Python 筛选 cvs 数据,并生成 shp 格式地理信息文件

83 阅读3分钟
import os
import glob
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# 注意:首次运行需要安装以下库(取消注释执行)
# !pip install pandas geopandas pyshp shapely

# ================================
# ⚠️ Step 0: 用户自定义配置参数
# ================================
# 输入文件夹路径(包含原始CSV文件)
input_folder = r"/Users/***/Downloads/python/input"  # ⚠️需要自定义:替换为实际路径

# 输出文件夹路径
output_folder = r"/Users/***/Downloads/python/output"  # ⚠️需要自定义:替换为期望输出路径

# 杭州行政区经纬度范围(根据实际情况调整)
lon_min, lon_max = 119.0, 121.0  # ⚠️需要自定义:经度范围
lat_min, lat_max = 29.0, 31.0  # ⚠️需要自定义:纬度范围

# 需要删除的字段列表
columns_to_drop = [
    'parent',
    'distance',
    'pcode',
    'importance',
    'biz_ext',
    'recommend',
    'photos',
    'discount_num',
    'poiweight',
    'citycode',
    'children',
    'alias',
    'tel',
    'tag',
    'event',
    'entr_location',
    'indoor_map',
    'email',
    'website',
    'biz_type',
    'postcode',
    'match',
    'business_area',
    'indoor_data',
    'childtype',
    'exit_location',
    'shopid',
    'navi_poiid',
    'groupbuy_num',
    'typecode_single',
]  # 【需要填写】替换为实际要删除的字段名列表

# 字段重命名映射表
rename_columns = {
    'longitude_wgs84': 'lon',  # 【需要填写】旧字段名:新字段名
    'latitude_wgs84': 'lat',  # 按此格式添加更多需要重命名的字段
    '大类': 'big',
    '中类': 'mid',
    '小类': 'small',
}

# ================================
# Step 1: 创建输出文件夹
# ================================
os.makedirs(output_folder, exist_ok=True)


# ================================
# Step 2: 处理单个CSV文件的函数
# ================================
def process_single_file(csv_path):
    # 读取CSV文件
    df = pd.read_csv(csv_path, encoding='gb18030')  # ⚠️注意:如果编码错误尝试 gbk/utf-8/gb18030/GB2312

    # 1. 删除不需要的字段
    df = df.drop(columns=columns_to_drop)  # 忽略不存在的列

    # 2. 过滤经纬度范围
    mask = (df['longitude_wgs84'].between(lon_min, lon_max)) & \
           (df['latitude_wgs84'].between(lat_min, lat_max))
    df = df[mask].copy()

    # 3. 重命名字段
    df = df.rename(columns=rename_columns)

    # 4. 删除缺失值记录
    df = df.dropna(subset=['name', 'big', 'mid', 'small'])  # ⚠️需要自定义关键字段

    return df


# 调试建议:打印输入路径和文件列表
print("输入路径是否存在:", os.path.exists(input_folder))

print("找到的CSV文件数量:", len(glob.glob(os.path.join(input_folder, "*.csv"))))

# 读取时指定原始编码(如 GBK), 保存为 UTF-8
# df = pd.read_csv('input.csv', encoding='GBK')
#
# df.to_csv('output_utf8.csv', index=False, encoding='UTF-8')

# ================================
# Step 3: 批量处理所有CSV文件
# ================================
# 获取所有CSV文件路径
csv_files = glob.glob(os.path.join(input_folder, "*.csv"))  # ⚠️确保路径正确

# 处理并合并所有文件
all_data = []
for file in csv_files:
    try:
        processed_df = process_single_file(file)
        all_data.append(processed_df)
    except Exception as e:
        print(f"处理文件 {os.path.basename(file)} 失败: {str(e)}")

if len(all_data) == 0:
    raise ValueError("没有成功处理任何文件,请检查输入路径和文件格式")

final_df = pd.concat(all_data, ignore_index=True)

# Step 4: 输出结果文件
# ================================
# 输出CSV
csv_output = os.path.join(output_folder, "合并结果.csv")
final_df.to_csv(csv_output, index=False, encoding='utf-8')

# Convert non-compatible data types
for col in final_df.columns:
    if final_df[col].dtype == 'object':
        try:
            final_df[col] = pd.to_numeric(final_df[col], errors='raise')
        except:
            final_df[col] = final_df[col].astype(str)

# Check and modify column names
final_df.columns = [col[:31] if len(col) > 31 else col for col in final_df.columns]
final_df.columns = [''.join(e for e in col if e.isalnum() or e == '_') for col in final_df.columns]

# Convert string columns to bytes using utf-8 encoding
for col in final_df.select_dtypes(include=['object']).columns:
    final_df[col] = final_df[col].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)
    final_df[col] = final_df[col].astype(str).str.encode('utf-8').str.decode('utf-8')


# 输出Stata文件(.dta)
dta_output = os.path.join(output_folder, "合并结果.dta")
try:
    final_df.to_stata(dta_output, write_index=False,version=118)
except Exception as e:
    print(f"Failed to save Stata file: {str(e)}")

# 输出Shapefile(地理空间数据)
# 创建几何对象
# Bug fix: Use renamed column names
geometry = [Point(xy) for xy in zip(final_df['lon'], final_df['lat'])]
gdf = gpd.GeoDataFrame(final_df, geometry=geometry, crs="EPSG:4326")  # WGS84坐标系

# Shorten column names to 10 characters or less
gdf.columns = [col[:10] for col in gdf.columns]

shp_output = os.path.join(output_folder, "空间数据.shp")
gdf.to_file(shp_output, encoding='utf-8')

print(f"处理完成!结果已保存至:{output_folder}")