import os
import glob
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
input_folder = r"/Users/***/Downloads/python/input"
output_folder = r"/Users/***/Downloads/python/output"
lon_min, lon_max = 119.0, 121.0
lat_min, lat_max = 29.0, 31.0
columns_to_drop = [
'parent',
'distance',
'pcode',
'importance',
'biz_ext',
'recommend',
'photos',
'discount_num',
'poiweight',
'citycode',
'children',
'alias',
'tel',
'tag',
'event',
'entr_location',
'indoor_map',
'email',
'website',
'biz_type',
'postcode',
'match',
'business_area',
'indoor_data',
'childtype',
'exit_location',
'shopid',
'navi_poiid',
'groupbuy_num',
'typecode_single',
]
rename_columns = {
'longitude_wgs84': 'lon',
'latitude_wgs84': 'lat',
'大类': 'big',
'中类': 'mid',
'小类': 'small',
}
os.makedirs(output_folder, exist_ok=True)
def process_single_file(csv_path):
df = pd.read_csv(csv_path, encoding='gb18030')
df = df.drop(columns=columns_to_drop)
mask = (df['longitude_wgs84'].between(lon_min, lon_max)) & \
(df['latitude_wgs84'].between(lat_min, lat_max))
df = df[mask].copy()
df = df.rename(columns=rename_columns)
df = df.dropna(subset=['name', 'big', 'mid', 'small'])
return df
print("输入路径是否存在:", os.path.exists(input_folder))
print("找到的CSV文件数量:", len(glob.glob(os.path.join(input_folder, "*.csv"))))
csv_files = glob.glob(os.path.join(input_folder, "*.csv"))
all_data = []
for file in csv_files:
try:
processed_df = process_single_file(file)
all_data.append(processed_df)
except Exception as e:
print(f"处理文件 {os.path.basename(file)} 失败: {str(e)}")
if len(all_data) == 0:
raise ValueError("没有成功处理任何文件,请检查输入路径和文件格式")
final_df = pd.concat(all_data, ignore_index=True)
csv_output = os.path.join(output_folder, "合并结果.csv")
final_df.to_csv(csv_output, index=False, encoding='utf-8')
for col in final_df.columns:
if final_df[col].dtype == 'object':
try:
final_df[col] = pd.to_numeric(final_df[col], errors='raise')
except:
final_df[col] = final_df[col].astype(str)
final_df.columns = [col[:31] if len(col) > 31 else col for col in final_df.columns]
final_df.columns = [''.join(e for e in col if e.isalnum() or e == '_') for col in final_df.columns]
for col in final_df.select_dtypes(include=['object']).columns:
final_df[col] = final_df[col].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)
final_df[col] = final_df[col].astype(str).str.encode('utf-8').str.decode('utf-8')
dta_output = os.path.join(output_folder, "合并结果.dta")
try:
final_df.to_stata(dta_output, write_index=False,version=118)
except Exception as e:
print(f"Failed to save Stata file: {str(e)}")
geometry = [Point(xy) for xy in zip(final_df['lon'], final_df['lat'])]
gdf = gpd.GeoDataFrame(final_df, geometry=geometry, crs="EPSG:4326")
gdf.columns = [col[:10] for col in gdf.columns]
shp_output = os.path.join(output_folder, "空间数据.shp")
gdf.to_file(shp_output, encoding='utf-8')
print(f"处理完成!结果已保存至:{output_folder}")