将行政区划 Excel 转换为树形 JSON

0 阅读4分钟

表格见 腾讯位置服务 WebService API

image.png

import pandas as pd
import json
import os
import sys


# 增加递归深度限制,防止深层嵌套导致崩溃
sys.setrecursionlimit(5000)

def parse_district_tree(file_path, sheet_name=1, output_file='tree.json', minify_json=False):
    """
    解析行政区划Excel并生成树形JSON结构
    
    参数:
        file_path: 输入Excel文件路径
        sheet_name: Sheet名称或索引
        output_file: 输出文件路径
        minify_json: 
            True  -> 紧凑模式 (去除换行和空格,体积小,适合程序读取/网络传输)
            False -> 美化模式 (带缩进换行,体积大,适合人工查看)
    """
    
    print(f"开始任务...")
    print(f"输入文件: {file_path}")
    print(f"Sheet索引/名称: {sheet_name}")
    print(f"输出文件: {output_file}")
    print(f"模式: {'紧凑压缩 (Minify)' if minify_json else '美化格式 (Pretty)'}")
    print("-" * 50)

    # 1. 读取数据
    if not os.path.exists(file_path):
        print(f"错误:找不到文件 '{file_path}'")
        return

    try:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
    except Exception as e:
        print(f"读取Excel失败: {e}")
        return

    if len(df.columns) < 2:
        print("错误:表格列数不足2列。")
        return
    
    # 标准化列名
    df = df.iloc[:, :2].copy()
    df.columns = ['code', 'full_path']
    
    # 数据清洗
    df = df.dropna(subset=['code', 'full_path'])
    df['code'] = pd.to_numeric(df['code'], errors='coerce').astype('Int64').astype(str)
    df['full_path'] = df['full_path'].astype(str).str.strip()

    # 过滤9位代码
    initial_count = len(df)
    df = df[df['code'].str.len() == 9]
    print(f"原始行数: {initial_count} | 有效9位代码行数: {len(df)}")

    if len(df) == 0:
        print("警告:未找到有效的9位行政区划代码。")
        return

    tree_map = {} 

    def get_node(code, name):
        if code not in tree_map:
            tree_map[code] = {
                "value": code,
                "text": name,
                "children": []
            }
        else:
            if name and (not tree_map[code]['text'] or tree_map[code]['text'] != name):
                tree_map[code]['text'] = name
        return tree_map[code]

    print("正在构建拓扑树...")
    
    for _, row in df.iterrows():
        code_9 = row['code']
        full_path = row['full_path']
        
        parts = [p.strip() for p in full_path.split(',') if p.strip()]
        town_name = parts[-1] if parts else "未知街道"
        
        province_name = city_name = district_name = ""
        province_code = city_code = ""
        district_code = code_9[:6]

        # 直辖市 vs 普通省 判断逻辑
        if len(parts) == 4:
            # 直辖市: 中国, 北京市, 东城区, 街道
            province_name = parts[1]
            city_name = parts[1]
            district_name = parts[2]
            p_prefix = code_9[:2]
            province_code = p_prefix + "0000"
            city_code = province_code 
        elif len(parts) >= 5:
            # 普通省: 中国, 河北省, 石家庄市, 赞皇县, 镇
            province_name = parts[1]
            city_name = parts[2]
            district_name = parts[3]
            p_prefix = code_9[:2]
            c_prefix = code_9[:4]
            province_code = p_prefix + "0000"
            city_code = c_prefix + "00"
        else:
            province_name = "未知省"
            city_name = "未知市"
            district_name = "未知区"
            province_code = code_9[:2] + "0000"
            city_code = code_9[:4] + "00"

        # --- 节点挂载 ---
        node_prov = get_node(province_code, province_name)
        
        if city_code == province_code:
            node_city = node_prov
        else:
            node_city = get_node(city_code, city_name)
            if not any(c['value'] == city_code for c in node_prov['children']):
                node_prov['children'].append(node_city)

        node_dist = get_node(district_code, district_name)
        if district_code != city_code:
            if not any(c['value'] == district_code for c in node_city['children']):
                node_city['children'].append(node_dist)
        else:
            node_dist = node_city

        node_town = get_node(code_9, town_name)
        if code_9 != district_code:
            if not any(c['value'] == code_9 for c in node_dist['children']):
                node_dist['children'].append(node_town)

    # 排序
    root_nodes = [node for code, node in tree_map.items() if code.endswith('0000')]
    root_nodes.sort(key=lambda x: x['value'])
    
    def sort_children(n):
        if not n.get('children'):
            return
        n['children'].sort(key=lambda x: x['value'])
        for child in n['children']:
            if child['value'] == n['value']:
                continue
            sort_children(child)
    
    print("正在排序节点...")
    for r in root_nodes:
        sort_children(r)

    # 写入文件
    print(f"正在写入文件 {output_file} ...")
    try:
        # 核心修改:根据 minify_json 决定 indent 参数
        # None 表示紧凑模式 (无换行无缩进)
        # 2 表示美化模式 (2空格缩进)
        indent_val = None if minify_json else 2
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(root_nodes, f, ensure_ascii=False, indent=indent_val)
        
        # 统计大小
        file_size = os.path.getsize(output_file)
        size_mb = file_size / (1024 * 1024)
        
        print("-" * 50)
        print(f"成功完成!")
        print(f"省级节点数: {len(root_nodes)}")
        print(f"总节点数: {len(tree_map)}")
        print(f"文件大小: {size_mb:.2f} MB")
        
        if minify_json:
            print(f"已启用紧凑模式 (去除了换行和空格),文件体积已优化。")
        else:
            print(f"已启用美化模式 (包含换行和缩进),方便人工阅读。")
            
    except Exception as e:
        print(f"写入文件失败: {e}")

if __name__ == '__main__':
    # ================= 配置区域 =================
    
    # 1. 输入文件
    INPUT_FILE = '行政区划编码表_20251127.xlsx'
    
    # 2. Sheet 索引 (0是第一个, 1是第二个)
    SHEET_INDEX = 1 
    
    # 3. 输出文件名
    OUTPUT_FILE = 'district_tree.json'
    
    # 4. 是否压缩 JSON (去除换行和空格)
    # True  -> 紧凑模式
    # False -> 美化模式
    MINIFY_JSON = False 
    
    # ===========================================

    # 自动查找文件逻辑
    if not os.path.exists(INPUT_FILE):
        xlsx_files = [f for f in os.listdir('.') if f.endswith('.xlsx')]
        if xlsx_files:
            print(f"未找到 '{INPUT_FILE}',自动使用: {xlsx_files[0]}")
            INPUT_FILE = xlsx_files[0]
        else:
            print("当前目录下未找到 Excel 文件")
            sys.exit(1)

    parse_district_tree(
        file_path=INPUT_FILE,
        sheet_name=SHEET_INDEX,
        output_file=OUTPUT_FILE,
        minify_json=MINIFY_JSON
    )

image.png