Python爬取网页生成特定格式的API文档

3 阅读4分钟

并非swagger不好用,因为要写项目详细设计文档,需要按照文档里的特定格式填写API,我觉得这活如果人力来写那真是吃力不讨好,纯纯浪费时间。所以想说AI那么发达,为啥不用AI整一个Python脚本去抓取网页上的所有接口并生成特定格式的文档呢。

实现方案

使用 Python + Playwright(抓接口) + OpenPyXL(生成 Excel)

外网:MAC
内网:Windows

操作步骤

安装工具

联网

pip install playwright openpyxl 
playwright install chromium

内网

  1. Python 离线安装包(如果内网没装 Python)
  1. 所有 Python 库的离线包(核心!)

你需要下载 4 个 .whl 后缀文件

  • playwright
  • openpyxl
  • pyee
  • greenlet

外网下载:

pip download \ --platform win_amd64 \ --python-version 310 \ --only-binary=:all: \ playwright openpyxl

内网安装: pip install --no-index --find-links=. .\playwright-*.whl .\openpyxl-*.whl

  1. Playwright 浏览器离线包(必须!)
  • 浏览器离线包:chromium.zip
  • 作用:让 playwright 在内网运行浏览器抓接口

外网下载: playwright install chromium --dry-run

或者在下面地址里下载 https://playwright.azureedge.net/builds/chromium/1091/chromium-win64.zip

内网安装:

# 解压路径
C:\Users\你的用户名\AppData\Local\ms-playwright\chromium-1091

# 路径结构
ms-playwright/ 
└── chromium-1091/ 
    └── chrome.exe

都安装好了后生成Python脚本。

生成Python脚本(api_crawl.py)

from playwright.sync_api import sync_playwright
import json
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill, Border, Side

# ====================== 【配置项,根据需要修改】 ======================
TARGET_URL = "http://localhost:8080"  # 你的前端页面
EXCEL_NAME = "全自动API接口文档.xlsx"
CHROME_PATH = r"C:\Users\你的用户名\AppData\Local\ms-playwright\chromium-1091\chrome.exe"
# ============================================================

# 样式
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")  # 标准深蓝
white_font = Font(color="FFFFFF", bold=True)
black_font = Font(color="000000")
center = Alignment(horizontal="center", vertical="center")
thin = Side(style="thin")
border = Border(left=thin, right=thin, top=thin, bottom=thin)

# 常用字段翻译(可自行添加)
FIELD_TRANSLATION = {
    "id": "ID",
    "name": "名称",
    "title": "标题",
    "code": "编码",
    "type": "类型",
    "status": "状态",
    "value": "值",
    "label": "标签",
    "sort": "排序",
    "remark": "备注",
    "desc": "描述",
    "content": "内容",
    "url": "地址",
    "path": "路径",
    "icon": "图标",
    "parentId": "父ID",
    "order": "顺序",
    "createTime": "创建时间",
    "updateTime": "更新时间",
    "creator": "创建人",
    "pageNo": "页码",
    "pageSize": "每页条数",
    "total": "总条数",
    "records": "数据列表",
    "username": "用户名",
    "password": "密码",
    "phone": "手机号",
    "email": "邮箱",
    "sex": "性别",
    "age": "年龄",
    "address": "地址",
    "audit": "审核状态",
    "enable": "启用状态",
}

def get_field_name(key):
    last_part = key.split(".")[-1]
    return FIELD_TRANSLATION.get(last_part, last_part)

def parse_fields(obj, prefix=""):
    rows = []
    if isinstance(obj, dict):
        for k, v in obj.items():
            full_key = f"{prefix}.{k}" if prefix else k
            cn_name = get_field_name(full_key)
            t = type(v).__name__
            if t == "dict":
                rows.append([len(rows)+1, full_key, cn_name, "object", "", "", "O", ""])
                rows.extend(parse_fields(v, full_key))
            elif t == "list":
                rows.append([len(rows)+1, full_key, cn_name, "array", "", "", "O", ""])
                if v and isinstance(v[0], dict):
                    rows.extend(parse_fields(v[0], full_key))
            else:
                rows.append([len(rows)+1, full_key, cn_name, t, "", "", "O", ""])
    return rows

def export_excel(api_list):
    wb = Workbook()
    wb.remove(wb.active)
    head = ["序号", "字段编码", "字段名称", "类型", "长度", "格式", "M/O", "备注"]

    for idx, api in enumerate(api_list, 1):
        ws = wb.create_sheet(title=f"接口{idx}")
        row = 1

        # ========== 第一行:接口URL ==========
        ws.merge_cells(f"A{row}:H{row}")
        ws[f"A{row}"] = f"接口URL:{api['url']}"
        ws[f"A{row}"].fill = header_fill
        ws[f"A{row}"].font = white_font
        ws[f"A{row}"].alignment = center
        ws[f"A{row}"].border = border
        row += 1

        # ========== 第二行:请求方式 ==========
        ws.merge_cells(f"A{row}:H{row}")
        ws[f"A{row}"] = f"请求方式:{api['method']}"
        ws[f"A{row}"].fill = header_fill
        ws[f"A{row}"].font = white_font
        ws[f"A{row}"].alignment = center
        ws[f"A{row}"].border = border
        row += 2

        # ========== 请求参数 ==========
        ws.merge_cells(f"A{row}:H{row}")
        ws[f"A{row}"] = "请求参数"
        ws[f"A{row}"].fill = header_fill
        ws[f"A{row}"].font = white_font
        ws[f"A{row}"].alignment = center
        ws[f"A{row}"].border = border
        row += 1

        for col, val in enumerate(head, 1):
            c = ws.cell(row=row, column=col, value=val)
            c.fill = header_fill
            c.font = white_font
            c.alignment = center
            c.border = border
        row += 1

        for line in parse_fields(api["req_data"]):
            for col, val in enumerate(line, 1):
                c = ws.cell(row=row, column=col, value=val)
                c.font = black_font
                c.border = border
            row += 1
        row += 2

        # ========== 响应参数 ==========
        ws.merge_cells(f"A{row}:H{row}")
        ws[f"A{row}"] = "响应参数"
        ws[f"A{row}"].fill = header_fill
        ws[f"A{row}"].font = white_font
        ws[f"A{row}"].alignment = center
        ws[f"A{row}"].border = border
        row += 1

        for col, val in enumerate(head, 1):
            c = ws.cell(row=row, column=col, value=val)
            c.fill = header_fill
            c.font = white_font
            c.alignment = center
            c.border = border
        row += 1

        for line in parse_fields(api["resp_data"]):
            for col, val in enumerate(line, 1):
                c = ws.cell(row=row, column=col, value=val)
                c.font = black_font
                c.border = border
            row += 1

        # 列宽
        ws.column_dimensions["A"].width = 6
        ws.column_dimensions["B"].width = 28
        ws.column_dimensions["C"].width = 16
        ws.column_dimensions["D"].width = 10

    wb.save(EXCEL_NAME)

def run():
    apis = {}
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False, executable_path=CHROME_PATH)
        ctx = browser.new_context(ignore_https_errors=True)
        page = ctx.new_page()

        def on_req(r):
            if r.resource_type in ["xhr", "fetch"]:
                try:
                    data = {}
                    if r.post_data:
                        data = json.loads(r.post_data)
                    apis[(r.url, r.method)] = {
                        "url": r.url,
                        "method": r.method,
                        "req_data": data,
                        "resp_data": {}
                    }
                except:
                    pass

        def on_res(res):
            try:
                key = (res.request.url, res.request.method)
                if key in apis:
                    try:
                        apis[key]["resp_data"] = res.json()
                    except:
                        pass
            except:
                pass

        page.on("request", on_req)
        page.on("response", on_res)
        page.goto(TARGET_URL, timeout=60000)
        print("已打开页面,请操作触发接口,15秒后自动生成文档...")
        # 这里调整页面打开时间
        page.wait_for_timeout(15000)
        browser.close()

    export_excel(list(apis.values()))
    print(f"完成!文件:{EXCEL_NAME}")

if __name__ == "__main__":
    run()

执行脚本,生成Excel API文档

python api_crawl.py

会自动弹出浏览器,在页面操作增删改查操作即可(必须要操作页面,否则浏览器不会记录接口请求)

由于效果在内网,不方便截图,但效果不错,非常符合我的需求。