[1346]python ijson用法

36 阅读5分钟

@[toc]

我来详细介绍一下 Python ijson 库的用法,这是一个用于流式解析大型 JSON 文件的工具。

安装

pip install ijson

基本概念

ijson 的主要优势在于它不需要将整个 JSON 文件加载到内存中,而是逐个读取和解析 JSON 元素。

基础用法

1. 基本解析

import ijson

# 示例 JSON 数据
json_data = '''
{
    "users": [
        {"id": 1, "name": "Alice", "age": 25},
        {"id": 2, "name": "Bob", "age": 30},
        {"id": 3, "name": "Charlie", "age": 35}
    ],
    "total": 3
}
'''

# 方法1: 从字符串解析
items = ijson.items(json_data, 'users.item')
for user in items:
    print(f"ID: {user['id']}, Name: {user['name']}, Age: {user['age']}")

# 方法2: 从文件解析
with open('data.json', 'w') as f:
    f.write(json_data)

with open('data.json', 'r') as f:
    users = ijson.items(f, 'users.item')
    for user in users:
        print(f"User: {user}")

2. 解析文件中的特定路径

import ijson

# 假设有 large_data.json 文件
json_structure = '''
{
    "company": "Tech Corp",
    "departments": [
        {
            "name": "Engineering",
            "employees": [
                {"id": 101, "name": "John", "salary": 50000},
                {"id": 102, "name": "Jane", "salary": 60000}
            ]
        },
        {
            "name": "Marketing", 
            "employees": [
                {"id": 201, "name": "Mike", "salary": 45000},
                {"id": 202, "name": "Sarah", "salary": 55000}
            ]
        }
    ]
}
'''

with open('large_data.json', 'w') as f:
    f.write(json_structure)

# 解析特定路径
with open('large_data.json', 'r') as f:
    # 获取公司名称
    company = ijson.items(f, 'company')
    print(f"Company: {next(company)}")

with open('large_data.json', 'r') as f:
    # 获取所有部门名称
    dept_names = ijson.items(f, 'departments.item.name')
    print("Departments:")
    for name in dept_names:
        print(f" - {name}")

with open('large_data.json', 'r') as f:
    # 获取所有员工
    employees = ijson.items(f, 'departments.item.employees.item')
    print("\nAll Employees:")
    for emp in employees:
        print(f" - {emp['name']} (ID: {emp['id']})")

高级用法

1. 使用事件解析

import ijson

json_data = '''
{
    "users": [
        {"id": 1, "name": "Alice", "hobbies": ["reading", "swimming"]},
        {"id": 2, "name": "Bob", "hobbies": ["gaming", "coding"]}
    ]
}
'''

with open('events_data.json', 'w') as f:
    f.write(json_data)

print("解析事件流:")
with open('events_data.json', 'r') as f:
    parser = ijson.parse(f)
    
    current_path = []
    for prefix, event, value in parser:
        print(f"路径: {prefix:20} 事件: {event:10} 值: {value}")
        
        # 可以根据事件类型进行特定处理
        if event == 'start_array':
            print(f"开始数组: {prefix}")
        elif event == 'end_array':
            print(f"结束数组: {prefix}")
        elif event == 'start_map':
            print(f"开始对象: {prefix}")
        elif event == 'end_map':
            print(f"结束对象: {prefix}")

2. 处理大型 JSON 数组

import ijson
import json

# 创建大型测试数据
large_data = {"products": []}
for i in range(1000):
    large_data["products"].append({
        "id": i + 1,
        "name": f"Product {i + 1}",
        "price": i * 10 + 5.99,
        "category": f"Category {i % 5}",
        "in_stock": i % 3 == 0
    })

with open('large_products.json', 'w') as f:
    json.dump(large_data, f)

# 使用 ijson 流式处理
def process_expensive_products(price_threshold=500.0):
    count = 0
    with open('large_products.json', 'r') as f:
        products = ijson.items(f, 'products.item')
        
        for product in products:
            if product['price'] > price_threshold:
                count += 1
                print(f"高价商品: {product['name']} - ${product['price']:.2f}")
    
    return count

expensive_count = process_expensive_products(500.0)
print(f"\n高价商品总数: {expensive_count}")

# 按类别统计
def count_by_category():
    category_count = {}
    with open('large_products.json', 'r') as f:
        products = ijson.items(f, 'products.item')
        
        for product in products:
            category = product['category']
            category_count[category] = category_count.get(category, 0) + 1
    
    return category_count

category_stats = count_by_category()
print("\n按类别统计:")
for category, count in category_stats.items():
    print(f"{category}: {count} 个商品")

3. 处理嵌套结构

import ijson

complex_data = '''
{
    "school": {
        "name": "Central High",
        "classes": [
            {
                "grade": "10A",
                "students": [
                    {"name": "Alice", "scores": {"math": 95, "english": 88}},
                    {"name": "Bob", "scores": {"math": 87, "english": 92}}
                ]
            },
            {
                "grade": "10B", 
                "students": [
                    {"name": "Charlie", "scores": {"math": 78, "english": 85}},
                    {"name": "Diana", "scores": {"math": 92, "english": 90}}
                ]
            }
        ]
    }
}
'''

with open('school_data.json', 'w') as f:
    f.write(complex_data)

# 提取学生数学成绩
print("学生数学成绩:")
with open('school_data.json', 'r') as f:
    students = ijson.items(f, 'school.classes.item.students.item')
    for student in students:
        print(f"{student['name']}: 数学 {student['scores']['math']}分")

# 提取特定路径的分数
def get_english_scores():
    scores = []
    with open('school_data.json', 'r') as f:
        # 使用更精确的路径
        parser = ijson.parse(f)
        current_student = None
        
        for prefix, event, value in parser:
            if prefix.endswith('name') and event == 'string':
                current_student = value
            elif prefix.endswith('scores.english') and event == 'number':
                scores.append((current_student, value))
    
    return scores

english_scores = get_english_scores()
print("\n英语成绩:")
for student, score in english_scores:
    print(f"{student}: {score}分")

4. 错误处理和性能优化

import ijson
import json
import time

def safe_json_parsing(file_path):
    """安全的 JSON 解析函数"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            # 使用 ijson 的基本验证
            try:
                # 尝试解析第一个元素来验证文件格式
                parser = ijson.parse(f)
                first_event = next(parser, None)
                
                if first_event is None:
                    print("文件为空")
                    return
                
                # 重置文件指针
                f.seek(0)
                
                # 实际解析数据
                data = ijson.items(f, 'item')
                count = 0
                for item in data:
                    count += 1
                    # 处理每个项目
                    if count % 1000 == 0:
                        print(f"已处理 {count} 个项目...")
                
                print(f"总共处理了 {count} 个项目")
                
            except ijson.JSONError as e:
                print(f"JSON 解析错误: {e}")
            except Exception as e:
                print(f"其他错误: {e}")
                
    except FileNotFoundError:
        print(f"文件不存在: {file_path}")
    except UnicodeDecodeError:
        print("文件编码错误")

# 性能对比测试
def performance_comparison():
    """对比 ijson 和标准 json 库的性能"""
    
    # 创建测试数据
    test_data = [{"id": i, "data": "x" * 100} for i in range(10000)]
    
    with open('test_large.json', 'w') as f:
        json.dump(test_data, f)
    
    # 测试标准 json 库
    start_time = time.time()
    with open('test_large.json', 'r') as f:
        data = json.load(f)
        count = len(data)
    std_time = time.time() - start_time
    
    # 测试 ijson
    start_time = time.time()
    with open('test_large.json', 'r') as f:
        items = ijson.items(f, 'item')
        count = 0
        for item in items:
            count += 1
    ijson_time = time.time() - start_time
    
    print(f"标准 json 库: {std_time:.4f} 秒")
    print(f"ijson 流式解析: {ijson_time:.4f} 秒")
    print(f"处理项目数: {count}")

# 运行示例
if __name__ == "__main__":
    # 创建测试文件
    sample_data = [
        {"id": 1, "name": "Item 1", "value": 100},
        {"id": 2, "name": "Item 2", "value": 200},
        {"id": 3, "name": "name": "Item 3", "value": 300}
    ]
    
    with open('sample.json', 'w') as f:
        json.dump(sample_data, f)
    
    # 测试安全解析
    safe_json_parsing('sample.json')
    
    # 性能对比
    performance_comparison()

实际应用场景

1. 处理 API 响应流

import ijson
import requests

def stream_large_api_response():
    """处理大型 API 响应"""
    url = "https://api.example.com/large-data"
    
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    # 使用 ijson 流式解析响应内容
    items = ijson.items(response.raw, 'items.item')
    
    for item in items:
        # 处理每个项目,避免内存溢出
        process_item(item)

def process_item(item):
    """处理单个数据项"""
    print(f"处理: {item.get('id', 'N/A')} - {item.get('name', 'Unnamed')}")

2. 日志文件分析

import ijson

def analyze_json_logs(log_file_path):
    """分析 JSON 格式的日志文件"""
    error_count = 0
    warning_count = 0
    
    with open(log_file_path, 'r') as f:
        # 假设每行是一个独立的 JSON 对象
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
                
            try:
                # 解析每行的 JSON
                log_entry = ijson.items(line, 'item')
                entry = next(log_entry, None)
                
                if entry:
                    level = entry.get('level', '').lower()
                    if level == 'error':
                        error_count += 1
                        print(f"错误在第 {line_num} 行: {entry.get('message')}")
                    elif level == 'warning':
                        warning_count += 1
    
            except ijson.JSONError:
                print(f"第 {line_num} 行 JSON 格式错误")
    
    print(f"\n统计结果:")
    print(f"错误数: {error_count}")
    print(f"警告数: {warning_count}")

# 使用示例
analyze_json_logs('app_logs.json')

ijson 特别适合处理以下几种情况:

  • 非常大的 JSON 文件(GB 级别)
  • 网络流式 JSON 数据
  • 只需要提取部分数据的场景
  • 内存受限的环境

记住在处理完成后及时关闭文件,并在生产环境中添加适当的错误处理。