@[toc]
我来详细介绍一下 Python ijson 库的用法,这是一个用于流式解析大型 JSON 文件的工具。
安装
pip install ijson
基本概念
ijson 的主要优势在于它不需要将整个 JSON 文件加载到内存中,而是逐个读取和解析 JSON 元素。
基础用法
1. 基本解析
import ijson
# 示例 JSON 数据
json_data = '''
{
"users": [
{"id": 1, "name": "Alice", "age": 25},
{"id": 2, "name": "Bob", "age": 30},
{"id": 3, "name": "Charlie", "age": 35}
],
"total": 3
}
'''
# 方法1: 从字符串解析
items = ijson.items(json_data, 'users.item')
for user in items:
print(f"ID: {user['id']}, Name: {user['name']}, Age: {user['age']}")
# 方法2: 从文件解析
with open('data.json', 'w') as f:
f.write(json_data)
with open('data.json', 'r') as f:
users = ijson.items(f, 'users.item')
for user in users:
print(f"User: {user}")
2. 解析文件中的特定路径
import ijson
# 假设有 large_data.json 文件
json_structure = '''
{
"company": "Tech Corp",
"departments": [
{
"name": "Engineering",
"employees": [
{"id": 101, "name": "John", "salary": 50000},
{"id": 102, "name": "Jane", "salary": 60000}
]
},
{
"name": "Marketing",
"employees": [
{"id": 201, "name": "Mike", "salary": 45000},
{"id": 202, "name": "Sarah", "salary": 55000}
]
}
]
}
'''
with open('large_data.json', 'w') as f:
f.write(json_structure)
# 解析特定路径
with open('large_data.json', 'r') as f:
# 获取公司名称
company = ijson.items(f, 'company')
print(f"Company: {next(company)}")
with open('large_data.json', 'r') as f:
# 获取所有部门名称
dept_names = ijson.items(f, 'departments.item.name')
print("Departments:")
for name in dept_names:
print(f" - {name}")
with open('large_data.json', 'r') as f:
# 获取所有员工
employees = ijson.items(f, 'departments.item.employees.item')
print("\nAll Employees:")
for emp in employees:
print(f" - {emp['name']} (ID: {emp['id']})")
高级用法
1. 使用事件解析
import ijson
json_data = '''
{
"users": [
{"id": 1, "name": "Alice", "hobbies": ["reading", "swimming"]},
{"id": 2, "name": "Bob", "hobbies": ["gaming", "coding"]}
]
}
'''
with open('events_data.json', 'w') as f:
f.write(json_data)
print("解析事件流:")
with open('events_data.json', 'r') as f:
parser = ijson.parse(f)
current_path = []
for prefix, event, value in parser:
print(f"路径: {prefix:20} 事件: {event:10} 值: {value}")
# 可以根据事件类型进行特定处理
if event == 'start_array':
print(f"开始数组: {prefix}")
elif event == 'end_array':
print(f"结束数组: {prefix}")
elif event == 'start_map':
print(f"开始对象: {prefix}")
elif event == 'end_map':
print(f"结束对象: {prefix}")
2. 处理大型 JSON 数组
import ijson
import json
# 创建大型测试数据
large_data = {"products": []}
for i in range(1000):
large_data["products"].append({
"id": i + 1,
"name": f"Product {i + 1}",
"price": i * 10 + 5.99,
"category": f"Category {i % 5}",
"in_stock": i % 3 == 0
})
with open('large_products.json', 'w') as f:
json.dump(large_data, f)
# 使用 ijson 流式处理
def process_expensive_products(price_threshold=500.0):
count = 0
with open('large_products.json', 'r') as f:
products = ijson.items(f, 'products.item')
for product in products:
if product['price'] > price_threshold:
count += 1
print(f"高价商品: {product['name']} - ${product['price']:.2f}")
return count
expensive_count = process_expensive_products(500.0)
print(f"\n高价商品总数: {expensive_count}")
# 按类别统计
def count_by_category():
category_count = {}
with open('large_products.json', 'r') as f:
products = ijson.items(f, 'products.item')
for product in products:
category = product['category']
category_count[category] = category_count.get(category, 0) + 1
return category_count
category_stats = count_by_category()
print("\n按类别统计:")
for category, count in category_stats.items():
print(f"{category}: {count} 个商品")
3. 处理嵌套结构
import ijson
complex_data = '''
{
"school": {
"name": "Central High",
"classes": [
{
"grade": "10A",
"students": [
{"name": "Alice", "scores": {"math": 95, "english": 88}},
{"name": "Bob", "scores": {"math": 87, "english": 92}}
]
},
{
"grade": "10B",
"students": [
{"name": "Charlie", "scores": {"math": 78, "english": 85}},
{"name": "Diana", "scores": {"math": 92, "english": 90}}
]
}
]
}
}
'''
with open('school_data.json', 'w') as f:
f.write(complex_data)
# 提取学生数学成绩
print("学生数学成绩:")
with open('school_data.json', 'r') as f:
students = ijson.items(f, 'school.classes.item.students.item')
for student in students:
print(f"{student['name']}: 数学 {student['scores']['math']}分")
# 提取特定路径的分数
def get_english_scores():
scores = []
with open('school_data.json', 'r') as f:
# 使用更精确的路径
parser = ijson.parse(f)
current_student = None
for prefix, event, value in parser:
if prefix.endswith('name') and event == 'string':
current_student = value
elif prefix.endswith('scores.english') and event == 'number':
scores.append((current_student, value))
return scores
english_scores = get_english_scores()
print("\n英语成绩:")
for student, score in english_scores:
print(f"{student}: {score}分")
4. 错误处理和性能优化
import ijson
import json
import time
def safe_json_parsing(file_path):
"""安全的 JSON 解析函数"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
# 使用 ijson 的基本验证
try:
# 尝试解析第一个元素来验证文件格式
parser = ijson.parse(f)
first_event = next(parser, None)
if first_event is None:
print("文件为空")
return
# 重置文件指针
f.seek(0)
# 实际解析数据
data = ijson.items(f, 'item')
count = 0
for item in data:
count += 1
# 处理每个项目
if count % 1000 == 0:
print(f"已处理 {count} 个项目...")
print(f"总共处理了 {count} 个项目")
except ijson.JSONError as e:
print(f"JSON 解析错误: {e}")
except Exception as e:
print(f"其他错误: {e}")
except FileNotFoundError:
print(f"文件不存在: {file_path}")
except UnicodeDecodeError:
print("文件编码错误")
# 性能对比测试
def performance_comparison():
"""对比 ijson 和标准 json 库的性能"""
# 创建测试数据
test_data = [{"id": i, "data": "x" * 100} for i in range(10000)]
with open('test_large.json', 'w') as f:
json.dump(test_data, f)
# 测试标准 json 库
start_time = time.time()
with open('test_large.json', 'r') as f:
data = json.load(f)
count = len(data)
std_time = time.time() - start_time
# 测试 ijson
start_time = time.time()
with open('test_large.json', 'r') as f:
items = ijson.items(f, 'item')
count = 0
for item in items:
count += 1
ijson_time = time.time() - start_time
print(f"标准 json 库: {std_time:.4f} 秒")
print(f"ijson 流式解析: {ijson_time:.4f} 秒")
print(f"处理项目数: {count}")
# 运行示例
if __name__ == "__main__":
# 创建测试文件
sample_data = [
{"id": 1, "name": "Item 1", "value": 100},
{"id": 2, "name": "Item 2", "value": 200},
{"id": 3, "name": "name": "Item 3", "value": 300}
]
with open('sample.json', 'w') as f:
json.dump(sample_data, f)
# 测试安全解析
safe_json_parsing('sample.json')
# 性能对比
performance_comparison()
实际应用场景
1. 处理 API 响应流
import ijson
import requests
def stream_large_api_response():
"""处理大型 API 响应"""
url = "https://api.example.com/large-data"
response = requests.get(url, stream=True)
response.raise_for_status()
# 使用 ijson 流式解析响应内容
items = ijson.items(response.raw, 'items.item')
for item in items:
# 处理每个项目,避免内存溢出
process_item(item)
def process_item(item):
"""处理单个数据项"""
print(f"处理: {item.get('id', 'N/A')} - {item.get('name', 'Unnamed')}")
2. 日志文件分析
import ijson
def analyze_json_logs(log_file_path):
"""分析 JSON 格式的日志文件"""
error_count = 0
warning_count = 0
with open(log_file_path, 'r') as f:
# 假设每行是一个独立的 JSON 对象
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
# 解析每行的 JSON
log_entry = ijson.items(line, 'item')
entry = next(log_entry, None)
if entry:
level = entry.get('level', '').lower()
if level == 'error':
error_count += 1
print(f"错误在第 {line_num} 行: {entry.get('message')}")
elif level == 'warning':
warning_count += 1
except ijson.JSONError:
print(f"第 {line_num} 行 JSON 格式错误")
print(f"\n统计结果:")
print(f"错误数: {error_count}")
print(f"警告数: {warning_count}")
# 使用示例
analyze_json_logs('app_logs.json')
ijson 特别适合处理以下几种情况:
- 非常大的 JSON 文件(GB 级别)
- 网络流式 JSON 数据
- 只需要提取部分数据的场景
- 内存受限的环境
记住在处理完成后及时关闭文件,并在生产环境中添加适当的错误处理。