为了将 .docx
文件中的提取结果保存为结构化文件,我们可以选择将其保存为 JSON 文件。JSON 格式便于读取和处理。以下是一个示例代码,用于提取 .docx
文件中的文本、表格和图片,并将其保存为结构化 JSON 文件:
import os
import json
from docx import Document
def extract_docx_content(file_path):
doc = Document(file_path)
hierarchy = []
# 提取文本和标题
for para in doc.paragraphs:
style = para.style.name
text = para.text.strip()
if 'Heading' in style:
level = int(style.split()[-1])
hierarchy.append({'type': 'Heading', 'level': level, 'text': text})
elif text:
hierarchy.append({'type': 'Paragraph', 'text': text})
# 提取表格
for table in doc.tables:
table_data = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_data.append(row_data)
hierarchy.append({'type': 'Table', 'data': table_data})
# 提取图片
image_folder = 'extracted_images'
if not os.path.exists(image_folder):
os.makedirs(image_folder)
for rel in doc.part.rels.values():
if "image" in rel.target_ref:
img_ext = rel.target_ref.split('.')[-1]
img_data = rel.target_part.blob
img_filename = os.path.join(image_folder, f'image_{len(hierarchy)}.{img_ext}')
with open(img_filename, 'wb') as img_file:
img_file.write(img_data)
hierarchy.append({'type': 'Image', 'path': img_filename})
return hierarchy
def save_hierarchy_to_json(hierarchy, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(hierarchy, f, ensure_ascii=False, indent=4)
def print_hierarchy(hierarchy):
for item in hierarchy:
if item['type'] == 'Heading':
print(f"{' ' * (item['level'] - 1) * 2}Heading {item['level']}: {item['text']}")
elif item['type'] == 'Paragraph':
print(f" Paragraph: {item['text']}")
elif item['type'] == 'Table':
print("Table:")
for row in item['data']:
print(f" {' | '.join(row)}")
elif item['type'] == 'Image':
print(f"Image: {item['path']}")
# 示例使用
file_path = 'path/to/your/document.docx'
output_file = 'docx_structure.json'
hierarchy = extract_docx_content(file_path)
save_hierarchy_to_json(hierarchy, output_file)
print_hierarchy(hierarchy)
代码解析
-
导入必要的库:
import os import json from docx import Document
-
定义
extract_docx_content
函数:-
打开并读取
.docx
文件:doc = Document(file_path) hierarchy = []
-
提取段落和标题:
for para in doc.paragraphs: style = para.style.name text = para.text.strip() if 'Heading' in style: level = int(style.split()[-1]) hierarchy.append({'type': 'Heading', 'level': level, 'text': text}) elif text: hierarchy.append({'type': 'Paragraph', 'text': text})
-
提取表格内容:
for table in doc.tables: table_data = [] for row in table.rows: row_data = [cell.text.strip() for cell in row.cells] table_data.append(row_data) hierarchy.append({'type': 'Table', 'data': table_data})
-
提取图片并保存到本地文件夹:
image_folder = 'extracted_images' if not os.path.exists(image_folder): os.makedirs(image_folder) for rel in doc.part.rels.values(): if "image" in rel.target_ref: img_ext = rel.target_ref.split('.')[-1] img_data = rel.target_part.blob img_filename = os.path.join(image_folder, f'image_{len(hierarchy)}.{img_ext}') with open(img_filename, 'wb') as img_file: img_file.write(img_data) hierarchy.append({'type': 'Image', 'path': img_filename})
-
-
定义
save_hierarchy_to_json
函数:- 将层次结构保存为 JSON 文件:
def save_hierarchy_to_json(hierarchy, output_file): with open(output_file, 'w', encoding='utf-8') as f: json.dump(hierarchy, f, ensure_ascii=False, indent=4)
- 将层次结构保存为 JSON 文件:
-
定义
print_hierarchy
函数:- 打印提取到的层次结构:
def print_hierarchy(hierarchy): for item in hierarchy: if item['type'] == 'Heading': print(f"{' ' * (item['level'] - 1) * 2}Heading {item['level']}: {item['text']}") elif item['type'] == 'Paragraph': print(f" Paragraph: {item['text']}") elif item['type'] == 'Table': print("Table:") for row in item['data']: print(f" {' | '.join(row)}") elif item['type'] == 'Image': print(f"Image: {item['path']}")
- 打印提取到的层次结构:
示例输出
假设 .docx
文件包含以下内容:
- 一个一级标题 "Introduction"
- 一个段落 "This is an introduction."
- 一个表格
- 一个图片
输出将会是:
Heading 1: Introduction
Paragraph: This is an introduction.
Table:
Column1 | Column2
Data1 | Data2
Image: extracted_images/image_3.png
生成的 docx_structure.json
文件将包含所有提取的内容:
[
{
"type": "Heading",
"level": 1,
"text": "Introduction"
},
{
"type": "Paragraph",
"text": "This is an introduction."
},
{
"type": "Table",
"data": [
["Column1", "Column2"],
["Data1", "Data2"]
]
},
{
"type": "Image",
"path": "extracted_images/image_3.png"
}
]
这个代码示例展示了如何从 .docx
文件中提取文本、表格和图片,并将其保存为结构化的 JSON 文件。