将 docx 文件中的文本、表格、图片提取结果保存为结构化文件为了将 .docx 文件中的提取结果保存为结构化文件，我

为了将 .docx 文件中的提取结果保存为结构化文件，我们可以选择将其保存为 JSON 文件。JSON 格式便于读取和处理。以下是一个示例代码，用于提取 .docx 文件中的文本、表格和图片，并将其保存为结构化 JSON 文件：

import os
import json
from docx import Document

def extract_docx_content(file_path):
    doc = Document(file_path)
    hierarchy = []

    # 提取文本和标题
    for para in doc.paragraphs:
        style = para.style.name
        text = para.text.strip()
        if 'Heading' in style:
            level = int(style.split()[-1])
            hierarchy.append({'type': 'Heading', 'level': level, 'text': text})
        elif text:
            hierarchy.append({'type': 'Paragraph', 'text': text})

    # 提取表格
    for table in doc.tables:
        table_data = []
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            table_data.append(row_data)
        hierarchy.append({'type': 'Table', 'data': table_data})

    # 提取图片
    image_folder = 'extracted_images'
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            img_ext = rel.target_ref.split('.')[-1]
            img_data = rel.target_part.blob
            img_filename = os.path.join(image_folder, f'image_{len(hierarchy)}.{img_ext}')
            with open(img_filename, 'wb') as img_file:
                img_file.write(img_data)
            hierarchy.append({'type': 'Image', 'path': img_filename})

    return hierarchy

def save_hierarchy_to_json(hierarchy, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(hierarchy, f, ensure_ascii=False, indent=4)

def print_hierarchy(hierarchy):
    for item in hierarchy:
        if item['type'] == 'Heading':
            print(f"{' ' * (item['level'] - 1) * 2}Heading {item['level']}: {item['text']}")
        elif item['type'] == 'Paragraph':
            print(f"  Paragraph: {item['text']}")
        elif item['type'] == 'Table':
            print("Table:")
            for row in item['data']:
                print(f"  {' | '.join(row)}")
        elif item['type'] == 'Image':
            print(f"Image: {item['path']}")

# 示例使用
file_path = 'path/to/your/document.docx'
output_file = 'docx_structure.json'

hierarchy = extract_docx_content(file_path)
save_hierarchy_to_json(hierarchy, output_file)
print_hierarchy(hierarchy)

代码解析

导入必要的库：

import os
import json
from docx import Document

定义 extract_docx_content 函数：

打开并读取 .docx 文件：

doc = Document(file_path)
hierarchy = []

提取段落和标题：

for para in doc.paragraphs:
    style = para.style.name
    text = para.text.strip()
    if 'Heading' in style:
        level = int(style.split()[-1])
        hierarchy.append({'type': 'Heading', 'level': level, 'text': text})
    elif text:
        hierarchy.append({'type': 'Paragraph', 'text': text})

提取表格内容：

for table in doc.tables:
    table_data = []
    for row in table.rows:
        row_data = [cell.text.strip() for cell in row.cells]
        table_data.append(row_data)
    hierarchy.append({'type': 'Table', 'data': table_data})

提取图片并保存到本地文件夹：

image_folder = 'extracted_images'
if not os.path.exists(image_folder):
    os.makedirs(image_folder)

for rel in doc.part.rels.values():
    if "image" in rel.target_ref:
        img_ext = rel.target_ref.split('.')[-1]
        img_data = rel.target_part.blob
        img_filename = os.path.join(image_folder, f'image_{len(hierarchy)}.{img_ext}')
        with open(img_filename, 'wb') as img_file:
            img_file.write(img_data)
        hierarchy.append({'type': 'Image', 'path': img_filename})

定义 save_hierarchy_to_json 函数：

将层次结构保存为 JSON 文件：

def save_hierarchy_to_json(hierarchy, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(hierarchy, f, ensure_ascii=False, indent=4)

定义 print_hierarchy 函数：

打印提取到的层次结构：

def print_hierarchy(hierarchy):
    for item in hierarchy:
        if item['type'] == 'Heading':
            print(f"{' ' * (item['level'] - 1) * 2}Heading {item['level']}: {item['text']}")
        elif item['type'] == 'Paragraph':
            print(f"  Paragraph: {item['text']}")
        elif item['type'] == 'Table':
            print("Table:")
            for row in item['data']:
                print(f"  {' | '.join(row)}")
        elif item['type'] == 'Image':
            print(f"Image: {item['path']}")

示例输出

假设 .docx 文件包含以下内容：

一个一级标题 "Introduction"
一个段落 "This is an introduction."
一个表格
一个图片

输出将会是：

Heading 1: Introduction
  Paragraph: This is an introduction.
Table:
  Column1 | Column2
  Data1 | Data2
Image: extracted_images/image_3.png

生成的 docx_structure.json 文件将包含所有提取的内容：

[
    {
        "type": "Heading",
        "level": 1,
        "text": "Introduction"
    },
    {
        "type": "Paragraph",
        "text": "This is an introduction."
    },
    {
        "type": "Table",
        "data": [
            ["Column1", "Column2"],
            ["Data1", "Data2"]
        ]
    },
    {
        "type": "Image",
        "path": "extracted_images/image_3.png"
    }
]

这个代码示例展示了如何从 .docx 文件中提取文本、表格和图片，并将其保存为结构化的 JSON 文件。

将 docx 文件中的文本、表格、图片 提取结果保存为结构化文件

代码解析

示例输出

将 docx 文件中的文本、表格、图片提取结果保存为结构化文件