将 docx 文件中的文本、表格、图片 提取结果保存为结构化文件

2 阅读2分钟

为了将 .docx 文件中的提取结果保存为结构化文件,我们可以选择将其保存为 JSON 文件。JSON 格式便于读取和处理。以下是一个示例代码,用于提取 .docx 文件中的文本、表格和图片,并将其保存为结构化 JSON 文件:

import os
import json
from docx import Document

def extract_docx_content(file_path):
    doc = Document(file_path)
    hierarchy = []

    # 提取文本和标题
    for para in doc.paragraphs:
        style = para.style.name
        text = para.text.strip()
        if 'Heading' in style:
            level = int(style.split()[-1])
            hierarchy.append({'type': 'Heading', 'level': level, 'text': text})
        elif text:
            hierarchy.append({'type': 'Paragraph', 'text': text})

    # 提取表格
    for table in doc.tables:
        table_data = []
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            table_data.append(row_data)
        hierarchy.append({'type': 'Table', 'data': table_data})

    # 提取图片
    image_folder = 'extracted_images'
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            img_ext = rel.target_ref.split('.')[-1]
            img_data = rel.target_part.blob
            img_filename = os.path.join(image_folder, f'image_{len(hierarchy)}.{img_ext}')
            with open(img_filename, 'wb') as img_file:
                img_file.write(img_data)
            hierarchy.append({'type': 'Image', 'path': img_filename})

    return hierarchy

def save_hierarchy_to_json(hierarchy, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(hierarchy, f, ensure_ascii=False, indent=4)

def print_hierarchy(hierarchy):
    for item in hierarchy:
        if item['type'] == 'Heading':
            print(f"{' ' * (item['level'] - 1) * 2}Heading {item['level']}: {item['text']}")
        elif item['type'] == 'Paragraph':
            print(f"  Paragraph: {item['text']}")
        elif item['type'] == 'Table':
            print("Table:")
            for row in item['data']:
                print(f"  {' | '.join(row)}")
        elif item['type'] == 'Image':
            print(f"Image: {item['path']}")

# 示例使用
file_path = 'path/to/your/document.docx'
output_file = 'docx_structure.json'

hierarchy = extract_docx_content(file_path)
save_hierarchy_to_json(hierarchy, output_file)
print_hierarchy(hierarchy)

代码解析

  1. 导入必要的库:

    import os
    import json
    from docx import Document
    
  2. 定义 extract_docx_content 函数:

    • 打开并读取 .docx 文件:

      doc = Document(file_path)
      hierarchy = []
      
    • 提取段落和标题:

      for para in doc.paragraphs:
          style = para.style.name
          text = para.text.strip()
          if 'Heading' in style:
              level = int(style.split()[-1])
              hierarchy.append({'type': 'Heading', 'level': level, 'text': text})
          elif text:
              hierarchy.append({'type': 'Paragraph', 'text': text})
      
    • 提取表格内容:

      for table in doc.tables:
          table_data = []
          for row in table.rows:
              row_data = [cell.text.strip() for cell in row.cells]
              table_data.append(row_data)
          hierarchy.append({'type': 'Table', 'data': table_data})
      
    • 提取图片并保存到本地文件夹:

      image_folder = 'extracted_images'
      if not os.path.exists(image_folder):
          os.makedirs(image_folder)
      
      for rel in doc.part.rels.values():
          if "image" in rel.target_ref:
              img_ext = rel.target_ref.split('.')[-1]
              img_data = rel.target_part.blob
              img_filename = os.path.join(image_folder, f'image_{len(hierarchy)}.{img_ext}')
              with open(img_filename, 'wb') as img_file:
                  img_file.write(img_data)
              hierarchy.append({'type': 'Image', 'path': img_filename})
      
  3. 定义 save_hierarchy_to_json 函数:

    • 将层次结构保存为 JSON 文件:
      def save_hierarchy_to_json(hierarchy, output_file):
          with open(output_file, 'w', encoding='utf-8') as f:
              json.dump(hierarchy, f, ensure_ascii=False, indent=4)
      
  4. 定义 print_hierarchy 函数:

    • 打印提取到的层次结构:
      def print_hierarchy(hierarchy):
          for item in hierarchy:
              if item['type'] == 'Heading':
                  print(f"{' ' * (item['level'] - 1) * 2}Heading {item['level']}: {item['text']}")
              elif item['type'] == 'Paragraph':
                  print(f"  Paragraph: {item['text']}")
              elif item['type'] == 'Table':
                  print("Table:")
                  for row in item['data']:
                      print(f"  {' | '.join(row)}")
              elif item['type'] == 'Image':
                  print(f"Image: {item['path']}")
      

示例输出

假设 .docx 文件包含以下内容:

  • 一个一级标题 "Introduction"
  • 一个段落 "This is an introduction."
  • 一个表格
  • 一个图片

输出将会是:

Heading 1: Introduction
  Paragraph: This is an introduction.
Table:
  Column1 | Column2
  Data1 | Data2
Image: extracted_images/image_3.png

生成的 docx_structure.json 文件将包含所有提取的内容:

[
    {
        "type": "Heading",
        "level": 1,
        "text": "Introduction"
    },
    {
        "type": "Paragraph",
        "text": "This is an introduction."
    },
    {
        "type": "Table",
        "data": [
            ["Column1", "Column2"],
            ["Data1", "Data2"]
        ]
    },
    {
        "type": "Image",
        "path": "extracted_images/image_3.png"
    }
]

这个代码示例展示了如何从 .docx 文件中提取文本、表格和图片,并将其保存为结构化的 JSON 文件。