我在Python界杀疯了??!!!

1,697 阅读3分钟

自从上次帮助业务部门使用Python开发了个工具,新领导简直对我青睐有加,于是在对全公司各个部门的调研会议中都宣传了我的"作品",现在很多部门提了需求,我成了小工具开发专员!!

今天为大家介绍的工具是:

根据必选词和可选词搜索PDF文件,将符合条件的页面截图输出到表格,避免批量在长达几十页、几百页的PDF文档中逐页查询,也可以起到溯源的作用。

实现效果:

image.png
第一列为读取文件的名称;
第二列为图片链接;
第三列为该图片在文件中的哪一页。

代码如下:

import fitz  # PyMuPDF
from PIL import Image
import io
from paddleocr import PaddleOCR
from openpyxl import Workbook, load_workbook
from openpyxl.worksheet.hyperlink import Hyperlink
import os
import re
import uuid  # 用于生成唯一标识符


def check_keywords_within_interval(text, target_keywords, other_keywords):
    keyword_index_dict = {}
    for kw in target_keywords + other_keywords:
        keyword_index_dict[kw] = [m.start() for m in re.finditer(kw, text)]
    for target_keyword in target_keywords:
        target_indexes = keyword_index_dict[target_keyword]
        for other_keyword in other_keywords:
            other_indexes = keyword_index_dict[other_keyword]
            for target_index in target_indexes:
                for other_index in other_indexes:
                    if abs(target_index - other_index) <= 100:
                        return True
    return False
   

def judge_rules(text, page, ws, page_num, file_name, target_keywords, other_keywords):
    if any(keyword in text for keyword in target_keywords) and check_keywords_within_interval(text, target_keywords, other_keywords):
        all_keywords = target_keywords + other_keywords
        for keyword in all_keywords:
            quads = page.search_for(keyword, quads=True)
            for quad in quads:
                annot = page.add_highlight_annot(quad)
                annot.set_colors({"stroke": (0, 0.5, 0), "fill": (0, 0.5, 0.5, 1)})
                annot.update()
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), dpi=300)
        img_buffer = io.BytesIO()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        unique_suffix = str(uuid.uuid4())[:8]
        img_name = f"{image_prefix}_{page_num}_{unique_suffix}.png"
        img.save(img_name, "PNG", optimize=True, quality=95)
        ws.append([file_name, img_name, page_num])
        img_absolute_path = os.path.abspath(img_name)
        img_cell = ws.cell(row=ws.max_row, column=2)
        img_cell.hyperlink = Hyperlink(ref=f"'{img_absolute_path}'!A1", target=img_absolute_path)
        img_cell.style = "Hyperlink"


# 打开文件夹内所有文件并处理
def read_files_in_subfolders(base_folder, other_keywords, target_keywords, image_prefix):
    wb = Workbook()
    ws = wb.active
    # 设置表头
    ws.append(["文件名", "图片链接", "所在PDF页面页数"])
    for root, dirs, files in os.walk(base_folder):
        for file_name in files:
            if file_name.endswith('.pdf'):
                pdf_path = os.path.join(root, file_name)
                # 判断是扫描件还是PDF
                document = fitz.open(pdf_path)
                is_scanned_document = False
                for page in document:
                    images = page.get_images()
                    if images:
                        is_scanned_document = True
                        break
                # 扫描件
                if is_scanned_document:
                    ocr = PaddleOCR(use_angle_cls=True, lang="ch")
                    result = ocr.ocr(pdf_path)
                    file_name = os.path.basename(pdf_path)
                    document = fitz.open(pdf_path)  # 打开PDF文档获取页面对象,假设这里的fitz操作与非扫描件部分一致
                    for page_num, pages in enumerate(result):
                        page_txtsNew = [line[1][0] for line in pages]
                        text = ''.join(page_txtsNew)
                        page = document.load_page(page_num)  # 获取当前页码对应的页面对象
                        judge_rules(text, page, ws, page_num, file_name, target_keywords, other_keywords)


                else:
                    with fitz.open(pdf_path) as pdf_document:
                        file_name = os.path.basename(pdf_path)
                        for page_num in range(len(pdf_document)):
                            page = pdf_document.load_page(page_num)
                            text = page.get_text("text")  # 获取页面文本
                            judge_rules(text, page, ws, page_num, file_name, target_keywords, other_keywords)
                            
                            

    # 保存Excel文件
    wb.save('D:\output.xlsx')
    print(f"共保存了{len([name for name in os.listdir('.') if name.startswith(image_prefix) and name.endswith('.png')])}张图片,相关信息已记录到output.xlsx中。")


# 使用示例  这段代码可设置为自己输入关键词
# 获取用户输入的必选词,以逗号分隔,去除空格后转为列表
# target_keywords_input = input("请输入必选词,多个词之间用逗号分隔:")
# target_keywords = [keyword.strip() for keyword in target_keywords_input.split(",")]

# # 获取用户输入的可选词(other_keywords),以逗号分隔,去除空格后转为列表
# other_keywords_input = input("请输入可选词(other_keywords),多个词之间用逗号分隔:")
# other_keywords = [keyword.strip() for keyword in other_keywords_input.split(",")]

#必选词
target_keywords = ["表"]
#可选词
other_keywords = ["吞吐量", '货物吞吐量', '公司货物吞吐量','2021']  # 这里可以根据实际需求修改或设置为空列表 []
#文件夹路径
pdf_path = r'D:\文件'
image_prefix = "screenshot"
read_files_in_subfolders(pdf_path, other_keywords, target_keywords, image_prefix)

功能:

1.设置了截图中将关键词高亮的功能;
2.将截图以链接的形式输出到表格;
3.设置任意连续的100个字内有必选词和任一可选词的筛选条件。

缺点:

运行好慢啊,大家有啥优化建议嘛~欢迎分享。

666 我竟然早俩月做了个本地知识库的功能!!!