自从上次帮助业务部门使用Python开发了个工具,新领导简直对我青睐有加,于是在对全公司各个部门的调研会议中都宣传了我的"作品",现在很多部门提了需求,我成了小工具开发专员!!
今天为大家介绍的工具是:
根据必选词和可选词搜索PDF文件,将符合条件的页面截图输出到表格,避免批量在长达几十页、几百页的PDF文档中逐页查询,也可以起到溯源的作用。
实现效果:
第一列为读取文件的名称;
第二列为图片链接;
第三列为该图片在文件中的哪一页。
代码如下:
import fitz # PyMuPDF
from PIL import Image
import io
from paddleocr import PaddleOCR
from openpyxl import Workbook, load_workbook
from openpyxl.worksheet.hyperlink import Hyperlink
import os
import re
import uuid # 用于生成唯一标识符
def check_keywords_within_interval(text, target_keywords, other_keywords):
keyword_index_dict = {}
for kw in target_keywords + other_keywords:
keyword_index_dict[kw] = [m.start() for m in re.finditer(kw, text)]
for target_keyword in target_keywords:
target_indexes = keyword_index_dict[target_keyword]
for other_keyword in other_keywords:
other_indexes = keyword_index_dict[other_keyword]
for target_index in target_indexes:
for other_index in other_indexes:
if abs(target_index - other_index) <= 100:
return True
return False
def judge_rules(text, page, ws, page_num, file_name, target_keywords, other_keywords):
if any(keyword in text for keyword in target_keywords) and check_keywords_within_interval(text, target_keywords, other_keywords):
all_keywords = target_keywords + other_keywords
for keyword in all_keywords:
quads = page.search_for(keyword, quads=True)
for quad in quads:
annot = page.add_highlight_annot(quad)
annot.set_colors({"stroke": (0, 0.5, 0), "fill": (0, 0.5, 0.5, 1)})
annot.update()
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2), dpi=300)
img_buffer = io.BytesIO()
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
unique_suffix = str(uuid.uuid4())[:8]
img_name = f"{image_prefix}_{page_num}_{unique_suffix}.png"
img.save(img_name, "PNG", optimize=True, quality=95)
ws.append([file_name, img_name, page_num])
img_absolute_path = os.path.abspath(img_name)
img_cell = ws.cell(row=ws.max_row, column=2)
img_cell.hyperlink = Hyperlink(ref=f"'{img_absolute_path}'!A1", target=img_absolute_path)
img_cell.style = "Hyperlink"
# 打开文件夹内所有文件并处理
def read_files_in_subfolders(base_folder, other_keywords, target_keywords, image_prefix):
wb = Workbook()
ws = wb.active
# 设置表头
ws.append(["文件名", "图片链接", "所在PDF页面页数"])
for root, dirs, files in os.walk(base_folder):
for file_name in files:
if file_name.endswith('.pdf'):
pdf_path = os.path.join(root, file_name)
# 判断是扫描件还是PDF
document = fitz.open(pdf_path)
is_scanned_document = False
for page in document:
images = page.get_images()
if images:
is_scanned_document = True
break
# 扫描件
if is_scanned_document:
ocr = PaddleOCR(use_angle_cls=True, lang="ch")
result = ocr.ocr(pdf_path)
file_name = os.path.basename(pdf_path)
document = fitz.open(pdf_path) # 打开PDF文档获取页面对象,假设这里的fitz操作与非扫描件部分一致
for page_num, pages in enumerate(result):
page_txtsNew = [line[1][0] for line in pages]
text = ''.join(page_txtsNew)
page = document.load_page(page_num) # 获取当前页码对应的页面对象
judge_rules(text, page, ws, page_num, file_name, target_keywords, other_keywords)
else:
with fitz.open(pdf_path) as pdf_document:
file_name = os.path.basename(pdf_path)
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text("text") # 获取页面文本
judge_rules(text, page, ws, page_num, file_name, target_keywords, other_keywords)
# 保存Excel文件
wb.save('D:\output.xlsx')
print(f"共保存了{len([name for name in os.listdir('.') if name.startswith(image_prefix) and name.endswith('.png')])}张图片,相关信息已记录到output.xlsx中。")
# 使用示例 这段代码可设置为自己输入关键词
# 获取用户输入的必选词,以逗号分隔,去除空格后转为列表
# target_keywords_input = input("请输入必选词,多个词之间用逗号分隔:")
# target_keywords = [keyword.strip() for keyword in target_keywords_input.split(",")]
# # 获取用户输入的可选词(other_keywords),以逗号分隔,去除空格后转为列表
# other_keywords_input = input("请输入可选词(other_keywords),多个词之间用逗号分隔:")
# other_keywords = [keyword.strip() for keyword in other_keywords_input.split(",")]
#必选词
target_keywords = ["表"]
#可选词
other_keywords = ["吞吐量", '货物吞吐量', '公司货物吞吐量','2021'] # 这里可以根据实际需求修改或设置为空列表 []
#文件夹路径
pdf_path = r'D:\文件'
image_prefix = "screenshot"
read_files_in_subfolders(pdf_path, other_keywords, target_keywords, image_prefix)
功能:
1.设置了截图中将关键词高亮的功能;
2.将截图以链接的形式输出到表格;
3.设置任意连续的100个字内有必选词和任一可选词的筛选条件。
缺点:
运行好慢啊,大家有啥优化建议嘛~欢迎分享。
666 我竟然早俩月做了个本地知识库的功能!!!