使用Python提取PDF文件中的图片

322 阅读1分钟

环境

软件版本
python3.11.2
pymupdf1.22.5

代码


import fitz
import re
import os

file_path = r'.\test.pdf' # PDF 文件路径
dir_path = r'.\output' # 存放图片的文件夹

def pdf2image1(path, pic_path):
    checkIM = r"/Subtype(?= */Image)"
    pdf = fitz.open(path)
    lenXREF = pdf.xref_length()
    
    count = 1
    for i in range(1, lenXREF):
        text = pdf.xref_object(i)
        isImage = re.search(checkIM, text) #正则判断是否图片
        if not isImage:
            continue
        pix = fitz.Pixmap(pdf, i)
        
        if (pix.size < 3*1024*1024): # 自己加的过滤条件, 小于3M不要. 这个可以自己改
            continue
            
        new_name = f"img_{count}.png"
        output_filename = os.path.join(pic_path, new_name)
        pix.save(output_filename)
        print("save to [", output_filename,"]", pix.irect.width, "x", pix.irect.height, ", ", pix.size)
        count += 1
        pix = None

curPath = os.getcwd()
print("当前目录", curPath)
if not os.path.exists(dir_path):
   os.makedirs(dir_path)
pdf2image1(file_path, dir_path)