背景
每年都要报销一堆的发票,电子发票管理起来比较麻烦,想着自己写一个小玩意,好管理一些
准备工作
这次主要针对PDF版本的发票,用到的开源软件包括PyMuPDF和camelot,PyMuPDF前面的文章有写,安装也是比较简单,参考之前的文章吧 首先,安装camelot
pip install camelot-py[cv]
过程
用到两个包,主要是需要读取的pdf文件格式比较多,camelot擅长读取表格数据,对于发票头数据支持不了,有点是处理比较简单,因此,用PyMuPDF读取发票头,camelot读取发票内容 小玩意比较简单,核心的东西主要在发票头的读取和发票内容读取
1. 发票头读取
利用PyMuPDF读取发票头信息,包括发票编号及开票日期等,PyMuPDF中对pdf的块读取能将位置信息也提供出来,利用位置的相邻关系,匹配相关的kv对
doc = fitz.open(receipt)
page = doc.load_page(0)
# block格式: x0, y0, x1, y1, "提取的内容", block_no, block_typ(1:image, 0:text)
blocks = page.get_textpage().extractBLOCKS()
for block in blocks:
for other_block in blocks:
if other_block[5] == block[5]:
continue
if abs(block[2] - other_block[0]) < 10 and abs(block[1] - other_block[1]) < 3:
receipt_info[block[4].strip()] = other_block[4]
将收尾位置相近的项组合在一起,形成键值对
2. 发票内容提取
主要是提取发票总金额,利用camelot,将表格转化成dataframe,提取起来比较简单
tables = camelot.read_pdf(receipt, shift_text=[''], strip_text='\n')
table = tables[0].df # 将发票表格转换为dataframe
price = table[10][2][1:]
if len(price) == 0:
price = table[9][2][-6:]
3. 界面封装
简单的搞个小界面封装一下,操作起来比较方便,如下图
4. 整体集成
将两块的东西合并在一起,看看效果
结尾
结尾就是把代码贴一下
import os
import sys
import camelot
import fitz
from PyQt6.QtCore import pyqtSlot
from PyQt6.QtGui import QStandardItemModel, QStandardItem
from PyQt6.QtWidgets import QDialog, QApplication, QFileDialog, QHeaderView
from little_tool.ui_receipt_stat import Ui_receipt
class ReceiptStat(QDialog, Ui_receipt):
def __init__(self):
super(ReceiptStat, self).__init__()
self.setupUi(self)
self.model = QStandardItemModel()
self.model.setHorizontalHeaderLabels(['文件名', '发票号码', '开票日期', '金额'])
self.tableView_detail.horizontalHeader().setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
self.tableView_detail.setModel(self.model)
self.cwd = os.getcwd()
@pyqtSlot()
def on_pushButton_scan_clicked(self):
receipt_dir = QFileDialog.getExistingDirectory(self, "选择文件夹", self.cwd)
if receipt_dir != "":
self.lineEdit_receipt_path.setText(receipt_dir)
@pyqtSlot()
def on_pushButton_receipt_clicked(self):
receipt_dir = self.lineEdit_receipt_path.text().strip()
if receipt_dir != '':
# todo: 读取发票的逻辑
receipt_list = self.read_receipts(receipt_dir)
index = 0
row = 0
total_price = 0
for receipt in receipt_list:
index += 1
receipt_no = receipt.get('发票号码:', '1').strip()
receipt_date = receipt.get('开票日期:', '2')
receipt_date = str(receipt_date).replace(" ", ".").strip()
receipt_price = receipt.get('price', '0')
total_price += float(receipt_price)
receipt_file_path = receipt.get('file_path', '4')
item_receipt_no = QStandardItem(receipt_no)
item_receipt_date = QStandardItem(receipt_date)
item_receipt_price = QStandardItem(receipt_price)
item_file_path = QStandardItem(os.path.basename(receipt_file_path))
self.model.setItem(row, 1, item_receipt_no)
self.model.setItem(row, 2, item_receipt_date)
self.model.setItem(row, 3, item_receipt_price)
self.model.setItem(row, 0, item_file_path)
row += 1
self.label_total.setText(f"共计[ {index} ]个文件,合计金额[ {str(round(total_price, 2))} ]元")
def read_receipts(self, receipt_dir) -> list:
if not os.path.isdir(receipt_dir):
return []
receipt_walk = os.walk(receipt_dir)
receipts = [os.path.join(root, filename) for root, _, filenames in receipt_walk for filename in filenames if
filename.endswith(".pdf")]
receipt_infos = []
for receipt in receipts:
# 提取发票表头信息
receipt_info = {}
doc = fitz.open(receipt)
page = doc.load_page(0)
# block格式: x0, y0, x1, y1, "提取的内容", block_no, block_typ(1:image, 0:text)
blocks = page.get_textpage().extractBLOCKS()
for block in blocks:
for other_block in blocks:
if other_block[5] == block[5]:
continue
if abs(block[2] - other_block[0]) < 10 and abs(block[1] - other_block[1]) < 3:
receipt_info[block[4].strip()] = other_block[4]
# 提取发票表格信息
tables = camelot.read_pdf(receipt, shift_text=[''], strip_text='\n')
table = tables[0].df # 将发票表格转换为dataframe
price = table[10][2][1:]
if len(price) == 0:
price = table[9][2][-6:]
receipt_info["price"] = price
receipt_info['file_path'] = receipt
receipt_infos.append(receipt_info)
return receipt_infos
def main():
app = QApplication(sys.argv)
app.setStyle("Fusion")
l = ReceiptStat()
l.show()
sys.exit(app.exec())
if __name__ == '__main__':
main()