electron中完成pdf文件预览及文字提取

283 阅读4分钟

需要完成electron中PDF文本内容的提取,PDF预览,文本复制等功能。记录下技术调研的结果及测试代码。

其中

  • pdf-lib可以完成pdf水印的添加,文本的添加。但是并没有找到获取当前pdf中所有文本内容的方法.
  • pdf-parse可以完成pdf文本解析,但是要写在主进程中。
  • pdfjs-dist建议使用2.16.105,可以完成PDF的预览 文本的选取等操作。

pdf-lib

  • 可以完成pdf水印的添加,文本的添加。但是并没有找到获取当前pdf中所有文本内容的方法。
// 渲染进程
  <a-upload-dragger v-model:fileList="fileList" name="file" :multiple="false" :accept="'.pdf'"
        :before-upload="httpRequestWaterMark" @change="handleChange" @drop="handleDrop">
        <p class="ant-upload-drag-icon">
          <inbox-outlined></inbox-outlined>
        </p>
        <p class="ant-upload-text">Click or drag file to this area to upload</p>
        <p class="ant-upload-hint">
          pdf添加水印
        </p>
      </a-upload-dragger>
      
   import { degrees, PDFDocument, rgb, StandardFonts } from 'pdf-lib';
   
   const httpRequestWaterMark = (file) => {
  return new Promise((resolve, reject) => {
    wordData.htmlStr = "";
    const reader = new FileReader();
    reader.onload = (event) => {
      const arrayBufferRes = event.target.result;
      const resBase64 = event.target.result;
      let base64Str = resBase64.replaceAll("data:application/pdf;base64,", "");
      //   //获取文件,不带data:application/pdf;base64,前缀
      fileLocal = base64Str;
      // 在pdf中添加水印
      getAddFile()
    };
    reader.onerror = () => {
      reject(false);
    };
    // reader.readAsArrayBuffer(file);
    reader.readAsDataURL(file);
  });

};

const getAddFile = async () => {
  // A4大小
  const A4_WIDTH = 595.28;
  const A4_HEIGHT = 841.89;
  // 载入文件
  const pdfDoc = await PDFDocument.load(fileLocal);
  const pages = pdfDoc.getPages();
  const content = pdfDoc.context;
  debugger;
  // 获取字体
  const timesRomanFont = await pdfDoc.embedFont(StandardFonts.TimesRoman)
  //遍历每页加文本
  for (let i = 0; i < pages.length; i++) {
    //当前页面宽高
    const pageEle = pages[i]
    console.log(typeof pageEle);
    const { width, height } = pageEle.getSize()




    // const pageHeight = pageEle.getsize().height
    // const pagewidth = pageEle.getsize().width
    // 计算缩放比例
    let scale = 1
    // let scaleX = Math.min(A4_WIDTH / pagewidth, 1);
    // let scaleY = Math.min(A4_HEIGHT / pageHeight, 1);
    // scale = Math.min(scaleX, scaleY)
    //缩放页面大小
    // pageEle.setsize(pageEle.getwidth() * scale, pageEle.getHeight() * scale)
    //缩放页面内容
    // pageEle.scaleContent(scale, scale);
    //重置页面位置,防止添加文字错位
    // pageEle.resetPosition()
    // 添加文字 文字内容 文字信息
    pageEle.drawText('124234YT-63', {
      x: 500,
      y: 800,
      size: 8,
      // font: "timesRomanFont rgb(0,6,0)",
      color: rgb(0, 0.66, 0)
    })
  }
  //下载文件
  const pdfBytes = await pdfDoc.save();
  saveByteArray(" 添加水印" + '.pdf', pdfBytes);
}
//保存pdf并下载
const saveByteArray = (reportName, byte) => {
  try {
    var blob = new Blob([byte], { type: "application/pdf" });
    var link = document.createElement('a');
    link.href = window.URL.createObjectURL(blob);
    var fileName = reportName;
    link.download = fileName;
    link.click();
  } catch (error) {
    console.log("error", error);
  }
}

pdf-parse

  • pdf-parse - npm (npmjs.com)
  • 使用的是npm install pdf-parse@^1.1.1 .
  • 注意:pdf-parse必须在electron的主进程中使用。在渲染进程中获取到文件数据buffer后,需要传递到主进程中转换成文本,然后把文本从主进程中再传到渲染进程。
  • ipc完成主进程与渲染进程通讯。
// 渲染进程中
 <a-upload-dragger v-model:fileList="fileList" name="file" :multiple="false" :accept="'.pdf'"
        :before-upload="httpRequestPdfGetText" @change="handleChange" @drop="handleDrop">
        <p class="ant-upload-drag-icon">
          <inbox-outlined></inbox-outlined>
        </p>
        <p class="ant-upload-text">Click or drag file to this area to upload</p>
        <p class="ant-upload-hint">
          pdf文本提取
        </p>
      </a-upload-dragger>
    <div class="one-block-2">PDF内容提取</div>
    <div class="one-block-2">{{ wordData.pdfContent }}</div>
      
   let fileLocal = null;
// pdf内容提取
const httpRequestPdfGetText = (file) => {
  return new Promise((resolve, reject) => {
    wordData.htmlStr = "";
    const reader = new FileReader();
    reader.onload = async (event) => {
      const arrayBufferRes = event.target.result;
      const arrayBuffer = new Uint8Array(arrayBufferRes);
      console.log(`==========`);
      console.log(arrayBuffer);
      ipc.invoke(ipcApiRoute.pdfParseOper, { pdfBuffer: arrayBuffer }).then(r => {
        console.log(r);
        wordData.pdfContent = r;
      });
    };
    reader.onerror = () => {
      reject(false);
    };
    reader.readAsArrayBuffer(file);
  });
};

// 主进程中
const fs = require('fs');
const PDFParser = require('pdf-parse');
 pdfParseOper(pdfBuffer){    
    try {
      console.log('pdfParse')
      return new Promise((resolve, reject) => {
        PDFParser(pdfBuffer).then((pdfData) => {
          const pages = pdfData.text.split('\n\n');
          console.log(pages);
           let str;
              for(let i=1;i<pages.length;i++)
           str=str+pages[i]
          console.log(str)
          resolve(pdfData.text);
        });
      });    
    } catch (error) {
      console.log(error);
    } 
  }
      
      

pdfjs / pdfjs-dist

参考文章

安装 pdfjs-dist ,此处指定版本为 2.16.105
  • 最新版本已经到4.x,建议用2.16.105版本,亲测,其他版本的兼容性不好。

一、文件预览 1、安装 pdfjs-dist ,此处指定版本为 2.16.105 yarn add pdfjs-dist@2.16.105

注:3.x版本部分功能的实现方法与旧版本存在差异。

2、html 结构内容

<template>
导入部分
  <a-upload-dragger v-model:fileList="fileList" name="file" :multiple="false" :accept="'.pdf'"
        :before-upload="httpRequestPdfView" @change="handleChange" @drop="handleDrop">
        <p class="ant-upload-drag-icon">
          <inbox-outlined></inbox-outlined>
        </p>
        <p class="ant-upload-text">Click or drag file to this area to upload</p>
        <p class="ant-upload-hint">
          pdf预览及文本内容选择
        </p>
      </a-upload-dragger>
      展示部分
    <div id="pdf-view">
        <canvas v-for="page in state.pdfPages" :key="page" id="pdfCanvas" />
        <div id="text-view"></div>
    </div>
</template>

3、js 功能实现:

  • 一次完成所有页面的导入,文本提取后覆盖在canvas上。当鼠标选中时,更改文本颜色。
  • 文本复制及选取关键是textLayer元素,初始时是透明的,背景透明,文本span字体透明。只有选中后文本框颜色调整。
  • 还要注意缩放问题,根据页面大小调整整个store的大小,目标是textLayer完整覆盖在canvas上。
import * as PDF from 'pdfjs-dist'
// 导入worker废了写功夫
const pdfjsWorker = import('pdfjs-dist/build/pdf.worker.entry')
PDF.GlobalWorkerOptions.workerSrc = pdfjsWorker;
import * as pdfjsViewer from 'pdfjs-dist/web/pdf_viewer.js'
import 'pdfjs-dist/web/pdf_viewer.css'
import { TextLayerBuilder } from 'pdfjs-dist/web/pdf_viewer.js';
const refPdfView = ref();

let pdfDoc = null;
const eventBus = new pdfjsViewer.EventBus();
const httpRequestPdfView = (file) => {
  return new Promise((resolve, reject) => {
    wordData.htmlStr = "";
    const reader = new FileReader();
    reader.onload = async (event) => {
      const arrayBufferRes = event.target.result;
      const arrayBuffer = new Uint8Array(arrayBufferRes); //  new Uint8Array(arrayBufferRes.Uint8Array).buffer;
      console.log(`==========`);
      console.log(arrayBuffer);
      try {
        PDF.getDocument(arrayBuffer).promise.then((pdf) => {
          pdfDoc = pdf
          console.log(pdf);
          state.pdfPath = file.name;
          state.pdfPages = pdf.numPages;
          renderPage(1);
        })
      } catch (error) {
        console.error(error);
      }
    };
    reader.onerror = () => {
      reject(false);
    };
    reader.readAsArrayBuffer(file);
  });
};
const renderPage = (num) => {
  try {
    pdfDoc.getPage(num).then((page) => {
      const canvas = document.getElementById(`pdfCanvas${num}`)
      const ctx = canvas.getContext('2d');
      // 获取页面缩放比例
      const dpr = 1;// window.devicePixelRatio || 1
      const bsr
        = ctx.webkitBackingStorePixelRatio
        || ctx.mozBackingStorePixelRatio
        || ctx.msBackingStorePixelRatio
        || ctx.oBackingStorePixelRatio
        || ctx.backingStorePixelRatio
        || 1
      const ratio = dpr / bsr;
      // 根据页面宽度和视口宽度的比率就是内容区的放大比率
      let dialogWidth = refPdfView.value.offsetWidth;
      let pageWidth = page.view[2] * ratio;
      state.pdfScale = dialogWidth / pageWidth;
      console.log(`page size=${page.view}`);

      console.log(`dpr=${dpr},ratio=${ratio} ,state.pdfScale=${state.pdfScale}`);
      const viewport = page.getViewport({ scale: state.pdfScale });
      canvas.width = viewport.width * ratio;
      canvas.height = viewport.height * ratio;
      console.log(`viewport.width=${viewport.width},viewport.height=${viewport.height}`);
      console.log(`canvas.width=${canvas.width},canvas.height=${canvas.height}`);
      canvas.style.width = '100%';
      canvas.style.height = '100%';
      state.pdfWidth = `${viewport.width}px`;
      console.log(`state.pdfWidth=${state.pdfWidth}`);
      ctx.setTransform(ratio, 0, 0, ratio, 0, 0);
      // 将 PDF 页面渲染到 canvas 上下文中
      const renderContext = {
        canvasContext: ctx,
        viewport,
      }

      // 获取文本内容和渲染页面的 Promise
      const getTextContentPromise = page.getTextContent();
      const renderPagePromise = page.render(renderContext);
      Promise.all([getTextContentPromise, renderPagePromise])
        .then(([textContent]) => {
          console.log(textContent);
          const textLayerDiv = document.createElement('div');
          // 注意:此处不要修改该元素的class名称,该元素的样式通过外部导入,名称是固定的
          textLayerDiv.setAttribute('class', 'textLayer');
          // 设置容器样式
          textLayerDiv.setAttribute('style', `
                    z-index: 1;
                    opacity: 1;
                    background-color:transparent;
                    transform: scale(1);
                    width: 100%,
                    height: 100%,
                    color: white;
                `);
          // 设置容器的位置和宽高
          textLayerDiv.style.left = canvas.offsetLeft + 'px';
          textLayerDiv.style.top = canvas.offsetTop + 'px';
          textLayerDiv.style.height = canvas.offsetHeight + 'px';
          textLayerDiv.style.width = canvas.offsetWidth + 'px';

          const textView = document.querySelector('#text-view');
          textView.appendChild(textLayerDiv);

          const textLayer = new TextLayerBuilder({
            // container: ,
            textLayerDiv: textLayerDiv,
            pageIndex: page.pageIndex,
            viewport: viewport,
            eventBus,
            // textDivs: []
          });

          textLayer.setTextContent(textContent);
          textLayer.render();
        })
        .catch((error) => {
          console.error('Error rendering page:', error);
        })



      if (state.pdfPages > num)
        renderPage(num + 1)
    })
  } catch (error) {
    console.error(error);
  }
}