pdf-dist.js读取PDF文件内容

5,219 阅读1分钟
 "pdfjs-dist": "^2.10.377"

安装

 npm install pdfjs-dist --save

头部引入,解决No "GlobalWorkerOptions.workerSrc" specified

 import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry'
 import { getDocument, GlobalWorkerOptions } from 'pdfjs-dist'
 GlobalWorkerOptions.workerSrc = pdfjsWorker

读取文件

 // 读取page
 const readPDFPage = async(doc, pageNo) => {
   const page = await doc.getPage(pageNo)
   const tokenizedText = await page.getTextContent()
   const pageText = tokenizedText.items.map(token => token.str).join('')
   return pageText.replaceAll(/\s+/g, '\n')
 }
 ​
 // 读取文件
 const readPDFDoc = (file, resolve, reject) => {
   let reader = new FileReader()
   reader.readAsArrayBuffer(file)
   reader.onload = async(event) => {
     try {
       const doc = await getDocument(event.target.result).promise
       const pageTextPromises = []
       for (let pageNo = 1; pageNo <= doc.numPages; pageNo++) {
         pageTextPromises.push(readPDFPage(doc, pageNo))
       }
       const pageTexts = await Promise.all(pageTextPromises)
       resolve(pageTexts.join(''))
     } catch (err) {
       reject(err)
     }
   }
   reader.onerror = reject
 }

参考

github.com/mozilla/pdf…