pdf & docx格式文件

107 阅读1分钟

import mammoth from "mammoth";
import * as pdfjsLib from "pdfjs-dist/build/pdf";//"pdfjs-dist": "2.0.943"版本
import pdfjsWorker from "pdfjs-dist/build/pdf.worker.entry";

安装处理pdf解析 类私有方法的 Babel 插件
npm install --save-dev @babel/plugin-proposal-class-properties @babel/plugin-proposal-nullish-coalescing-operator@babel/plugin-proposal-private-methods

babel.config.js中设置

plugins: [
    "@babel/plugin-proposal-optional-chaining",
    "@babel/plugin-proposal-nullish-coalescing-operator",
    "@babel/plugin-proposal-private-methods",
  ],

pdfjsLib.GlobalWorkerOptions.workerSrc = pdfjsWorker;

changeFile(file, fileList) {  
  const type = file.name.split(".").pop().toLowerCase();  
  if (file.status !== "ready") return;  

  const reader = new FileReader();  
  
  const handleTextFile = () => {  
    reader.readAsText(file.raw);  
    reader.onload = () => {  
      if (reader.result) {  
        this.textarea = reader.result;  
      }  
    };  
  };  

  const handleDocxFile = () => {  
    reader.readAsArrayBuffer(file.raw);  
    reader.onload = (e) => {  
      const arrayBuffer = e.target.result;  
      mammoth.convertToHtml({ arrayBuffer }).then((result) => {  
        this.textarea = result.value;  
      }).catch(console.error);  
    };  
  };  

  const handlePdfFile = async () => {  
    reader.readAsArrayBuffer(file.raw);  
    reader.onload = async (e) => {  
      const typedarray = new Uint8Array(e.target.result);  
      const pdf = await pdfjsLib.getDocument(typedarray).promise;  
      let text = "";  

      for (let i = 1; i <= pdf.numPages; i++) {  
        const page = await pdf.getPage(i);  
        const content = await page.getTextContent();  
        const pageText = content.items.map(item => item.str).join("");  
        text += `${pageText}<br/>`; // 每页文本用换行分隔  
      }  

      this.textarea = text;  
    };  
  };  
  
  switch (type) {  
    case "txt":  
      handleTextFile();  
      break;  
    case "docx":  
      handleDocxFile();  
      break;  
    case "pdf":  
      handlePdfFile();  
      break;  
    default:  
      return; // 不支持的文件类型  
  }  
}