转换为XML

127 阅读1分钟

实现 以下格式文档转为xml (.doc .ppt格式 目前没有实现)

  .doc,.docx,
  .xls,.xlsx,
  .pdf,
  .txt,.rtf,
  .md,
  .html,.htm,
  .ppt,.pptx
  <div>
     
       <input type="file" @change="handleFileUpload" accept=".doc,.docx,.xls,.xlsx,.pdf,.txt,.rtf,.md,.html,.htm,.ppt,.pptx,">
       <button @click="convertToXml">转换为XML</button>
       <div v-if="xmlContent">
           <h3>XML 结果:</h3>
           <pre>{{ xmlContent }}</pre>
       </div>
   </div>
import mammoth from 'mammoth'
import * as XLSX from 'xlsx'
import * as pdfjsLib from 'pdfjs-dist/webpack';
import { marked } from 'marked' // .md 转 xml
import { DOMParser } from 'xmldom'
import XMLSerializer from 'xmlserializer'
import JSZip from 'jszip';

export default {
    // 对外提供接口处理文件
    async convertToXml(file) {
        const extension = file.name.split('.').pop().toLowerCase()
        switch (extension) {
            case 'ppt': // TODO ---  暂时没有处理方法
            case 'pptx':
                return await this.powerPointToXml(file, extension)
            case 'doc': // TODO ---  暂时没有处理方法
            case 'docx':
                return await this.wordToXml(file, extension)
            case 'xlsx':
            case 'xls':
                return await this.excelToXml(file)
            case 'pdf':
                return await this.pdfToXml(file)
            case 'md':
                return this.markdownToXml(file)
            case 'html':
            case 'htm':
                return this.htmlToXml(file)
            case 'txt':
            case 'rtf':
                return this.textToXml(file)
            default:
                throw new Error('不支持文件格式')
        }
    },
    //1 pptx 转 xml (ppt不行)
    async powerPointToXml(file, extension) {
        try {
            if (extension === 'pptx') {
                const arrayBuffer = await this.readFileAsArrayBuffer(file);
                const zip = await JSZip.loadAsync(arrayBuffer);
                // 获取所有幻灯片 XML
                const slides = [];
                const slideFiles = Object.keys(zip.files).filter(name =>
                    name.startsWith('ppt/slides/slide') && name.endsWith('.xml')
                );
                for (const slidePath of slideFiles) {
                    const xmlContent = await zip.file(slidePath).async('text');
                    slides.push(xmlContent);
                }
                // 组合成完整 XML
                const result = `
                    <?xml version="1.0" encoding="UTF-8"?>
                    <presentation>
                        ${slides.map((xml, index) => `
                        <slide number="${index + 1}">
                            ${this.cleanXML(xml)}
                        </slide>
                        `).join('\n')}
                    </presentation>
                    `;
                return this.textToXmlString(result)
            } else {

                return this.textToXmlString(' 对于 ppt 文件,先转换为 pptx 再处理')
            }
        } catch (error) {
            console.error('powerPoint转换错误:', error)
            throw new Error('powerPoint 转换失败')
        }
    },
    //2. DOCX文档转XML (DOC - 不行  )
    async wordToXml(file, extension) {
        try {
            if (extension === 'docx') {
                // 使用mammoth处理DOCX
                const result = await mammoth.extractRawText({ arrayBuffer: await file.arrayBuffer() })
                return this.textToXmlString(result.value)
            } else {
                console.error('对于DOC文件,先转换为DOCX再处理');
                return this.textToXmlString(' 对于DOC文件,先转换为DOCX再处理')
            }
        } catch (error) {
            console.error('Word转换错误:', error)
            throw new Error('Word文档转换失败')
        }
    },
    //3 pdf
    async pdfToXml(file) {
        try {
            // 1. 读取PDF文件
            const arrayBuffer = await this.readFileAsArrayBuffer(file);
            // 2. 使用pdf.js解析PDF
            const pdf = await pdfjsLib.getDocument(arrayBuffer).promise;
            // 3. 提取文本和结构信息
            let xmlStructure = '<?xml version="1.0" encoding="UTF-8"?>\n<pdf>\n';
            for (let i = 1; i <= pdf.numPages; i++) {
                const page = await pdf.getPage(i);
                const textContent = await page.getTextContent();
                xmlStructure += `  <page number="${i}">\n`;
                // 处理文本项
                textContent.items.forEach((item) => {
                    xmlStructure += `    <text x="${item.transform[4]}" y="${item.transform[5]}" ` +
                        `width="${item.width}" height="${item.height}" ` +
                        `font="${item.fontName}">` +
                        `<![CDATA[${item.str}]]></text>\n`;
                });
                xmlStructure += '  </page>\n';
            }
            xmlStructure += '</pdf>';
            // return xmlStructure
            return this.textToXmlString(xmlStructure)
        } catch (error) {
            console.error('PDF转换失败:', error);
            this.$emit('error', error);
        }
    },
    // 4md 
    markdownToXml(file) {
        return new Promise((resolve) => {
            const reader = new FileReader()
            reader.onload = (e) => {
                const markdown = e.target.result
                debugger
                const html = marked(markdown)
                const xml = html;//htmlToXml(html)
                resolve(xml)
            }
            reader.readAsText(file)
        })
    },
    //5 html、htm
    htmlToXml(file) {
        return new Promise((resolve) => {
            const reader = new FileReader()
            reader.onload = (e) => {
                const html = e.target.result
                const doc = new DOMParser().parseFromString(html, 'text/html')
                const text = XMLSerializer.serializeToString(doc)
                resolve(this.textToXmlString(text))
            }
            reader.readAsText(file)
        })
    },
    //6 xlsx xls 转 XML
    async excelToXml(file) {
        const data = await file.arrayBuffer()
        const workbook = XLSX.read(data)
        const firstSheet = workbook.Sheets[workbook.SheetNames[0]]
        const jsonData = XLSX.utils.sheet_to_json(firstSheet, { header: 1 })

        let xml = '<excel>'
        jsonData.forEach(row => {
            xml += '<row>'
            row.forEach(cell => {
                xml += `<cell>${cell}</cell>`
            })
            xml += '</row>'
        })
        xml += '</excel>'

        return xml
    },
    //7 txt rtf XML
    textToXml(file) {
        return new Promise((resolve) => {
            const reader = new FileReader()
            reader.onload = (e) => {
                const text = e.target.result
                resolve(this.textToXmlString(text))
            }
            reader.readAsText(file)
        })
    },
    // 文件转为 buffer
    readFileAsArrayBuffer(file) {
        return new Promise((resolve, reject) => {
            const reader = new FileReader();
            reader.onload = () => resolve(reader.result);
            reader.onerror = reject;
            reader.readAsArrayBuffer(file);
        });
    },
    // 辅助方法:文本转基本XML结构
    textToXmlString(text) {
        return `${this.escapeXml(text)}`
    },
    // XML特殊字符转义
    escapeXml(unsafe) {

        return unsafe
        // return unsafe.replace(/[<>&'"]/g, (c) => {
        //     switch (c) {
        //         case '<': return '&lt;'
        //         case '>': return '&gt;'
        //         case '&': return '&amp;'
        //         case '\'': return '&apos;'
        //         case '"': return '&quot;'
        //         default: return c
        //     }
        // })
    },
    // 清理 XML 中的命名空间 
    cleanXML(rawXML) {
        return rawXML
            .replace(/xmlns(:[a-z0-9]+)?=".*?"/g, '')  // 移除命名空间
            .replace(/<a:.*?>/g, '')                    // 简化绘图标签
            .replace(/<\/a:.*?>/g, '');
    },
}