实现 以下格式文档转为xml (.doc .ppt格式 目前没有实现)
.doc,.docx,
.xls,.xlsx,
.pdf,
.txt,.rtf,
.md,
.html,.htm,
.ppt,.pptx
<div>
<input type="file" @change="handleFileUpload" accept=".doc,.docx,.xls,.xlsx,.pdf,.txt,.rtf,.md,.html,.htm,.ppt,.pptx,">
<button @click="convertToXml">转换为XML</button>
<div v-if="xmlContent">
<h3>XML 结果:</h3>
<pre>{{ xmlContent }}</pre>
</div>
</div>
import mammoth from 'mammoth'
import * as XLSX from 'xlsx'
import * as pdfjsLib from 'pdfjs-dist/webpack';
import { marked } from 'marked' // .md 转 xml
import { DOMParser } from 'xmldom'
import XMLSerializer from 'xmlserializer'
import JSZip from 'jszip';
export default {
// 对外提供接口处理文件
async convertToXml(file) {
const extension = file.name.split('.').pop().toLowerCase()
switch (extension) {
case 'ppt': // TODO --- 暂时没有处理方法
case 'pptx':
return await this.powerPointToXml(file, extension)
case 'doc': // TODO --- 暂时没有处理方法
case 'docx':
return await this.wordToXml(file, extension)
case 'xlsx':
case 'xls':
return await this.excelToXml(file)
case 'pdf':
return await this.pdfToXml(file)
case 'md':
return this.markdownToXml(file)
case 'html':
case 'htm':
return this.htmlToXml(file)
case 'txt':
case 'rtf':
return this.textToXml(file)
default:
throw new Error('不支持文件格式')
}
},
//1 pptx 转 xml (ppt不行)
async powerPointToXml(file, extension) {
try {
if (extension === 'pptx') {
const arrayBuffer = await this.readFileAsArrayBuffer(file);
const zip = await JSZip.loadAsync(arrayBuffer);
// 获取所有幻灯片 XML
const slides = [];
const slideFiles = Object.keys(zip.files).filter(name =>
name.startsWith('ppt/slides/slide') && name.endsWith('.xml')
);
for (const slidePath of slideFiles) {
const xmlContent = await zip.file(slidePath).async('text');
slides.push(xmlContent);
}
// 组合成完整 XML
const result = `
<?xml version="1.0" encoding="UTF-8"?>
<presentation>
${slides.map((xml, index) => `
<slide number="${index + 1}">
${this.cleanXML(xml)}
</slide>
`).join('\n')}
</presentation>
`;
return this.textToXmlString(result)
} else {
return this.textToXmlString(' 对于 ppt 文件,先转换为 pptx 再处理')
}
} catch (error) {
console.error('powerPoint转换错误:', error)
throw new Error('powerPoint 转换失败')
}
},
//2. DOCX文档转XML (DOC - 不行 )
async wordToXml(file, extension) {
try {
if (extension === 'docx') {
// 使用mammoth处理DOCX
const result = await mammoth.extractRawText({ arrayBuffer: await file.arrayBuffer() })
return this.textToXmlString(result.value)
} else {
console.error('对于DOC文件,先转换为DOCX再处理');
return this.textToXmlString(' 对于DOC文件,先转换为DOCX再处理')
}
} catch (error) {
console.error('Word转换错误:', error)
throw new Error('Word文档转换失败')
}
},
//3 pdf
async pdfToXml(file) {
try {
// 1. 读取PDF文件
const arrayBuffer = await this.readFileAsArrayBuffer(file);
// 2. 使用pdf.js解析PDF
const pdf = await pdfjsLib.getDocument(arrayBuffer).promise;
// 3. 提取文本和结构信息
let xmlStructure = '<?xml version="1.0" encoding="UTF-8"?>\n<pdf>\n';
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
xmlStructure += ` <page number="${i}">\n`;
// 处理文本项
textContent.items.forEach((item) => {
xmlStructure += ` <text x="${item.transform[4]}" y="${item.transform[5]}" ` +
`width="${item.width}" height="${item.height}" ` +
`font="${item.fontName}">` +
`<![CDATA[${item.str}]]></text>\n`;
});
xmlStructure += ' </page>\n';
}
xmlStructure += '</pdf>';
// return xmlStructure
return this.textToXmlString(xmlStructure)
} catch (error) {
console.error('PDF转换失败:', error);
this.$emit('error', error);
}
},
// 4md
markdownToXml(file) {
return new Promise((resolve) => {
const reader = new FileReader()
reader.onload = (e) => {
const markdown = e.target.result
debugger
const html = marked(markdown)
const xml = html;//htmlToXml(html)
resolve(xml)
}
reader.readAsText(file)
})
},
//5 html、htm
htmlToXml(file) {
return new Promise((resolve) => {
const reader = new FileReader()
reader.onload = (e) => {
const html = e.target.result
const doc = new DOMParser().parseFromString(html, 'text/html')
const text = XMLSerializer.serializeToString(doc)
resolve(this.textToXmlString(text))
}
reader.readAsText(file)
})
},
//6 xlsx xls 转 XML
async excelToXml(file) {
const data = await file.arrayBuffer()
const workbook = XLSX.read(data)
const firstSheet = workbook.Sheets[workbook.SheetNames[0]]
const jsonData = XLSX.utils.sheet_to_json(firstSheet, { header: 1 })
let xml = '<excel>'
jsonData.forEach(row => {
xml += '<row>'
row.forEach(cell => {
xml += `<cell>${cell}</cell>`
})
xml += '</row>'
})
xml += '</excel>'
return xml
},
//7 txt rtf XML
textToXml(file) {
return new Promise((resolve) => {
const reader = new FileReader()
reader.onload = (e) => {
const text = e.target.result
resolve(this.textToXmlString(text))
}
reader.readAsText(file)
})
},
// 文件转为 buffer
readFileAsArrayBuffer(file) {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => resolve(reader.result);
reader.onerror = reject;
reader.readAsArrayBuffer(file);
});
},
// 辅助方法:文本转基本XML结构
textToXmlString(text) {
return `${this.escapeXml(text)}`
},
// XML特殊字符转义
escapeXml(unsafe) {
return unsafe
// return unsafe.replace(/[<>&'"]/g, (c) => {
// switch (c) {
// case '<': return '<'
// case '>': return '>'
// case '&': return '&'
// case '\'': return '''
// case '"': return '"'
// default: return c
// }
// })
},
// 清理 XML 中的命名空间
cleanXML(rawXML) {
return rawXML
.replace(/xmlns(:[a-z0-9]+)?=".*?"/g, '') // 移除命名空间
.replace(/<a:.*?>/g, '') // 简化绘图标签
.replace(/<\/a:.*?>/g, '');
},
}