Java Doc文件转HTML

479 阅读1分钟

原理

使用 Apache POIHWPFDocument 类读取 Doc 文件, 使用 WordToHtmlConverter 转换成 HTML, 设置 PictureManager 处理图片 src(以下示例是将图片转换成base64)

实现

接口层

@PostMapping(value = "/convert-doc")
public String convertDoc(@RequestParam("file") MultipartFile file) {
    String result = null;
    try {
        result = service.convertDoc(file.getInputStream());
    } catch (IOException e) {
        BizException.fail("文件读取失败");
    }
    return result;
}

服务层

public String convertDoc(InputStream inputStream) {
    HWPFDocument doc = null;
    try {
        doc = new HWPFDocument(inputStream);
    } catch (IOException e) {
        BizException.fail("不是doc文件");
    }

    ByteArrayOutputStream outStream = null;
    try {
        WordToHtmlConverter converter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

        converter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) ->
            "data:" + pictureType.getMime() + ";base64," + Base64.getEncoder().encodeToString(content)
        );

        converter.processDocument(doc);
        Document htmlDocument = converter.getDocument();
        outStream = new ByteArrayOutputStream();

        Transformer serializer = TransformerFactory.newInstance().newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(new DOMSource(htmlDocument), new StreamResult(outStream));
        outStream.close();
    } catch (Exception e) {
        BizException.fail("文件转换失败");
    }

    return outStream.toString();
}