pdf转html

33 阅读2分钟

import java.io.; import java.nio.charset.StandardCharsets; import java.util.;

import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition;

// tabula-java classes (需要在 classpath 中有 tabula 的 jar) import technology.tabula.ObjectExtractor; import technology.tabula.Page; import technology.tabula.Rectangle; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; import technology.tabula.extractors.BasicExtractionAlgorithm; import technology.tabula.Table; import technology.tabula.RectangularTextContainer;

public class PdfToHtmlWithTables {

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.out.println("用法: java PdfToHtmlWithTables input.pdf out.html");
        return;
    }
    File pdfFile = new File(args[0]);
    File outHtml = new File(args[1]);

    try (PDDocument doc = PDDocument.load(pdfFile)) {
        // 1) 提取每页的纯文本(按行)
        List<String> pageTexts = extractPageTexts(doc);

        // 2) 用 tabula 提取每页的表格(如果有的话)
        List<List<Table>> allPagesTables = extractTablesWithTabula(doc);

        // 3) 把文本和表格合并(这里做一个简单的按顺序插入策略:先输出页面文本,再输出表格)
        //    更复杂的做法:使用坐标比较(表格的 bounding box 与文本 y 坐标)决定插入位置——下面给出扩展提示
        String html = buildHtml(pageTexts, allPagesTables);

        // 4) 输出文件
        try (Writer w = new OutputStreamWriter(new FileOutputStream(outHtml), StandardCharsets.UTF_8)) {
            w.write(html);
        }

        System.out.println("已生成 HTML: " + outHtml.getAbsolutePath());
    }
}

// 使用 PDFBox 提取每页文本(按页)
static List<String> extractPageTexts(PDDocument doc) throws IOException {
    PDFTextStripper stripper = new PDFTextStripper();
    List<String> res = new ArrayList<>();
    int pages = doc.getNumberOfPages();
    for (int p = 1; p <= pages; p++) {
        stripper.setStartPage(p);
        stripper.setEndPage(p);
        String text = stripper.getText(doc);
        res.add(text == null ? "" : text.trim());
    }
    return res;
}

// 使用 tabula-java 提取表格(每页一个 List<Table>)
static List<List<Table>> extractTablesWithTabula(PDDocument doc) {
    List<List<Table>> pagesTables = new ArrayList<>();
    ObjectExtractor oe = new ObjectExtractor(doc);
    int numPages = doc.getNumberOfPages();
    SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
    BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();

    for (int i = 1; i <= numPages; i++) {
        Page page = oe.extract(i);
        // 尝试 spreadsheet 模式(对规则表格效果好),再 fallback 到 basic 模式
        List<Table> tables = new ArrayList<>();
        try {
            List<Table> t1 = sea.extract(page);
            if (t1 != null && !t1.isEmpty()) tables.addAll(t1);
            else {
                List<Table> t2 = bea.extract(page);
                if (t2 != null && !t2.isEmpty()) tables.addAll(t2);
            }
        } catch (Throwable ex) {
            // 有时候 sea.extract 会抛异常,fallback
            List<Table> t2 = bea.extract(page);
            if (t2 != null && !t2.isEmpty()) tables.addAll(t2);
        }
        pagesTables.add(tables);
    }
    return pagesTables;
}

// 把表格转换为 HTML table(简单样式),并把页面文本一起拼接
static String buildHtml(List<String> pageTexts, List<List<Table>> allPagesTables) {
    StringBuilder sb = new StringBuilder();
    sb.append("<!doctype html>\n<html><head><meta charset='utf-8'>\n");
    sb.append("<style>\n");
    sb.append("body{font-family: Arial, Helvetica, sans-serif;}\n");
    sb.append("table.pdf-table{border-collapse:collapse;margin:10px 0;}\n");
    sb.append("table.pdf-table td, table.pdf-table th{border:1px solid #444;padding:6px;}\n");
    sb.append(".page{page-break-after:always;padding:12px;border-bottom:1px dashed #ccc;margin-bottom:20px}\n");
    sb.append("</style>\n</head><body>\n");

    int pages = pageTexts.size();
    for (int p = 0; p < pages; p++) {
        sb.append("<div class='page'>\n");
        sb.append("<h4>Page ").append(p + 1).append("</h4>\n");

        // 页面文本(简单输出,保留换行)
        String text = pageTexts.get(p);
        if (text != null && !text.isEmpty()) {
            // 把换行转换为 <p> 或 <br>
            String[] lines = text.split("\\r?\\n");
            for (String line : lines) {
                String escaped = escapeHtml(line.trim());
                if (!escaped.isEmpty()) sb.append("<p>").append(escaped).append("</p>\n");
            }
        }

        // 把提取到的表格输出
        List<Table> tables = allPagesTables.get(p);
        if (tables != null && !tables.isEmpty()) {
            for (int tIdx = 0; tIdx < tables.size(); tIdx++) {
                Table t = tables.get(tIdx);
                sb.append("<div class='pdf-table-wrap'>\n");
                sb.append(convertTableToHtml(t));
                sb.append("</div>\n");
            }
        } else {
            sb.append("<!-- no table on this page -->\n");
        }

        sb.append("</div>\n");
    }

    sb.append("</body></html>");
    return sb.toString();
}

// 将 tabula Table 转换成 HTML(简单处理 cell 合并:tabula 返回的行列可能包含空 cell)
static String convertTableToHtml(Table t) {
    StringBuilder sb = new StringBuilder();
    sb.append("<table class='pdf-table'>\n");
    List<List<RectangularTextContainer>> rows = t.getRows();
    for (List<RectangularTextContainer> row : rows) {
        sb.append("<tr>");
        for (RectangularTextContainer cell : row) {
            String txt = cell == null ? "" : escapeHtml(cell.getText());
            // NOTE: tabula 的 Table API 有时候会把空单元放置为空字符串——要判断合并单元格需要用 cell 的 bounding box, 复杂,下面只输出内容不 merge
            sb.append("<td>").append(txt == null ? "" : txt).append("</td>");
        }
        sb.append("</tr>\n");
    }
    sb.append("</table>\n");
    return sb.toString();
}

static String escapeHtml(String s) {
    if (s == null) return "";
    return s.replace("&", "&amp;").replace("<", "&lt;").replace(">", "&gt;")
            .replace("\"", "&quot;").replace("'", "&#39;");
}

}