import java.io.; import java.nio.charset.StandardCharsets; import java.util.;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.TextPosition;
// tabula-java classes (需要在 classpath 中有 tabula 的 jar) import technology.tabula.ObjectExtractor; import technology.tabula.Page; import technology.tabula.Rectangle; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; import technology.tabula.extractors.BasicExtractionAlgorithm; import technology.tabula.Table; import technology.tabula.RectangularTextContainer;
public class PdfToHtmlWithTables {
public static void main(String[] args) throws Exception {
if (args.length < 2) {
System.out.println("用法: java PdfToHtmlWithTables input.pdf out.html");
return;
}
File pdfFile = new File(args[0]);
File outHtml = new File(args[1]);
try (PDDocument doc = PDDocument.load(pdfFile)) {
// 1) 提取每页的纯文本(按行)
List<String> pageTexts = extractPageTexts(doc);
// 2) 用 tabula 提取每页的表格(如果有的话)
List<List<Table>> allPagesTables = extractTablesWithTabula(doc);
// 3) 把文本和表格合并(这里做一个简单的按顺序插入策略:先输出页面文本,再输出表格)
// 更复杂的做法:使用坐标比较(表格的 bounding box 与文本 y 坐标)决定插入位置——下面给出扩展提示
String html = buildHtml(pageTexts, allPagesTables);
// 4) 输出文件
try (Writer w = new OutputStreamWriter(new FileOutputStream(outHtml), StandardCharsets.UTF_8)) {
w.write(html);
}
System.out.println("已生成 HTML: " + outHtml.getAbsolutePath());
}
}
// 使用 PDFBox 提取每页文本(按页)
static List<String> extractPageTexts(PDDocument doc) throws IOException {
PDFTextStripper stripper = new PDFTextStripper();
List<String> res = new ArrayList<>();
int pages = doc.getNumberOfPages();
for (int p = 1; p <= pages; p++) {
stripper.setStartPage(p);
stripper.setEndPage(p);
String text = stripper.getText(doc);
res.add(text == null ? "" : text.trim());
}
return res;
}
// 使用 tabula-java 提取表格(每页一个 List<Table>)
static List<List<Table>> extractTablesWithTabula(PDDocument doc) {
List<List<Table>> pagesTables = new ArrayList<>();
ObjectExtractor oe = new ObjectExtractor(doc);
int numPages = doc.getNumberOfPages();
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm();
for (int i = 1; i <= numPages; i++) {
Page page = oe.extract(i);
// 尝试 spreadsheet 模式(对规则表格效果好),再 fallback 到 basic 模式
List<Table> tables = new ArrayList<>();
try {
List<Table> t1 = sea.extract(page);
if (t1 != null && !t1.isEmpty()) tables.addAll(t1);
else {
List<Table> t2 = bea.extract(page);
if (t2 != null && !t2.isEmpty()) tables.addAll(t2);
}
} catch (Throwable ex) {
// 有时候 sea.extract 会抛异常,fallback
List<Table> t2 = bea.extract(page);
if (t2 != null && !t2.isEmpty()) tables.addAll(t2);
}
pagesTables.add(tables);
}
return pagesTables;
}
// 把表格转换为 HTML table(简单样式),并把页面文本一起拼接
static String buildHtml(List<String> pageTexts, List<List<Table>> allPagesTables) {
StringBuilder sb = new StringBuilder();
sb.append("<!doctype html>\n<html><head><meta charset='utf-8'>\n");
sb.append("<style>\n");
sb.append("body{font-family: Arial, Helvetica, sans-serif;}\n");
sb.append("table.pdf-table{border-collapse:collapse;margin:10px 0;}\n");
sb.append("table.pdf-table td, table.pdf-table th{border:1px solid #444;padding:6px;}\n");
sb.append(".page{page-break-after:always;padding:12px;border-bottom:1px dashed #ccc;margin-bottom:20px}\n");
sb.append("</style>\n</head><body>\n");
int pages = pageTexts.size();
for (int p = 0; p < pages; p++) {
sb.append("<div class='page'>\n");
sb.append("<h4>Page ").append(p + 1).append("</h4>\n");
// 页面文本(简单输出,保留换行)
String text = pageTexts.get(p);
if (text != null && !text.isEmpty()) {
// 把换行转换为 <p> 或 <br>
String[] lines = text.split("\\r?\\n");
for (String line : lines) {
String escaped = escapeHtml(line.trim());
if (!escaped.isEmpty()) sb.append("<p>").append(escaped).append("</p>\n");
}
}
// 把提取到的表格输出
List<Table> tables = allPagesTables.get(p);
if (tables != null && !tables.isEmpty()) {
for (int tIdx = 0; tIdx < tables.size(); tIdx++) {
Table t = tables.get(tIdx);
sb.append("<div class='pdf-table-wrap'>\n");
sb.append(convertTableToHtml(t));
sb.append("</div>\n");
}
} else {
sb.append("<!-- no table on this page -->\n");
}
sb.append("</div>\n");
}
sb.append("</body></html>");
return sb.toString();
}
// 将 tabula Table 转换成 HTML(简单处理 cell 合并:tabula 返回的行列可能包含空 cell)
static String convertTableToHtml(Table t) {
StringBuilder sb = new StringBuilder();
sb.append("<table class='pdf-table'>\n");
List<List<RectangularTextContainer>> rows = t.getRows();
for (List<RectangularTextContainer> row : rows) {
sb.append("<tr>");
for (RectangularTextContainer cell : row) {
String txt = cell == null ? "" : escapeHtml(cell.getText());
// NOTE: tabula 的 Table API 有时候会把空单元放置为空字符串——要判断合并单元格需要用 cell 的 bounding box, 复杂,下面只输出内容不 merge
sb.append("<td>").append(txt == null ? "" : txt).append("</td>");
}
sb.append("</tr>\n");
}
sb.append("</table>\n");
return sb.toString();
}
static String escapeHtml(String s) {
if (s == null) return "";
return s.replace("&", "&").replace("<", "<").replace(">", ">")
.replace("\"", """).replace("'", "'");
}
}