读取PDF表格数据,提取特定列的内容:springboot + pdfbox读取全部内容文字,但是读取有回车符的表格单元格时,会自动转成换行符导致内容顺序错误:pdfbox 解析PDF文档,tabula依据文档生成对象解析器,里面包含分页迭代,遍历分页数据读取各个表格内容即可,读取单元格时重点在将 ‘\r’回车符替换掉就好。
需求:读取PDF表格数据,提取特定列的内容
问题:springboot + pdfbox读取全部内容文字,但是读取有回车符的表格单元格时,会自动转成换行符导致内容顺序错误
解决:pdfbox 解析PDF文档,tabula依据文档生成对象解析器,里面包含分页迭代,遍历分页数据读取各个表格内容即可,读取单元格时重点在将 ‘\r’回车符替换掉就好
<!-- PDF解析依赖 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.27</version>
</dependency>
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.5</version>
</dependency>
public static void readPdfTable() {
String filePath = "C:\Users\Administrator\Desktop\测试.pdf";
try (PDDocument document = PDDocument.load(new File(filePath))) {
ObjectExtractor extractor = new ObjectExtractor(document);
SpreadsheetExtractionAlgorithm algorithm = new SpreadsheetExtractionAlgorithm();
PageIterator extract = extractor.extract();
Page page = extract.next();
while (page != null) {
System.out.println("当前页数:" + page.getPageNumber());
for (Table table : algorithm.extract(page)) {
List<List<RectangularTextContainer>> rows = table.getRows();
for (List<RectangularTextContainer> row : rows) {
for (RectangularTextContainer cell : row) {
// 获取单元格完整内容(自动处理换行)
String text = cell.getText().replace("\r", "");
System.out.println("单元格内容: " + text);
}
}
}
page = extract.next();
}
} catch (Exception e) {
}
}复制