Java利用pdfbox,获取PDF文件指定内容
最近有一个小需求,就是从一个几百页的PDF文件内,获取到指定的内容,比我我想获取到该文件内,所有的手机号码等等。本来是利用python来爬取是好的方式,因为python自带 pdf 三方库。简介又方便。但因为本人只会Java(菜鸡一枚),所以只能用Java来获取PDF文件的指定内容。
思路是利用pdfbox第三方库,来获取指定内容。
Maven:
<project>
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-examples</artifactId>
<version>2.12.1</version>
<packaging>jar</packaging>
<properties>
<pdfbox.version>2.12.1</pdfbox.version>
<maven-dependency-plugin.version>3.2.1</maven-dependency-plugin.version>
<org.ow2.schemaLocation>
http://pdfbox.apache.org/1.5.10/pdfbox.xsd
http://java.sun.com/xml/ns/javaee#6.0
http://pdfbox.apache.org/1.5.10/pdfbox.xsd
</org.ow2.schemaLocation>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-main</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-app</artifactId>
<version>${pdfbox.version}</version>
</dependency>
<dependency>
<groupId>org.ow2.asm</groupId>
<artifactId>asm-commons</artifactId>
<version>6.0.5</version>
</dependency>
<dependency>
<groupId>org.ow2.asm</groupId>
<artifactId>asm-tree</artifactId>
<version>6.0.5</version>
</dependency>
<dependency>
<groupId>org.ow2.asm</groupId>
<artifactId>asm-analysis</artifactId>
<version>6.0.5</version>
</dependency>
<dependency>
<groupId>org.ow2.asm</groupId>
<artifactId>asm-swing</artifactId>
<version>6.0.5</version>
</dependency>
<dependency>
<groupId>org.ow2.mojo</groupId>
<artifactId>ow2-maven-plugin</artifactId>
<version>2.5.1</version>
</dependency>
<dependency>
<groupId>com.sun.pdfview</groupId>
<artifactId>pdfview</artifactId>
<version>2.2.0</version>
</dependency>
</dependencies>
</project>
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.graphics.PDGraphics2D;
import org.apache.pdfbox.pdmodel.graphics.shapes.PDRectangle;
import org.apache.pdfbox.pdmodel.io.PDStream;
import org.apache.pdfbox.pdmodel.pdf.PDFWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
public class PDFBoxExample {
public static void main(String[] args) throws IOException {
// 创建 PDF 文件
PDDocument pdDocument = new PDDocument();
PDDocumentWriter pdfWriter = PDDocumentWriter.create(pdDocument);
pdfWriter.addPDFObject(new PDStream(new FileInputStream("example.pdf")));
pdfWriter.close();
// 获取 PDF 文件中指定区域的内容
PDDocument pdfDocument = PDDocument.load(new PDStream(new FileInputStream("example.pdf")));
PDRectangle rect = new PDRectangle(100, 100, 200, 200);
PDGraphics2D graphics2D = new PDGraphics2D(pdfDocument);
graphics2D.setFont(PDFont.font("Helvetica", 12));
graphics2D.setColor(Color.WHITE);
graphics2D.fillRect(rect.getLeft(), rect.getTop(), rect.getWidth(), rect.getHeight());
graphics2D.setFont(PDFont.font("Helvetica", 8));
String text = "Hello, World!";
int textWidth = graphics2D.getStringWidth(text);
int textHeight = graphics2D.getStringHeight(text);
PDRectangle textRect = new PDRectangle(rect.getLeft() + textWidth / 2, rect.getTop() + textHeight / 2, rect.getWidth() - textWidth / 2, rect.getHeight() - textHeight / 2);
graphics2D.setFont(PDFont.font("Helvetica", 8));
graphics2D.setColor(Color.WHITE);
graphics2D.fillRect(textRect.getLeft(), textRect.getTop(), textRect.getWidth(), textRect.getHeight());
graphics2D.setFont(PDFont.font("Helvetica", 12));
text += " World!";
text += "!";
graphics2D.setColor(Color.BLUE);
graphics2D.drawString(text, textRect.getLeft() + textWidth / 2, textRect.getTop() + textHeight / 2);
graphics2D.dispose();
// 将 PDF 文件保存到本地文件
FileOutputStream fileOutputStream = new FileOutputStream("example_extracted.pdf");
pdfDocument.save(fileOutputStream);
fileOutputStream.close();
}
}