Java利用pdfbox,获取PDF文件指定内容

426 阅读1分钟

Java利用pdfbox,获取PDF文件指定内容

最近有一个小需求,就是从一个几百页的PDF文件内,获取到指定的内容,比我我想获取到该文件内,所有的手机号码等等。本来是利用python来爬取是好的方式,因为python自带 pdf 三方库。简介又方便。但因为本人只会Java(菜鸡一枚),所以只能用Java来获取PDF文件的指定内容。

思路是利用pdfbox第三方库,来获取指定内容。

Maven:

<project>  
  <modelVersion>4.0.0</modelVersion>  
  <groupId>org.apache.pdfbox</groupId>  
  <artifactId>pdfbox-examples</artifactId>  
  <version>2.12.1</version>  
  <packaging>jar</packaging>

  <properties>  
    <pdfbox.version>2.12.1</pdfbox.version>  
    <maven-dependency-plugin.version>3.2.1</maven-dependency-plugin.version>  
    <org.ow2.schemaLocation>  
      http://pdfbox.apache.org/1.5.10/pdfbox.xsd  
      http://java.sun.com/xml/ns/javaee#6.0  
      http://pdfbox.apache.org/1.5.10/pdfbox.xsd  
    </org.ow2.schemaLocation>  
  </properties>

  <dependencies>  
    <dependency>  
      <groupId>org.apache.pdfbox</groupId>  
      <artifactId>pdfbox-main</artifactId>  
      <version>${pdfbox.version}</version>  
    </dependency>  
    <dependency>  
      <groupId>org.apache.pdfbox</groupId>  
      <artifactId>pdfbox-app</artifactId>  
      <version>${pdfbox.version}</version>  
    </dependency>  
    <dependency>  
      <groupId>org.ow2.asm</groupId>  
      <artifactId>asm-commons</artifactId>  
      <version>6.0.5</version>  
    </dependency>  
    <dependency>  
      <groupId>org.ow2.asm</groupId>  
      <artifactId>asm-tree</artifactId>  
      <version>6.0.5</version>  
    </dependency>  
    <dependency>  
      <groupId>org.ow2.asm</groupId>  
      <artifactId>asm-analysis</artifactId>  
      <version>6.0.5</version>  
    </dependency>  
    <dependency>  
      <groupId>org.ow2.asm</groupId>  
      <artifactId>asm-swing</artifactId>  
      <version>6.0.5</version>  
    </dependency>  
    <dependency>  
      <groupId>org.ow2.mojo</groupId>  
      <artifactId>ow2-maven-plugin</artifactId>  
      <version>2.5.1</version>  
    </dependency>  
    <dependency>  
      <groupId>com.sun.pdfview</groupId>  
      <artifactId>pdfview</artifactId>  
      <version>2.2.0</version>  
    </dependency>  
  </dependencies>  
</project>  
import org.apache.pdfbox.pdmodel.PDDocument;  
import org.apache.pdfbox.pdmodel.font.PDFont;  
import org.apache.pdfbox.pdmodel.graphics.PDGraphics2D;  
import org.apache.pdfbox.pdmodel.graphics.shapes.PDRectangle;  
import org.apache.pdfbox.pdmodel.io.PDStream;  
import org.apache.pdfbox.pdmodel.pdf.PDFWriter;

import java.io.FileInputStream;  
import java.io.FileOutputStream;  
import java.io.IOException;

public class PDFBoxExample {  
    public static void main(String[] args) throws IOException {  
        // 创建 PDF 文件  
        PDDocument pdDocument = new PDDocument();  
        PDDocumentWriter pdfWriter = PDDocumentWriter.create(pdDocument);  
        pdfWriter.addPDFObject(new PDStream(new FileInputStream("example.pdf")));  
        pdfWriter.close();

        // 获取 PDF 文件中指定区域的内容  
        PDDocument pdfDocument = PDDocument.load(new PDStream(new FileInputStream("example.pdf")));  
        PDRectangle rect = new PDRectangle(100, 100, 200, 200);  
        PDGraphics2D graphics2D = new PDGraphics2D(pdfDocument);  
        
        graphics2D.setFont(PDFont.font("Helvetica", 12));  
        graphics2D.setColor(Color.WHITE);  
        graphics2D.fillRect(rect.getLeft(), rect.getTop(), rect.getWidth(), rect.getHeight());  
        graphics2D.setFont(PDFont.font("Helvetica", 8));  
        
        String text = "Hello, World!";  
        int textWidth = graphics2D.getStringWidth(text);  
        int textHeight = graphics2D.getStringHeight(text);  
        PDRectangle textRect = new PDRectangle(rect.getLeft() + textWidth / 2, rect.getTop() + textHeight / 2, rect.getWidth() - textWidth / 2, rect.getHeight() - textHeight / 2);  
        
        graphics2D.setFont(PDFont.font("Helvetica", 8));  
        graphics2D.setColor(Color.WHITE);  
        graphics2D.fillRect(textRect.getLeft(), textRect.getTop(), textRect.getWidth(), textRect.getHeight());  
        graphics2D.setFont(PDFont.font("Helvetica", 12));  
        text += " World!";  
        text += "!";  
        graphics2D.setColor(Color.BLUE);  
        graphics2D.drawString(text, textRect.getLeft() + textWidth / 2, textRect.getTop() + textHeight / 2);  
        graphics2D.dispose();

        // 将 PDF 文件保存到本地文件  
        FileOutputStream fileOutputStream = new FileOutputStream("example_extracted.pdf");  
        pdfDocument.save(fileOutputStream);  
        fileOutputStream.close();  
    }  
}