Add Dependencies
Before we go to the coding section, we need to add Spire.PDF to our Java project. There are two ways to add the needed dependencies to our project.
For Maven projects, we need to add the following code to our project’s pom.xml file.
- <repositories>
- <repository>
- <id>com.e-iceblue</id>
- <name>e-iceblue</name>
- <url>http:
- </repository>
- </repositories>
- <dependencies>
- <dependency>
- <groupId>e-iceblue</groupId>
- <artifactId>spire.pdf.free</artifactId>
- <version>3.9.0</version>
- </dependency>
- </dependencies>
For non-maven projects, download Free Spire.PDF for Java pack from
this website and add Spire.Pdf.jar in the lib folder into our project as a dependency.
Example 1. Extract all texts from the whole PDF
Spire.PDF for Java library provides thepage.extractText() method that can be used to extract text from each page of a PDF document. In the following example, you will learn how to extract all text from a single PDF documentby using this API.
- import com.spire.pdf.*;
- import com.spire.pdf.PdfPageBase;
- import java.io.*;
- public class extractAllTexts {
- public static void main(String[] args) throws Exception {
- String input = "Sample.pdf";
-
- PdfDocument pdf = new PdfDocument();
- pdf.loadFromFile(input);
-
- String result = "output/extractAllText.txt";
- File file = new File(result);
- if (!file.exists()) {
- file.delete();
- }
- file.createNewFile();
- FileWriter fw = new FileWriter(file, true);
- BufferedWriter bw = new BufferedWriter(fw);
-
- PdfPageBase page;
- for (int i = 0; i < pdf.getPages().getCount(); i++) {
- page = pdf.getPages().get(i);
- String text = page.extractText(true);
- bw.write(text);
- }
- bw.flush();
- bw.close();
- fw.close();
- }
- }
Example 2. Extract text from specific area
Spire.PDF for Java enables developers to extract text from the specific area from a PDF page by using page.extractText(new Rectangle2D.Float(80, 200, 500, 200)) method.
- import com.spire.pdf.*;
- import java.awt.geom.Rectangle2D;
- import java.io.*;
- public class extractTextFromSpecificArea {
- public static void main(String[] args) throws Exception {
- String input = "Sample.pdf";
-
- PdfDocument pdf = new PdfDocument();
- pdf.loadFromFile(input);
-
- String result = "output/extractText.txt";
- File file = new File(result);
- if (!file.exists()) {
- file.delete();
- }
- file.createNewFile();
- FileWriter fw = new FileWriter(file, true);
- BufferedWriter bw = new BufferedWriter(fw);
-
- PdfPageBase page = pdf.getPages().get(0);
-
- String text = page.extractText(new Rectangle2D.Float(80, 200, 500, 200));
- bw.write(text);
- bw.flush();
- bw.close();
- fw.close();
- }
- }
Example 3. Extract highlighted text from PDF
Some PDFs will add the highlighted color for some texts. Spire.PDF offers a page.extractText(textMarkupAnnotation.getBounds())method to extract the highlighted text from the PDF.
- import com.spire.pdf.*;
- import java.io.*;
- import com.spire.pdf.annotations.*;
- import com.spire.pdf.graphics.*;
- public class extractHighlightedText {
- public static void main(String[] args) throws Exception {
- String input = "Sample.pdf";
-
- PdfDocument pdf = new PdfDocument();
- pdf.loadFromFile(input);
-
- String result = "output/extractText1.txt";
- File file = new File(result);
- if (!file.exists()) {
- file.delete();
- }
- file.createNewFile();
- FileWriter fw = new FileWriter(file, true);
- BufferedWriter bw = new BufferedWriter(fw);
- bw.write("Extracted highlighted text:");
- PdfPageBase page = pdf.getPages().get(0);
- for (int i = 0; i < page.getAnnotationsWidget().getCount(); i++) {
- if (page.getAnnotationsWidget().get(i) instanceof PdfTextMarkupAnnotationWidget) {
- PdfTextMarkupAnnotationWidget textMarkupAnnotation = (PdfTextMarkupAnnotationWidget) page.getAnnotationsWidget().get(i);
- bw.write(page.extractText(textMarkupAnnotation.getBounds()));
-
- PdfRGBColor color = textMarkupAnnotation.getColor();
- bw.write("Color=" + (color.getR() & 0XFF) + "," + (color.getG() & 0XFF) + "," + (color.getB() & 0XFF) + "\n");
- }
- }
- bw.flush();
- bw.close();
- fw.close();
- }
- }
Example 4. Extract images from PDF
Spire.PDF for Java offers a page.extractImages() method to extract images from the PDF file.
- import com.spire.pdf.*;
- import javax.imageio.ImageIO;
- import java.awt.image.BufferedImage;
- import java.io.*;
- import java.util.ArrayList;
- public class extractImages {
- public static void main(String[] args) throws Exception {
-
- PdfDocument doc = new PdfDocument();
- doc.loadFromFile("Sample.pdf");
- StringBuilder buffer = new StringBuilder();
- ArrayList < BufferedImage > images = new ArrayList < BufferedImage > ();
-
- for (PdfPageBase page: (Iterable < PdfPageBase > ) doc.getPages()) {
-
- for (BufferedImage image: page.extractImages()) {
-
- int index = 0;
-
- File output = new File("output/" + String.format("Image_%d.png", index++));
-
- ImageIO.write(image, "PNG", output);
- }
- }
- }
- }