How To Extract Text And Image From PDF In Java Applications

C# Curator
4y
18k
0
1

Article

Add Dependencies

Before we go to the coding section, we need to add Spire.PDF to our Java project. There are two ways to add the needed dependencies to our project.

For Maven projects, we need to add the following code to our project’s pom.xml file.

<repositories>
<repository>
<id>com.e-iceblue</id>
<name>e-iceblue</name>
<url>http://repo.e-iceblue.com/nexus/content/groups/public/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf.free</artifactId>
<version>3.9.0</version>
</dependency>
</dependencies>

For non-maven projects, download Free Spire.PDF for Java pack from this website and add Spire.Pdf.jar in the lib folder into our project as a dependency.

Example 1. Extract all texts from the whole PDF

Spire.PDF for Java library provides thepage.extractText() method that can be used to extract text from each page of a PDF document. In the following example, you will learn how to extract all text from a single PDF documentby using this API.

import com.spire.pdf.*;
import com.spire.pdf.PdfPageBase;
import java.io.*;
public class extractAllTexts {
public static void main(String[] args) throws Exception {
String input = "Sample.pdf";
//Load the PDF file
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(input);
//Create a new txt file to save the extracted text
String result = "output/extractAllText.txt";
File file = new File(result);
if (!file.exists()) {
file.delete();
}
file.createNewFile();
FileWriter fw = new FileWriter(file, true);
BufferedWriter bw = new BufferedWriter(fw);
//Extract text from all the pages on the PDF
PdfPageBase page;
for (int i = 0; i < pdf.getPages().getCount(); i++) {
page = pdf.getPages().get(i);
String text = page.extractText(true);
bw.write(text);
}
bw.flush();
bw.close();
fw.close();
}
}

Example 2. Extract text from specific area

Spire.PDF for Java enables developers to extract text from the specific area from a PDF page by using page.extractText(new Rectangle2D.Float(80, 200, 500, 200)) method.

import com.spire.pdf.*;
import java.awt.geom.Rectangle2D;
import java.io.*;
public class extractTextFromSpecificArea {
public static void main(String[] args) throws Exception {
String input = "Sample.pdf";
//Load the PDF file
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(input);
//Create a new txt file to save the extracted text
String result = "output/extractText.txt";
File file = new File(result);
if (!file.exists()) {
file.delete();
}
file.createNewFile();
FileWriter fw = new FileWriter(file, true);
BufferedWriter bw = new BufferedWriter(fw);
//Get the first page
PdfPageBase page = pdf.getPages().get(0);
//Extract text from a specific rectangle area within the page
String text = page.extractText(new Rectangle2D.Float(80, 200, 500, 200));
bw.write(text);
bw.flush();
bw.close();
fw.close();
}
}

Example 3. Extract highlighted text from PDF

Some PDFs will add the highlighted color for some texts. Spire.PDF offers a page.extractText(textMarkupAnnotation.getBounds())method to extract the highlighted text from the PDF.

import com.spire.pdf.*;
import java.io.*;
import com.spire.pdf.annotations.*;
import com.spire.pdf.graphics.*;
public class extractHighlightedText {
public static void main(String[] args) throws Exception {
String input = "Sample.pdf";
//Load the PDF file
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(input);
//Create a new txt file to save the extracted text
String result = "output/extractText1.txt";
File file = new File(result);
if (!file.exists()) {
file.delete();
}
file.createNewFile();
FileWriter fw = new FileWriter(file, true);
BufferedWriter bw = new BufferedWriter(fw);
bw.write("Extracted highlighted text:");
PdfPageBase page = pdf.getPages().get(0);
for (int i = 0; i < page.getAnnotationsWidget().getCount(); i++) {
if (page.getAnnotationsWidget().get(i) instanceof PdfTextMarkupAnnotationWidget) {
PdfTextMarkupAnnotationWidget textMarkupAnnotation = (PdfTextMarkupAnnotationWidget) page.getAnnotationsWidget().get(i);
bw.write(page.extractText(textMarkupAnnotation.getBounds()));
//Get the highlighted color
PdfRGBColor color = textMarkupAnnotation.getColor();
bw.write("Color=" + (color.getR() & 0XFF) + "," + (color.getG() & 0XFF) + "," + (color.getB() & 0XFF) + "\n");
}
}
bw.flush();
bw.close();
fw.close();
}
}

Example 4. Extract images from PDF

Spire.PDF for Java offers a page.extractImages() method to extract images from the PDF file.

import com.spire.pdf.*;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
public class extractImages {
public static void main(String[] args) throws Exception {
//Load the PDF File
PdfDocument doc = new PdfDocument();
doc.loadFromFile("Sample.pdf");
StringBuilder buffer = new StringBuilder();
ArrayList < BufferedImage > images = new ArrayList < BufferedImage > ();
//loop through the pages
for (PdfPageBase page: (Iterable < PdfPageBase > ) doc.getPages()) {
//extract images from a particular page
for (BufferedImage image: page.extractImages()) {
//declare an int variable
int index = 0;
//specify the file path and name
File output = new File("output/" + String.format("Image_%d.png", index++));
//save image as .png file
ImageIO.write(image, "PNG", output);
}
}
}
}