- package com.example.demo.Image;
-
- import net.sourceforge.tess4j.ITesseract;
- import net.sourceforge.tess4j.Tesseract;
- import net.sourceforge.tess4j.TesseractException;
- import org.apache.pdfbox.pdmodel.PDDocument;
-
- import org.apache.pdfbox.rendering.PDFRenderer;
- import org.apache.pdfbox.rendering.ImageType;
-
-
-
- import java.awt.image.BufferedImage;
- import java.io.File;
- import java.io.IOException;
-
- public class PdfImageToText {
- public static void main(String[] args) {
- try {
- // 载入PDF文件
- PDDocument document = PDDocument.load(new File("D:\\foucus\\img\\01.pdf"));
-
- // 创建Tesseract实例
- ITesseract tesseract = new Tesseract();
-
-
- // 设置Tesseract OCR数据文件的路径(根据您的安装位置)
- tesseract.setDatapath("D:\\foucus\\tessdata-main\\");
- // 指定要识别的语言(中文)
- tesseract.setLanguage("chi_sim");
-
-
- // 创建PDF渲染器
- PDFRenderer pdfRenderer = new PDFRenderer(document);
-
- for (int page = 0; page < document.getNumberOfPages(); ++page) {
- // 渲染PDF页为图像
- BufferedImage bufferedImage = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
-
- // 使用Tesseract进行OCR识别
- String text = tesseract.doOCR(bufferedImage);
-
- // 输出识别的文本
- System.out.println("Page " + (page + 1) + " 识别的文本: \n" + text);
- }
-
- // 关闭PDF文档
- document.close();
- } catch (IOException | TesseractException e) {
- e.printStackTrace();
- }
- }
- }
- "1.0" encoding="UTF-8"?>
- <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0modelVersion>
- <parent>
- <groupId>org.springframework.bootgroupId>
- <artifactId>spring-boot-starter-parentartifactId>
- <version>3.1.4version>
- <relativePath/>
- parent>
- <groupId>com.examplegroupId>
- <artifactId>demoartifactId>
- <version>0.0.1-SNAPSHOTversion>
- <name>demoname>
- <description>demodescription>
- <properties>
- <java.version>17java.version>
- properties>
- <dependencies>
- <dependency>
- <groupId>org.springframework.bootgroupId>
- <artifactId>spring-boot-starter-webartifactId>
- dependency>
-
- <dependency>
- <groupId>org.springframework.bootgroupId>
- <artifactId>spring-boot-starter-testartifactId>
- <scope>testscope>
- dependency>
- <dependency>
- <groupId>net.sourceforge.tess4jgroupId>
- <artifactId>tess4jartifactId>
- <version>5.8.0version>
- dependency>
- <dependency>
- <groupId>org.apache.pdfboxgroupId>
- <artifactId>pdfboxartifactId>
- <version>2.0.29version>
- dependency>
-
- dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.springframework.bootgroupId>
- <artifactId>spring-boot-maven-pluginartifactId>
- plugin>
- plugins>
- build>
-
- project>
需要语言训练包
改写的代码以实现递归处理指定目录下的PDF文件,并将每个PDF文件的每个图像提取为一个文本文件,并将每个PDF的所有文本文件保存在一个单独的目录中,您需要进行一些修改。以下是修改后的Java代码示例:
- public class RecursivePdfImageToText {
- public static void main(String[] args) {
- String inputDirectory = "D:\\foucus\\input"; // 指定输入目录
- String outputBaseDirectory = "D:\\foucus\\output"; // 指定输出基础目录
-
- // 创建Tesseract实例
- ITesseract tesseract = new Tesseract();
-
- // 设置Tesseract OCR数据文件的路径(根据您的安装位置)
- tesseract.setDatapath("D:\\foucus\\tessdata-main\\");
- // 指定要识别的语言(中文)
- tesseract.setLanguage("chi_sim");
-
- processDirectory(new File(inputDirectory), new File(outputBaseDirectory), tesseract);
- }
-
- private static void processDirectory(File inputDir, File outputBaseDir, ITesseract tesseract) {
- if (inputDir.isDirectory()) {
- File[] files = inputDir.listFiles();
- if (files != null) {
- for (File file : files) {
- if (file.isDirectory()) {
- // 递归处理子目录
- processDirectory(file, outputBaseDir, tesseract);
- } else if (file.isFile() && file.getName().toLowerCase().endsWith(".pdf")) {
- // 处理PDF文件
- processPDF(file, outputBaseDir, tesseract);
- }
- }
- }
- }
- }
-
- private static void processPDF(File pdfFile, File outputBaseDir, ITesseract tesseract) {
- try {
- // 创建输出目录
- String pdfName = pdfFile.getName().replace(".pdf", "");
- File pdfOutputDir = new File(outputBaseDir, pdfName);
- pdfOutputDir.mkdirs();
-
- // 读取PDF文件
- PDDocument document = PDDocument.load(pdfFile);
- PDFRenderer pdfRenderer = new PDFRenderer(document);
-
- for (int page = 0; page < document.getNumberOfPages(); ++page) {
- // 渲染PDF页为图像
- BufferedImage bufferedImage = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
-
- // 使用Tesseract进行OCR识别
- String text = tesseract.doOCR(bufferedImage);
-
- // 输出识别的文本
- System.out.println("Page " + (page + 1) + " 识别的文本: \n" + text);
-
- // 指定要保存文本的文件路径
- String outputFilePath = new File(pdfOutputDir, "page_" + (page + 1) + ".txt").getAbsolutePath();
-
- // 将识别到的文本保存到文本文件
- try (FileWriter writer = new FileWriter(outputFilePath)) {
- writer.write(text);
- System.out.println("识别的文本已保存到: " + outputFilePath);
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
-
- // 关闭PDF文档
- document.close();
- } catch (IOException | TesseractException e) {
- e.printStackTrace();
- }
- }
- }
指定每个目录生成,每个目录中的内容生成一个txt文件
- package com.example.demo.Image;
-
- import net.sourceforge.tess4j.ITesseract;
- import net.sourceforge.tess4j.Tesseract;
- import net.sourceforge.tess4j.TesseractException;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.rendering.PDFRenderer;
- import org.apache.pdfbox.rendering.ImageType;
-
- import java.awt.image.BufferedImage;
- import java.io.File;
- import java.io.FileWriter;
- import java.io.IOException;
-
- public class ExtractPdfTextToSingleTxt {
- public static void main(String[] args) {
- // 指定要处理的根目录
- String rootDirectory1 = "D:\\foucus\\input\\f1"; // 请替换成您的目录路径 31
- processDirectory(new File(rootDirectory1));
- // 指定要处理的根目录
- String rootDirectory2 = "D:\\foucus\\input\\f2"; // 请替换成您的目录路径
- processDirectory(new File(rootDirectory2));
- // 指定要处理的根目录
- String rootDirectory3 = "D:\\foucus\\input\\f3"; // 请替换成您的目录路径
- processDirectory(new File(rootDirectory3));
- }
-
- private static void processDirectory(File directory) {
- File[] files = directory.listFiles();
- if (files != null) {
- for (File file : files) {
- if (file.isDirectory()) {
- // 如果是子目录,递归处理
- processDirectory(file);
- } else if (file.isFile() && file.getName().toLowerCase().endsWith(".pdf")) {
- // 如果是PDF文件,提取文字并保存到txt文件
- extractPdfTextToTxt(file, directory.getName());
- }
- }
- }
- }
-
- private static void extractPdfTextToTxt(File pdfFile, String directoryName) {
- try {
- // 载入PDF文件
- PDDocument document = PDDocument.load(pdfFile);
-
- // 创建Tesseract实例
- ITesseract tesseract = new Tesseract();
- // 设置Tesseract OCR数据文件的路径(根据您的安装位置)
- tesseract.setDatapath("D:\\foucus\\tessdata-main\\");
- // 指定要识别的语言(中文)
- tesseract.setLanguage("chi_sim");
-
- // 创建PDF渲染器
- PDFRenderer pdfRenderer = new PDFRenderer(document);
-
- StringBuilder textContent = new StringBuilder();
-
- for (int page = 0; page < document.getNumberOfPages(); ++page) {
- // 渲染PDF页为图像
- BufferedImage bufferedImage = pdfRenderer.renderImageWithDPI(page, 300, ImageType.GRAY);
-
- // 使用Tesseract进行OCR识别
- String text = tesseract.doOCR(bufferedImage);
-
- // 将识别的文本追加到内容中
- textContent.append("Page ").append(page + 1).append(" 识别的文本:\n").append(text).append("\n");
- }
-
- // 关闭PDF文档
- document.close();
-
- // 构建txt文件路径,以目录名称作为文件名
- String txtFileName = directoryName + ".txt";
- String txtFilePath = new File(pdfFile.getParent(), txtFileName).getAbsolutePath();
-
- // 将所有页面的文字内容保存到txt文件中
- try (FileWriter writer = new FileWriter(txtFilePath,true)) {
- writer.write(textContent.toString());
- System.out.println("PDF文本已保存到: " + txtFilePath);
- } catch (IOException e) {
- e.printStackTrace();
- }
-
- } catch (IOException | TesseractException e) {
- e.printStackTrace();
- }
- }
- }