java 读取pdf文件内容

一、引入maven

<dependency>
    <groupId>org.apache.pdfboxgroupId>
    <artifactId>pdfboxartifactId>
    <version>2.0.25version>
dependency>
1
2
3
4
5

二、代码工具类

package com.jiayou.peis.utils;

//import com.itextpdf.text.pdf.PdfReader;
//import com.itextpdf.text.pdf.parser.PdfTextExtractor;
//import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;

import com.google.common.collect.Lists;
import com.jiayou.peis.entity.ImageObject;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
 * PDF处理
 *
 * @author Bob Ren (Copyright © 2015-2029 贵州家有在线网络有限公司)
 * @version 1.0.0
 * @date 2022-02-07 16:21
 */
public class PdfUtils {
    //    /**
//     * 使用itextpdf提取PDF文本（解析不靠谱）
//     *
//     * @param inputStream
//     * @return
//     * @throws IOException
//     */
//    @Deprecated
//    public static String toText(InputStream inputStream) throws IOException {
//        try {
//            StringBuilder buf = new StringBuilder();
//            PdfReader reader = new PdfReader(inputStream);
//            int pageNum = reader.getNumberOfPages();
//            for(int i=1;i<=pageNum;i++){
//                // 读取第i页的文档内容
//                buf.append(PdfTextExtractor.getTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
//            }
            return buf.toString();
//            return StrUtils.removeReturnChar(buf.toString());
//        } finally {
//            CloseUtils.closeQuietly(inputStream);
//        }
//    }
    public static String text(byte[] data) throws IOException {
        return PdfUtils.text(data, true);
    }
    public static String text(byte[] data, boolean sortByPosition) throws IOException {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    /**
     * 使用pdfbox提取PDF文本（解析正常，可使用）
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static String text(File file, boolean sortByPosition) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.text(inputStream, sortByPosition);
    }
    public static String text(File file) throws IOException {
        return PdfUtils.text(file, true);
    }
    public static String text(InputStream inputStream) throws IOException {
        return text(inputStream, true);
    }
    /**
     * 使用pdfbox提取PDF文本（解析正常，可使用）
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static String text(InputStream inputStream, boolean sortByPosition) throws IOException {
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            PDFTextStripper textStripper = new PDFTextStripper();
            // Get total page count of the PDF document
            int numberOfPages = document.getNumberOfPages();
            //set the first page to be extracted
            textStripper.setStartPage(1);
            // set the last page to be extracted
            textStripper.setEndPage(numberOfPages);
            // 获取文本内容
            textStripper.setSortByPosition(sortByPosition);
            textStripper.setShouldSeparateByBeads(true);
            return StrUtils.removeReturnChar(textStripper.getText(document));
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF文本（解析正常，可使用）
     *
     * @param file
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(File file) throws IOException {
        InputStream inputStream = new FileInputStream(file);
        return PdfUtils.images(inputStream);
    }

    public static List<ImageObject> images(byte[] data) throws IOException {
        ByteArrayInputStream inputStream = null;
        try {
            inputStream = new ByteArrayInputStream(data);
            return PdfUtils.images(inputStream);
        } finally {
            CloseUtils.closeQuietly(inputStream);
        }
    }

    /**
     * 使用pdfbox提取PDF图片列表
     *
     * @param inputStream
     * @return
     * @throws IOException
     */
    public static List<ImageObject> images(InputStream inputStream) throws IOException {
        List<ImageObject> imageList = Lists.newArrayList();
        PDDocument document = null;
        try {
//            document = PDDocument.load(inputStream);
            document = Loader.loadPDF(inputStream);
            // get resources for a page
            PDResources pdResources = document.getPage(0).getResources();
            int i = 0;
            for (COSName csName : pdResources.getXObjectNames()) {
//                System.out.println(i+":"+csName);
                PDXObject pdxObject = pdResources.getXObject(csName);
                if (pdxObject instanceof PDImageXObject) {
//                    i++;
                    PDStream pdStream = pdxObject.getStream();
                    PDImageXObject image = new PDImageXObject(pdStream, pdResources);
                    String imageSuffix = imageSuffix(image);
                    // image storage location and image name
                    BufferedImage bufferedImage = image.getImage();
                    ImageObject object = new ImageObject();
                    object.setIndex(i++);
                    object.setImage(bufferedImage);
                    object.setSuffix(imageSuffix);
                    imageList.add(object);
                }
            }
        } finally {
            CloseUtils.closeQuietly(document, inputStream);
        }
        return imageList;
    }

    /**
     * 获取图片后缀
     *
     * @param pdImage
     * @return
     * @throws IOException
     */
    private static String imageSuffix(PDImageXObject pdImage) throws IOException {
        String suffix = pdImage.getSuffix();
        if (suffix == null || "jb2".equals(suffix)) {
            suffix = "png";
        } else if ("jpx".equals(suffix)) {
            // use jp2 suffix for file because jpx not known by windows
            suffix = "jp2";
        }

        if (hasMasks(pdImage)) {
            // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
            suffix = "png";
        }
        return suffix;
    }

    private static boolean hasMasks(PDImage pdImage) throws IOException {
        if (pdImage instanceof PDImageXObject) {
            PDImageXObject ximg = (PDImageXObject) pdImage;
            return ximg.getMask() != null || ximg.getSoftMask() != null;
        }
        return false;
    }

    /**
     * 保存图片到指定文件夹
     *
     * @param imageList
     * @param dir
     * @param prefixName
     * @throws IOException
     */
    public static void saveImage(List<ImageObject> imageList, String dir, String prefixName) throws IOException {
        File imgDir = new File(dir);
        FileUtils.forceMkdir(imgDir);
        for(ImageObject image:imageList){
            File imgFile = new File(dir, prefixName+"_"+image.getIndex()+"."+image.getSuffix());
            ImageIO.write(image.getImage(), image.getSuffix(), imgFile);
        }
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

三、方式二

https://blog.csdn.net/ThinkPet/article/details/131256428

相关阅读:
23 种设计模式之单例模式
 vue 前端参值后端接收的几种方式
 国内有什么比较好用的量化交易接口？
MongoDB 中查询(find) 指南
 基于JAVA医院管理系统计算机毕业设计源码+系统+mysql数据库+lw文档+部署
 【高并发】ScheduledThreadPoolExecutor与Timer的区别和简单示例
 wflow模型sbm
linux 安装配置github
EV代码签名证书申请的具体步骤是什么？
（中）苹果有开源，但又怎样呢？
原文地址：https://blog.csdn.net/qq_30272167/article/details/134019256