依赖
-
-
org.apache.pdfbox -
pdfbox -
2.0.4 -
-
-
net.coobird -
thumbnailator -
0.4.8 -
-
-
-
org.apache.poi -
poi -
3.9 -
-
-
org.apache.poi -
poi-ooxml -
3.9 -
处理图片的工具-代码
- package com.example.pdf.Pdf2wordNew;
-
- import org.apache.poi.openxml4j.opc.OPCPackage;
- import org.apache.poi.xwpf.usermodel.XWPFDocument;
- import org.apache.poi.xwpf.usermodel.XWPFParagraph;
- import org.apache.xmlbeans.XmlException;
- import org.apache.xmlbeans.XmlToken;
- import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
- import org.openxmlformats.schemas.drawingml.x2006.main.CTPositiveSize2D;
- import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline;
-
- import java.io.InputStream;
-
- /**
- * @program: pdf
- * @author: xlk
- * @create: 2022-11-21 10:30
- */
- public class MyXWPFDocument extends XWPFDocument {
- public MyXWPFDocument(InputStream in) throws Exception {
- super(in);
- }
-
- public MyXWPFDocument() {
- super();
- }
-
- public MyXWPFDocument(OPCPackage pkg) throws Exception {
- super(pkg);
- }
-
- /**
- * 处理图片工具
- * @param id
- * @param width 宽
- * @param height 高
- * @param paragraph 段落
- */
- public void createPicture(int id, int width, int height, XWPFParagraph paragraph) {
- final int EMU = 9525;
- width *= EMU;
- height *= EMU;
- String blipId = getAllPictures().get(id).getPackageRelationship().getId();
- CTInline inline = paragraph.createRun().getCTR().addNewDrawing().addNewInline();
- String picXml = ""
- + "
" - + "
" - + "
" - + "
" + " - + id
- + "\" name=\"Generated\"/>"
- + "
" - + " "
- + "
" - + "
- + blipId
- + "\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\"/>"
- + "
" - + "
" - + " "
- + " "
- + "
" - + "
" - + "
" - + "
- + width
- + "\" cy=\""
- + height
- + "\"/>"
- + " "
- + "
" - + "
" - + " "
- + " "
- + " "
- + " " + "";
-
- inline.addNewGraphic().addNewGraphicData();
- XmlToken xmlToken = null;
- try {
- xmlToken = XmlToken.Factory.parse(picXml);
- } catch (XmlException xe) {
- xe.printStackTrace();
- }
- inline.set(xmlToken);
-
- inline.setDistT(0);
- inline.setDistB(0);
- inline.setDistL(0);
- inline.setDistR(0);
-
- CTPositiveSize2D extent = inline.addNewExtent();
- extent.setCx(width);
- extent.setCy(height);
-
- CTNonVisualDrawingProps docPr = inline.addNewDocPr();
- docPr.setId(id);
- docPr.setName("图片名称");
- docPr.setDescr("描述信息");
- }
- }
开始转换
- package com.example.pdf.Pdf2wordNew;
-
- import net.coobird.thumbnailator.Thumbnails;
- import org.apache.pdfbox.cos.COSName;
- import org.apache.pdfbox.pdmodel.PDDocument;
- import org.apache.pdfbox.pdmodel.PDPage;
- import org.apache.pdfbox.pdmodel.PDResources;
- import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
- import org.apache.pdfbox.text.PDFTextStripper;
- import org.apache.poi.xwpf.usermodel.XWPFParagraph;
- import org.apache.poi.xwpf.usermodel.XWPFRun;
-
- import javax.imageio.ImageIO;
- import java.awt.image.BufferedImage;
- import java.io.*;
- import java.util.Arrays;
- import java.util.Iterator;
- import java.util.List;
-
- public class Pdf2wordNew {
-
- public static void main(String[] args) throws Exception {
-
- try {
-
- String pdfFileName = "C:\\Users\\11949\\Desktop\\新建文件夹 (2)\\面试题.pdf";
-
- PDDocument pdf = PDDocument.load(new File(pdfFileName));
- int pageNumber = pdf.getNumberOfPages();
-
- String docFileName = pdfFileName.substring(0, pdfFileName.lastIndexOf(".")) + ".doc";
-
- File file = new File(docFileName);
- if (!file.exists()) {
- file.createNewFile();
- }
- MyXWPFDocument document = new MyXWPFDocument();
- FileOutputStream fos = new FileOutputStream(docFileName);
-
- //提取每一页的图片和文字,添加到 word 中
- for (int i = 0; i < pageNumber; i++) {
- PDPage page = pdf.getPage(i);
- PDResources resources = page.getResources();
- Iterable
names = resources.getXObjectNames(); - Iterator
iterator = names.iterator(); - while (iterator.hasNext()) {
- COSName cosName = iterator.next();
- if (resources.isImageXObject(cosName)) {
- PDImageXObject imageXObject = (PDImageXObject) resources.getXObject(cosName);
- File outImgFile = new File("C:\\Users\\11949\\Desktop\\新建文件夹 (2)\\"
- + System.currentTimeMillis() + ".jpg");
- Thumbnails.of(imageXObject.getImage()).scale(1).rotate(0).toFile(outImgFile);
- BufferedImage bufferedImage = ImageIO.read(outImgFile);
- int width = bufferedImage.getWidth();
- int height = bufferedImage.getHeight();
- if (width > 600) {
- double ratio = Math.round((double) width / 550.0);
- System.out.println("缩放比ratio:" + ratio);
- width = (int) (width / ratio);
- height = (int) (height / ratio);
- }
- System.out.println("width: " + width + ", height: " + height);
- FileInputStream in = new FileInputStream(outImgFile);
- byte[] ba = new byte[in.available()];
- in.read(ba);
- ByteArrayInputStream byteInputStream = new ByteArrayInputStream(ba);
- XWPFParagraph picture = document.createParagraph();
- //添加图片
- document.addPictureData(byteInputStream, MyXWPFDocument.PICTURE_TYPE_JPEG);
- //图片大小、位置
- document.createPicture(document.getAllPictures().size() - 1, width, height, picture);
- }
- }
- PDFTextStripper stripper = new PDFTextStripper();
- stripper.setSortByPosition(true);
- stripper.setStartPage(i);
- stripper.setEndPage(i);
- //当前页中的文字
- String text = stripper.getText(pdf);
- System.out.println(" ========== " + text);
- XWPFParagraph textParagraph = document.createParagraph();
- XWPFRun textRun = textParagraph.createRun();
- // 处理换行问题
- if (text.contains("\r\n")) {
- String[] split = text.split("\r\n");
- List
strsToList1 = Arrays.asList(split); - for (String str : strsToList1) {
- System.out.println(str);
- textRun.setText(str);
- textRun.addCarriageReturn();
- }
- }
- // textRun.setText(text);
- textRun.setFontFamily("仿宋");
- textRun.setFontSize(10);
- //换行
- // 插入换行符
- textParagraph.setWordWrap(true);
- }
- document.write(fos);
- fos.close();
- pdf.close();
- System.out.println("pdf转换解析结束!!----");
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }