• java pdf转word 支持图片转换到word(最大程度的解决原PDF)


    依赖

    1. org.apache.pdfbox
    2. pdfbox
    3. 2.0.4
    4. net.coobird
    5. thumbnailator
    6. 0.4.8
    7. org.apache.poi
    8. poi
    9. 3.9
    10. org.apache.poi
    11. poi-ooxml
    12. 3.9

    处理图片的工具-代码

    1. package com.example.pdf.Pdf2wordNew;
    2. import org.apache.poi.openxml4j.opc.OPCPackage;
    3. import org.apache.poi.xwpf.usermodel.XWPFDocument;
    4. import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    5. import org.apache.xmlbeans.XmlException;
    6. import org.apache.xmlbeans.XmlToken;
    7. import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
    8. import org.openxmlformats.schemas.drawingml.x2006.main.CTPositiveSize2D;
    9. import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline;
    10. import java.io.InputStream;
    11. /**
    12. * @program: pdf
    13. * @author: xlk
    14. * @create: 2022-11-21 10:30
    15. */
    16. public class MyXWPFDocument extends XWPFDocument {
    17. public MyXWPFDocument(InputStream in) throws Exception {
    18. super(in);
    19. }
    20. public MyXWPFDocument() {
    21. super();
    22. }
    23. public MyXWPFDocument(OPCPackage pkg) throws Exception {
    24. super(pkg);
    25. }
    26. /**
    27. * 处理图片工具
    28. * @param id
    29. * @param width 宽
    30. * @param height 高
    31. * @param paragraph 段落
    32. */
    33. public void createPicture(int id, int width, int height, XWPFParagraph paragraph) {
    34. final int EMU = 9525;
    35. width *= EMU;
    36. height *= EMU;
    37. String blipId = getAllPictures().get(id).getPackageRelationship().getId();
    38. CTInline inline = paragraph.createRun().getCTR().addNewDrawing().addNewInline();
    39. String picXml = ""
    40. + ""
    41. + " "
    42. + " "
    43. + " " + "
    44. + id
    45. + "\" name=\"Generated\"/>"
    46. + " "
    47. + " "
    48. + " "
    49. + "
    50. + blipId
    51. + "\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\"/>"
    52. + " "
    53. + " "
    54. + " "
    55. + " "
    56. + " "
    57. + " "
    58. + " "
    59. + "
    60. + width
    61. + "\" cy=\""
    62. + height
    63. + "\"/>"
    64. + " "
    65. + " "
    66. + " "
    67. + " "
    68. + " "
    69. + " "
    70. + " " + "";
    71. inline.addNewGraphic().addNewGraphicData();
    72. XmlToken xmlToken = null;
    73. try {
    74. xmlToken = XmlToken.Factory.parse(picXml);
    75. } catch (XmlException xe) {
    76. xe.printStackTrace();
    77. }
    78. inline.set(xmlToken);
    79. inline.setDistT(0);
    80. inline.setDistB(0);
    81. inline.setDistL(0);
    82. inline.setDistR(0);
    83. CTPositiveSize2D extent = inline.addNewExtent();
    84. extent.setCx(width);
    85. extent.setCy(height);
    86. CTNonVisualDrawingProps docPr = inline.addNewDocPr();
    87. docPr.setId(id);
    88. docPr.setName("图片名称");
    89. docPr.setDescr("描述信息");
    90. }
    91. }

    开始转换

    1. package com.example.pdf.Pdf2wordNew;
    2. import net.coobird.thumbnailator.Thumbnails;
    3. import org.apache.pdfbox.cos.COSName;
    4. import org.apache.pdfbox.pdmodel.PDDocument;
    5. import org.apache.pdfbox.pdmodel.PDPage;
    6. import org.apache.pdfbox.pdmodel.PDResources;
    7. import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
    8. import org.apache.pdfbox.text.PDFTextStripper;
    9. import org.apache.poi.xwpf.usermodel.XWPFParagraph;
    10. import org.apache.poi.xwpf.usermodel.XWPFRun;
    11. import javax.imageio.ImageIO;
    12. import java.awt.image.BufferedImage;
    13. import java.io.*;
    14. import java.util.Arrays;
    15. import java.util.Iterator;
    16. import java.util.List;
    17. public class Pdf2wordNew {
    18. public static void main(String[] args) throws Exception {
    19. try {
    20. String pdfFileName = "C:\\Users\\11949\\Desktop\\新建文件夹 (2)\\面试题.pdf";
    21. PDDocument pdf = PDDocument.load(new File(pdfFileName));
    22. int pageNumber = pdf.getNumberOfPages();
    23. String docFileName = pdfFileName.substring(0, pdfFileName.lastIndexOf(".")) + ".doc";
    24. File file = new File(docFileName);
    25. if (!file.exists()) {
    26. file.createNewFile();
    27. }
    28. MyXWPFDocument document = new MyXWPFDocument();
    29. FileOutputStream fos = new FileOutputStream(docFileName);
    30. //提取每一页的图片和文字,添加到 word 中
    31. for (int i = 0; i < pageNumber; i++) {
    32. PDPage page = pdf.getPage(i);
    33. PDResources resources = page.getResources();
    34. Iterable names = resources.getXObjectNames();
    35. Iterator iterator = names.iterator();
    36. while (iterator.hasNext()) {
    37. COSName cosName = iterator.next();
    38. if (resources.isImageXObject(cosName)) {
    39. PDImageXObject imageXObject = (PDImageXObject) resources.getXObject(cosName);
    40. File outImgFile = new File("C:\\Users\\11949\\Desktop\\新建文件夹 (2)\\"
    41. + System.currentTimeMillis() + ".jpg");
    42. Thumbnails.of(imageXObject.getImage()).scale(1).rotate(0).toFile(outImgFile);
    43. BufferedImage bufferedImage = ImageIO.read(outImgFile);
    44. int width = bufferedImage.getWidth();
    45. int height = bufferedImage.getHeight();
    46. if (width > 600) {
    47. double ratio = Math.round((double) width / 550.0);
    48. System.out.println("缩放比ratio:" + ratio);
    49. width = (int) (width / ratio);
    50. height = (int) (height / ratio);
    51. }
    52. System.out.println("width: " + width + ", height: " + height);
    53. FileInputStream in = new FileInputStream(outImgFile);
    54. byte[] ba = new byte[in.available()];
    55. in.read(ba);
    56. ByteArrayInputStream byteInputStream = new ByteArrayInputStream(ba);
    57. XWPFParagraph picture = document.createParagraph();
    58. //添加图片
    59. document.addPictureData(byteInputStream, MyXWPFDocument.PICTURE_TYPE_JPEG);
    60. //图片大小、位置
    61. document.createPicture(document.getAllPictures().size() - 1, width, height, picture);
    62. }
    63. }
    64. PDFTextStripper stripper = new PDFTextStripper();
    65. stripper.setSortByPosition(true);
    66. stripper.setStartPage(i);
    67. stripper.setEndPage(i);
    68. //当前页中的文字
    69. String text = stripper.getText(pdf);
    70. System.out.println(" ========== " + text);
    71. XWPFParagraph textParagraph = document.createParagraph();
    72. XWPFRun textRun = textParagraph.createRun();
    73. // 处理换行问题
    74. if (text.contains("\r\n")) {
    75. String[] split = text.split("\r\n");
    76. List strsToList1 = Arrays.asList(split);
    77. for (String str : strsToList1) {
    78. System.out.println(str);
    79. textRun.setText(str);
    80. textRun.addCarriageReturn();
    81. }
    82. }
    83. // textRun.setText(text);
    84. textRun.setFontFamily("仿宋");
    85. textRun.setFontSize(10);
    86. //换行
    87. // 插入换行符
    88. textParagraph.setWordWrap(true);
    89. }
    90. document.write(fos);
    91. fos.close();
    92. pdf.close();
    93. System.out.println("pdf转换解析结束!!----");
    94. } catch (IOException e) {
    95. e.printStackTrace();
    96. }
    97. }
    98. }

  • 相关阅读:
    首都博物京韵展,监测系统实现文物科技保护
    基于VC++的包过滤防火墙系统设计与实现
    微信小程序支付及退款整体流程
    搭建Prometheus+Grafana框架监控Hyperledger Fabric的运行
    万字整理 | 深入理解编译系统
    Java高级进阶训练营 目录
    基于高阶微分器的无模型滑模控制器及其在自动电压调节器中的应用
    MySQL-触发器
    [补题记录] Atcoder Beginner Contest 300(E)
    Hadoop2.8 安装心得
  • 原文地址:https://blog.csdn.net/qq_36961226/article/details/127961676