• doc与docx文档转html,格式样式不变(包含图片转换)


            最近做一个富文本的需求,要求把文档内容转换到富文本内,文档中的格式也好,样式也好,图片啥的都要一致展示;踩了不少坑,据说word文档其实是一个压缩包,我不是特别清楚但是也能理解,自己借鉴参考凑合看的,大佬勿喷

            啥都不说了看代码吧;其中关于图片的导出有两种方式比较大的那种是用的jdk8自带的base

    64搞的,大小有差别同一个图片的话我这个实测的图片是差200k左右,有要求的你可以换着来引用;jar的引用pom中有

    1. org.apache.poi
    2. poi
    3. 4.1.2
    4. org.apache.poi
    5. poi-scratchpad
    6. 4.1.2
    7. org.apache.poi
    8. poi-ooxml
    9. 4.1.2
    10. fr.opensagres.xdocreport
    11. fr.opensagres.poi.xwpf.converter.xhtml
    12. 2.0.2
    1. import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
    2. import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
    3. import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
    4. import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
    5. import lombok.extern.slf4j.Slf4j;
    6. import org.apache.poi.hwpf.HWPFDocument;
    7. import org.apache.poi.hwpf.converter.WordToHtmlConverter;
    8. import org.apache.poi.hwpf.usermodel.PictureType;
    9. import org.apache.poi.xwpf.usermodel.XWPFDocument;
    10. import org.springframework.web.multipart.MultipartFile;
    11. import org.w3c.dom.Document;
    12. import sun.misc.BASE64Encoder;
    13. import javax.imageio.ImageIO;
    14. import javax.xml.parsers.DocumentBuilderFactory;
    15. import javax.xml.parsers.ParserConfigurationException;
    16. import javax.xml.transform.OutputKeys;
    17. import javax.xml.transform.Transformer;
    18. import javax.xml.transform.TransformerException;
    19. import javax.xml.transform.TransformerFactory;
    20. import javax.xml.transform.dom.DOMSource;
    21. import javax.xml.transform.stream.StreamResult;
    22. import java.awt.image.BufferedImage;
    23. import java.io.*;
    24. import java.util.Base64;
    25. /**
    26. * @author :Xiaoning Fan
    27. * @date :Created in 2023-10-16 下午 3:49
    28. * @description: 上传word文档并转换为html字符串返回,保持样式不变,图片替换为base64
    29. * @version: 1.0
    30. */
    31. @Slf4j
    32. public class WordToHtmlStringConverter {
    33. /**
    34. * wordToHtml
    35. *
    36. * @return
    37. * @throws IOException
    38. * @throws ParserConfigurationException
    39. * @throws TransformerException
    40. */
    41. public static String wordToHtml(MultipartFile file) {
    42. // 提取出word文档名称和后缀
    43. String filename = file.getOriginalFilename();
    44. try {
    45. if (filename.endsWith(".docx")) {
    46. // 将上传的文件传入Document转换
    47. return new WordToHtmlStringConverter().docxToHtmlText(file);
    48. } else if (filename.endsWith(".doc")) {
    49. return new WordToHtmlStringConverter().docToHtmlText(file);
    50. } else {
    51. log.error("不支持的文件格式!");
    52. return null;
    53. }
    54. } catch (FileNotFoundException e) {
    55. log.error("文件找不到异常!");
    56. e.printStackTrace();
    57. } catch (IOException e) {
    58. log.error("io转换异常!");
    59. e.printStackTrace();
    60. } catch (Exception e) {
    61. log.error("文件转换异常!");
    62. e.printStackTrace();
    63. }
    64. return null;
    65. }
    66. /**
    67. * 上传Word文档,返回解析后的Html
    68. */
    69. public static String docToHtmlText(MultipartFile file) throws Exception {
    70. //使用字符数组流获取解析的内容
    71. ByteArrayOutputStream baos = new ByteArrayOutputStream();
    72. OutputStream outStream = new BufferedOutputStream(baos);
    73. try {
    74. //将上传的文件传入Document转换
    75. HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
    76. Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
    77. WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
    78. //将读取到的图片上传并添加链接地址
    79. wordToHtmlConverter.setPicturesManager((imageStream, pictureType, name, width, height) -> {
    80. try {
    81. //首先要判断图片是否能识别
    82. if (pictureType.equals(PictureType.UNKNOWN)) {
    83. return "[不能识别的图片]";
    84. }
    85. //此处转换图片文件为Base64
    86. return Base64.getEncoder().encodeToString(imageStream).trim();
    87. } catch (Exception e) {
    88. log.info("upload exception", e);
    89. }
    90. return "[图片上传失败]";
    91. });
    92. // word文档转Html文档
    93. wordToHtmlConverter.processDocument(wordDocument);
    94. Document htmlDocument = wordToHtmlConverter.getDocument();
    95. DOMSource domSource = new DOMSource(htmlDocument);
    96. StreamResult streamResult = new StreamResult(outStream);
    97. TransformerFactory factory = TransformerFactory.newInstance();
    98. Transformer serializer = factory.newTransformer();
    99. serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
    100. serializer.setOutputProperty(OutputKeys.INDENT, "yes");
    101. serializer.setOutputProperty(OutputKeys.METHOD, "html");
    102. serializer.transform(domSource, streamResult);
    103. String content = baos.toString();
    104. log.info("docToHtmlText--->{}", content);
    105. return content;
    106. } catch (Exception e) {
    107. log.error("docToHtmlText 异常", e);
    108. } finally {
    109. baos.close();
    110. outStream.close();
    111. }
    112. return null;
    113. }
    114. /**
    115. * 上传docx文档,返回解析后的Html
    116. */
    117. public static String docxToHtmlText(MultipartFile file) throws Exception {
    118. ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
    119. ByteArrayOutputStream htmlImg = new ByteArrayOutputStream();
    120. String htmlStr = null;
    121. try {
    122. // 将上传的文件传入Document转换
    123. XWPFDocument docxDocument = new XWPFDocument(file.getInputStream());
    124. XHTMLOptions options = XHTMLOptions.create();
    125. // 设置图片存储路径
    126. String path = System.getProperty("java.io.tmpdir");
    127. String firstImagePathStr = path + "/" + System.currentTimeMillis();
    128. options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
    129. options.URIResolver(new BasicURIResolver(firstImagePathStr));
    130. // 转换html
    131. docxDocument.createNumbering();
    132. XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
    133. htmlStr = htmlStream.toString();
    134. String middleImageDirStr = "/word/media";
    135. String imageDirStr = firstImagePathStr + middleImageDirStr;
    136. File imageDir = new File(imageDirStr);
    137. String[] imageList = imageDir.list();
    138. if (imageList != null) {
    139. for (int i = 0; i < imageList.length; i++) {
    140. try {
    141. String oneImagePathStr = imageDirStr + "/" + imageList[i];
    142. File fileImage = new File(oneImagePathStr);
    143. if (fileImage.exists()) {
    144. log.info("处理图片开始。。。。。。。。");
    145. // 处理图片成为Base64格式
    146. // 读取图片字节数组
    147. InputStream in = new FileInputStream(fileImage);
    148. byte[] data = new byte[in.available()];
    149. in.read(data);
    150. String encode = new BASE64Encoder().encode(data);
    151. log.info("处理图片结束。。。。。。。" + encode);
    152. //修改文档中的图片信息
    153. htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+encode);
    154. /* BufferedImage bi = ImageIO.read(fileImage);// 图片存储大小比较大
    155. ByteArrayOutputStream baos = new ByteArrayOutputStream();
    156. ImageIO.write(bi, "png", baos);
    157. byte[] bytes = baos.toByteArray();
    158. String sd = Base64.getEncoder().encodeToString(bytes).trim();
    159. log.info("处理图片结束。。。。。。。" + sd);
    160. htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+sd);*/
    161. }
    162. } catch (Exception e) {
    163. log.info("upload docxToHtmlText exception", e);
    164. }
    165. }
    166. }
    167. log.info("处理结果:{}", htmlStr);
    168. } catch (Exception e) {
    169. log.error("docxToHtmlText 解析异常", e);
    170. } finally {
    171. if (htmlStream != null) {
    172. htmlStream.close();
    173. }
    174. return htmlStr;
    175. }
    176. }
    177. }

    直接引用就行,但是有一点,一定要注意接口返回的时候,如果直接返回页面接口上要加

    @ResponseBody不然就悲剧了;当然如果直接存库的那就无所谓了

    这次就先这样,自娱自乐,手下留情勿喷!!

  • 相关阅读:
    非零基础自学Java (老师:韩顺平) 第14章 集合 14.12 Map 接口和常用方法
    php 时区查看和设置
    RabbitMQ-管理界面介绍
    Flink SQL Window TopN 详解
    神经网络和图神经网络,神经网络背景图高清
    【通义千问】Qwen从本地加载分词器报错‘‘tokenizer class not exist‘‘
    spring ExpressionParser 四则运算表达式解析参数提取
    Kafka部署、原理和使用介绍
    数据库规范化理论
    软过过程与管理学习之(2)Process Control(流程控制)
  • 原文地址:https://blog.csdn.net/u011205527/article/details/133942632