最近做一个富文本的需求,要求把文档内容转换到富文本内,文档中的格式也好,样式也好,图片啥的都要一致展示;踩了不少坑,据说word文档其实是一个压缩包,我不是特别清楚但是也能理解,自己借鉴参考凑合看的,大佬勿喷
啥都不说了看代码吧;其中关于图片的导出有两种方式比较大的那种是用的jdk8自带的base
64搞的,大小有差别同一个图片的话我这个实测的图片是差200k左右,有要求的你可以换着来引用;jar的引用pom中有
-
-
-
org.apache.poi -
poi -
4.1.2 -
-
-
-
org.apache.poi -
poi-scratchpad -
4.1.2 -
-
-
-
org.apache.poi -
poi-ooxml -
4.1.2 -
-
-
fr.opensagres.xdocreport -
fr.opensagres.poi.xwpf.converter.xhtml -
2.0.2 -
-
- import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
- import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
- import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
- import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.poi.hwpf.HWPFDocument;
- import org.apache.poi.hwpf.converter.WordToHtmlConverter;
- import org.apache.poi.hwpf.usermodel.PictureType;
- import org.apache.poi.xwpf.usermodel.XWPFDocument;
- import org.springframework.web.multipart.MultipartFile;
- import org.w3c.dom.Document;
- import sun.misc.BASE64Encoder;
-
- import javax.imageio.ImageIO;
- import javax.xml.parsers.DocumentBuilderFactory;
- import javax.xml.parsers.ParserConfigurationException;
- import javax.xml.transform.OutputKeys;
- import javax.xml.transform.Transformer;
- import javax.xml.transform.TransformerException;
- import javax.xml.transform.TransformerFactory;
- import javax.xml.transform.dom.DOMSource;
- import javax.xml.transform.stream.StreamResult;
- import java.awt.image.BufferedImage;
- import java.io.*;
- import java.util.Base64;
-
- /**
- * @author :Xiaoning Fan
- * @date :Created in 2023-10-16 下午 3:49
- * @description: 上传word文档并转换为html字符串返回,保持样式不变,图片替换为base64
- * @version: 1.0
- */
- @Slf4j
- public class WordToHtmlStringConverter {
-
- /**
- * wordToHtml
- *
- * @return
- * @throws IOException
- * @throws ParserConfigurationException
- * @throws TransformerException
- */
- public static String wordToHtml(MultipartFile file) {
- // 提取出word文档名称和后缀
- String filename = file.getOriginalFilename();
- try {
- if (filename.endsWith(".docx")) {
- // 将上传的文件传入Document转换
- return new WordToHtmlStringConverter().docxToHtmlText(file);
- } else if (filename.endsWith(".doc")) {
- return new WordToHtmlStringConverter().docToHtmlText(file);
- } else {
- log.error("不支持的文件格式!");
- return null;
- }
- } catch (FileNotFoundException e) {
- log.error("文件找不到异常!");
- e.printStackTrace();
- } catch (IOException e) {
- log.error("io转换异常!");
- e.printStackTrace();
- } catch (Exception e) {
- log.error("文件转换异常!");
- e.printStackTrace();
- }
-
- return null;
- }
-
- /**
- * 上传Word文档,返回解析后的Html
- */
- public static String docToHtmlText(MultipartFile file) throws Exception {
- //使用字符数组流获取解析的内容
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- OutputStream outStream = new BufferedOutputStream(baos);
- try {
- //将上传的文件传入Document转换
- HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
- Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
- WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
- //将读取到的图片上传并添加链接地址
- wordToHtmlConverter.setPicturesManager((imageStream, pictureType, name, width, height) -> {
- try {
- //首先要判断图片是否能识别
- if (pictureType.equals(PictureType.UNKNOWN)) {
- return "[不能识别的图片]";
- }
- //此处转换图片文件为Base64
- return Base64.getEncoder().encodeToString(imageStream).trim();
-
- } catch (Exception e) {
- log.info("upload exception", e);
- }
- return "[图片上传失败]";
- });
- // word文档转Html文档
- wordToHtmlConverter.processDocument(wordDocument);
- Document htmlDocument = wordToHtmlConverter.getDocument();
- DOMSource domSource = new DOMSource(htmlDocument);
- StreamResult streamResult = new StreamResult(outStream);
- TransformerFactory factory = TransformerFactory.newInstance();
- Transformer serializer = factory.newTransformer();
- serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
- serializer.setOutputProperty(OutputKeys.INDENT, "yes");
- serializer.setOutputProperty(OutputKeys.METHOD, "html");
- serializer.transform(domSource, streamResult);
- String content = baos.toString();
- log.info("docToHtmlText--->{}", content);
- return content;
- } catch (Exception e) {
- log.error("docToHtmlText 异常", e);
- } finally {
- baos.close();
- outStream.close();
- }
- return null;
- }
-
- /**
- * 上传docx文档,返回解析后的Html
- */
- public static String docxToHtmlText(MultipartFile file) throws Exception {
- ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
- ByteArrayOutputStream htmlImg = new ByteArrayOutputStream();
- String htmlStr = null;
- try {
- // 将上传的文件传入Document转换
- XWPFDocument docxDocument = new XWPFDocument(file.getInputStream());
- XHTMLOptions options = XHTMLOptions.create();
- // 设置图片存储路径
- String path = System.getProperty("java.io.tmpdir");
- String firstImagePathStr = path + "/" + System.currentTimeMillis();
- options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
- options.URIResolver(new BasicURIResolver(firstImagePathStr));
- // 转换html
- docxDocument.createNumbering();
- XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
- htmlStr = htmlStream.toString();
-
- String middleImageDirStr = "/word/media";
- String imageDirStr = firstImagePathStr + middleImageDirStr;
- File imageDir = new File(imageDirStr);
- String[] imageList = imageDir.list();
- if (imageList != null) {
- for (int i = 0; i < imageList.length; i++) {
- try {
- String oneImagePathStr = imageDirStr + "/" + imageList[i];
- File fileImage = new File(oneImagePathStr);
- if (fileImage.exists()) {
- log.info("处理图片开始。。。。。。。。");
- // 处理图片成为Base64格式
- // 读取图片字节数组
- InputStream in = new FileInputStream(fileImage);
- byte[] data = new byte[in.available()];
- in.read(data);
- String encode = new BASE64Encoder().encode(data);
- log.info("处理图片结束。。。。。。。" + encode);
- //修改文档中的图片信息
- htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+encode);
-
- /* BufferedImage bi = ImageIO.read(fileImage);// 图片存储大小比较大
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- ImageIO.write(bi, "png", baos);
- byte[] bytes = baos.toByteArray();
- String sd = Base64.getEncoder().encodeToString(bytes).trim();
- log.info("处理图片结束。。。。。。。" + sd);
- htmlStr = htmlStr.replace(oneImagePathStr, "data:image/png;base64,"+sd);*/
-
-
- }
- } catch (Exception e) {
- log.info("upload docxToHtmlText exception", e);
- }
- }
- }
- log.info("处理结果:{}", htmlStr);
- } catch (Exception e) {
- log.error("docxToHtmlText 解析异常", e);
- } finally {
- if (htmlStream != null) {
- htmlStream.close();
- }
- return htmlStr;
- }
- }
- }
直接引用就行,但是有一点,一定要注意接口返回的时候,如果直接返回页面接口上要加
@ResponseBody不然就悲剧了;当然如果直接存库的那就无所谓了

这次就先这样,自娱自乐,手下留情勿喷!!