java判断word文件是否正常

最近在做word合并的时候，发现合并之后的word文档内容打不开，然后去看官方的合并word的代码，发现都没有问题，然后就将word解析出来看，发现有问题的word里面的图片是无法解析的，而正常word里面的图片是可以解析的，由此可以判断一个word文档内容是否有异常，先将word解析，看解析的文档是否正常，下面提供具体的代码示例，内容仅供大家参考，
1.导入pom.xml依赖包

 <!-- POI-word文件处理需要 -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>

1
2
3
4
5
6
7

2.具体代码示例：

package com.common.utils;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.impl.util.Base64;
import org.slf4j.Logger;
import org.springframework.util.CollectionUtils;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * @ClassName WordAlansi
 * @Description:  java解析word代码示例
 * @Author: mischen
 * @date: 14:57 2022/11/25
 * @Version 1.0
 */
public class WordAlansis {

    public static void main(String[] args) {
        String path ="C:\\Users\\Administrator\\Desktop\\公司资料\\面试\\1.docx";
       // Map getContentWps = getContentWps(path);
       // System.out.println(getContentWps);
        Map<String, String> getContentWps = getContentDocx(new File(path));
        System.out.println(getContentWps);
    }

    /**
     * 获取正文文件内容，docx方法
     *
     * @param file
     * @return
     */
    public static Map<String, String> getContentDocx(File file) {
        Map<String, String> map = new HashMap();
        StringBuffer content = new StringBuffer("");
        String result = "0";  // 0表示获取正常，1表示获取异常
        InputStream is = null;
        Logger logger = null;
        try {
            //根据需求入参也可以改为文件路径，对应的输入流部分改为new File(路径)即可
            is = new FileInputStream(file);
            // 2007版本的word
            XWPFDocument xwpf = new XWPFDocument(is);    // 2007版本，仅支持docx文件处理
            //解析word文件中的文段内容
            List<XWPFParagraph> paragraphs = xwpf.getParagraphs();
            if (paragraphs != null && paragraphs.size() > 0) {
                for (XWPFParagraph paragraph : paragraphs) {
                    if (!paragraph.getParagraphText().startsWith("    ")) {
                        content.append(paragraph.getParagraphText().trim()).append("\r\n");
                    } else {
                        content.append(paragraph.getParagraphText());
                    }
                    List<XWPFRun> runs = paragraph.getRuns();
                    runs.forEach(
                            run -> {
                                //解析填充的图片信息
                                List<XWPFPicture> pictures = run.getEmbeddedPictures();
                                if (pictures.size() > 0) {
                                    XWPFPicture picture = pictures.get(0);
                                    XWPFPictureData pictureData = picture.getPictureData();
                                    System.out.println(Base64.encode(pictureData.getData()));
                                }
                            });
                }
            }
            //解析word文件中的所有图片信息
            List<XWPFPictureData> allPictures = xwpf.getAllPictures();
            for (XWPFPictureData picture : allPictures) {
                String before = xwpf.getRelationId(picture);
                //将原文档中的图片加入到目标文档中
                //String after = src.addPictureData(picture.getData(), Document.PICTURE_TYPE_PNG);
               // map.put(before, after);
            }


        } catch (Exception e) {
            logger.error("docx解析正文异常:" + e);
            result = "1"; // 出现异常
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    logger.error("" + e);
                }
            }
            map.put("result", result);
            map.put("content", String.valueOf(content));
        }
        return map;
    }

    /**
     * 获取正文文件内容，doc方法
     *
     * @param path
     * @return
     */
    public static Map<String, String> getContentDoc(String path) {
        Map<String, String> map = new HashMap();
        StringBuffer content = new StringBuffer("");
        String result = "0";  // 0表示获取正常，1表示获取异常
        InputStream is = null;
        Logger logger = null;
        try {
            is = new FileInputStream(new File(path));
            // 2003版本的word
            WordExtractor extractor = new WordExtractor(is);  // 2003版本 仅doc格式文件可处理，docx文件不可处理
            String[] paragraphText = extractor.getParagraphText();   // 获取段落，段落缩进无法获取，可以在前添加空格填充
            if (paragraphText != null && paragraphText.length > 0) {
                for (String paragraph : paragraphText) {
                    if (!paragraph.startsWith("    ")) {
                        content.append(paragraph.trim()).append("\r\n");
                    } else {
                        content.append(paragraph);
                    }
                }
            }
        } catch (Exception e) {
            logger.error("doc解析正文异常:" + e);
            result = "1"; // 出现异常
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    logger.error("" + e);
                }
            }
            map.put("result", result);
            map.put("content", content.toString());
        }
        return map;
    }

    /**
     * 获取正文文件内容，wps方法
     *
     * @param path
     * @return
     */
    public static Map<String, String> getContentWps(String path) {
        Map<String, String> map = new HashMap();
        StringBuffer content = new StringBuffer("");
        String result = "0";  // 0表示获取正常，1表示获取异常
        InputStream is = null;
        Logger logger = null;
        try {
            is = new FileInputStream(new File(path));
            // wps版本word
            HWPFDocument hwpf = new HWPFDocument(is);
            WordExtractor wordExtractor = new WordExtractor(hwpf);
            // 文档文本内容
            String[] paragraphText1 = wordExtractor.getParagraphText();
            if (paragraphText1 != null && paragraphText1.length > 0) {
                for (String paragraph : paragraphText1) {
                    if (!paragraph.startsWith("    ")) {
                        content.append(paragraph.trim()).append("\r\n");
                    } else {
                        content.append(paragraph);
                    }
                }
            }
        } catch (Exception e) {
            logger.error("wps解析正文异常:" + e);
            result = "1"; // 出现异常
        } finally {
            if (is != null) {
                try {
                    is.close();
                } catch (IOException e) {
                    logger.error("" + e);
                }
            }
            map.put("result", result);
            map.put("content", content.toString());
        }
        return map;
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189

相关阅读:
darknet 结构体汇总
 信息系统项目管理师Part16-物联网
 测试为什么分白盒、黑盒、单元、集成测试？
C语言中，可变参数函数调用的过程？！
springboot 项目起步讲解及自动装配原理
 本周Github有趣的项目、工具和库：Radius等
 编译原理—x86汇编指令
 【内存拷贝函数：memcpy与memmove】
多路彩灯控制器LED流水灯花型verilog仿真图视频、源代码
 ping命令网络抓包分析
原文地址：https://blog.csdn.net/miachen520/article/details/128043636