C# 将PDF文档转换为Word文档

一.开发框架：

.NetCore6.0
工具：Visual Studio 2022

二.思路：

1.使用SHA256Hash标识文档转换记录，数据库已经存在对应散列值，则直接返还已经转换过的文档
2.数据库没有对应散列值记录的话，则保存上传PDF文档，并进行文档转换，保留Word
3.转换成功，则在数据库记录对应文档的转换记录，用散列值做标识

三.C#后台包：

1.方法一：Spire.PDF转换包（免费的只能一次转换10页）
在这里插入图片描述
2.方法二：iTextSharp包，没有10页转换限制

3.mssql数据库连接包

4.iTextSharp包转换Word文档时，文档格式包

四：C#代码案例：

1.PDF转Word方法：
a.方法一：Spire.PDF包，PDF转Word方法（旧版，有页码限制）：

	 /// 
    /// PDF文件转化为Word文件
    /// 
    /// 
    /// 
    public static void ConvertPdfToWord(string pdfFilePath, string wordFilePath)
    {
        try
        {
            Spire.Pdf.PdfDocument pdfDoc = new Spire.Pdf.PdfDocument();
            pdfDoc.LoadFromFile(pdfFilePath);
            pdfDoc.SaveToFile(wordFilePath, Spire.Pdf.FileFormat.DOCX);
            pdfDoc.Close();
        }
        catch (Exception ex)
        {
            Console.WriteLine("Error converting PDF to Word: " + ex.Message);
        }
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

b.iTextSharp包，没有页码限制：

	 /// 
    /// iTextSharp库PDF文件转Word文件
    /// 
    /// 
    /// 
    public static void ConvertPdfToWordByText(string pdfFilePath, string wordFilePath)
    {
        
        using (iText.Kernel.Pdf.PdfReader reader = new iText.Kernel.Pdf.PdfReader(pdfFilePath))
        {
            using (iText.Kernel.Pdf.PdfDocument pdfDoc = new iText.Kernel.Pdf.PdfDocument(reader))
            {
                Spire.Doc.Document doc = new Spire.Doc.Document();
                for (int i = 1; i <= pdfDoc.GetNumberOfPages(); i++)
                {
                    iText.Kernel.Pdf.PdfPage page = pdfDoc.GetPage(i);
                    var strategy = new iText.Kernel.Pdf.Canvas.Parser.Listener.LocationTextExtractionStrategy();
                    PdfCanvasProcessor parser = new PdfCanvasProcessor(strategy);
                    parser.ProcessPageContent(page);
                    string textFromPage = strategy.GetResultantText();
                    var paragraph = doc.AddSection().AddParagraph();
                    paragraph.AppendText(textFromPage);
                }

                doc.SaveToFile(wordFilePath, Spire.Doc.FileFormat.Docx);
            }
        }


    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

2.获取文件散列值方法（两种）：

a.根据上传文件，获取散列值

	/// 
    /// 根据上传文件获取文件散列值
    /// 
    /// 
    /// 
    public string CalculateSHA256Hash(IFormFile file)
    {
        try
        {
            using (var sha256 = SHA256.Create())
            {
                using (var stream = file.OpenReadStream())
                {
                    byte[] hashBytes = sha256.ComputeHash(stream);
                    string hashString = BitConverter.ToString(hashBytes).Replace("-", String.Empty);
                    return hashString;
                }
            }
        }
        catch (Exception ex)
        {
            Console.WriteLine("Error calculating SHA256 hash: " + ex.Message);
            return null;
        }
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

b.根据文件路径，获取散列值

    /// 
    /// 根据文件路径获取文件散列值
    /// 
    /// 
    /// 
    public string CalculateSHA256Hash(string filePath)
    {
        try
        {
            using (FileStream stream = System.IO.File.OpenRead(filePath))
            {
                SHA256 sha = SHA256.Create();
                byte[] hash = sha.ComputeHash(stream);
                string hashString = BitConverter.ToString(hash).Replace("-", String.Empty);
                return hashString;
            }
        }
        catch (Exception ex)
        {
            Console.WriteLine("Error calculating SHA256 hash: " + ex.Message);
            return null;
        }
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

3.上传PDF文件，转化为Word文件方法：

    /// 
    /// 文件上传
    /// 
    /// 
    public ActionResult UploadFile()
    {
        var files = HttpContext.Request.Form.Files;
        if (files == null || files.Count <= 0)
        {
            return Json(new { code = -1, msg = "请上传文件！" });
        }

        var file = files[0];
        if (file.ContentType != "application/pdf")
        {
            return Json(new { code = -1, msg = "不是PDF文件！" });
        }

        var SHA256Hash = CalculateSHA256Hash(file);
        long fileSize = file.Length;
        if (new FileConversionBll().ExistsSHA256Hash(SHA256Hash, fileSize))
        {
            var model = new FileConversionBll().GetFileBySHA256HashAndSize(SHA256Hash, fileSize);
            return Json(new { code = 0, msg = "",data = model.WordFilePath }); 
        }
        else
        {
            string fileName = DateTime.Now.ToString("yyyyMMddHHmmssfff");
            //var filePath = $@"~/File/Pdf/{fileName}";
            string pdfFilePath = System.IO.Path.Combine("PDF文档路径", fileName + ".pdf");
            using (var fileStream = new FileStream(pdfFilePath, FileMode.Create))
            {
                file.CopyTo(fileStream);
            }

            string wordFilePath = System.IO.Path.Combine("Word文档路径", fileName + ".docx");
            //ConvertPdfToWord(pdfFilePath, wordFilePath);
            ConvertPdfToWordByText(pdfFilePath, wordFilePath);

            var res = new FileConversionBll().AddFileConversion(
                new FileConversion()
                {
                    PdfFilePath = pdfFilePath,
                    WordFilePath = wordFilePath,
                    PdfSHA256Hash = SHA256Hash,
                    FileSize = fileSize
                }
            ) ;

            if (res)
            {
                return Json(new { code = 0, msg = "",data = wordFilePath });
            }
        }

        

        return Json(new {code = -2,msg = "出错了！"});
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

五.效果图：

旧版Spire.PDF包，只转化了10页：在这里插入图片描述
新版iTextSharp包，全部转化完成（但是去除了原本的Word文档格式）：

相关阅读:
MySQL（4）索引实践（2）
程序员兼职社区招募（内含技术指导）
Smart Community(1)之设计规范
 算法：只使用一个int类型变量表示日期
 JAVA Steam原理和常见操作
 人工神经网络理论及应用pdf,人工智能的相关书籍
 day11-Servlet01
canvas 基础和动图案例
 （附源码）springboot自习室座位预约系统毕业设计674156
搭建 3D 智慧农场可视化
原文地址：https://blog.csdn.net/wuyanEdwardElrid/article/details/134383379