【pdf翻译中文】免费自用日语pdf转中文、韩语pdf转中文（也可以转英文）或者其他小语种法语、德语pdf转中文

不是很精确、有点慢，但是也够用，胜在免费free

效果图：

一些对比：
在这里插入图片描述

模型来自于：

https://huggingface.co/models

文件目录

在这里插入图片描述

调用模型的代码：

运行此段代码，执行翻译
在这里插入图片描述
一些简单的设置在这里控制
pdf2chines.py

import os

import cv2
import easyocr
from PIL import Image
from PIL import Image, ImageDraw, ImageFont

import rect_dealer
from img_text import ImgText

cut_model_path = r"F:\ocr\cut_model"
detect_model_jap_path = r"F:\ocr\meta_model\manga-ocr-base"  # 检测漫画的文本用的，好烂，还不如easyocr
trans_model_path = r"F:\ocr\meta_model\m2m100_1.2B"  # meta的模型
pdf2png_save_path = r"F:\ocr\pdf2png"
pdf_path = r"F:\ocr\pdfs"
pass_point = 0.05
blank_png_path = r"F:\ocr\blank.png"
DEFUALT_FONT_SIZE = 60
MIN_FONT_SIZE = 20

height_sub = 0.1  # 检测到位置后，高度减少一丢丢来找每个文本块
include_height_sub = 0.3
include_width_sub = 0.3
finished_list = "finished_list.txt"


def generate_mask(png, graph_infos):
    """
    生成一张mask图
    :param png:
    :param graph_infos:
    :return:
    """
    image = Image.open(png)
    im_width, im_height = image.size
    fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
    image.paste(fill_image, (0, 0))  # 全搞成黑色的
    for info in graph_infos:
        pos_info = info[0]
        left_up_point = pos_info[0]  # [939, 791]
        left_down_point = pos_info[3]  # [939, 805]
        right_up_point = pos_info[1]  # [1007, 791]
        right_down_point = pos_info[2]  # [1007, 805]
        up_margin = left_up_point[0]  # 上间距
        left_margin = left_up_point[1]  # 左侧间距
        width = right_up_point[0] - left_up_point[0]
        height = right_down_point[1] - right_up_point[1]
        height_sub_num = height * 0.1
        blank_png = Image.new('RGBA', (int(width), int(height - height_sub_num)), (255, 255, 255))
        image.paste(blank_png, (int(up_margin + height_sub_num), int(left_margin)))
    image.save("{}_filled.png".format(png))
    return "{}_filled.png".format(png)


def merge_neighbers(png, graph_infos):
    """
    需要把邻近的行都合并了
    :param graph_infos:
    :return:
    """
    filled_path = generate_mask(png, graph_infos)  # 生成mask图
    rects = rect_dealer.getHoleRects(filled_path)  # 获取分割关系
    for info in graph_infos:
        # 检测包含关系
        detect_include(rects, info[0], info[-2], info[-1])
    return rects


def detect_include(rects, pos_info, words, acc):
    left_up_point = pos_info[0]  # [939, 791]
    left_down_point = pos_info[3]  # [939, 805]
    right_up_point = pos_info[1]  # [1007, 791]
    right_down_point = pos_info[2]  # [1007, 805]
    up_margin = left_up_point[0]  # 上间距
    left_margin = left_up_point[1]  # 左侧间距
    width = right_up_point[0] - left_up_point[0]
    height = right_down_point[1] - right_up_point[1]
    height_sub_num = min(height * include_height_sub, 20)
    width_sub_num = min(include_width_sub * width, 10)
    for rect in rects:
        if width - width_sub_num < rect.w and height - height_sub_num < rect.h:
            print("minus:{},{}".format(rect, pos_info))
        if width - width_sub_num < rect.w and height - height_sub_num < rect.h and left_up_point[0] > \
                rect.x - width_sub_num and left_up_point[1] > rect.y - height_sub_num:
            rect.words += words
            rect.acc += float(acc)
            rect.acc /= 2.0
            rect.line_num += 1  # 行数+1
            # print("include:{},{}".format(rect, pos_info))
            return


def change_graph2words(graph_path, languages):
    """
    图片转成词
    :param graph_path:
    :param languages:
    :return:
    """
    reader = easyocr.Reader(languages, model_storage_directory=cut_model_path, download_enabled=False, gpu=True)
    result = reader.readtext(graph_path)
    return result


def words2chinese(words, from_lang, tgt_lang):
    from transformers import pipeline
    translator = pipeline("translation", model=trans_model_path)
    to_trans = "".join(words)
    output = translator(to_trans, src_lang=from_lang, tgt_lang=tgt_lang)
    print("翻译原文：{}\n翻译结果：{}".format(to_trans, output))
    return output


def pdf2png(pdf_name):
    import fitz
    #  打开PDF文件，生成一个对象
    doc = fitz.open('{}'.format(pdf_name))
    png_paths = []
    for pg in range(doc.page_count):
        page = doc[pg]
        rotate = int(0)
        # 每个尺寸的缩放系数为2，这将为我们生成分辨率提高四倍的图像。
        zoom_x = 1.0
        zoom_y = 1.0
        trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
        pm = page.get_pixmap(matrix=trans, alpha=False)
        graph_path = os.path.join(pdf2png_save_path, '%s.png' % pg)
        pm.save(graph_path, output="png")
        png_paths.append(graph_path)
    return png_paths


def line_sep(sentense, line_num):
    sep = int(len(sentense) / line_num)
    new_sen = ""
    next_start_index = 0
    for i in range(0, line_num):
        new_sen += sentense[next_start_index:line_num + sep].strip()
        new_sen += "\n"
        next_start_index = line_num + sep
    new_sen += sentense[next_start_index:]
    return new_sen


def draw_text(png, infos):
    image = Image.open(png)
    for info in infos:
        blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
        image.paste(blank_png, (info.x, info.y))
        n = ImgText(info.words, max(min(int(min(info.w, info.h) / (1.5 * info.line_num)), DEFUALT_FONT_SIZE)
                                    , MIN_FONT_SIZE), info.w)
        n.draw_text(image, info.x, info.y)
    image.save("{}".format(png))


def clear_png_files():
    pass


def translate_a_pdf(pdf_path, detectlang: list, translate_from_lang: str, translate_to_lang: str):
    with open(finished_list, "r") as f:
        finished = f.readlines()
    for finish in finished:
        finished[finished.index(finish)] = finish.strip()
    clear_png_files()  # 先清空png文件夹下面的全部图，然后就可以转换当前pdf的图了
    png_paths = pdf2png(pdf_path)

    for png in png_paths:
        if png in finished:
            continue
        img_changes = []
        words_result = change_graph2words(png, detectlang)
        print("查找到的文本：{}".format(words_result))
        rects = merge_neighbers(png, words_result)  # 合并段
        for rect in rects:
            if float(rect.acc) < pass_point:
                print("认为这个词正确度{}极低,不进行翻译：{}".format(rect.acc, rect.words))
                continue
            transed_words = words2chinese(rect.words, translate_from_lang, translate_to_lang)
            translation_text = ""
            for trans in transed_words:
                translation_text += trans["translation_text"]
            rect.words = translation_text
            print("存储位置：{}".format(str(rect)))
            img_changes.append(rect)  # 更新一下图像数据
            draw_text(png, img_changes)
        draw_text(png, img_changes)
        print("输出图片：{}".format(png))
        with open("finished_list.txt", "a+") as f:
            f.write(png + "\n")


from PIL import Image
import os


def combine_imgs_pdf(folder_path, pdf_file_path):
    """
    合成文件夹下的所有图片为pdf
    Args:
        folder_path (str): 源文件夹
        pdf_file_path (str): 输出路径
    """
    with open(finished_list,"r") as f:
        png_list = f.readlines()
    for png in png_list:
        png_list[png_list.index(png)] = png.strip()
    sources = []
    png_list.sort()
    output = Image.open(png_list[0])
    png_list.pop(0)
    for file in png_list:
        png_file = Image.open(file)
        if png_file.mode == "RGB":
            png_file = png_file.convert("RGB")
        sources.append(png_file)
    output.save(pdf_file_path, "pdf", save_all=True, append_images=sources)
    with open(finished_list,"w") as f:
        f.write("")


if __name__ == '__main__':
    from_lang = ["ja", "en"]
    to_lang = ["zh"]
    pdf_name = "ポーズの定理_ダイジェスト.pdf"
    translate_a_pdf(os.path.join(pdf_path, pdf_name), from_lang, "ja", "zh")
    combine_imgs_pdf(pdf2png_save_path, os.path.join(pdf_path, "changed_"+pdf_name))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228

在这里插入图片描述

处理一下一些段落，按照段落去识别
rect_dealer.py

import math

import cv2
from PIL import Image
from PIL import Image, ImageDraw, ImageFont


# 定义一个边界表示
class Rec:
    def __init__(self, x, y, w, h):
        self.x = x
        self.y = y
        self.w = w
        self.h = h
        self.words = ""
        self.acc = 0
        self.line_num = 0

    def __str__(self):  # __str__(self)不可以添加参数(形参)
        return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
            self.words)

    def __repr__(self):
        return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
            self.words)


def include_other_recs(rec_in: Rec, recs):
    """
    比较矩形REC：rec_in和矩形数组：recs
    比较是否包含其他矩形,如果包含了,返回Ture ,否则返回False，表示不包含其他矩形区域，是单独的表格
    :param rec_in:
    :param recs:
    :return:
    """
    for rec in recs:
        if rec_in != rec:
            if rec_in.x <= rec.x and rec_in.x + rec_in.w >= rec.x + rec.w and rec_in.y <= rec.y \
                    and rec_in.y + rec_in.h >= rec.y + rec.h + 5:
                # print(str(rec) + " in " + str(rec_in))
                return True
    # print(str(rec_in), "------not include other recs------")
    return False


def hole_select(recs):
    results = []
    for rec in recs:
        if not include_other_recs(rec, recs):
            results.append(rec)
    return results


class detectWords(object):
    def __init__(self, src_img, width_max_scale=15, height_max_scale=15):
        self.src_img = src_img
        self.width_scale = width_max_scale
        self.height_scale = height_max_scale

    def run(self):
        if len(self.src_img.shape) == 2:  # 灰度图
            gray_img = self.src_img
        if len(self.src_img.shape) == 3:
            gray_img = cv2.cvtColor(self.src_img, cv2.COLOR_BGR2GRAY)

        # 处理图像，灰度化，二值化
        # erode_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
        dilated_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
        # eroded = cv2.erode(gray_img.copy(), erode_kernel, 3)
        dilated = cv2.dilate(gray_img.copy(), dilated_kernel, 10)
        return dilated


# 判断是否区域为表格,返回可能包含表格的矩形若干个（它们可能存在重叠包含关系）：
def region_hole(image):
    recs = []  # 保存表格结果矩形
    contours_mask, hierarchy_mask = cv2.findContours(image, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
    draw_img_in = cv2.drawContours(image.copy(), contours_mask, -1, (153, 153, 0), 2, maxLevel=2)
    cv2.imwrite("region_table.png", draw_img_in)

    for contour in contours_mask:  # 遍历轮廓
        # 只保留需要的轮廓，去掉误读的噪点 和 外轮廓
        # 绘制矩形
        area = cv2.contourArea(contour)
        if area < 150:
            # 获取区域的面积，如果小于某个值就忽略，代表是杂线不是表格
            continue
        approx = cv2.approxPolyDP(contour, 3, True)  # 趋近矩形
        x, y, width, height = cv2.boundingRect(approx)  # 得到矩形面积、
        rec = Rec(x, y, width, height)
        recs.append((rec))
    return recs


def draw_rects(png, recs):
    image = Image.open(png)
    im_width, im_height = image.size
    fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
    image.paste(fill_image, (0, 0))  # 全搞成黑色的
    for info in recs:
        # print(info)
        blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
        image.paste(blank_png, (info.x, info.y))
    image.save("{}_filled.png".format(png))


def getHoleRects(png_path):
    origin_image = cv2.imread(png_path)
    h_dilated_img = detectWords(origin_image).run()  # 稍微膨胀
    recs = region_hole(h_dilated_img)  # 检测候选洞区域
    results = hole_select(recs)  # 筛选出洞
    draw_rects('region_table.png', results)
    return results

if __name__ == '__main__':
    file_name = r'F:\ocr\pdf2png\1.png_filled.png'
    getHoleRects(file_name)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

在这里插入图片描述

img_text.py （这段代码抄的网上、实现了图片文本换行的效果）

from PIL import Image, ImageDraw, ImageFont


class ImgText:
    def __init__(self, text, font_size, width):
        self.font = ImageFont.truetype(r'‪C:\Windows\Fonts\simhei.ttf', font_size)
        # 预设宽度 可以修改成你需要的图片宽度
        self.width = width
        # 文本
        self.text = text
        # 段落 , 行数, 行高
        self.duanluo, self.note_height, self.line_height = self.split_text()

    def get_duanluo(self, text):
        txt = Image.new('RGBA', (100, 100), (255, 255, 255, 0))
        draw = ImageDraw.Draw(txt)
        # 所有文字的段落
        duanluo = ""
        # 宽度总和
        sum_width = 0
        # 几行
        line_count = 1
        # 行高
        line_height = 0
        for char in text:
            width, height = draw.textsize(char, self.font)
            sum_width += width
            if sum_width > self.width:  # 超过预设宽度就修改段落 以及当前行数
                line_count += 1
                sum_width = 0
                duanluo += '\n'
            duanluo += char
            line_height = max(height, line_height)
        if not duanluo.endswith('\n'):
            duanluo += '\n'
        return duanluo, line_height, line_count

    def split_text(self):
        # 按规定宽度分组
        max_line_height, total_lines = 0, 0
        allText = []
        for text in self.text.split('\n'):
            duanluo, line_height, line_count = self.get_duanluo(text)
            max_line_height = max(line_height, max_line_height)
            total_lines += line_count
            allText.append((duanluo, line_count))
        line_height = max_line_height
        total_height = total_lines * line_height
        return allText, total_height, line_height

    def draw_text(self,note_img,x,y):
        """
    绘图以及文字
    :return:
    """
        draw = ImageDraw.Draw(note_img)
        # 左上角开始
        for duanluo, line_count in self.duanluo:
            draw.text((x, y), duanluo, fill=(255, 0, 0), font=self.font)
            y += self.line_height * line_count
        note_img.save("result.png")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

步骤：

1.先用easyocr识别文本，easyocr需要下载easyocr的模型，放在cut_model文件夹里
在这里插入图片描述
下载地址：https://www.jaided.ai/easyocr/modelhub/ 可能需要科学上w、

2.在这里可以控制easyocr识别的文本语言：
在这里插入图片描述
我这里输入ja、en，代表日语（japanese）和英语（english），所以会从图片中检测出日语和英语的文本

3.简单地处理一下块，把一个段落的文本，合并起来
在这里插入图片描述
4.输入到翻译模型中，这里可以是任何模型，我试过下面几个模型
绿色框住的是好，其他的由于各种原因，比如太慢、比如性能太差，被我残忍抛弃，
（ps：opus-mt-XX的模型是真的好用，又小又准确，但是它！没有ja-zh，所以……好气！）
在这里插入图片描述
例如：m2m100_418M，这个模型在：https://toscode.gitee.com/mirrors_UKPLab/EasyNMT 可以看到，

它的节点和大小没有m2m100_1.2B多，我下载了试了试，真的不能用

在这里插入图片描述
这俩的翻译对比：m2m100_418M，右边m2m100_1.2B

性能差了很多，而且会出现奇怪的表现，速度也没有快多少。

模型排行榜：
（排行靠前的一大堆，没一个开源的，我只能说，感谢meta，小扎还是良心企业嗷）
在这里插入图片描述
网易有道词典小语种翻译实现思路
网易有道的小语种翻译真的很牛，微信在它面前被揍得像个弟弟，可惜模型都不公开，毕竟都是核心资源……

其他语种模型可以去下面的笑脸中心找，很牛的企业，可能需要科学上网，模型太大的话可以用迅雷下载器（或者用别的下载器），
在这里插入图片描述
下载器下载能快许多：

也可以用讯飞的api直接就翻译日语了
https://www.xfyun.cn/services/xftrans
在这里插入图片描述
给的200万字免费调用，够用一段时间了

m2m100_1.2B模型翻译日文还是有很多不如人意的地方，

例如：

1.速度很慢：慢的我有点受不了了
2.正确率还不够好（虽然也不太差了）：
在这里插入图片描述

—————————————————————————————
后来换了讯飞的接口试了下，也不怎么样（调用接口还很麻烦）
讯飞翻译：
在这里插入图片描述
唯一好使的只有有道图片翻译，感觉错误率明显低；而且提供了任意体验的服务，真的很好，如果不是想一键pdf2pdf，那么用有道去翻译一下也可以。

在这里插入图片描述

相关阅读:
css列表
 jpa Repository的常用写法总结
 学习nginx,这一篇就够了
 Spring支持人工智能应用框架-SpringAi
第十四届蓝桥杯大赛软件赛决赛 C/C++ 大学 B 组试题 C: 班级活动
 设计模式-10--多例模式（Multition pattern）
java线程池详解
 uniapp 声音提示、震动提示、语音播报插件 Ba-Beep
React.memo()、useCallback()和useMemo()的用法--性能优化--缓存
 Bob 的生存概率问题
原文地址：https://blog.csdn.net/qinglingLS/article/details/128063881