文档：htm格式转txt

꧂ 两个地方都保存꧁

import os
import codecs
from bs4 import BeautifulSoup

def generate_output_filename(file_path, save_path):
    # 获取文件名（不包含扩展名）
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    # 构造保存路径和文件名
    output_filename = os.path.join(save_path, file_name + '.txt')
    return output_filename

def get_content_from_mht(soup):
    # 从 MHT 文件中提取内容，并返回字符串形式的内容
    # 这里只是示例，您可以根据具体的 MHT 文件结构进行修改
    # 下面的示例代码仅提取  标签下的文本内容
    body = soup.body
    if body:
        return body.get_text()
    else:
        return ""

def convert_mht_to_txt(path, save_path_1, save_path_2):
    if os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith('.mht'):
                    file_path = os.path.join(root, file)
                    output_filename_1 = generate_output_filename(file_path, save_path_1)
                    output_filename_2 = generate_output_filename(file_path, save_path_2)
                    with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
                         codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
                        with open(file_path, 'r', encoding='utf-8') as f_in:
                            soup = BeautifulSoup(f_in, 'html.parser')
                            content = get_content_from_mht(soup)
                            f_out_1.write(content)
                            f_out_2.write(content)
    elif os.path.isfile(path) and path.endswith('.mht'):
        output_filename_1 = generate_output_filename(path, save_path_1)
        output_filename_2 = generate_output_filename(path, save_path_2)
        with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
             codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
            with open(path, 'r', encoding='utf-8') as f_in:
                soup = BeautifulSoup(f_in, 'html.parser')
                content = get_content_from_mht(soup)
                f_out_1.write(content)
                f_out_2.write(content)

# 示例用法
path = input("请输入要处理的文件路径：")
save_path_1 = input("请输入第一个保存文件的位置（按回车默认保存）：")
save_path_2 = input("请输入第二个保存文件的位置（按回车默认不保存）：")

if save_path_1 == '':
    save_path_1 = '/storage/emulated/0/文件/代码print相关/结构图/字符图/'

if save_path_2 == '':
    convert_mht_to_txt(path, save_path_1, save_path_1)
else:
    convert_mht_to_txt(path, save_path_1, save_path_2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

仅适用于处理 .mht 格式的文件。

꧂添加转义成文字 ꧁

如果您想要将转义字符转换为相应的文字形式，可以借助 Python 的内置模块 html。以下是修改后的代码：

import os
import codecs
from bs4 import BeautifulSoup
import html

def generate_output_filename(file_path, save_path):
    # 获取文件名（不包含扩展名）
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    # 构造保存路径和文件名
    output_filename = os.path.join(save_path, file_name + '.txt')
    return output_filename

def get_content_from_mht(soup):
    # 从 MHT 文件中提取内容，并返回字符串形式的内容
    # 这里只是示例，您可以根据具体的 MHT 文件结构进行修改
    # 下面的示例代码仅提取  标签下的文本内容
    body = soup.body
    if body:
        content = body.get_text()
        # 转义字符转换为相应的文字形式
        content = html.unescape(content)
        return content
    else:
        return ""

def convert_mht_to_txt(path, save_path_1, save_path_2):
    if os.path.isdir(path):
        for root, dirs, files in os.walk(path):
            for file in files:
                if file.endswith('.mht'):
                    file_path = os.path.join(root, file)
                    output_filename_1 = generate_output_filename(file_path, save_path_1)
                    output_filename_2 = generate_output_filename(file_path, save_path_2)
                    with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
                         codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
                        with open(file_path, 'r', encoding='utf-8') as f_in:
                            soup = BeautifulSoup(f_in, 'html.parser')
                            content = get_content_from_mht(soup)
                            f_out_1.write(content)
                            f_out_2.write(content)
    elif os.path.isfile(path) and path.endswith('.mht'):
        output_filename_1 = generate_output_filename(path, save_path_1)
        output_filename_2 = generate_output_filename(path, save_path_2)
        with codecs.open(output_filename_1, 'w', 'utf-8') as f_out_1, \
             codecs.open(output_filename_2, 'w', 'utf-8') as f_out_2:
            with open(path, 'r', encoding='utf-8') as f_in:
                soup = BeautifulSoup(f_in, 'html.parser')
                content = get_content_from_mht(soup)
                f_out_1.write(content)
                f_out_2.write(content)

# 示例用法
path = input("请输入要处理的文件路径：")
save_path_1 = input("请输入第一个保存文件的位置（按回车默认保存）：")
save_path_2 = input("请输入第二个保存文件的位置（按回车默认不保存）：")

if save_path_1 == '':
    save_path_1 = '/storage/emulated/0/文件/代码print相关/结构图/字符图/'

if save_path_2 == '':
    convert_mht_to_txt(path, save_path_1, save_path_1)
else:
    convert_mht_to_txt(path, save_path_1, save_path_2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

在这个版本的代码中，使用 html.unescape() 函数将转义字符转换为相应的文字形式。这样在保存为文本文件时，转义字符就会以正常的文本形式显示。

相关阅读:
Linux免密登录
 Java小树的参天成长（构造方法重载）
Mysql配置参数
 excel 导出单元格换行
 【Python百日进阶-数据分析】Day124 - Plotly Figure参数:饼图(二)
CMSC5724-数据挖掘之线性分类问题与感知机
 怎样在PDF上直接编辑文字？这几种编辑方法需要掌握
 Linux基础入门到精通之Linux系统配置IP
【Flink入门修炼】2-1 Flink 四大基石
 LVS-DR集群部署
原文地址：https://blog.csdn.net/weixin_73675558/article/details/133870938