python如何给正文内容加富文本标签

给正文加富文本标签，给图片加网址

import os
import shutil
import hashlib
import time
import pandas as pd
import requests
import re

file_dir_ = "903"

file_dir = r"C:\Users\Administrator\Desktop\download\9.3"

excel_name = file_dir.split("\\")[-1]


def file_name(file_dir_):
    for root, dirs, files in os.walk(file_dir_):
        return files, dirs


def md5_content(content_):
    """
    将段落进行md5 加密
    :param content_:
    :return:
    """
    m = hashlib.md5()
    b = content_.encode(encoding='utf-8')
    m.update(b)
    str_md5 = m.hexdigest()
    return str_md5


def copy_path(source_path, target_path):
    if not os.path.exists(target_path):
        os.makedirs(target_path)
    if os.path.exists(source_path):
        shutil.rmtree(target_path)
    shutil.copytree(source_path, target_path)


def rename_f(src_file, dst_file):
    os.rename(src_file, dst_file)


def get_img_title_(name_):
    file_dir_copy = file_dir + '\\' + name_ + '\\' + file_dir_
    copy_path(source_path=file_dir + '\\' + name_, target_path=file_dir_copy)
    all_p = file_name(file_dir_=file_dir_copy)[0]
    dd = []
    for t in all_p:
        _title = t.split('.')[0]
        type_ = t.split(".")[1]
        id_ = md5_content(t + str(time.time()))
        src_file_ = file_dir_copy + "\\" + t
        dst_file_ = file_dir_copy + "\\" + t.replace(_title, id_)
        rename_f(src_file_, dst_file_)
        url = f"https://wwww.aliyuncs.com/{file_dir_}/{id_}.{type_}"
        dd.append((name_ + "|" + _title, url))
    return dd


def get_all_title():
    all_p = file_name(file_dir_=file_dir)[1]
    d_l = []
    for n in all_p:
        res_ = get_img_title_(name_=n)
        for i in res_:
            d_l.append(i)
    ddd = pd.DataFrame(d_l)
    ddd.to_excel(f"{excel_name}.xlsx", index=False, header=["图片标识", "图片url"])


def createdir(path):
    is_exists = os.path.exists(path)
    # 判断结果
    if not is_exists:
        # 如果不存在则创建目录
        os.makedirs(path)
        print(path + ' 目录创建成功')
    else:
        # 如果目录存在则不创建，并提示目录已存在

        print(path + ' 目录已存在')
        shutil.rmtree(path)
        os.makedirs(path)


def copy_file(filepath, new_path):
    # 获取当前路径下的文件名，返回List
    file_names = os.listdir(filepath)
    for file_ in file_names:
        # 将文件命加入到当前文件路径后面
        new_dir = filepath + '/' + file_
        # 如果是文件
        if os.path.isfile(new_dir):
            new_file = new_path + '/' + file_
            # copyfile函数两个必须为文件，不能是目录，
            shutil.copyfile(new_dir, new_file)
        # 如果不是文件，递归这个文件夹的路径
        else:
            copy_file(new_dir, new_path)


def copy_file_all():
    # 创建文件夹
    path_ = file_dir + "\\" + file_dir_
    createdir(path_)
    # 遍历当前文件夹
    file_names = file_name(file_dir_=file_dir)[1]
    for file_d in file_names:
        if file_d == file_dir_:
            continue
        file_name_two = file_name(file_dir_=file_dir + "\\" + file_d)[1][0]
        copy_file(file_dir + "\\" + file_d + "\\" + file_name_two, path_)
        shutil.rmtree(file_dir + "\\" + file_d + "\\" + file_name_two)


def get_content(content_, url_l, name_, title):
    title_ = title.replace("?", "")
    cont_ = ""
    content = content_.split("\n")
    for c in content:
        cont_ += "<p>" + c + "</p>" + "\n"
    for i in range(1, 10):
        if f"{title}_{i}" in cont_:            
        # cont_ = re.sub(f"{title}_{i}", f'<img src=\"{url_l[name_ + "|" + title_ + "_" + str(i)]}\">', cont_)
            cont_ = cont_.replace(f"<p>{title}_{i}</p>", f'<p></p>\n<img src=\"{url_l[name_ + "|" + title_ + "_" + str(i)]}\">\n<p></p>')

    cont_ += "<p>《图片来源于网络，如有问题请联系作者》</p>"
    # print(f"{title}_{i}")
    # print(name_ + "|" + title_ + "_" + str(i))
    # print(cont_)
    # print("* ****************************")
    return cont_, url_l[name_ + "|" + title_ + "_" + str(1)]


def get_keyword(title):
    """获取项目标签词"""
    res = requests.get(f"?title={title}").json()
    return res[2]


def get_excel_one(name_):
    all_ = pd.read_excel(file_dir + "\\" + f"{name_}.xlsx")  # 单个文章标题表
    all_img = pd.read_excel(f"{excel_name}.xlsx")  # 图片链接加密表
    dp = []
    img_d = dict(zip(all_img['图片标识'], all_img['图片url']))
    for value in all_.itertuples():
        query_id = value.queryID
        title = value.query
        content = value.正文
        title_1 = value.标题
        author = name_
        content_, f_url = get_content(content_=content, url_l=img_d, name_=name_, title=title)
        keyword = get_keyword(title=title)
        dp.append((query_id, title, title_1, content, "", f_url, content_, keyword, author))
    return dp


def get_all_excel():
    all_p = file_name(file_dir_=file_dir)[1]
    L = []
    for p in all_p:
        if p == file_dir_:
            continue
        res = get_excel_one(name_=p)
        for i in res:
            L.append(i)
    ddd = pd.DataFrame(L)
    ddd.to_excel(f"{excel_name}_all.xlsx", index=False,
                 header=["queryID", "query", "标题", "正文", "图片比例", "封面图片", "富文本", "项目标签","作者"])


def main():
    get_all_title()
    time.sleep(2)
    copy_file_all()
    get_all_excel()


if __name__ == "__main__":
    main()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

给图片加网址

import os
import shutil
import hashlib
import time
import pandas as pd

file_dir_="726_6"

file_dir = r"C:\Users\Administrator\Desktop\download\7.22苏苏8图"
file_dir_copy = r"C:\Users\Administrator\Desktop\download\7.22苏苏8图"+file_dir_


def file_name(file_dir_):
    for root, dirs, files in os.walk(file_dir_):
        return files


def md5_content(content_):
    """
    将段落进行md5 加密
    :param content_:
    :return:
    """
    m = hashlib.md5()
    b = content_.encode(encoding='utf-8')
    m.update(b)
    str_md5 = m.hexdigest()
    return str_md5


def copy_path(source_path, target_path):
    if not os.path.exists(target_path):
        os.makedirs(target_path)
    if os.path.exists(source_path):
        shutil.rmtree(target_path)
    shutil.copytree(source_path, target_path)


def rename_f(src_file, dst_file):
    os.rename(src_file, dst_file)


copy_path(source_path=file_dir, target_path=file_dir_copy)
all_p = file_name(file_dir_=file_dir_copy)

dd = []
for t in all_p:
    _title = t.split('.')[0]
    id_ = md5_content(t + str(time.time()))
    src_file_ = file_dir_copy + "\\" + t
    dst_file_ = file_dir_copy + "\\" + t.replace(_title, id_)
    rename_f(src_file_, dst_file_)
    url = f"https://wwww.aliyuncs.com/{file_dir_}/{id_}.jpg"
    dd.append((_title, url))

ddd = pd.DataFrame(dd)
name = file_dir.split("\\")[-1]
ddd.to_excel(f"{name}.xlsx", index=False, header=False)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

相关阅读:
python之排列组合1
机器学习中的数学原理——最小二乘法
 Go-命令行参数解析
 DNS域名解析服务
 解决win10因为WSL问题无法正常启动docker
HMI 出色的 UI 风格
 【网络知识必知必会】再谈Cookie和Session
如何kill一条TCP连接？
二叉排序树（BST）
【故障公告】cc攻击又来了，雪上加霜的三月
原文地址：https://blog.csdn.net/qq_45396577/article/details/125621373