python 网页爬虫，多任务下载

网上找个网站，视频手动一个一个下载，太麻烦了，怎么办？用某雷，out了
网页爬虫多线程下载视频步骤：

引入requests 访问网页内容，用正则解析提取url
分析html页得到mp4地址
把url存入线程安全的queue
多线程获到queue内的mp4地址，同时下载

import requests
import re
import os
import queue
import threading
import shutil

def download_start():
    download_url_queue = queue.Queue(3)
    mp4_code_set = set()
    page = 10
    store_location = '/Users/Downloads/.dyxx/'    #存储的地址
    download_site_home = "https://xxxxxx.com/"   #下载视的地址，这个需要你自己到网上发掘了
    mp4_api_url = 'https://api.xxxxxx.com/get-mp4-url?code='  #通过下载片源地址获取code, 通过code获得播放mp4的地址

    def download():
        while True:
            if not download_url_queue.empty():
                mp4_url = download_url_queue.get()
                try:
                    file_path = store_location + mp4_url[-15:]
                    if not os.path.exists(file_path):
                        print('Download start::::' + mp4_url)
                        res_header = requests.head(mp4_url)
                        if res_header.headers['Content-Type'] == 'video/mp4':
                            with open(file_path, "wb") as f, requests.get(mp4_url, stream=True) as res:
                                shutil.copyfileobj(res.raw, f)
                        print('Download end::::' + mp4_url)
                except Exception as ee:
                    print(str(ee))
                    pass

    for t in range(5):
        threading.Thread(target=download).start()

    while True:
        try:
            download_pages = download_site_home+'?page=' + str(page)
            res = requests.get(download_pages)
            if res.status_code == 200:
                re_href = re.compile(r'href="/\d{4}/[^"]*')
                all_href = re_href.findall(res.text)
                all_href.reverse()
                all_href_set = set(all_href[15:-15])
                for href_item in all_href_set:
                    play_page = download_site_home + href_item.replace('href="/', '')
                    play_page_res = requests.get(play_page)
                    if play_page_res.status_code == 200:
                        play_page_text = play_page_res.text
                        re_play_code = re.compile(r'data-code="[^"]*')
                        mp4_play_codes = re_play_code.findall(play_page_text)
                        mp4_play_codes_set = set(mp4_play_codes)
                        for code in mp4_play_codes_set:
                            param_code = code.replace('data-code="', '')
                            if param_code in mp4_code_set:
                                break
                            else:
                                mp4_code_set.add(param_code)
                            mp4url = mp4_api_url + param_code
                            mp4res = requests.get(mp4url)
                            if mp4res.status_code == 200:
                                file_path = store_location + mp4res.text[-15:]
                                if os.path.exists(file_path):
                                    break;
                                print(mp4res.text + '   ' + param_code)
                                download_url_queue.put(mp4res.text)
            page = page + 1
        except Exception as e:
            pass


if __name__ == '__main__':
    download_start()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

相关阅读:
Jmeter扩展---自定义取样器
弃用 ifconfig 吧，你值得收藏的 IpRoute2 简明指南
量子计算（八）：观测量和计算基下的测量
循环队列解析
手把手教你从安装CentOS7.4镜像开始，搭建IoT视频监控系统
图像处理之图像的几何变换
Mysql主从复制数据架构全面解读
Java lambda 动态查询
RUST 和 GO 如何管理它们的内存
【逗老师的无线电】Debian Linux手工编译安装MMDVM

原文地址：https://blog.csdn.net/zhou8622/article/details/126805695