• python爬虫,多线程与生产者消费者模式


    • 使用队列完成生产者消费者模式
    • 使用类创建多线程提高爬虫速度
    '''
    https://sc.chinaz.com/tupian/index.html
    https://sc.chinaz.com/tupian/index_2.html
    https://sc.chinaz.com/tupian/index_3.html
    '''
    
    from threading import Thread
    from queue import Queue
    import requests
    from bs4 import BeautifulSoup
    import os
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69',
    }
    class Put_Thread(Thread):
        def __init__(self, url_queue, img_queue):
            super().__init__()
            self.url_queue = url_queue
            self.img_queue = img_queue
    
        def run(self):
            while not self.url_queue.empty():
                url = self.url_queue.get()
                self.fetch_url(url)
    
        def fetch_url(self, url):
            response = requests.get(url, headers=headers)
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, 'lxml')
            data_list = soup.find_all('img', class_='lazy')
            for i in data_list:
                title = i.get('alt')
                href = 'https:' + i.get('data-original').replace('_s', '')
                self.img_queue.put((title, href))
    
    class Get_Thread(Thread):
        def __init__(self, img_queue):
            super().__init__()
            self.img_queue = img_queue
    
        def run(self):
            while True:
                try:
                    img_data = self.img_queue.get(timeout=3)
                except:
                    break
                else:
                    title, href = img_data
                    if not os.path.exists('./image'):
                        os.mkdir('./image')
                    with open('./image/' + title + '.jpg', 'wb') as f:
                        resp = requests.get(href, headers=headers).content
                        f.write(resp)
                    print(title, '保存成功!')
    
    def main():
        '''存放url'''
        url_queue = Queue()
        '''存放图片的地址和名称'''
        img_queue = Queue()
    
        url_queue.put('https://sc.chinaz.com/tupian/index.html')
        for i in range(1,11):
            url = 'https://sc.chinaz.com/tupian/index_{}.html'.format(i)
            url_queue.put(url)
    
        for i in range(41):
            t1 = Put_Thread(url_queue, img_queue)
            t1.start()
            t2 = Get_Thread(img_queue)
            t2.start()
    
    if __name__ == '__main__':
        main()
        print('\n************主线程已结束************\n')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 通过队列可以让线程之间进行通信
    • 创建继承Thread的类创建线程,run()会在线程start时执行
    • 吃cpu性能
  • 相关阅读:
    分组后统计查询
    awk命令实例
    Linux字符设备驱动
    命令与文件的查找
    基于vue的学生租房及自习室预约管理系统
    微信小程序导航组件 navigator使用
    G - Card Game
    YOLO训练KITTI数据集(二):KITTI数据集的标签转换成YOLOv5所需要的标签格式并进行分割
    失业登记对养老保险是否有影响
    基于springboot的汽车销售系统,汽车商城管理系统,附源码+数据库+论文+开题报告,包远程安装调试
  • 原文地址:https://blog.csdn.net/qq248606117/article/details/132747677