• 爬虫 代理ip池构建及使用


    ip有效性检测

    1.对于免费代理的检测

    #免费代理或不用密码的代理
    url = 'http://httpbin.org/get'
    
    proxy = '127.0.0.0:8000'
    
    proxies = {
        'http': 'http://' + proxy,
        'https': 'https://' + proxy,
    }
    
    
    response = requests.get(url, proxies=proxies, verify=False)
    print(response.text)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13

    2.对于付费代理的检测:

    #测试付费代理和加密代理
    url = 'http://httpbin.org/get'
    
    proxy_host = '127.0.0.0'
    proxy_port = '8000'
    
    proxy_user = 'root'
    proxy_pass = 'root'
    
    proxy_meta = 'http://%(user)s:%(pass)s@%(host)s:%(port)s' % {
        'host': proxy_host,
        'port': proxy_port,
        'user': proxy_user,
        'pass': proxy_pass,
    }
    
    proxies = {
        'http': proxy_meta,
        'https': proxy_meta,
    }
    
    response = requests.get(url, proxies=proxies)
    print(response.text)
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25

    完整代码

    目录结构
    └─ip_pool
    │ proxies_ip.py
    │ test.py
    init.py

    └─com
    comm.py
    setting.py
    init.py

    proxies_ip.py 主要逻辑代码

    import requests
    import time
    import json
    import random
    from time import sleep
    from com.comm import PyMongo
    from com.setting import PROXIES_URL
    
    
    class ProxiesIP():
        def __init__(self):
            self.mongo_obj = PyMongo()
    
        def get_ip(self):
            res=requests.get(PROXIES_URL)
            if not res:
                return False
            d = dict()
            d["ip"] = res.text.strip()
            d["flag"] = 1
            d["ct_time"] = time.strftime('%Y%m%d', time.localtime(time.time()))
            return d
    
    
    
        def ip_to_mango(self):
            ip = self.get_ip()
            flag = self.mongo_obj.save_mongo(ip)
            if flag:
                print(f'{ip}保存成功')
            else:
                print(f'{ip}保存失败')
    
        def choice_ip(self):
            while True:
                ip_list = self.mongo_obj.find_mongo()
                if len(ip_list) > 2:
                    break
                print(f"IP池ip个数为:{len(ip_list)}")
                self.ip_to_mango()
            ip_coll = random.choice(ip_list)
            ip = ip_coll.get("ip")
            return ip
    
        def update_one(self, ip):
            self.mongo_obj.update_many(ip)
    
        def check_ip(self, ip):
            headers = {
                "User-Agent": f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/5{random.randint(20, 50)}.36"
            }
            test_url = 'http://httpbin.org/get'
            proxy_ip = {
                "http": "http://" + ip,
                "https": "https://" + ip,
            }
    
            try:
                response = requests.get(test_url, headers=headers, proxies=proxy_ip, timeout=3)
                # response = requests.get(test_url, headers=headers,timeout=3)
                print(response.text)
    
                if response.ok:
                    return True
            except Exception as ex:
                return False
    
        def check_proxies(self):
            while True:
                sleep(2)
                ip_list = self.mongo_obj.find_mongo()
                print(f"当前代理池有效代理{len(ip_list)}个")
                for coll_ip in ip_list:
                    ip = coll_ip.get("ip")
                    flag = self.check_ip(ip)
                    if not flag:
                        self.update_one(ip)
                        print(f"{ip}更新成功")
    
    
    if __name__ == '__main__':
        obj_ip = ProxiesIP()
        # obj_ip.find_from_mongo()
        # obj_ip.update_one("123.182.59.13:8888")
        obj_ip.check_proxies()
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86

    com/comm.py #公共函数代码

    from pymongo import MongoClient
    from com.setting import MONGO_URL,DB_NAME,COLLECTION_NAME
    
    class PyMongo(object):
        def save_mongo(self,data)->bool:
            '''
                插入数据到Mongodb
                @:param data 插入
                @:param collection_name 要插入连接名字
            '''
            try:
                conn = MongoClient(MONGO_URL)
                db = conn[DB_NAME]
                collection = db[COLLECTION_NAME]
                if isinstance(data,dict):
                    collection.insert_one(data)
                elif isinstance(data,list):
                    collection.insert_many(data)
                conn.close()
                return True
            except Exception as ex:
                print(ex)
                return False
    
        def find_mongo(self):
            try:
                conn = MongoClient(MONGO_URL)
                db = conn[DB_NAME]
                collection = db[COLLECTION_NAME]
                result = collection.find({"flag":1})
                result_list = list(result)
                conn.close()
                return result_list
            except Exception as ex:
                print(ex)
                return False
    
        def update_many(self,ip):
            try:
                conn = MongoClient(MONGO_URL)
                db = conn[DB_NAME]
                collection = db[COLLECTION_NAME]
                result = collection.update_many({"ip":ip}, {"$set": {"flag": 0}})
                conn.close()
                return True
            except Exception as ex:
                print(ex)
                return False
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48

    setting.py 配置文件代码

    import sys
    MONGO_URL = "mongodb://localhost:27017"
    DB_NAME = "proxies"
    COLLECTION_NAME = "ip_pool"
    PROXIES_URL = "http://127.0.0.1:5012/proxies/ip"  #供应商ip url
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

    scrapy中代理使用

    方法一 中间件中使用

    1.在middlewares.py问件中添加如下代码即可:

    
    from python_code.ip_pool.proxies_ip import ProxiesIP
    
    # useful for handling different item types with a single interface
    from itemadapter import is_item, ItemAdapter
    
    class ProxyMiddleware(object):
        def process_request(self,request,spider):
            ip_pool = ProxiesIP()
            ip = ip_pool.choice_ip()
            if request.url.startswith("http://"):
                request.meta['proxy']="http://"+ str(ip)          # http代理
            elif request.url.startswith("https://"):
                request.meta['proxy']="https://"+ str(ip)         # https代理
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    2.在settings.py文件中添加配置

    # Enable or disable downloader middlewares
    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
    DOWNLOADER_MIDDLEWARES = {
       # 'JinDong.middlewares.JindongDownloaderMiddleware': 543,
       'JinDong.middlewares.ProxyMiddleware': 543,
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

    3.scrapy爬虫代码中正常编写代码,不用做任何修改/添加

    import scrapy
    
    class ProxySpider(scrapy.Spider):
        name = 'proxy'
        allowed_domains = ["httpbin.org"]
        # start_urls = ['http://httpbin.org/get']
    
        def start_requests(self):
            url = 'http://httpbin.org/get'
            yield scrapy.Request(url, callback=self.parse)
    
        def parse(self,response):
            print(response.text)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13

    测试验证结果如图:
    在这里插入图片描述

    方法二
    1.直接编写在scrapy爬虫代码中

    利用meta函数进行携带即可访问
    scrapy爬虫代码中:

    import scrapy
    
    class ProxySpider(scrapy.Spider):
        name = 'proxy'
        allowed_domains = ["httpbin.org"]
    
        def start_requests(self):
            url = 'http://httpbin.org/get'
            proxy = '127.0.0.0:8000'
    
            proxies = ""
            if url.startswith("http://"):
                proxies = "http://"+str(proxy)
            elif url.startswith("https://"):
                proxies = "https://"+str(proxy)
            #注意这里面的meta={'proxy':proxies},一定要是proxy进行携带,其它的不行,后面的proxies一定 要是字符串,其它任何形式都不行
            yield scrapy.Request(url, callback=self.parse,meta={'proxy':proxies})
    
        def parse(self,response):
            print(response.text)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20

    完整代码下载:下载地址

  • 相关阅读:
    你适不适合当一个Java程序员?从这几点来判断
    Mysql热备增量备份与恢复(-)--备份部分
    2022北京老博会/北京养老展/北京智慧养老展/老年用品展
    2020年下半年~2022下半年下午题易错总结
    网络运维Day01
    搭建STM32F407的Freertos系统(基于STM32CubeMX)
    初学python记录:力扣216. 组合总和 III
    《C和指针》(1)快速上手
    Allegro Design Entry HDL(OrCAD Capture HDL)元器件管理菜单详细介绍
    【风控】评分卡建模的流程和要点
  • 原文地址:https://blog.csdn.net/weixin_43335288/article/details/126813330