1.对于免费代理的检测
#免费代理或不用密码的代理
url = 'http://httpbin.org/get'
proxy = '127.0.0.0:8000'
proxies = {
'http': 'http://' + proxy,
'https': 'https://' + proxy,
}
response = requests.get(url, proxies=proxies, verify=False)
print(response.text)
2.对于付费代理的检测:
#测试付费代理和加密代理
url = 'http://httpbin.org/get'
proxy_host = '127.0.0.0'
proxy_port = '8000'
proxy_user = 'root'
proxy_pass = 'root'
proxy_meta = 'http://%(user)s:%(pass)s@%(host)s:%(port)s' % {
'host': proxy_host,
'port': proxy_port,
'user': proxy_user,
'pass': proxy_pass,
}
proxies = {
'http': proxy_meta,
'https': proxy_meta,
}
response = requests.get(url, proxies=proxies)
print(response.text)
完整代码
目录结构
└─ip_pool
│ proxies_ip.py
│ test.py
│ init.py
│
└─com
comm.py
setting.py
init.py
proxies_ip.py 主要逻辑代码
import requests
import time
import json
import random
from time import sleep
from com.comm import PyMongo
from com.setting import PROXIES_URL
class ProxiesIP():
def __init__(self):
self.mongo_obj = PyMongo()
def get_ip(self):
res=requests.get(PROXIES_URL)
if not res:
return False
d = dict()
d["ip"] = res.text.strip()
d["flag"] = 1
d["ct_time"] = time.strftime('%Y%m%d', time.localtime(time.time()))
return d
def ip_to_mango(self):
ip = self.get_ip()
flag = self.mongo_obj.save_mongo(ip)
if flag:
print(f'{ip}保存成功')
else:
print(f'{ip}保存失败')
def choice_ip(self):
while True:
ip_list = self.mongo_obj.find_mongo()
if len(ip_list) > 2:
break
print(f"IP池ip个数为:{len(ip_list)}")
self.ip_to_mango()
ip_coll = random.choice(ip_list)
ip = ip_coll.get("ip")
return ip
def update_one(self, ip):
self.mongo_obj.update_many(ip)
def check_ip(self, ip):
headers = {
"User-Agent": f"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/5{random.randint(20, 50)}.36"
}
test_url = 'http://httpbin.org/get'
proxy_ip = {
"http": "http://" + ip,
"https": "https://" + ip,
}
try:
response = requests.get(test_url, headers=headers, proxies=proxy_ip, timeout=3)
# response = requests.get(test_url, headers=headers,timeout=3)
print(response.text)
if response.ok:
return True
except Exception as ex:
return False
def check_proxies(self):
while True:
sleep(2)
ip_list = self.mongo_obj.find_mongo()
print(f"当前代理池有效代理{len(ip_list)}个")
for coll_ip in ip_list:
ip = coll_ip.get("ip")
flag = self.check_ip(ip)
if not flag:
self.update_one(ip)
print(f"{ip}更新成功")
if __name__ == '__main__':
obj_ip = ProxiesIP()
# obj_ip.find_from_mongo()
# obj_ip.update_one("123.182.59.13:8888")
obj_ip.check_proxies()
com/comm.py #公共函数代码
from pymongo import MongoClient
from com.setting import MONGO_URL,DB_NAME,COLLECTION_NAME
class PyMongo(object):
def save_mongo(self,data)->bool:
'''
插入数据到Mongodb
@:param data 插入
@:param collection_name 要插入连接名字
'''
try:
conn = MongoClient(MONGO_URL)
db = conn[DB_NAME]
collection = db[COLLECTION_NAME]
if isinstance(data,dict):
collection.insert_one(data)
elif isinstance(data,list):
collection.insert_many(data)
conn.close()
return True
except Exception as ex:
print(ex)
return False
def find_mongo(self):
try:
conn = MongoClient(MONGO_URL)
db = conn[DB_NAME]
collection = db[COLLECTION_NAME]
result = collection.find({"flag":1})
result_list = list(result)
conn.close()
return result_list
except Exception as ex:
print(ex)
return False
def update_many(self,ip):
try:
conn = MongoClient(MONGO_URL)
db = conn[DB_NAME]
collection = db[COLLECTION_NAME]
result = collection.update_many({"ip":ip}, {"$set": {"flag": 0}})
conn.close()
return True
except Exception as ex:
print(ex)
return False
setting.py 配置文件代码
import sys
MONGO_URL = "mongodb://localhost:27017"
DB_NAME = "proxies"
COLLECTION_NAME = "ip_pool"
PROXIES_URL = "http://127.0.0.1:5012/proxies/ip" #供应商ip url
scrapy中代理使用
方法一 中间件中使用
1.在middlewares.py问件中添加如下代码即可:
from python_code.ip_pool.proxies_ip import ProxiesIP
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ProxyMiddleware(object):
def process_request(self,request,spider):
ip_pool = ProxiesIP()
ip = ip_pool.choice_ip()
if request.url.startswith("http://"):
request.meta['proxy']="http://"+ str(ip) # http代理
elif request.url.startswith("https://"):
request.meta['proxy']="https://"+ str(ip) # https代理
2.在settings.py文件中添加配置
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'JinDong.middlewares.JindongDownloaderMiddleware': 543,
'JinDong.middlewares.ProxyMiddleware': 543,
}
3.scrapy爬虫代码中正常编写代码,不用做任何修改/添加
import scrapy
class ProxySpider(scrapy.Spider):
name = 'proxy'
allowed_domains = ["httpbin.org"]
# start_urls = ['http://httpbin.org/get']
def start_requests(self):
url = 'http://httpbin.org/get'
yield scrapy.Request(url, callback=self.parse)
def parse(self,response):
print(response.text)
测试验证结果如图:
方法二
1.直接编写在scrapy爬虫代码中
利用meta函数进行携带即可访问
scrapy爬虫代码中:
import scrapy
class ProxySpider(scrapy.Spider):
name = 'proxy'
allowed_domains = ["httpbin.org"]
def start_requests(self):
url = 'http://httpbin.org/get'
proxy = '127.0.0.0:8000'
proxies = ""
if url.startswith("http://"):
proxies = "http://"+str(proxy)
elif url.startswith("https://"):
proxies = "https://"+str(proxy)
#注意这里面的meta={'proxy':proxies},一定要是proxy进行携带,其它的不行,后面的proxies一定 要是字符串,其它任何形式都不行
yield scrapy.Request(url, callback=self.parse,meta={'proxy':proxies})
def parse(self,response):
print(response.text)
完整代码下载:下载地址