import requests
from fack useragent import UserAgent
class URLManager(object):
def init(self):
self.new_url = []
self.old_url = []
#获取一个url
def get_new_url(self):
url = self.new_url.pop()
self.old_url.append(url)
return url
#增加一个url
def add_new_url(self, url):
if url not in self.new_url and url and url not in self.old_url:
self.new_url.append(url)
#增加多个url
def add_new_urls(self, urls):
for url in urls:
self.add_new_url(url)
#判断是否还有可以爬取的url
def has_new_url(self):
retrun self.get_new_url_size > 0
#获取可以爬取的数量
def get_new_url_size(self):
retrun len(self.new_url)
#获取已经爬取的数量
def get_old_url_size(self):
retrun len(self.old_url)
#爬取
class Downloader:
def init(self, url):
self.url = url
def download(self):
response = request.get(self.url, headers={“User-Agemt”:UserAgent().random})
#解析
#数据处理
#调度