目录
1. 使用多线程爬取网站
2.爬取数据后保存至excel
3.爬取网站(仅做测试)网创类项目爬取:https://www.maomp.com/
4..实现效果


- # coding:utf-8
- import threading
-
- import requests
- from queue import Queue
- from PageSpider import PageSpider
- from DetailSpider import DetailSpider
- from DataParse import DataParse
- import xlsxwriter
- import time
- """
- 爬取网站:https://www.maomp.com/wzjc/
- 爬取信息,保存至Excel
- """
-
- def start_page(threadsize,page_queue,detail_queue):
- # 开启线程,开始采集page页面
- page_spider_threadsize = threadsize
- page_spider_list = []
- for i in range(1,page_spider_threadsize+1):
- pageSpiderThread = PageSpider(thread_name="页面采集线程"+str(i), page_queue=page_queue, detail_queue=detail_queue)
- # 启动线程
- pageSpiderThread.start()
- page_spider_list.append(pageSpiderThread)
- # 查看队列是否有数据
- while not page_queue:
- pass
- # 释放资源
- for page_spider in page_spider_list:
- if page_spider.is_alive():
- page_spider.join()
-
-
- def start_detail(threadsize,detail_queue,data_queue):
- # 开启线程,开始采集page页面
- detail_spider_threadsize = threadsize
- detail_spider_list = []
- for i in range(1, detail_spider_threadsize + 1):
- detailSpiderThread = DetailSpider(thread_name="详情页采集线程" + str(i), detail_queue=detail_queue,
- data_queue=data_queue)
- # 启动线程
- detailSpiderThread.start()
- detail_spider_list.append(detailSpiderThread)
- # 查看队列是否有数据
- while not detail_queue:
- pass
- # 释放资源
- for detail_spider in detail_spider_list:
- if detail_spider.is_alive():
- detail_spider.join()
-
- def start_data_parse(threadsize,data_queue,book):
- # 开启线程,开始采集page页面
- lock=threading.Lock()
- sheet1 = book.add_worksheet("sheet1")
- title_data = ("网址", "标题", "发布时间", "内容")
- # 添加表头
- for index, title_datum in enumerate(title_data):
- sheet1.write(0, index, title_datum)
-
- spider_list = []
- for i in range(1, threadsize + 1):
- thread = DataParse(thread_name="数据解析线程" + str(i), data_queue=data_queue,lock=lock,sheet=sheet1)
- # 启动线程
- thread.start()
- spider_list.append(thread)
- # 查看队列是否有数据
- while not data_queue:
- pass
- # 释放资源
- for parse in spider_list:
- if parse.is_alive():
- parse.join()
-
- def main(xlswriter=None):
- #定义页面队列,存放page页信息
- page_queue = Queue()
- #定义详情页队列
- detail_queue = Queue()
- #定义详情页数据队列
- data_queue = Queue()
- page_start=1
- page_end=1
- for i in range(page_start,page_end+1):
- page_url="https://www.maomp.com/wzjc/page/{}/".format(i)
- page_queue.put(page_url)
- print("页面队列:",page_queue.qsize())
-
- #启动采集分页
- start_page(threadsize=3,page_queue=page_queue,detail_queue=detail_queue)
- #启动详情页采集
- start_detail(threadsize=3, detail_queue=detail_queue, data_queue=data_queue)
- # 启动数据解析
- #创建存放excel文件夹
- book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S",time.gmtime())+"文件.xlsx")
- start_data_parse(threadsize=5,data_queue=data_queue,book=book)
- book.close()
- print("分页数据个数:",page_queue.qsize())
- print("详情页数据个数:", detail_queue.qsize())
- print("数据数据个数:", data_queue.qsize())
-
- if __name__ == '__main__':
- main()
- # coding:utf-8
- import threading
- from lxml import etree
- import HanderRequest
-
-
- class PageSpider(threading.Thread):
- """
- 页面url,请求多线程类
- """
-
- def __init__(self,thread_name,page_queue,detail_queue):
- super(PageSpider,self).__init__()
- self.thread_name=thread_name
- self.page_queue=page_queue
- self.detail_queue=detail_queue
-
- def parse_detail_url(self,content):
- """
- 解析page页获取详情页url
- :param content: page页text
- :return: 返回详情页url
- """
- #页码返回数据html实例化
- item_html=etree.HTML(content)
- #解析出索引详情页URL
- detail_urls=item_html.xpath("//h2[@class='entry-title']/a/@href")
- for url in detail_urls:
- #将详情页url存放到队列中
- self.detail_queue.put(url)
-
- def run(self):
- #实际发送请求
- print("{}启动".format(self.thread_name))
- #需要从page_queue队列中获取数据
- try:
- while not self.page_queue.empty():
- #从队列中获取数据,并设置为非阻塞状态
- page_url= self.page_queue.get(block=False)
- #请求页面链接
- response_text=HanderRequest.send_reqeust(page_url)
- if response_text:
- #解析详情url
- self.parse_detail_url(response_text)
- except Exception as e:
- print("{} 执行异常:{}".format(self.thread_name,e))
-
- print("{}结束".format(self.thread_name))
- # coding:utf-8
- import threading
- from lxml import etree
- import HanderRequest
-
-
- class DetailSpider(threading.Thread):
- """
- 详情页url,请求详情页
- """
-
- def __init__(self,thread_name,detail_queue,data_queue):
- super(DetailSpider,self).__init__()
- self.thread_name=thread_name
- self.data_queue=data_queue
- self.detail_queue=detail_queue
-
-
- def run(self):
- #实际发送请求
- print("{}启动".format(self.thread_name))
- #需要从page_queue队列中获取数据
- try:
- while not self.detail_queue.empty():
- #从队列中获取数据,并设置为非阻塞状态
- detail_url= self.detail_queue.get(block=False)
- #请求页面链接
- response_text=HanderRequest.send_reqeust(detail_url)
- if response_text:
- data={
- "url":detail_url,
- "html_content":response_text
- }
- #存放data_queuq数据
- self.data_queue.put(data)
-
- except Exception as e:
- print("{} 执行异常:{}".format(self.thread_name,e))
-
- print("{}结束".format(self.thread_name))
- # coding:utf-8
- import threading
- from lxml import etree
- import Constant
-
-
-
- class DataParse(threading.Thread):
- """
- 详情页数据处理
- """
-
- def __init__(self,thread_name,data_queue,lock,sheet):
- super(DataParse,self).__init__()
- self.thread_name=thread_name
- self.data_queue=data_queue
- self.lock=lock
- self.sheet=sheet
-
-
- def __list_join(self,list):
- return "".join(list)
-
- def __parse(self,data):
- """
- 解析data_queue数据
- 保存至excel中
- :return:
- """
-
- html= etree.HTML(data.get("html_content"))
- data={
- "url":data.get("url"),
- "title": self.__list_join(html.xpath("//h1[@class='entry-title']/text()")),
- "put_date":self.__list_join(html.xpath("//span[@class='my-date']/text()")),
- "content_html":self.__list_join(html.xpath("//div[@class='single-content']//p/text()"))
- }
- #多线程,使用lock来进行控制并发
- with self.lock:
- #写入Excel
- for index,e in enumerate(data):
- self.sheet.write(Constant.CURR_EXCEL_COL,index,data.get(e))
- Constant.CURR_EXCEL_COL += 1
-
- def run(self):
- #实际发送请求
- print("{}启动".format(self.thread_name))
- #需要从page_queue队列中获取数据
- try:
- while not self.data_queue.empty():
- #从队列中获取数据,并设置为非阻塞状态
- data_content= self.data_queue.get(block=False)
- #解析html数据
- self.__parse(data_content)
-
- except Exception as e:
- print("{} 执行异常:{}".format(self.thread_name,e))
-
- print("{}结束".format(self.thread_name))
- # coding:utf-8
-
- # excel写入到第几列
- CURR_EXCEL_COL=1
注意修改cookie
- # coding:utf-8
-
- import requests
-
- def send_reqeust(url):
- #发送数据
- headers={
- "Cookie":"xxx",
- "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
- }
- response=requests.get(url,headers=headers)
- if response.status_code==200 and response:
- return response.text
-