• 第十五天-爬虫项目实战


    目录

    1.介绍

    2.代码

    1.main.py

    2.PageSider.py

    3.DetailSpider.py

    4.DataParse.py

    5.Constant.py

    6.HanderRequest.py


    1.介绍

    1. 使用多线程爬取网站

    2.爬取数据后保存至excel

    3.爬取网站(仅做测试)网创类项目爬取:https://www.maomp.com/

    4..实现效果

    2.代码

    1.main.py

    1. # coding:utf-8
    2. import threading
    3. import requests
    4. from queue import Queue
    5. from PageSpider import PageSpider
    6. from DetailSpider import DetailSpider
    7. from DataParse import DataParse
    8. import xlsxwriter
    9. import time
    10. """
    11. 爬取网站:https://www.maomp.com/wzjc/
    12. 爬取信息,保存至Excel
    13. """
    14. def start_page(threadsize,page_queue,detail_queue):
    15. # 开启线程,开始采集page页面
    16. page_spider_threadsize = threadsize
    17. page_spider_list = []
    18. for i in range(1,page_spider_threadsize+1):
    19. pageSpiderThread = PageSpider(thread_name="页面采集线程"+str(i), page_queue=page_queue, detail_queue=detail_queue)
    20. # 启动线程
    21. pageSpiderThread.start()
    22. page_spider_list.append(pageSpiderThread)
    23. # 查看队列是否有数据
    24. while not page_queue:
    25. pass
    26. # 释放资源
    27. for page_spider in page_spider_list:
    28. if page_spider.is_alive():
    29. page_spider.join()
    30. def start_detail(threadsize,detail_queue,data_queue):
    31. # 开启线程,开始采集page页面
    32. detail_spider_threadsize = threadsize
    33. detail_spider_list = []
    34. for i in range(1, detail_spider_threadsize + 1):
    35. detailSpiderThread = DetailSpider(thread_name="详情页采集线程" + str(i), detail_queue=detail_queue,
    36. data_queue=data_queue)
    37. # 启动线程
    38. detailSpiderThread.start()
    39. detail_spider_list.append(detailSpiderThread)
    40. # 查看队列是否有数据
    41. while not detail_queue:
    42. pass
    43. # 释放资源
    44. for detail_spider in detail_spider_list:
    45. if detail_spider.is_alive():
    46. detail_spider.join()
    47. def start_data_parse(threadsize,data_queue,book):
    48. # 开启线程,开始采集page页面
    49. lock=threading.Lock()
    50. sheet1 = book.add_worksheet("sheet1")
    51. title_data = ("网址", "标题", "发布时间", "内容")
    52. # 添加表头
    53. for index, title_datum in enumerate(title_data):
    54. sheet1.write(0, index, title_datum)
    55. spider_list = []
    56. for i in range(1, threadsize + 1):
    57. thread = DataParse(thread_name="数据解析线程" + str(i), data_queue=data_queue,lock=lock,sheet=sheet1)
    58. # 启动线程
    59. thread.start()
    60. spider_list.append(thread)
    61. # 查看队列是否有数据
    62. while not data_queue:
    63. pass
    64. # 释放资源
    65. for parse in spider_list:
    66. if parse.is_alive():
    67. parse.join()
    68. def main(xlswriter=None):
    69. #定义页面队列,存放page页信息
    70. page_queue = Queue()
    71. #定义详情页队列
    72. detail_queue = Queue()
    73. #定义详情页数据队列
    74. data_queue = Queue()
    75. page_start=1
    76. page_end=1
    77. for i in range(page_start,page_end+1):
    78. page_url="https://www.maomp.com/wzjc/page/{}/".format(i)
    79. page_queue.put(page_url)
    80. print("页面队列:",page_queue.qsize())
    81. #启动采集分页
    82. start_page(threadsize=3,page_queue=page_queue,detail_queue=detail_queue)
    83. #启动详情页采集
    84. start_detail(threadsize=3, detail_queue=detail_queue, data_queue=data_queue)
    85. # 启动数据解析
    86. #创建存放excel文件夹
    87. book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S",time.gmtime())+"文件.xlsx")
    88. start_data_parse(threadsize=5,data_queue=data_queue,book=book)
    89. book.close()
    90. print("分页数据个数:",page_queue.qsize())
    91. print("详情页数据个数:", detail_queue.qsize())
    92. print("数据数据个数:", data_queue.qsize())
    93. if __name__ == '__main__':
    94. main()

    2.PageSider.py

    1. # coding:utf-8
    2. import threading
    3. from lxml import etree
    4. import HanderRequest
    5. class PageSpider(threading.Thread):
    6. """
    7. 页面url,请求多线程类
    8. """
    9. def __init__(self,thread_name,page_queue,detail_queue):
    10. super(PageSpider,self).__init__()
    11. self.thread_name=thread_name
    12. self.page_queue=page_queue
    13. self.detail_queue=detail_queue
    14. def parse_detail_url(self,content):
    15. """
    16. 解析page页获取详情页url
    17. :param content: page页text
    18. :return: 返回详情页url
    19. """
    20. #页码返回数据html实例化
    21. item_html=etree.HTML(content)
    22. #解析出索引详情页URL
    23. detail_urls=item_html.xpath("//h2[@class='entry-title']/a/@href")
    24. for url in detail_urls:
    25. #将详情页url存放到队列中
    26. self.detail_queue.put(url)
    27. def run(self):
    28. #实际发送请求
    29. print("{}启动".format(self.thread_name))
    30. #需要从page_queue队列中获取数据
    31. try:
    32. while not self.page_queue.empty():
    33. #从队列中获取数据,并设置为非阻塞状态
    34. page_url= self.page_queue.get(block=False)
    35. #请求页面链接
    36. response_text=HanderRequest.send_reqeust(page_url)
    37. if response_text:
    38. #解析详情url
    39. self.parse_detail_url(response_text)
    40. except Exception as e:
    41. print("{} 执行异常:{}".format(self.thread_name,e))
    42. print("{}结束".format(self.thread_name))

    3.DetailSpider.py

    1. # coding:utf-8
    2. import threading
    3. from lxml import etree
    4. import HanderRequest
    5. class DetailSpider(threading.Thread):
    6. """
    7. 详情页url,请求详情页
    8. """
    9. def __init__(self,thread_name,detail_queue,data_queue):
    10. super(DetailSpider,self).__init__()
    11. self.thread_name=thread_name
    12. self.data_queue=data_queue
    13. self.detail_queue=detail_queue
    14. def run(self):
    15. #实际发送请求
    16. print("{}启动".format(self.thread_name))
    17. #需要从page_queue队列中获取数据
    18. try:
    19. while not self.detail_queue.empty():
    20. #从队列中获取数据,并设置为非阻塞状态
    21. detail_url= self.detail_queue.get(block=False)
    22. #请求页面链接
    23. response_text=HanderRequest.send_reqeust(detail_url)
    24. if response_text:
    25. data={
    26. "url":detail_url,
    27. "html_content":response_text
    28. }
    29. #存放data_queuq数据
    30. self.data_queue.put(data)
    31. except Exception as e:
    32. print("{} 执行异常:{}".format(self.thread_name,e))
    33. print("{}结束".format(self.thread_name))

    4.DataParse.py

    1. # coding:utf-8
    2. import threading
    3. from lxml import etree
    4. import Constant
    5. class DataParse(threading.Thread):
    6. """
    7. 详情页数据处理
    8. """
    9. def __init__(self,thread_name,data_queue,lock,sheet):
    10. super(DataParse,self).__init__()
    11. self.thread_name=thread_name
    12. self.data_queue=data_queue
    13. self.lock=lock
    14. self.sheet=sheet
    15. def __list_join(self,list):
    16. return "".join(list)
    17. def __parse(self,data):
    18. """
    19. 解析data_queue数据
    20. 保存至excel中
    21. :return:
    22. """
    23. html= etree.HTML(data.get("html_content"))
    24. data={
    25. "url":data.get("url"),
    26. "title": self.__list_join(html.xpath("//h1[@class='entry-title']/text()")),
    27. "put_date":self.__list_join(html.xpath("//span[@class='my-date']/text()")),
    28. "content_html":self.__list_join(html.xpath("//div[@class='single-content']//p/text()"))
    29. }
    30. #多线程,使用lock来进行控制并发
    31. with self.lock:
    32. #写入Excel
    33. for index,e in enumerate(data):
    34. self.sheet.write(Constant.CURR_EXCEL_COL,index,data.get(e))
    35. Constant.CURR_EXCEL_COL += 1
    36. def run(self):
    37. #实际发送请求
    38. print("{}启动".format(self.thread_name))
    39. #需要从page_queue队列中获取数据
    40. try:
    41. while not self.data_queue.empty():
    42. #从队列中获取数据,并设置为非阻塞状态
    43. data_content= self.data_queue.get(block=False)
    44. #解析html数据
    45. self.__parse(data_content)
    46. except Exception as e:
    47. print("{} 执行异常:{}".format(self.thread_name,e))
    48. print("{}结束".format(self.thread_name))

    5.Constant.py

    1. # coding:utf-8
    2. # excel写入到第几列
    3. CURR_EXCEL_COL=1

    6.HanderRequest.py

    注意修改cookie

    1. # coding:utf-8
    2. import requests
    3. def send_reqeust(url):
    4. #发送数据
    5. headers={
    6. "Cookie":"xxx",
    7. "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
    8. }
    9. response=requests.get(url,headers=headers)
    10. if response.status_code==200 and response:
    11. return response.text

  • 相关阅读:
    WPF dataGrid初步使用案例
    石英砂过滤器 多介质过滤器 活性炭过滤器
    DDR SDRAM 学习笔记
    web课程设计使用html+css+javascript+jquery技术制作个人介绍6页
    Spring Boot 项目的常用注解与依赖
    禹晶、肖创柏、廖庆敏《数字图像处理(面向新工科的电工电子信息基础课程系列教材)》Chapter 7插图
    基于start.spring.io,定制你的Java脚手架
    数据治理-GDPR准则
    当线程池任务抛出异常
    JAVA Annotation 详解
  • 原文地址:https://blog.csdn.net/qq_34960590/article/details/136408405