• 第十五天-爬虫项目实战


    目录

    1.介绍

    2.代码

    1.main.py

    2.PageSider.py

    3.DetailSpider.py

    4.DataParse.py

    5.Constant.py

    6.HanderRequest.py


    1.介绍

    1. 使用多线程爬取网站

    2.爬取数据后保存至excel

    3.爬取网站(仅做测试)网创类项目爬取:https://www.maomp.com/

    4..实现效果

    2.代码

    1.main.py

    1. # coding:utf-8
    2. import threading
    3. import requests
    4. from queue import Queue
    5. from PageSpider import PageSpider
    6. from DetailSpider import DetailSpider
    7. from DataParse import DataParse
    8. import xlsxwriter
    9. import time
    10. """
    11. 爬取网站:https://www.maomp.com/wzjc/
    12. 爬取信息,保存至Excel
    13. """
    14. def start_page(threadsize,page_queue,detail_queue):
    15. # 开启线程,开始采集page页面
    16. page_spider_threadsize = threadsize
    17. page_spider_list = []
    18. for i in range(1,page_spider_threadsize+1):
    19. pageSpiderThread = PageSpider(thread_name="页面采集线程"+str(i), page_queue=page_queue, detail_queue=detail_queue)
    20. # 启动线程
    21. pageSpiderThread.start()
    22. page_spider_list.append(pageSpiderThread)
    23. # 查看队列是否有数据
    24. while not page_queue:
    25. pass
    26. # 释放资源
    27. for page_spider in page_spider_list:
    28. if page_spider.is_alive():
    29. page_spider.join()
    30. def start_detail(threadsize,detail_queue,data_queue):
    31. # 开启线程,开始采集page页面
    32. detail_spider_threadsize = threadsize
    33. detail_spider_list = []
    34. for i in range(1, detail_spider_threadsize + 1):
    35. detailSpiderThread = DetailSpider(thread_name="详情页采集线程" + str(i), detail_queue=detail_queue,
    36. data_queue=data_queue)
    37. # 启动线程
    38. detailSpiderThread.start()
    39. detail_spider_list.append(detailSpiderThread)
    40. # 查看队列是否有数据
    41. while not detail_queue:
    42. pass
    43. # 释放资源
    44. for detail_spider in detail_spider_list:
    45. if detail_spider.is_alive():
    46. detail_spider.join()
    47. def start_data_parse(threadsize,data_queue,book):
    48. # 开启线程,开始采集page页面
    49. lock=threading.Lock()
    50. sheet1 = book.add_worksheet("sheet1")
    51. title_data = ("网址", "标题", "发布时间", "内容")
    52. # 添加表头
    53. for index, title_datum in enumerate(title_data):
    54. sheet1.write(0, index, title_datum)
    55. spider_list = []
    56. for i in range(1, threadsize + 1):
    57. thread = DataParse(thread_name="数据解析线程" + str(i), data_queue=data_queue,lock=lock,sheet=sheet1)
    58. # 启动线程
    59. thread.start()
    60. spider_list.append(thread)
    61. # 查看队列是否有数据
    62. while not data_queue:
    63. pass
    64. # 释放资源
    65. for parse in spider_list:
    66. if parse.is_alive():
    67. parse.join()
    68. def main(xlswriter=None):
    69. #定义页面队列,存放page页信息
    70. page_queue = Queue()
    71. #定义详情页队列
    72. detail_queue = Queue()
    73. #定义详情页数据队列
    74. data_queue = Queue()
    75. page_start=1
    76. page_end=1
    77. for i in range(page_start,page_end+1):
    78. page_url="https://www.maomp.com/wzjc/page/{}/".format(i)
    79. page_queue.put(page_url)
    80. print("页面队列:",page_queue.qsize())
    81. #启动采集分页
    82. start_page(threadsize=3,page_queue=page_queue,detail_queue=detail_queue)
    83. #启动详情页采集
    84. start_detail(threadsize=3, detail_queue=detail_queue, data_queue=data_queue)
    85. # 启动数据解析
    86. #创建存放excel文件夹
    87. book = xlsxwriter.Workbook(time.strftime("%Y%m%d%H%M%S",time.gmtime())+"文件.xlsx")
    88. start_data_parse(threadsize=5,data_queue=data_queue,book=book)
    89. book.close()
    90. print("分页数据个数:",page_queue.qsize())
    91. print("详情页数据个数:", detail_queue.qsize())
    92. print("数据数据个数:", data_queue.qsize())
    93. if __name__ == '__main__':
    94. main()

    2.PageSider.py

    1. # coding:utf-8
    2. import threading
    3. from lxml import etree
    4. import HanderRequest
    5. class PageSpider(threading.Thread):
    6. """
    7. 页面url,请求多线程类
    8. """
    9. def __init__(self,thread_name,page_queue,detail_queue):
    10. super(PageSpider,self).__init__()
    11. self.thread_name=thread_name
    12. self.page_queue=page_queue
    13. self.detail_queue=detail_queue
    14. def parse_detail_url(self,content):
    15. """
    16. 解析page页获取详情页url
    17. :param content: page页text
    18. :return: 返回详情页url
    19. """
    20. #页码返回数据html实例化
    21. item_html=etree.HTML(content)
    22. #解析出索引详情页URL
    23. detail_urls=item_html.xpath("//h2[@class='entry-title']/a/@href")
    24. for url in detail_urls:
    25. #将详情页url存放到队列中
    26. self.detail_queue.put(url)
    27. def run(self):
    28. #实际发送请求
    29. print("{}启动".format(self.thread_name))
    30. #需要从page_queue队列中获取数据
    31. try:
    32. while not self.page_queue.empty():
    33. #从队列中获取数据,并设置为非阻塞状态
    34. page_url= self.page_queue.get(block=False)
    35. #请求页面链接
    36. response_text=HanderRequest.send_reqeust(page_url)
    37. if response_text:
    38. #解析详情url
    39. self.parse_detail_url(response_text)
    40. except Exception as e:
    41. print("{} 执行异常:{}".format(self.thread_name,e))
    42. print("{}结束".format(self.thread_name))

    3.DetailSpider.py

    1. # coding:utf-8
    2. import threading
    3. from lxml import etree
    4. import HanderRequest
    5. class DetailSpider(threading.Thread):
    6. """
    7. 详情页url,请求详情页
    8. """
    9. def __init__(self,thread_name,detail_queue,data_queue):
    10. super(DetailSpider,self).__init__()
    11. self.thread_name=thread_name
    12. self.data_queue=data_queue
    13. self.detail_queue=detail_queue
    14. def run(self):
    15. #实际发送请求
    16. print("{}启动".format(self.thread_name))
    17. #需要从page_queue队列中获取数据
    18. try:
    19. while not self.detail_queue.empty():
    20. #从队列中获取数据,并设置为非阻塞状态
    21. detail_url= self.detail_queue.get(block=False)
    22. #请求页面链接
    23. response_text=HanderRequest.send_reqeust(detail_url)
    24. if response_text:
    25. data={
    26. "url":detail_url,
    27. "html_content":response_text
    28. }
    29. #存放data_queuq数据
    30. self.data_queue.put(data)
    31. except Exception as e:
    32. print("{} 执行异常:{}".format(self.thread_name,e))
    33. print("{}结束".format(self.thread_name))

    4.DataParse.py

    1. # coding:utf-8
    2. import threading
    3. from lxml import etree
    4. import Constant
    5. class DataParse(threading.Thread):
    6. """
    7. 详情页数据处理
    8. """
    9. def __init__(self,thread_name,data_queue,lock,sheet):
    10. super(DataParse,self).__init__()
    11. self.thread_name=thread_name
    12. self.data_queue=data_queue
    13. self.lock=lock
    14. self.sheet=sheet
    15. def __list_join(self,list):
    16. return "".join(list)
    17. def __parse(self,data):
    18. """
    19. 解析data_queue数据
    20. 保存至excel中
    21. :return:
    22. """
    23. html= etree.HTML(data.get("html_content"))
    24. data={
    25. "url":data.get("url"),
    26. "title": self.__list_join(html.xpath("//h1[@class='entry-title']/text()")),
    27. "put_date":self.__list_join(html.xpath("//span[@class='my-date']/text()")),
    28. "content_html":self.__list_join(html.xpath("//div[@class='single-content']//p/text()"))
    29. }
    30. #多线程,使用lock来进行控制并发
    31. with self.lock:
    32. #写入Excel
    33. for index,e in enumerate(data):
    34. self.sheet.write(Constant.CURR_EXCEL_COL,index,data.get(e))
    35. Constant.CURR_EXCEL_COL += 1
    36. def run(self):
    37. #实际发送请求
    38. print("{}启动".format(self.thread_name))
    39. #需要从page_queue队列中获取数据
    40. try:
    41. while not self.data_queue.empty():
    42. #从队列中获取数据,并设置为非阻塞状态
    43. data_content= self.data_queue.get(block=False)
    44. #解析html数据
    45. self.__parse(data_content)
    46. except Exception as e:
    47. print("{} 执行异常:{}".format(self.thread_name,e))
    48. print("{}结束".format(self.thread_name))

    5.Constant.py

    1. # coding:utf-8
    2. # excel写入到第几列
    3. CURR_EXCEL_COL=1

    6.HanderRequest.py

    注意修改cookie

    1. # coding:utf-8
    2. import requests
    3. def send_reqeust(url):
    4. #发送数据
    5. headers={
    6. "Cookie":"xxx",
    7. "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
    8. }
    9. response=requests.get(url,headers=headers)
    10. if response.status_code==200 and response:
    11. return response.text

  • 相关阅读:
    C++ 线程安全注解
    Linux下kibana的安装与配置
    OpenCV第二篇:读取图像和保存图像
    k-均值聚类算法 Primary
    opengles在车载360环视avm系统中的应用简介
    使用Linkerd实现流量管理:学习如何使用Linkerd的路由规则来实现流量的动态控制
    webrtc opus 音频编码支持SILK和CELT模式
    Navicat 16.1 的新功能 - 第 3 部分
    2022系统分析师下午卷(案例分析)
    高级篇之ENC1V2新用法USB/RS232串口转Visca-Over-IP
  • 原文地址:https://blog.csdn.net/qq_34960590/article/details/136408405