• python爬虫之feapder.AirSpider轻量爬虫案例:豆瓣


    1. 创建feaderSpider项目:feapder create -p feapderSpider,已创建可忽略
    2. 进入feapderSpider目录:cd .\ feapderSpider\spiders
    3. 创建爬虫:feapder create -s airSpiderDouban,选择AirSpider爬虫模板,可跳过1、2直接创建爬虫文件
    4. 配置邮件报警:报警配置163邮箱,https://feapder.com/#/source_code/%E6%8A%A5%E8%AD%A6%E5%8F%8A%E7%9B%91%E6%8E%A7
      在这里插入图片描述
      在这里插入图片描述
    5. setting.py打开mysql配置,无setting.py文件,运行命令feapder create --setting
      # # MYSQL
      MYSQL_IP = "localhost"
      MYSQL_PORT = 3306
      MYSQL_DB = "video"
      MYSQL_USER_NAME = "root"
      MYSQL_USER_PASS = "root"
      
      # # REDIS
      # # ip:port 多个可写为列表或者逗号隔开 如 ip1:port1,ip2:port2 或 ["ip1:port1", "ip2:port2"]
      REDISDB_IP_PORTS = "localhost:6379"
      REDISDB_USER_PASS = ""
      REDISDB_DB = 0
      # 连接redis时携带的其他参数,如ssl=True
      REDISDB_KWARGS = dict()
      # 适用于redis哨兵模式
      REDISDB_SERVICE_NAME = ""
      
      # # 去重
      ITEM_FILTER_ENABLE = True # item 去重
      ITEM_FILTER_SETTING = dict(
          filter_type=1,  # 永久去重(BloomFilter) = 1 、内存去重(MemoryFilter) = 2、 临时去重(ExpireFilter)= 3、轻量去重(LiteFilter)= 4
          name="douban"
      )
      
      # # 邮件报警
      EMAIL_SENDER = "123123123@163.com"  # 发件人
      EMAIL_PASSWORD = "EYNXMBWJKMLZFTKQ"  # 授权码
      EMAIL_RECEIVER = ["123123123@163.com"]  # 收件人 支持列表,可指定多个
      EMAIL_SMTPSERVER = "smtp.163.com"  # 邮件服务器 默认为163邮箱
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
    6. 创建item:feapder create -i douban,选择item,需要现在数据库创建表
      CREATE TABLE IF NOT EXISTS douban(
              id INT AUTO_INCREMENT,
              title VARCHAR(255),
              rating FLOAT,
              quote VARCHAR(255),
              intro TEXT,
              PRIMARY KEY(id)
              )
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
    7. 修改douban_item.py文件
      # -*- coding: utf-8 -*-
      """
      Created on 2023-10-08 16:17:51
      ---------
      @summary:
      ---------
      @author: Administrator
      """
      
      from feapder import Item
      
      
      class DoubanItem(Item):
          """
          This class was generated by feapder
          command: feapder create -i douban
          """
      
          __table_name__ = "douban"
          __unique_key__ = ["title","quote","rating","title"] # 指定去重的key为 title、quote,最后的指纹为title与quote值联合计算的md5
      
          def __init__(self, *args, **kwargs):
              super().__init__(**kwargs)
              # self.id = None
              self.intro = None
              self.quote = None
              self.rating = None
              self.title = None
      
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
    8. 爬虫文件:air_spider_douban.py
      # -*- coding: utf-8 -*-
      """
      Created on 2023-10-06 15:36:09
      ---------
      @summary:
      ---------
      @author: Administrator
      """
      
      import feapder
      from items.douban_item import DoubanItem
      from feapder.network.user_agent import get as get_ua
      from requests.exceptions import ConnectTimeout,ProxyError
      from feapder.utils.email_sender import EmailSender
      import feapder.setting as setting
      
      class AirSpiderDouban(feapder.AirSpider):
          def __init__(self, thread_count=None):
              super().__init__(thread_count)
              self.request_url = 'https://movie.douban.com/top250'
      
          def start_requests(self):
              yield feapder.Request(self.request_url)
      
          def download_midware(self, request):
              request.headers = {
                  'User-Agent': get_ua()
              }
              return request
      
          def parse(self, request, response):
              video_list = response.xpath('//ol[@class="grid_view"]/li')
              for li in video_list:
                  item = DoubanItem()
                  item['title'] = li.xpath('.//div[@class="hd"]/a/span[1]/text()').extract_first()
                  item['rating'] = li.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()').extract_first()
                  item['quote'] = li.xpath('.//div[@class="bd"]//p[@class="quote"]/span/text()').extract_first()
                  detail_url = li.xpath('.//div[@class="hd"]/a/@href').extract_first()
                  if detail_url:
                      yield feapder.Request(detail_url, callback=self.get_detail_info, item=item)
              # 获取下一页数据
              next_page_url = response.xpath('//div[@class="paginator"]//link[@rel="next"]/@href').extract_first()
              if next_page_url:
                  yield feapder.Request(next_page_url,callback=self.parse)
      
          def get_detail_info(self, request, response):
              item = request.item
              detail = response.xpath('//span[@class="all hidden"]/text()').extract_first() or ''
              if not detail:
                  detail = response.xpath('//div[@id="link-report-intra"]/span[1]/text()').extract_first() or ''
              item['intro'] = detail.strip()
              yield item
      
          def exception_request(self, request, response, e):
              prox_err = [ConnectTimeout,ProxyError]
              if type(e) in prox_err:
                  request.del_proxy()
      
          def end_callback(self):
              with EmailSender(setting.EMAIL_SENDER,setting.EMAIL_PASSWORD) as email_sender:
                  email_sender.send(setting.EMAIL_RECEIVER, title='python',content="爬虫结束")
      
      
      if __name__ == "__main__":
          AirSpiderDouban(thread_count=5).start()
      
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55
      • 56
      • 57
      • 58
      • 59
      • 60
      • 61
      • 62
      • 63
      • 64
      • 65
      • 66
    9. feapder create -p feapderSpider根据该命令创建的项目下会有main文件,除了单独运行爬虫文件,可以在main文件中运行,一般用于运行多个爬虫
      from feapder import ArgumentParser
      from spiders import *
      
      def crawl_air_spider_douban():
          """
          AirSpider爬虫
          """
          spider = air_spider_douban.AirSpiderDouban()
          spider.start()
      
      if __name__ == "__main__":
          parser = ArgumentParser(description="爬虫练习")
      
      
          parser.add_argument(
              "--crawl_air_spider_douban", action="store_true", help="豆瓣AirSpide", function=crawl_air_spider_douban
          )
      
      
          parser.run("crawl_air_spider_douban")
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
  • 相关阅读:
    Qt 容器类
    (附源码)ssm华立学院门户网站 毕业设计 290011
    Linux环境搭建SVN服务器并实现公网访问 - cpolar端口映射
    自定义数字键盘(kotlin)
    Unity & PS Linear Workflow - Unity 和 PS 的线性工作流实践 - 简单配置示例(后续补上渲染差异图)
    【go】linux服务器 go从1.18.x版本升级1.19.x
    双token三验证和微信登录流程图
    决策树与随机森林
    C# range
    Python 数据可视化 boxplot
  • 原文地址:https://blog.csdn.net/randy521520/article/details/133826889