• python之Scrapy爬虫案例:豆瓣


    1. 运行命令创建项目:scrapy startproject scrapySpider
    2. 进入项目目录:cd .\scrapySpider\
    3. 运行命令创建爬虫:scrapy genspider douban movie.douban.com
    4. 目录结构说明
      |-- scrapySpider 项目目录
      |   |-- scrapySpider 项目目录
      |   |   |-- spiders 爬虫文件目录
      |   |   |   |-- douban.py 爬虫文件
      |   |   |-- items.py 定义数据模型文件,类似于数据库表的模式或数据结构的定义
      |   |   |-- middlewares.py 定义中间件文件,用于对请求和响应进行处理和修改
      |   |   |-- pipelines.py 定义数据处理管道(Pipeline)文件,用于处理爬取到的数据的组件
      |   |   |-- settings.py 定义配置文件,用于配置 Scrapy 项目的各种设置选项和参数
      |   |-- scrapy.cfg 框架中的配置文件,用于指定项目的结构和元数据信息
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
    5. 创建快代理文件scrapySpider>kuaidaili.py:https://www.kuaidaili.com/
      import requests
      
      
      class Kuaidaili():
          request_url = {
              # 获取代理ip前面
              'getIpSignature': 'https://auth.kdlapi.com/api/get_secret_token',
              # 获取代理ip
              'getIp': 'https://dps.kdlapi.com/api/getdps?secret_id=oy2q5xu76k4s8olx59et&num=1&signature={}'
          }
      
          headers = {
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
          }
      
          ip_use = '购买代理用户名'
          ip_password = '购买代理密码'
      
          def __init__(self):
              '''创建request会话对象'''
              self.request_session = requests.Session()
              self.request_session.headers.update(self.headers)
      
          # 获取代理ip签名
          @classmethod
          def get_ip_url(cls):
              par = {
                  'secret_id': 'oy2q5xu76k4s8olx59et',
                  'secret_key': '5xg6gvouc0vszfw0kxs1a8vrw1r6ity7'
              }
              response = requests.post(cls.request_url['getIpSignature'],data=par)
              response_data = response.json()
              return cls.request_url['getIp'].format(response_data['data']['secret_token'])
      
          @classmethod
          def get_ip(cls):
              url = cls.get_ip_url()
              response = requests.get(url)
              return f'http://{cls.ip_use}:{cls.ip_password}@{response.text}/'
      
      if __name__ == '__main__':
          kuaidaili = Kuaidaili()
          print(kuaidaili.get_ip())
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
    6. 爬取豆瓣案例
    • douban.py
      import scrapy
      from scrapy import cmdline
      from scrapy.http import HtmlResponse,Request
      from scrapySpider.items import DoubanItem
      
      class DoubanSpider(scrapy.Spider):
          name = 'douban'
          allowed_domains = ['movie.douban.com']
          start_urls = ['https://movie.douban.com/top250']
      
          def parse(self, response: HtmlResponse,**kwargs):
              video_list = response.xpath('//ol[@class="grid_view"]/li')
              for li in video_list:
                  item = DoubanItem()
                  item['title'] = li.xpath('.//div[@class="hd"]/a/span[1]/text()').extract_first()
                  item['rating'] = li.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()').extract_first()
                  item['quote'] = li.xpath('.//div[@class="bd"]//p[@class="quote"]/span/text()').extract_first()
                  detail_url = li.xpath('.//div[@class="hd"]/a/@href').extract_first()
                  yield Request(url=detail_url,callback=self.get_detail_info,meta={'item':item})
              #获取下一页数据
              next_page_url = response.xpath('//div[@class="paginator"]//link[@rel="next"]/@href').extract_first()
              if next_page_url:
                  yield Request(url=response.urljoin(next_page_url),callback=self.parse)
      
          #重写start_requests获取多页数据
          # def start_requests(self):
          #     for i in range(0,2):
          #         yield Request(url=f'{self.start_urls[0]}?start={i*25}&filter=',dont_filter=True,callback=self.parse)
      
          def get_detail_info(self,response:HtmlResponse):
              item = response.meta['item']
              detail = response.xpath('//span[@class="all hidden"]/text()').extract_first()
              if not detail:
                  detail = response.xpath('//div[@id="link-report-intra"]/span[1]/text()').extract_first()
              item['intro'] = detail.strip()
              request item
      
      if __name__ == '__main__':
          cmdline.execute('scrapy crawl douban'.split())
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
    • settings.py
      # Scrapy settings for scrapySpider project
      #
      # For simplicity, this file contains only settings considered important or
      # commonly used. You can find more settings consulting the documentation:
      #
      #     https://docs.scrapy.org/en/latest/topics/settings.html
      #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
      #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
      
      BOT_NAME = "scrapySpider"
      
      SPIDER_MODULES = ["scrapySpider.spiders"]
      NEWSPIDER_MODULE = "scrapySpider.spiders"
      
      
      # Crawl responsibly by identifying yourself (and your website) on the user-agent
      # USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
      
      # Obey robots.txt rules
      ROBOTSTXT_OBEY = False
      
      # Configure maximum concurrent requests performed by Scrapy (default: 16)
      #CONCURRENT_REQUESTS = 32
      
      # Configure a delay for requests for the same website (default: 0)
      # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
      # See also autothrottle settings and docs
      #DOWNLOAD_DELAY = 3
      # The download delay setting will honor only one of:
      #CONCURRENT_REQUESTS_PER_DOMAIN = 16
      #CONCURRENT_REQUESTS_PER_IP = 16
      
      # Disable cookies (enabled by default)
      #COOKIES_ENABLED = False
      
      # Disable Telnet Console (enabled by default)
      #TELNETCONSOLE_ENABLED = False
      
      # Override the default request headers:
      DEFAULT_REQUEST_HEADERS = {
         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Accept-Language": "en",
      }
      
      # Enable or disable spider middlewares
      # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
      #SPIDER_MIDDLEWARES = {
      #    "scrapySpider.middlewares.ScrapyspiderSpiderMiddleware": 543,
      #}
      
      # Enable or disable downloader middlewares
      # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
      DOWNLOADER_MIDDLEWARES = {
         "scrapySpider.middlewares.DoubanDownloaderMiddleware": 543,
      }
      
      # Enable or disable extensions
      # See https://docs.scrapy.org/en/latest/topics/extensions.html
      # EXTENSIONS = {
      #    'scrapeops_scrapy.extension.ScrapeOpsMonitor': 500,
      # }
      
      # Configure item pipelines
      # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
      ITEM_PIPELINES = {
         "scrapySpider.pipelines.MysqlPipeLine": 300,
         "scrapySpider.pipelines.MongoPipeLine": 301,
      }
      
      # Enable and configure the AutoThrottle extension (disabled by default)
      # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
      #AUTOTHROTTLE_ENABLED = True
      # The initial download delay
      #AUTOTHROTTLE_START_DELAY = 5
      # The maximum download delay to be set in case of high latencies
      #AUTOTHROTTLE_MAX_DELAY = 60
      # The average number of requests Scrapy should be sending in parallel to
      # each remote server
      #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
      # Enable showing throttling stats for every response received:
      #AUTOTHROTTLE_DEBUG = False
      
      # Enable and configure HTTP caching (disabled by default)
      # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
      #HTTPCACHE_ENABLED = True
      #HTTPCACHE_EXPIRATION_SECS = 0
      #HTTPCACHE_DIR = "httpcache"
      #HTTPCACHE_IGNORE_HTTP_CODES = []
      #HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
      
      # Set settings whose default value is deprecated to a future-proof value
      REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
      TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
      FEED_EXPORT_ENCODING = "utf-8"
      
      #日志配置
      # LOG_FILE = 'log.log'
      # LOG_FILE_APPEND = False
      # LOG_LEVEL = 'INFO'
      
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55
      • 56
      • 57
      • 58
      • 59
      • 60
      • 61
      • 62
      • 63
      • 64
      • 65
      • 66
      • 67
      • 68
      • 69
      • 70
      • 71
      • 72
      • 73
      • 74
      • 75
      • 76
      • 77
      • 78
      • 79
      • 80
      • 81
      • 82
      • 83
      • 84
      • 85
      • 86
      • 87
      • 88
      • 89
      • 90
      • 91
      • 92
      • 93
      • 94
      • 95
      • 96
      • 97
      • 98
      • 99
      • 100
    • items.py
      # Define here the models for your scraped items
      #
      # See documentation in:
      # https://docs.scrapy.org/en/latest/topics/items.html
      
      import scrapy
      
      
      class DoubanItem(scrapy.Item):
          # define the fields for your item here like:
          # name = scrapy.Field()
          title = scrapy.Field()
          rating = scrapy.Field()
          quote = scrapy.Field()
          intro = scrapy.Field()
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
    • pipelines.py
      # Define your item pipelines here
      #
      # Don't forget to add your pipeline to the ITEM_PIPELINES setting
      # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
      
      
      # useful for handling different item types with a single interface
      from itemadapter import ItemAdapter
      import pymysql
      import pymongo
      
      video_spider = ['douban']
      
      class DoubanPipeline:
          def process_item(self, item, spider):
              print(item)
              return item
      
      
      class MysqlPipeLine:
      
          def open_spider(self, spider):
              self.spider = spider
              self.mysql = pymysql.connect(host='localhost',port=3306,user='root',password='root')
              self.cursor = self.mysql.cursor()
              # 创建video数据库和相关爬虫表
              if self.spider.name in video_spider:
                  self.create_db('video')
      
          '''创建数据库'''
          def create_db(self,db_name):
               sql = f'''CREATE DATABASE IF NOT EXISTS {db_name}'''
               try:
                   self.cursor.execute(sql)
                   self.mysql.select_db(db_name)
                   if self.spider.name == 'douban':
                       self.create_douban_table()
               except Exception as e:
                  print(f'创建{db_name}数据库失败:{e}')
      
          '''创建表douban'''
          def create_douban_table(self):
              sql = f'''
              CREATE TABLE IF NOT EXISTS {self.spider.name}(
              id INT AUTO_INCREMENT,
              title VARCHAR(255),
              rating FLOAT,
              quote VARCHAR(255),
              intro TEXT,
              PRIMARY KEY(id)
              )
              '''
              try:
                  self.cursor.execute(sql)
              except Exception as e:
                  print(f'创建douban表失败:{e}')
      
          def process_item(self, item, spider):
              if spider.name == 'douban':
                  sql = f'''INSERT INTO {spider.name}(title,rating,quote,intro) VALUES(%(title)s,%(rating)s,%(quote)s,%(intro)s)'''
                  try:
                      item['rating'] = float(item['rating'])
                      self.cursor.execute(sql,dict(item))
                      self.mysql.commit()
                  except Exception as e:
                      print(f'”{item["title"]}”插入失败:{e}')
                      self.mysql.rollback()
              return item
      
          def close_spider(self,spider):
              self.mysql.close()
      
      
      class MongoPipeLine:
      
          def open_spider(self, spider):
              self.spider = spider
              self.mongo = pymongo.MongoClient(host='localhost',port=27017)
              # 创建video数据库和相关爬虫表
              if self.spider.name in video_spider:
                  self.cursor = self.mongo['video'][self.spider.name]
      
          def process_item(self, item, spider):
              try:
                  self.cursor.insert_one(dict(item))
              except Exception as e:
                  print(f'”{item["title"]}”插入失败:{e}')
      
              return item
      
          def close_spider(self, spider):
              self.mongo.close()
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
      • 50
      • 51
      • 52
      • 53
      • 54
      • 55
      • 56
      • 57
      • 58
      • 59
      • 60
      • 61
      • 62
      • 63
      • 64
      • 65
      • 66
      • 67
      • 68
      • 69
      • 70
      • 71
      • 72
      • 73
      • 74
      • 75
      • 76
      • 77
      • 78
      • 79
      • 80
      • 81
      • 82
      • 83
      • 84
      • 85
      • 86
      • 87
      • 88
      • 89
      • 90
      • 91
      • 92
    • middlewares.py
      # Define here the models for your spider middleware
      #
      # See documentation in:
      # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
      
      from scrapy import signals
      from fake_useragent import UserAgent
      from scrapy.http import Request,HtmlResponse
      from scrapySpider.kuaidaili import Kuaidaili
      
      # useful for handling different item types with a single interface
      from itemadapter import is_item, ItemAdapter
      
      class DoubanDownloaderMiddleware:
      
          def __init__(self):
              self.ua = UserAgent()
              self.kuaidaili = Kuaidaili()
              #初始化一个代理ip
              self.first_ip = self.kuaidaili.get_ip()
      
          @classmethod
          def from_crawler(cls, crawler):
              s = cls()
              crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
              return s
      
          def process_request(self, request:Request, spider):
              #设置UA
              request.headers['User-Agent'] = self.ua.random
              #设置代理
              request.meta['proxy'] = self.first_ip
              request.meta['download_timeout'] = 5
              spider.logger.info(f'ip:{request.meta["proxy"]}')
              return None
      
          def process_response(self, request, response:HtmlResponse, spider):
              spider.logger.info(f'ip:{request.meta["proxy"]}')
              if response.status == 200:
                  return response
              #代理失效小重新设置代理,并返回request重新请求
              request.meta['proxy'] = self.kuaidaili.get_ip()
              request.meta['download_timeout'] = 2
              return request
      
      
          def spider_opened(self, spider):
              spider.logger.info(f'"{spider.name}"Spide')
      
      
      • 1
      • 2
      • 3
      • 4
      • 5
      • 6
      • 7
      • 8
      • 9
      • 10
      • 11
      • 12
      • 13
      • 14
      • 15
      • 16
      • 17
      • 18
      • 19
      • 20
      • 21
      • 22
      • 23
      • 24
      • 25
      • 26
      • 27
      • 28
      • 29
      • 30
      • 31
      • 32
      • 33
      • 34
      • 35
      • 36
      • 37
      • 38
      • 39
      • 40
      • 41
      • 42
      • 43
      • 44
      • 45
      • 46
      • 47
      • 48
      • 49
  • 相关阅读:
    我该如何入门Python机器学习?
    【Java成王之路】EE初阶第十八篇: 前端三剑客 HTML基础篇
    Ubuntu出现无法获取 dpkg 前端锁 (/var/lib/dpkg/lock-frontend),是否有其他进程正占用它?
    Android AAB 包重签方法
    Android 插件化技术应运而生出的 Apk 动态加载技术的开源框架
    惯导标定国内外研究现状小结(删减版)
    I2C知识大全系列一 —— I2C相关概念
    如何在Mac上停止旋转等待光标?这里提供详细步骤
    线性时间选择(含平均O(n)和最坏O(n)算法)
    基于极限学习机的轴承故障分类(西储大学数据)
  • 原文地址:https://blog.csdn.net/randy521520/article/details/133826803