• Scrapy 下载多层请求、多页数据


    思路

      	1. 确定数据结构 item
      	2. 写爬虫程序 spider
      		① 每一页的每一个详情页 url
      		② 翻页
      		③ 详情页匹配目标数据
      	3. 管道处理数据 piplines
      		① 保存到 excel
      		② 下载图片
      	4. 配置设置 settings
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    ①items.py

    import scrapy
    
    class Ftb2Item(scrapy.Item):
        game_title = scrapy.Field()  # 比赛详情标题
        name_1 = scrapy.Field()  # 主队球队名称
        logo_src_1 = scrapy.Field()  # 主队球队logo
        name_2 = scrapy.Field()  # 客队球队名称
        logo_src_2 = scrapy.Field()  # 客队球队logo
        
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    ② ftb.py (spiders)

    import scrapy
    from ..items import Ftb2Item
    
    
    class FtbSpider(scrapy.Spider):
        name = 'ftb'
        allowed_domains = ['mynba.tv']
    
        domain = 'http://www.mynba.tv'
        base_page_url = 'http://www.mynba.tv/video/?page='
        page = 1
    
        # start_urls = ['http://www.mynba.tv/video/?page=1']
        # 重写start_requests
        def start_requests(self):
            url = "http://www.mynba.tv/video/?page=1"
            yield scrapy.Request(url, callback=self.parse)
    
        def parse(self, response):
            detail_urls = response.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/a/@href').extract()
    
            # 进一步请求每一页的比赛详情页
            for detail_url in detail_urls:
                detail_url = FtbSpider.domain + detail_url
                print("请求 " + detail_url + '详情页...')
                yield scrapy.Request(url=detail_url, callback=self.parse_detail_info)  # 一页的每个比赛
    
            # 请求多页(前10页)
            if self.page < 11:
                self.page += 1
                page_url = base_page_url = 'http://www.mynba.tv/video/?page=' + str(self.page)
                print('切换页面至:', page_url)
                yield scrapy.Request(url=page_url,callback=self.parse)
    
        # 解析详情页数据
        def parse_detail_info(self, response):
            # 匹配数据
            game_title = response.xpath('//*[@id="app"]/div/div[4]/div/div[2]/p[1]/text()').extract_first()
            name_1 = response.xpath('//*[@id="app"]/div/div[4]/div/div[1]/p/a/text()').extract_first()
            logo_src_1 = response.xpath('//*[@id="app"]/div/div[4]/div/div[1]/img/@src').extract_first()
            name_2 = response.xpath('//*[@id="app"]/div/div[4]/div/div[3]/a/text()').extract_first()
            logo_src_2 = response.xpath('//*[@id="app"]/div/div[4]/div/div[3]/img/@src').extract_first()
            # 推送给管道
            yield Ftb2Item(game_title=game_title, name_1=name_1, logo_src_1=logo_src_1, name_2=name_2,
                            logo_src_2=logo_src_2)
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47

    ③ piplines.py

    import xlwt
    
    
    class Ftb2Pipeline:
        def open_spider(self, spider):
            self.workbook = xlwt.Workbook()
            self.worksheet = self.workbook.add_sheet('sheet1')
            self.line_cnt = 0
            self.col_name = ['game_title', 'name_1', 'logo_src_1', 'name_2', 'logo_src_2']
            # 写入表头
            for i in range(4):
                self.worksheet.write(self.line_cnt, i, self.col_name[i])
            self.line_cnt += 1
    
        def process_item(self, item, spider):
            try:
                # 写入数据
                for i in range(4):
                    self.worksheet.write(self.line_cnt, i, item[self.col_name[i]])
                self.line_cnt += 1
                self.workbook.save('football_data.xls')
    
                return item  # 必须加,否则其他管道就无法获得item了!
            except Exception as e:
                print('写入失败!有残缺数据!已自动跳过!')
    
    
    
    import urllib.request  # 用于下载图片
    class Ftb2Pipeline_2:
        def process_item(self, item, spider):
            try:
                img_src_1 = item['logo_src_1']
                img_src_2 = item['logo_src_2']
                name_1 = img_src_1.split('/')[-1]
                name_2 = img_src_2.split('/')[-1]
                urllib.request.urlretrieve(img_src_1,filename=f'img/{name_1}')  # 此方法对于此页面请求下载不太稳定
                urllib.request.urlretrieve(img_src_2,filename=f'img/{name_2}')
                print('over!!')
                return item
            except Exception as e:
                print(e)
                print('下载失败!队徽或队名内容不存在!已自动跳过!')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43

    ④ settings.py

    BOT_NAME = 'ftb_2'
    
    SPIDER_MODULES = ['ftb_2.spiders']
    NEWSPIDER_MODULE = 'ftb_2.spiders'
    
    ROBOTSTXT_OBEY = False
    
    DOWNLOAD_DELAY = 3
    RANDOMIZE_DOWNLOAD_DELAY = True
    
    ITEM_PIPELINES = {
       'ftb_2.pipelines.Ftb2Pipeline': 300,
       'ftb_2.pipelines.Ftb2Pipeline_2': 300,
    }
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    ⑤ start.py (启动文件)

    from scrapy import cmdline
    cmdline.execute('scrapy crawl ftb'.split(' '))
    
    
    • 1
    • 2
    • 3

    Scrapy 基础链接: Python爬虫|Scrapy 基础用法

  • 相关阅读:
    走进乌镇峰会,《个人信息保护法》实施一周年实践与展望
    模板学堂|DataEase协助电商企业开展用户运营
    mysql之主从复制和读写分离搭建
    设计模式之访问者模式
    攻防世界-WEB-php_rce
    stm32 st7735驱动 详解
    Java编程练习题:Demo96 - Demo105(多维数组)
    帆软报表实现通过js查询数据库设置表格数据
    【Java面试】如何理解Spring Boot中的Starter?
    链接元宇宙,开启新纪元
  • 原文地址:https://blog.csdn.net/Syc1102g/article/details/126155280