• 高级深入--day40


    items.py
    1. class CoserItem(scrapy.Item):
    2. url = scrapy.Field()
    3. name = scrapy.Field()
    4. info = scrapy.Field()
    5. image_urls = scrapy.Field()
    6. images = scrapy.Field()

    spiders/coser.py

    1. # -*- coding: utf-8 -*-
    2. from scrapy.selector import Selector
    3. import scrapy
    4. from scrapy.contrib.loader import ItemLoader
    5. from Cosplay.items import CoserItem
    6. class CoserSpider(scrapy.Spider):
    7. name = "coser"
    8. allowed_domains = ["bcy.net"]
    9. start_urls = (
    10. 'http://bcy.net/cn125101',
    11. 'http://bcy.net/cn126487',
    12. 'http://bcy.net/cn126173'
    13. )
    14. def parse(self, response):
    15. sel = Selector(response)
    16. for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
    17. link = 'http://bcy.net%s' % link
    18. request = scrapy.Request(link, callback=self.parse_item)
    19. yield request
    20. def parse_item(self, response):
    21. item = ItemLoader(item=CoserItem(), response=response)
    22. item.add_xpath('name', "//h1[@class='js-post-title']/text()")
    23. item.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
    24. urls = item.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
    25. urls = [url.replace('/w650', '') for url in urls]
    26. item.add_value('image_urls', urls)
    27. item.add_value('url', response.url)
    28. return item.load_item()
    pipelines.py
    1. import requests
    2. from Cosplay import settings
    3. import os
    4. class ImageDownloadPipeline(object):
    5. def process_item(self, item, spider):
    6. if 'image_urls' in item:
    7. images = []
    8. dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
    9. if not os.path.exists(dir_path):
    10. os.makedirs(dir_path)
    11. for image_url in item['image_urls']:
    12. us = image_url.split('/')[3:]
    13. image_file_name = '_'.join(us)
    14. file_path = '%s/%s' % (dir_path, image_file_name)
    15. images.append(file_path)
    16. if os.path.exists(file_path):
    17. continue
    18. with open(file_path, 'wb') as handle:
    19. response = requests.get(image_url, stream=True)
    20. for block in response.iter_content(1024):
    21. if not block:
    22. break
    23. handle.write(block)
    24. item['images'] = images
    25. return item
    settings.py
    1. ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}
    2. IMAGES_STORE = '../Images'
    3. DOWNLOAD_DELAY = 0.25 # 250 ms of delay
    在项目根目录下新建main.py文件,用于调试
    1. from scrapy import cmdline
    2. cmdline.execute('scrapy crawl coser'.split())
    执行程序
    py2 main.py
  • 相关阅读:
    计算机毕业设计(附源码)python重工教师职称管理系统
    使用(七牛云)为例子实现将文件上传到云服务器
    如何在linux 安装tar.gz二进制包
    架构基本概念和架构本质
    RPM方式安装MongoDB,开启远程连接,开启认证,开启oplog
    C# - XMLHelper :一个操作XML的简单类库
    APT攻击与防范
    学习笔记-java代码审计-sqli
    如何做到一套FPGA工程无缝兼容两款不同的板卡?
    Linux之线程池
  • 原文地址:https://blog.csdn.net/qq_41813416/article/details/134066443