items.py
class CoserItem(scrapy.Item):
image_urls = scrapy.Field()
spiders/coser.py
from scrapy.selector import Selector
from scrapy.contrib.loader import ItemLoader
from Cosplay.items import CoserItem
class CoserSpider(scrapy.Spider):
allowed_domains = ["bcy.net"]
'http://bcy.net/cn125101',
'http://bcy.net/cn126487',
'http://bcy.net/cn126173'
def parse(self, response):
for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
link = 'http://bcy.net%s' % link
request = scrapy.Request(link, callback=self.parse_item)
def parse_item(self, response):
item = ItemLoader(item=CoserItem(), response=response)
item.add_xpath('name', "//h1[@class='js-post-title']/text()")
item.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
urls = item.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
urls = [url.replace('/w650', '') for url in urls]
item.add_value('image_urls', urls)
item.add_value('url', response.url)
pipelines.py
from Cosplay import settings
class ImageDownloadPipeline(object):
def process_item(self, item, spider):
dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)
if not os.path.exists(dir_path):
for image_url in item['image_urls']:
us = image_url.split('/')[3:]
image_file_name = '_'.join(us)
file_path = '%s/%s' % (dir_path, image_file_name)
if os.path.exists(file_path):
with open(file_path, 'wb') as handle:
response = requests.get(image_url, stream=True)
for block in response.iter_content(1024):
settings.py
ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}
IMAGES_STORE = '../Images'
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline
cmdline.execute('scrapy crawl coser'.split())
执行程序
py2 main.py