• 基于Python的爬虫演示示例-以电影网站为例


    作者主页:编程指南针

    作者简介:Java领域优质创作者、CSDN博客专家 、掘金特邀作者、多年架构师设计经验、腾讯课堂常驻讲师

    主要内容:Java项目、毕业设计、简历模板、学习资料、面试题库、技术互助

    文末获取源码 

    一,项目简介

       基于Python实现豆瓣电影数据的抓去,并存入本在数据库。

       数据库结构准备:

       

    1. create table if not exists `categories` (
    2. `id` int(11) NOT NULL PRIMARY KEY,
    3. `type` varchar (255) NOT NULL DEFAULT ''
    4. ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
    5. create table if not exists `movies`(
    6. `id` int(11) NOT NULL PRIMARY KEY AUTO_INCREMENT,
    7. `cover` varchar (255) NOT NULL DEFAULT '',
    8. `title` varchar (50) NOT NULL DEFAULT '',
    9. `date` varchar (10) NOT NULL DEFAULT '',
    10. `rate` float DEFAULT 0,
    11. `director` varchar (100) NOT NULL DEFAULT '',
    12. `scriptwriter` varchar(100) NOT NULL DEFAULT '',
    13. `actors` text,
    14. `district` varchar(255) DEFAULT '',
    15. `language` varchar (255) DEFAULT '',
    16. `duration` varchar (100) DEFAULT '',
    17. `abs` text,
    18. UNIQUE (`title`)
    19. )ENGINE=InnoDB DEFAULT CHARSET=utf8;
    20. create table if not exists `movie-category` (
    21. `id` BIGINT NOT NULL PRIMARY KEY AUTO_INCREMENT,
    22. `mid` int(11) NOT NULL,
    23. `cid` int(11) NOT NULL,
    24. KEY `fk_on_movie_id` (`mid`),
    25. CONSTRAINT `fk_on_movie_id` FOREIGN KEY (`mid`) REFERENCES `movies` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
    26. KEY `fk_on_category_id` (`cid`),
    27. CONSTRAINT `fk_on_category_id` FOREIGN KEY (`cid`) REFERENCES `categories` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
    28. )ENGINE=InnoDB DEFAULT CHARSET=utf8;

    1. INSERT INTO `categories` VALUES (1,'剧情');
    2. INSERT INTO `categories` VALUES (2,'喜剧');
    3. INSERT INTO `categories` VALUES (3,'动作');
    4. INSERT INTO `categories` VALUES (4,'爱情');
    5. INSERT INTO `categories` VALUES (5,'科幻');
    6. INSERT INTO `categories` VALUES (6,'动画');
    7. INSERT INTO `categories` VALUES (7,'悬疑');
    8. INSERT INTO `categories` VALUES (8,'惊悚');
    9. INSERT INTO `categories` VALUES (9,'恐怖');
    10. INSERT INTO `categories` VALUES (10,'犯罪');
    11. INSERT INTO `categories` VALUES (11,'同性');
    12. INSERT INTO `categories` VALUES (12,'音乐');
    13. INSERT INTO `categories` VALUES (13,'歌舞');
    14. INSERT INTO `categories` VALUES (14,'传记');
    15. INSERT INTO `categories` VALUES (15,'历史');
    16. INSERT INTO `categories` VALUES (16,'战争');
    17. INSERT INTO `categories` VALUES (17,'西部');
    18. INSERT INTO `categories` VALUES (18,'奇幻');
    19. INSERT INTO `categories` VALUES (19,'冒险');
    20. INSERT INTO `categories` VALUES (20,'灾难');
    21. INSERT INTO `categories` VALUES (21,'武侠');
    22. INSERT INTO `categories` VALUES (22,'情色');

    二,环境介绍

    语言环境:Python3.7+scrapy

    数据库:Mysql: mysql5.7

    开发工具:IDEA或eclipse

    三,核心代码展示

    数据模型:items.py

    1. # Define here the models for your scraped items
    2. #
    3. # See documentation in:
    4. # https://docs.scrapy.org/en/latest/topics/items.html
    5. import scrapy
    6. class DoubanItem(scrapy.Item):
    7. # define the fields for your item here like:
    8. # name = scrapy.Field()
    9. #电影标题
    10. title = scrapy.Field()
    11. #导演
    12. director = scrapy.Field()
    13. #编剧
    14. scriptwriter = scrapy.Field()
    15. #演员
    16. actors = scrapy.Field()
    17. #上映日期
    18. date = scrapy.Field()
    19. #评分
    20. rate = scrapy.Field()
    21. #国家/地区
    22. district = scrapy.Field()
    23. #语言
    24. language = scrapy.Field()
    25. #封面图片
    26. cover = scrapy.Field()
    27. #简介
    28. abs = scrapy.Field()
    29. #类型
    30. categories = scrapy.Field()
    31. #时长
    32. duration = scrapy.Field()

    数据存储工具定义:pipelines.py

    1. # Define your item pipelines here
    2. #
    3. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    4. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    5. # useful for handling different item types with a single interface
    6. from scrapy.exceptions import DropItem
    7. from scrapy.http import Request
    8. from scrapy.pipelines.images import ImagesPipeline
    9. import pymysql
    10. import random
    11. class DoubanPipeline:
    12. def process_item(self, item, spider):
    13. return item
    14. #根据取得的图片url重新请求,下载图片到本地
    15. class DownloadImagePipeline(ImagesPipeline):
    16. default_headers = {
    17. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
    18. #"Cookie":'_vwo_uuid_v2=D65EBF690D9454DE4C13354E37DC5B9AA|3bb7e6e65f20e31141b871b4fea88dc2; __yadk_uid=QBp8bLKHjCn5zS2J5r8xV7327R0wnqkU; douban-fav-remind=1; gr_user_id=0a41d8d1-fe39-4619-827a-17961cf31795; viewed="35013197_10769749_23008813_26282806_34912177_22139960_35003794_30249691_26616244_27035127"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.21320; bid=gplG4aEN4Xc; ll="108288"; ap_v=0,6.0; __utma=30149280.819011260.1572087992.1604448803.1604453561.105; __utmc=30149280; __utmz=30149280.1604453561.105.65.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=eddb65558a1da756-223ab4f88bc400c8:T=1604453562:RT=1604453562:S=ALNI_MZGB_I69qmiL2tt3lm57JVX1i4r2w; __utmb=30149280.4.10.1604453561; dbcl2="213202515:Ip9mjwUAab4"; ck=wxUS; __utma=223695111.897479705.1572088003.1604448803.1604455298.71; __utmb=223695111.0.10.1604455298; __utmc=223695111; __utmz=223695111.1604455298.71.42.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1604455298%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _pk_id.100001.4cf6=e11874c5506d4ab1.1572088003.71.1604455342.1604450364.'
    19. }
    20. def get_media_requests(self, item, info):
    21. #print('到这里来了...')
    22. image_url = item['cover']
    23. yield Request(
    24. image_url,
    25. headers=self.default_headers)
    26. #get_media_requests函数返回后执行
    27. def item_completed(self, results, item, info):
    28. image_paths = [x['path'] for ok, x in results if ok]
    29. if not image_paths:
    30. raise DropItem("Item contains no images")
    31. #返回的图片地址是full+文件名的格式,由于我是边爬边下载,所以每次只有一张图片,但是返回的是
    32. #数组,函数设计为多张图片,我将‘full’替换成了自己后台接口的地址,方便数据库中的存储
    33. image_paths = str(image_paths[0]).replace('full','http://localhost:8443/api/file')
    34. item['cover'] = image_paths
    35. return item
    36. # 将电影信息存入到数据库中
    37. class DBPipeline(object):
    38. def __init__(self):
    39. # connection database
    40. # 后面三个依次是数据库连接名、数据库密码、数据库名称
    41. self.connect = pymysql.connect(host='127.0.0.1', user='root', password='root',
    42. db='fivesix',charset='utf8',port=3306)
    43. # get cursor
    44. self.cursor_1 = self.connect.cursor()
    45. self.cursor_2 = self.connect.cursor()
    46. self.type_to_id = {
    47. '剧情': 1,'喜剧':2, '动作':3,
    48. '爱情': 4, '科幻':5, '动画':6,
    49. '悬疑': 7, '惊悚' : 8, '恐怖' : 9,
    50. '犯罪': 10, '同性':11, '音乐':12,
    51. '歌舞':13, '传记':14,'历史':15,
    52. '战争':16, '西部':17, '奇幻':18,
    53. '冒险':19, '灾难':20,'武侠':21, '情色':22
    54. }
    55. print("连接数据库成功")
    56. def process_item(self, item, spider):
    57. if item['title'] == '':
    58. return
    59. # sql语句
    60. insert_movie_sql = """
    61. insert ignore into `movies`(cover,title, director, scriptwriter, actors, district,rate,date,language,duration,abs) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    62. """
    63. insert_mc_sql = """
    64. insert into `movie-category` (mid,cid) values (%s,%s)
    65. """
    66. # 执行插入数据到数据库操作
    67. self.cursor_1.execute(insert_movie_sql, (item['cover'], item['title'], item['director'], item['scriptwriter'],
    68. item['actors'],item['district'],item['rate'],
    69. item['date'],item['language'],item['duration'],item['abs']))
    70. mid = self.cursor_1.lastrowid
    71. #处理标签
    72. cids = []
    73. categories = item['categories'].split('/')
    74. for c in categories:
    75. if c not in self.type_to_id.keys():continue
    76. cids.append(self.type_to_id.get(c))
    77. #插入关联表
    78. print(cids)
    79. for cid in cids:
    80. self.cursor_2.execute(insert_mc_sql,(mid,cid))
    81. # 提交,不进行提交无法保存到数据库
    82. self.connect.commit()
    83. def close_spider(self, spider):
    84. # 关闭游标和连接
    85. self.cursor_1.close()
    86. self.cursor_2.close()
    87. self.connect.close()

    爬虫核心代码:movies.py

    1. # -*- coding: utf-8 -*-
    2. import scrapy
    3. import json
    4. import re
    5. import time
    6. from douban.items import DoubanItem
    7. from fake_useragent import UserAgent
    8. import random
    9. class MovieHotSpider(scrapy.Spider):
    10. #爬虫的名称,在命令行可以方便的运行爬虫
    11. name = "movie_hot"
    12. allowed_domains = ["movie.douban.com"]
    13. #pro = ['139.224.37.83','115.223.7.110','221.122.91.75']
    14. # 拼接豆瓣电影URL
    15. BASE_URL = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%s&sort=recommend&page_limit=%s&page_start=%s'
    16. MOVIE_TAG = '华语'
    17. PAGE_LIMIT = 20
    18. page_start = 0
    19. domains = BASE_URL % (MOVIE_TAG, PAGE_LIMIT, page_start)
    20. #伪装浏览器
    21. headers = {
    22. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
    23. #,"Cookie":'_vwo_uuid_v2=D65EBF690D9454DE4C13354E37DC5B9AA|3bb7e6e65f20e31141b871b4fea88dc2; __yadk_uid=QBp8bLKHjCn5zS2J5r8xV7327R0wnqkU; douban-fav-remind=1; gr_user_id=0a41d8d1-fe39-4619-827a-17961cf31795; viewed="35013197_10769749_23008813_26282806_34912177_22139960_35003794_30249691_26616244_27035127"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.21320; bid=gplG4aEN4Xc; ll="108288"; ap_v=0,6.0; __utma=30149280.819011260.1572087992.1604448803.1604453561.105; __utmc=30149280; __utmz=30149280.1604453561.105.65.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __gads=ID=eddb65558a1da756-223ab4f88bc400c8:T=1604453562:RT=1604453562:S=ALNI_MZGB_I69qmiL2tt3lm57JVX1i4r2w; __utmb=30149280.4.10.1604453561; dbcl2="213202515:Ip9mjwUAab4"; ck=wxUS; __utma=223695111.897479705.1572088003.1604448803.1604455298.71; __utmb=223695111.0.10.1604455298; __utmc=223695111; __utmz=223695111.1604455298.71.42.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1604455298%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _pk_id.100001.4cf6=e11874c5506d4ab1.1572088003.71.1604455342.1604450364.'
    24. }
    25. #总共爬取的页数
    26. pages = 100
    27. # 爬虫从此开始
    28. def start_requests(self):
    29. print('~~~~爬取列表: '+ self.domains)
    30. yield scrapy.Request(
    31. url = self.domains,
    32. headers=self.headers,
    33. callback=self.request_movies
    34. )
    35. # 分析列表页
    36. def request_movies(self, response):
    37. infos = response.text
    38. # 使用JSON模块解析响应结果
    39. infos = json.loads(infos)
    40. # 迭代影片信息列表
    41. for movie_info in infos['subjects']:
    42. print('~~~爬取电影: ' + movie_info['title'] + '/'+ movie_info['rate'])
    43. # 提取影片页面url,构造Request发送请求,并将item通过meta参数传递给影片页面解析函数
    44. yield scrapy.Request(
    45. url = str(movie_info['url']),
    46. headers = self.headers,
    47. callback = self.request_movie,
    48. dont_filter=True
    49. )
    50. #如果已经爬完pages或者当前标签下没有更多电影时退出
    51. if self.pages > 0 and len(infos['subjects']) == self.PAGE_LIMIT:
    52. self.pages -= 1
    53. self.page_start += self.PAGE_LIMIT
    54. url = self.BASE_URL % (self.MOVIE_TAG,self.PAGE_LIMIT,self.page_start)
    55. time.sleep(5)
    56. print('-----爬取列表: ' + url)
    57. yield scrapy.Request(
    58. url=url,
    59. headers=self.headers,
    60. callback=self.request_movies,
    61. dont_filter=True
    62. )
    63. # 分析详情页
    64. def request_movie(self, response):
    65. #组装数据
    66. movie_item = DoubanItem()
    67. title = response.css('div#content>h1>span:nth-child(1)::text').extract_first()
    68. t = re.findall('[\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\u4e00-\u9fa5_0-9]', title)
    69. #获取非info区域数据
    70. movie_item['title'] = ''.join(t)
    71. movie_item['date'] = response.css('div#content>h1>span.year::text').extract_first()[1:-1]
    72. movie_item['rate'] = response.css('strong.rating_num::text').extract_first()
    73. #movie_item['commentCount'] = response.css('div.rating_sum>a.rating_people>span::text').extract_first()
    74. #movie_item['start'] = '/'.join(response.css('span.rating_per::text').extract())
    75. #movie_item['better'] = '/'.join(response.css('div.rating_betterthan>a::text').extract())
    76. movie_item['abs'] = response.css('#link-report>span::text').extract_first().strip()
    77. movie_item['cover'] = response.css('#mainpic>a>img::attr(src)').extract_first()
    78. # 获取整个信息字符串
    79. info = response.css('div.subject div#info').xpath('string(.)').extract_first()
    80. # 提取所以字段名
    81. fields = [s.strip().replace(':', '') for s in response.css('div#info span.pl::text').extract()]
    82. # 提取所有字段的值
    83. values = [re.sub('\s+', '', s.strip()) for s in re.split('\s*(?:%s):\s*' % '|'.join(fields), info)][1:]
    84. # 处理列名称
    85. for i in range(len(fields)):
    86. if '导演' == fields[i]:
    87. fields[i] = 'director'
    88. if '编剧' == fields[i]:
    89. fields[i] = 'scriptwriter'
    90. if '主演' == fields[i]:
    91. fields[i] = 'actors'
    92. if '类型' == fields[i]:
    93. fields[i] = 'categories'
    94. if '制片国家/地区' == fields[i]:
    95. fields[i] = 'district'
    96. if '语言' == fields[i]:
    97. fields[i] = 'language'
    98. if '片长' == fields[i]:
    99. fields[i] = 'duration'
    100. # 将所有信息填入item
    101. other_info = list(zip(fields,values))
    102. for field,value in other_info:
    103. if field in ['IMDb链接','上映日期','官方网站','又名']:
    104. other_info.remove((field,value))
    105. final_info = dict(other_info[:-1])
    106. movie_item.update(final_info)
    107. # 处理缺失字段
    108. if not 'director' in movie_item.keys():
    109. movie_item['director'] = '/'
    110. if not 'scriptwriter' in movie_item.keys():
    111. movie_item['scriptwriter'] = '/'
    112. if not 'actors' in movie_item.keys():
    113. movie_item['actors'] = '/'
    114. if not 'categories' in movie_item.keys():
    115. movie_item['categories'] = '/'
    116. if not 'district' in movie_item.keys():
    117. movie_item['district'] = '/'
    118. if not 'language' in movie_item.keys():
    119. movie_item['language'] = '/'
    120. if not 'duration' in movie_item.keys():
    121. movie_item['duration'] = '/'
    122. print('~完成爬取电影: ' + movie_item['title'] + '/' + movie_item['rate'])
    123. #将数据加入到字典中
    124. yield movie_item

    四,项目总结

          爬取的数据最终会存到MYSQL服务器的表中,可以写程序将数据展示出来。注意的时会对IP进行限制封号,200条为限,超过IP会被限制,可以换一个IP进行抓去。主要研究爬虫的基本使用规范和语法,相对较为简单,供大家学习参考

  • 相关阅读:
    GEE|时间序列分析(一)
    基于核心素养初中数学概念课设计策略研究课题开题报告
    PGL图学习之图游走类metapath2vec模型[系列五]
    c盘垃圾太多怎么清理?c盘垃圾太多需要重装系统嘛?
    除gRPC之外的另一个选择,IceRPC-支持QUIC
    cors基础,响应头设置
    后端程序员实现一个IP归属地的小程序
    【华为OD机试真题 JS】玩牌高手
    某汽车金融企业:搭建SDLC安全体系,打造智慧金融服务样本
    深度解析React 18应用性能提升
  • 原文地址:https://blog.csdn.net/whirlwind526/article/details/126366643