• Scrapy + selenium + 超级鹰验证码识别爬取网站


    一、安装Scrapy

    1,window安装

    pip install Scrapy

    2,安装selenium

    pip install selenium

    3,下载Chrome驱动

             a,查看Google Chrome浏览器版本    

                    Chrome驱动下载地址http://chromedriver.storage.googleapis.com/index.html

             b,找到和你版本最接近的哪个安装包

     

             c,下载好之后将我们的chromedriver放到和我们python安装路径相同的目录下

             d,配置环境变量

     4,超级鹰验证码识别

            a,超级鹰官网 https://www.chaojiying.com/

            b,注册,登入

            c,生成软件id

            d,下载,放置到爬虫工程目录下

    二、Scrapy项目生成

    1,win + R

    2,输入命令

    1. ​​# 切换到自己想要的路径 cd C:\Users\(用户名)\Desktop\spider
    2. # 创建工程 scrapy startproject (项目名)
    3. # 切换到新创建的文件夹 cd hellospider
    4. # 创建爬虫项目 scrapy genspider (爬虫名) (爬取网址的域名)

     3,使用pycharm打开​​​​​​​

    4, 修改为虚拟环境()

    file->setting

     pycharm里面的命令行,再次安装scrapy,selenium

    三、爬取某个网站(以下我用之前的创建的项目,不是刚刚新创的)

    1,修改setting

    1. # 修改机器人协议
    2. ROBOTSTXT_OBEY = False
    3. # 下载时间间隙
    4. DOWNLOAD_DELAY = 1
    5. # 启用后,当从相同的网站获取数据时,Scrapy将会等待一个随机的值,延迟时间为0.51.5之间的一个随机值乘以DOWNLOAD_DELAY
    6. RANDOMIZE_DOWNLOAD_DELAY=True
    7. # 若是请求时间超过60秒,就会报异常,异常机制是会再次发起请求的
    8. DOWNLOAD_TIMEOUT = 60
    9. # 设置请求头
    10. DEFAULT_REQUEST_HEADERS = {
    11.   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    12.   'Accept-Language': 'en',
    13.   'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'
    14. }
    15. # 打开一个管道
    16. ITEM_PIPELINES = {
    17.    # '项目名称.pipelines.管道名': 300,
    18. 'chuxiongfilespider.pipelines.ChuxiongfilespiderPipeline': 300,
    19. }

     2,items.py文件

    定义需要的字段

    3,写爬虫文件 chuxiongfile.py

    1. import copy
    2. from datetime import time
    3. import scrapy
    4. from pymysql.converters import escape_string
    5. from scrapy.http import HtmlResponse
    6. from selenium.common import NoSuchElementException
    7. from selenium.webdriver import Chrome
    8. from selenium.webdriver.common.by import By
    9. from chuxiongfilespider.items import ChuxiongfilespiderItem
    10. from chuxiongfilespider.spiders.chaojiying import Chaojiying_Client
    11. import uuid
    12. class ChuxiongfileSpider(scrapy.Spider):
    13. name = 'chuxiongfile'
    14. allowed_domains = ['网址']
    15. start_urls = [
    16. '爬取的网址']
    17. page = 1
    18. def start_requests(self):
    19. web = Chrome()
    20. web.get(self.start_urls[0])
    21. try:
    22. # selenium版本更新,原find_element_by_xpath需要改写,并导By包
    23. web.find_element(By.XPATH, '/html/body/form/div/img')
    24. # screenshot_as_png当前窗口的屏幕快照保存为二进制数据
    25. img = web.find_element(By.XPATH, '/html/body/form/div/img').screenshot_as_png
    26. # 超级鹰处理验证码
    27. chaojiying = Chaojiying_Client('超级鹰登入账号', '超级鹰登入密码', '软件id')
    28. # 1902处理验证码类型
    29. dic = chaojiying.PostPic(img, 1902)
    30. verify_code = dic['pic_str']
    31. # 填写验证码
    32. web.find_element(By.XPATH, '//*[@id="visitcode"]').send_keys(verify_code)
    33. # 点击确定
    34. time.sleep(2)
    35. web.find_element(By.XPATH, '/html/body/form/div/input[4]').click()
    36. # 获取验证码输入后的cookie
    37. cookies_dict = {cookie['name']: cookie['value'] for cookie in web.get_cookies()}
    38. web.close()
    39. yield scrapy.Request(url=self.start_urls[0], cookies=cookies_dict, callback=self.parse)
    40. except NoSuchElementException:
    41. yield scrapy.Request(url=self.start_urls[0], callback=self.parse)
    42. def parse(self, response: HtmlResponse, **kwargs):
    43. items = ChuxiongfilespiderItem()
    44. for item in response.css('.tml'):
    45. items['name'] = item.css('.tcc a::text').extract()[0]
    46. items['policy_id'] = ''.join(str(uuid.uuid5(uuid.NAMESPACE_DNS, items['name'])).split('-'))
    47. items['attachment_id'] = '123'
    48. items['url'] = response.urljoin(item.css('.tcc a::attr(href)').extract_first())
    49. if item.css('.d a::attr(href)').extract_first() == '':
    50. items['attachment_url'] = '无下载选项'
    51. else:
    52. items['attachment_url'] = response.urljoin(item.css('.d a::attr(href)').extract_first())
    53. items['netloc'] = '网址'
    54. yield scrapy.Request(url=items['url'], callback=self.get_details, meta={"items": copy.deepcopy(items)})
    55. def get_details(self, response):
    56. items = response.meta['items']
    57. items['content'] =escape_string(" ".join(response.css('.xzgfwrap').getall()))
    58. yield items
    59. if self.page < 2:
    60. self.page += 1
    61. url = f'http://(网址)?totalpage=3&PAGENUM={str(self.page)}&urltype' \
    62. f'=tree.TreeTempUrl&wbtreeid=3494'
    63. yield scrapy.Request(url=url, callback=self.parse) # 使用callback进行回调

     4,存储到数据库 pipelines.py

    1. # Define your item pipelines here
    2. #
    3. # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    4. # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    5. # useful for handling different item types with a single interface
    6. from itemadapter import ItemAdapter
    7. import pymysql
    8. class ChuxiongfilespiderPipeline(object):
    9. mysql = None
    10. cursor = None # 执行SQL语句返回游标接口
    11. def open_spider(self, spider):
    12. self.mysql = pymysql.Connect(host='localhost', user='数据库用户名', password='数据库用户密码', port=3306, charset='utf8',
    13. database='库名')
    14. self.cursor = self.mysql.cursor()
    15. def process_item(self, items, spider):
    16. # 创建表
    17. table = 'create table if not exists cx_other(' \
    18. 'id int not null primary key auto_increment' \
    19. ',policy_id varchar(100)' \
    20. ',url varchar(1000)' \
    21. ',attachment_id varchar(100)' \
    22. ',attachment_url varchar(100)' \
    23. ',name varchar(150)' \
    24. ',netloc varchar(50)' \
    25. ');'
    26. table_1 = 'create table if not exists cx_other_content(' \
    27. 'id int not null primary key auto_increment' \
    28. ',policy_id varchar(100)' \
    29. ',content MEDIUMTEXT NOT NULL' \
    30. ');'
    31. insert = 'insert into cx_other(policy_id,url,attachment_id,attachment_url,name,netloc) ' \
    32. 'values("%s","%s","%s","%s","%s","%s")' \
    33. % (items['policy_id'], items['url'], items['attachment_id'], items['attachment_url'], items['name'], items['netloc'])
    34. insert_1 = 'insert into cx_other_content(policy_id,content) values("%s","%s")' % (
    35. items['policy_id'], items['content'])
    36. try:
    37. # 数据库断开后重连
    38. self.mysql.ping(reconnect=True)
    39. # 创建表
    40. self.cursor.execute(table)
    41. self.cursor.execute(table_1)
    42. # 插入数据
    43. self.cursor.execute(insert)
    44. self.cursor.execute(insert_1)
    45. self.mysql.commit()
    46. print('===============插入数据成功===============')
    47. except Exception as e:
    48. print('===============插入数据失败===============', e)
    49. self.mysql.rollback()
    50. return items
    51. def close_spider(self, spider):
    52. self.cursor.close()
    53. self.mysql.close()

  • 相关阅读:
    我要进大厂第十二讲:阿里面经第三篇
    基于FPGA的数字滤波器fir
    salesforce零基础学习(一百二十八)Durable Id获取以及相关概念浅入浅出
    java毕业设计设计师品牌服装租赁网站源码+lw文档+mybatis+系统+mysql数据库+调试
    HTML核心(5)- a元素
    【MATLAB教程案例40】语音信号的共振峰频率倒谱法估计matlab仿真学习
    【Linux】题解:Linux环境基础开发工具——Git
    leecode |美化数组的最小删除个数
    Linux内核源码分析 (B.5)推演 slab 内存池的设计与实现
    [React] React高阶组件(HOC)
  • 原文地址:https://blog.csdn.net/weixin_41586246/article/details/126697920