• 百度文库旋转验证码识别


       最近研究了一下图像识别,一直找到很好的应用场景,今天我就发现可以用百度的旋转验证码来做一个实验。没想到效果还挺好,下面就是实际的识别效果。

    1、效果演示

    2、如何识别

    2.1准备数据集

    首先需要使用爬虫,对验证码图片进行采集,尽量每一种类型都要采集到。

    2.2图像矫正

    接下来对采集的数据进行人工校正

    2.3数据清洗

    (1)对数据进行进行旋转,达到增加数据量的目的。

    (2)对数据进行灰度化处理,将三维图片降为二维。

    (3)对图片大小进行resize,可以提高训练速度。

    1. # 图片转换部分,得到x
    2. picture = Picture(path=img_path)
    3. # 图像灰度化处理
    4. temp_img = picture.gray()
    5. # 图像resize
    6. temp_img = temp_img.resize((50, 50), Image.LANCZOS)
    7. # 获取y
    8. word = img_path.split('\\')[-1].split('-')[0]
    9.  
    10. # 结果包装成列表,保证x,y是一个整体,不被打乱
    11. res = [np.array(temp_img),np.array(word)]
    12. # 将结构给全局变量
    13. result_list.append(res)
    14. # 记录完成数量
    15. complete_list.append(img_path)
    2.4划分训练集与测试集

    一般训练集占数据量的80%,测试集占总数据量的20%,当然也可以根据自己的情况调整比例。

    2.5训练模型

    这里可以使用CNN神经网络模型进行训练,效果非常不错。

    2.6实战测试

    下面直接上代码。其中的滑动系数可能需要自行调整,这个变动不会太频繁,可能几个月某度变一次。

    1. __author__ = "dengxinyan"
    2. import os
    3. import sys
    4. import time
    5. import base64
    6. import random
    7. import requests
    8. from PIL import Image
    9. from io import BytesIO
    10. sys.path.append(os.path.abspath(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))))
    11. from selenium import webdriver
    12. from selenium.webdriver.support.wait import WebDriverWait
    13. from selenium.webdriver import ActionChains
    14. #PIL图片保存为base64编码
    15. def PIL_base64(img, coding='utf-8'):
    16. img_format = img.format
    17. if img_format == None:
    18. img_format = 'JPEG'
    19. format_str = 'JPEG'
    20. if 'png' == img_format.lower():
    21. format_str = 'PNG'
    22. if 'gif' == img_format.lower():
    23. format_str = 'gif'
    24. if img.mode == "P":
    25. img = img.convert('RGB')
    26. if img.mode == "RGBA":
    27. format_str = 'PNG'
    28. img_format = 'PNG'
    29. output_buffer = BytesIO()
    30. # img.save(output_buffer, format=format_str)
    31. img.save(output_buffer, quality=100, format=format_str)
    32. byte_data = output_buffer.getvalue()
    33. base64_str = 'data:image/' + img_format.lower() + ';base64,' + base64.b64encode(byte_data).decode(coding)
    34. return base64_str
    35. # 根据链接下载旋转图片
    36. def get_img(url):
    37. header = {
    38. "Host": "passport.baidu.com",
    39. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
    40. "Accept": "image/webp,*/*",
    41. "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    42. "Accept-Encoding": "gzip, deflate, br",
    43. "Connection": "keep-alive",
    44. "Referer": "https://wappass.baidu.com/static/captcha/tuxing.html?&ak=c27bbc89afca0463650ac9bde68ebe06&backurl=https%3A%2F%2Fwww.baidu.com%2Fs%3Fcl%3D3%26tn%3Dbaidutop10%26fr%3Dtop1000%26wd%3D%25E6%25B6%2588%25E9%2598%25B2%25E6%2588%2598%25E5%25A3%25AB%25E8%25BF%259E%25E5%25A4%259C%25E7%25AD%2591%25E5%259D%259D%25E5%25BA%2594%25E5%25AF%25B9%25E6%25B4%25AA%25E5%25B3%25B0%25E8%25BF%2587%25E5%25A2%2583%26rsv_idx%3D2%26rsv_dl%3Dfyb_n_homepage%26hisfilter%3D1&logid=8309940529500911554&signature=4bce59041938b160b7c24423bde0b518×tamp=1624535702",
    45. "Cookie": "BAIDUID=A0621DC238F4D936B38F699B70A7E41F:SL=0:NR=10:FG=1; BIDUPSID=A0621DC238F4D9360CD42C9C31352635; PSTM=1667351865; HOSUPPORT=1; UBI=fi_PncwhpxZ%7ETaKAanh2ue0vFk6vHMY02DgvigILJIFul8Z1nzMr9do3SYLtjAUqHSpUz7LvOKV27cIr18-YJryP0Q8j92oo93%7E6hGa0CLdraAlaHUZG-0PW9QrpZkW7MTyUn-yrAq7OmSRBIJ7%7E8gM9pv-; USERNAMETYPE=2; SAVEUSERID=3cd458184c56c2fe28174e594101f074d63463446d; HISTORY=0ece87e30ec8ecccd52ff3d5c42f98002a893bfb73ff358893; BDUSS_BFESS=NOcWd6YWJRbmFVUVBBaWVkaHJNSm5tRUpUaUVMaTNHOHcwZVVaVDdsYXlLZmxrSVFBQUFBJCQAAAAAAAAAAAEAAAC13Mct0KHQwl9keHkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALKc0WSynNFkRD; H_WISE_SIDS=219946_216846_213346_219942_213039_230178_204909_230288_110085_236307_243888_244730_245412_243706_232281_249910_247148_250889_249892_252577_234296_253427_253705_240590_254471_179345_254689_254884_254864_253213_255713_254765_255939_255959_255982_107317_256062_256093_256083_255803_253993_256257_255661_256025_256223_256439_256446_254831_253151_256252_256196_256726_256739_251973_256230_256611_256996_257068_257079_257047_254075_257110_257208_251196_254144_257290_251068_256095_257287_254317_251059_251133_254299_257454_257302_255317_255907_255324_257481_244258_257582_257542_257503_255177_257745_257786_257937_257167_257904_197096_257586_257402_255231_257790_258193_258248_258165_8000084_8000115_8000114_8000126_8000140_8000149_8000166_8000172_8000178_8000181_8000185_8000204; ZFY=SxMcCdU3pSsmienZSgA2BTmHLR9S6caVmiP5Ic:Awuz0:C; BAIDUID_BFESS=A0621DC238F4D936B38F699B70A7E41F:SL=0:NR=10:FG=1; Hm_lvt_90056b3f84f90da57dc0f40150f005d5=1690961642,1692328306; STOKEN=01dbff3d6ff696219b39c9fb730c31c34e032c0eebff4fe535d2f1dde0c7b45b; BDUSS=NOcWd6YWJRbmFVUVBBaWVkaHJNSm5tRUpUaUVMaTNHOHcwZVVaVDdsYXlLZmxrSVFBQUFBJCQAAAAAAAAAAAEAAAC13Mct0KHQwl9keHkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALKc0WSynNFkRD; PTOKEN=92e828db8120372a7baa2557ea4ec476; MAWEBCUID=web_VYfxPuQDaKjEzVgXMFgoHouACkpXyjcDpcWwhATKqELuuwEtNy; __bid_n=18a4ab547aa11525d249ea",
    46. }
    47. response = requests.get(url=url,headers=header)
    48. if response.status_code == 200:
    49. img = Image.open(BytesIO(response.content))
    50. # 将图片转换成base64字符串并返回
    51. return PIL_base64(img)
    52. # 验证码识别接口
    53. def shibie(img_base64):
    54. url = "http://www.detayun.cn/openapi/verify_code_identify/"
    55. data = {
    56. # 用户的key
    57. "key":"JxSfP4E1yfcmJZB6ynOB",
    58. # 验证码类型
    59. "verify_idf_id":"16",
    60. # 样例图片
    61. "img_base64":img_base64,
    62. "img_byte": None,
    63. # 中文点选,空间语义类型验证码的文本描述(这里缺省为空字符串)
    64. "words":""
    65. }
    66. header = {"Content-Type": "application/json"}
    67. # 发送请求调用接口
    68. response = requests.post(url=url, json=data, headers=header)
    69. print(response.text)
    70. return int(str(response.json()['data']['res_str']).replace('顺时针旋转','').replace('度',''))
    71. if __name__ == '__main__':
    72. # 加载防检测js
    73. with open('.\webdriver\stealth.min.js') as f:
    74. js = f.read()
    75. options = webdriver.ChromeOptions()
    76. driver = webdriver.Chrome(executable_path='.\webdriver\chromedriver.exe', options=options)
    77. driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    78. "source": js
    79. })
    80. # 访问百度首页
    81. driver.get('https://wappass.baidu.com/static/captcha/tuxing.html?&ak=c27bbc89afca0463650ac9bde68ebe06&backurl=https%3A%2F%2Fwww.baidu.com%2Fs%3Fcl%3D3%26tn%3Dbaidutop10%26fr%3Dtop1000%26wd%3D%25E6%25B6%2588%25E9%2598%25B2%25E6%2588%2598%25E5%25A3%25AB%25E8%25BF%259E%25E5%25A4%259C%25E7%25AD%2591%25E5%259D%259D%25E5%25BA%2594%25E5%25AF%25B9%25E6%25B4%25AA%25E5%25B3%25B0%25E8%25BF%2587%25E5%25A2%2583%26rsv_idx%3D2%26rsv_dl%3Dfyb_n_homepage%26hisfilter%3D1&logid=8309940529500911554&signature=4bce59041938b160b7c24423bde0b518×tamp=1624535702')
    82. # 等待滑块出现
    83. WebDriverWait(driver, 10).until(lambda x: x.find_element_by_xpath('//div[contains(@class,"passMod_slide-btn")]'))
    84. yzm_button = driver.find_element_by_xpath('//div[contains(@class,"passMod_slide-btn")]')
    85. time.sleep(1)
    86. move_x = 100
    87. # 等待验证码出现
    88. WebDriverWait(driver, 10).until(lambda x: x.find_element_by_xpath('//img[contains(@class,"passMod_spin-background")]'))
    89. img_src = driver.find_element_by_xpath('//img[contains(@class,"passMod_spin-background")]').get_attribute('src')
    90. # 下载图片并转化为base64
    91. img_base64 = get_img(img_src)
    92. # 识别图片旋转角度
    93. move_x = shibie(img_base64)
    94. # 通过旋转角度 * 滑动系数 = 滑动距离
    95. move_x = move_x * 0.661
    96. # 开始滑动
    97. action = ActionChains(driver)
    98. action.click_and_hold(yzm_button).perform() # 鼠标左键按下不放
    99. action.move_by_offset(move_x, 0).perform()
    100. action.release().perform() # 释放鼠标
    101. time.sleep(2)
    102. # 第二次滑动
    103. # 等待滑块出现
    104. WebDriverWait(driver, 10).until(lambda x: x.find_element_by_xpath('//div[contains(@class,"passMod_slide-btn")]'))
    105. yzm_button = driver.find_element_by_xpath('//div[contains(@class,"passMod_slide-btn")]')
    106. time.sleep(1)
    107. move_x = 100
    108. # 等待验证码出现
    109. WebDriverWait(driver, 10).until(lambda x: x.find_element_by_xpath('//img[contains(@class,"passMod_spin-background")]'))
    110. img_src = driver.find_element_by_xpath('//img[contains(@class,"passMod_spin-background")]').get_attribute('src')
    111. # 下载图片并转化为base64
    112. img_base64 = get_img(img_src)
    113. # 识别图片旋转角度
    114. move_x = shibie(img_base64)
    115. # 通过旋转角度 * 滑动系数 = 滑动距离
    116. move_x = move_x * 0.661
    117. # 开始滑动
    118. action = ActionChains(driver)
    119. action.click_and_hold(yzm_button).perform() # 鼠标左键按下不放
    120. action.move_by_offset(move_x, 0).perform()
    121. action.release().perform() # 释放鼠标

    3、总结

    这个旋转验证码非常有特色,而且有很大的难度。特别是在标记训练图片的时候,非常耗费时间。

    现在我也把识别模型封装成了接口,感兴趣的小伙伴可以免费使用:得塔云

  • 相关阅读:
    基础篇-SpringBoot HTTP接口实战
    零时科技 || DPC攻击事件分析
    mongodb入门(五)
    本地客户端连接阿里云Redis服务器
    故障管理:鼓励做事,而不是处罚错误
    ATF启动(六):bl32(OP-TEE)-->bl33 ATF ending
    Python爬虫:scrapy从项目创建到部署可视化定时任务运行
    Python开发工具PyCharm全新版本V2022.2正式发布——支持 Python 3.11
    Java-华为真题-预定酒店
    进阶C++__STL__stack和queue | deque与priority queue
  • 原文地址:https://blog.csdn.net/Dxy1239310216/article/details/136440543