• day5-selenium的高级和实战


    三 selenium的高级和实战

    3.1 selenium使用代理ip

    from selenium.webdriver import Chrome, ChromeOptions
    # 1. 创建配置对象
    options = ChromeOptions()
    # 2. 添加配置
    options.add_argument('--proxy-server=http://42.54.95.139:4531')#代理ip
    # 3.通过指定配置创建浏览器对象
    b = Chrome(options=options)
    b.get('https://movie.douban.com/top250')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    3.2 selenium的基本配置

    from selenium.webdriver import Chrome, ChromeOptions
    options = ChromeOptions()
    # 1.取消测试环境
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # 2. 取消图片加载
    options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    b = Chrome(options=options)
    b.get('https://www.jd.com')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    3.3 selenium等待

    1.隐式等待

    如果没有设置隐式等待:在通过浏览器获取标签的时候,如果标签不存在会直接报错;

    如果设置了隐式等待:在通过浏览器获取标签的时候,如果标签不存在会直接报错,不会马上报错,

    而是在指定时间范围内不断尝试重新获取标签,直到获取到标签或者超时为止(如果超时会报错);

    一个浏览器只需要设置一次隐式等待时间,它会作用于这个浏览器每次获取标签的时候。

    from selenium.webdriver import Chrome, ChromeOptions
    
    options = ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    
    b = Chrome(options=options)
    b.get('https://www.jd.com')
    
    # 1) 设置隐式等待时间
    b.implicitly_wait(5)
    print('=====')
    # 2) 获取标签的时候隐式等待时间才会生效
    input_tag = b.find_element_by_id('key')
    input_tag.send_keys('钱包\n')
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    1. 显示等待 - 等到某个条件成立或者不成立为止

      1)创建等待对象: WebDriverWait(浏览器对象, 超时时间)

      2)添加等待条件:

      等待对象.until(条件) - 等到条件成立为止

      等待对象.until_not(条件) - 等到条件不成立为止

      条件的写法:

      presence_of_element_located(标签) - 指定标签出现

      text_to_be_present_in_element_value(标签, 值) - 指定标签的value属性值中包含指定值

      text_to_be_present_in_element(标签, 值) - 指定标签的标签内容中包含指定值

      注意:条件中提供标签的方式

      (By.xxx, 具体的值)

    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.webdriver.common.by import By
    wait = WebDriverWait(b, 10)
    # wait.until(EC.text_to_be_present_in_element_value((By.ID, 'key'), '电脑'))
    # print('继续执行')
    
    # 等到页面中出现id值为J_goodsList里面class值为gl-i-wrap的div标签值
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_goodsList div.gl-i-wrap')))#必须给元组
    
    print(b.page_source)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    3.4 前程无忧(代理ip+requests+json)

    import requests
    from time import sleep
    from re import search
    from json import loads
    
    
    def get_ip():
        """获取代理ip"""
        url = 'http://d.jghttp.alicloudecs.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=2&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1®ions='
        while True:
            response = requests.get(url)
            if response.text[0] == '{':
                print('提取IP失败')
                sleep(1)
                continue
            return response.text
    
    
    def get_net_data():
        # 1. 设置请求地址和header
        url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,数据分析,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=&u_atoken=540000ca-581a-4002-aa0f-42aedfba6d36&u_asession=01kn7UwIxmYtNszVTAVFzCGxdHP0qhcha0-osvr6eita2YJVTYgshzhQTv11v_wW-pX0KNBwm7Lovlpxjd_P_q4JsKWYrT3W_NKPr8w6oU7K_1z5GupMvpS3-qdJzJEdUlymCvuFU2gNCRIRJqGpb9omBkFo3NEHBv0PZUm6pbxQU&u_asig=05qFfvfDNOQSPq9NIuzj4ViQi8mdNU0nfqkKpbY1NwgFn7IEzf-HWbIdx1PF5izag9Sr4_jabxb30W4ZvB3yc7qrv6vHeKYQ6JhUhmM1Fblfaq7AKpvOAd9zDedY2USM7SW6KPpoDNo4Zd_KsztAAIVcznu7ABWKDx-WZGW4j2y7b9JS7q8ZD7Xtz2Ly-b0kmuyAKRFSVJkkdwVUnyHAIJzbYXEd-kO_Sj5Qq2OkC0QQLEUWQ0Hrqe4W-P1gzc9X5bb4DvBakBj6x1SID70OM96u3h9VXwMyh6PgyDIVSG1W9f_GWun2RGoF0qWZlOBtVzMcP86Dpk-9o5WW2hT6bxqnfLDaWAmdBzT8FTmMacz4pQSRQmGmppZNXPg0hY8XcdmWspDxyAEEo4kbsryBKb9Q&u_aref=diUHZDkF4ZhqqQRmaoboTC71KAg%3D'
        headers = {
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
        }
    
        # 2. 发送请求,如果请求失败重新获取ip然后重新请求
        while True:
            ip = get_ip()
            print(ip)
            proxies = {
                'http': ip,
                'https': ip
            }
            response = requests.get(url, headers=headers, proxies=proxies)
            result = search(r'window.__SEARCH_RESULT__ = (.+?)', response.text)
            # 如果能够从请求结果中获取到数据数据,直接解析数据,不再重复请求
            if result:
                analysis_data(result.group(1))
                break
    
    
    def analysis_data(json_data: str):
        """json解析"""
        data = loads(json_data)
        for x in data['engine_jds']:
            print(x['job_name'], x['providesalary_text'])
    
    
    if __name__ == '__main__':
        get_net_data()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50

    3.5 获取京东评论数(requests+bs4+selenium)

    def get_one_data(page: int, key_word='菇凉果'):
        # 1. 请求网页数据
        url = f'https://search.jd.com/Search?keyword={key_word}&qrst=1&suggest=1.def.0.SAK7%7CMIXTAG_SAK7R%2CSAK7_M_AM_L5377%2CSAK7_M_COL_R%2CSAK7_S_AM_R%2CSAK7_SC_PD_R%2CSAK7_SM_PB_R%2CSAK7_SS_PM_R%2CSAK7_CS_PRF_R%2Ctsabtest_base64_U2VhcmNobGlzdF80MzkyfGJhc2Ux_tsabtest%7C&wq=%E8%8F%87%E5%87%89%E6%9E%9C&stock=1&pvid=85a91554bc964bdc8d2c0ddd18852678&page={page}&s=1&click=0'
        response = requests.get(url, headers=headers)
    
        # 2. 解析获取每个商品的详情页地址
        soup = BeautifulSoup(response.text, 'lxml')
        goods_list = soup.select('#J_goodsList div.gl-i-wrap .p-img>a')
        for x in goods_list:
            goods_url = 'https:' + x.attrs['href']
            get_goods_details(goods_url)
    
    
    def get_goods_details(url: str):
        # 1. 获取带有评价信息的商品详情数据
        b = Chrome(options=options)
        b.get(url)
        b.implicitly_wait(5)
        btn = b.find_element_by_css_selector('#detail > div.tab-main.large > ul > li:nth-child(5)')
        btn.click()
        sleep(1)
        result = b.page_source
    
        # 2. 解析数据
        soup = BeautifulSoup(result, 'lxml')
        name = soup.select_one('.itemInfo-wrap>.sku-name').text.strip()
        print(name)
        price = soup.select_one('.p-price>.price').text
        print(price)
        comment_count = soup.select_one('#comment-count .count').text
        print(comment_count)
        comment_info = soup.select('.J-comments-list ul.filter-list>li')
        print(comment_info)
        if comment_info:
            good_comment = comment_info[4].text
            good_comment = findall(r'好评\((.+)\)', good_comment)[0]
            middle_comment = comment_info[5].text
            middle_comment = findall(r'中评\((.+)\)', middle_comment)[0]
            negative_comment = comment_info[6].text
            negative_comment = findall(r'差评\((.+)\)', negative_comment)[0]
            print(good_comment, middle_comment, negative_comment)
        b.close()
    
    
    if __name__ == '__main__':
        get_one_data(1, '鼠标')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
  • 相关阅读:
    Stream流式处理
    python opencv 深度学习 指纹识别算法实现 计算机竞赛
    记一次 .NET 某券商论坛系统 卡死分析
    Less的基本语法
    java服务器如何知道客户端请求的ip地址
    【常见 error】Vivado 综合出现中断、失败、“PID not specified”
    浅谈余压监控系统在某高层住宅的应用方案
    深度学习常见损失函数总结+Pytroch实现
    网页期末作业 基于HTML+CSS中国传统节日【清明节】带论文8000字
    原型设计模式
  • 原文地址:https://blog.csdn.net/qq_59778168/article/details/126410444