• Python爬虫_某宝网案例


    Python爬虫_某宝网案例

    一、导入第三方库,确定url,定义headers ,伪装爬虫代码

    import requests
    url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
    
    • 1
    • 2
    headers = {
        'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
        'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
        'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
        'sec-ch-ua-mobile':'?0',
        'sec-ch-ua-platform':'"Windows"',
        'sec-fetch-dest':'document',
        'sec-fetch-mode':'navigate',
        'sec-fetch-site':'same-origin',
        'sec-fetch-user':'?1',
        'upgrade-insecure-requests':'1',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
    }
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13

    注:选中文本后,ctrl+R,采用正则表达式:(.*?):(.*)全部替换为'$1':'$2',这样我们就将每个字段添加了单引号。
    替换过后,务必将多余的空格删除,否则会报错

    二、版本一完整代码(数据保存至CSV文件)

    import re
    import json
    import pprint
    import requests
    import csv # 写入csv文件中
    
    with open('taobao.csv','w',encoding='ANSI',newline='') as filename :
        # 定义表头
        csvwriter = csv.DictWriter(filename,fieldnames=['标题','价格','店铺','购买人数','地点','商品详情页','店铺链接','图片链接'])
        # 写入表头
        csvwriter.writeheader()
        url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
        headers = {
            'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
            'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
            'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
            'sec-ch-ua-mobile':'?0',
            'sec-ch-ua-platform':'"Windows"',
            'sec-fetch-dest':'document',
            'sec-fetch-mode':'navigate',
            'sec-fetch-site':'same-origin',
            'sec-fetch-user':'?1',
            'upgrade-insecure-requests':'1',
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
        }
        response = requests.get(url=url,headers=headers)
        # print(response.text)
        html_data = re.findall('g_page_config = (.*);',response.text)[0]
        # print(html_data)
        json_data = json.loads(html_data) # 将原本的json数据格式转换为了python字典
        # pprint.pprint(json_data)
        # 产品标题 raw_title 在'mods' 'itemlist' 'data' 'auctions'标签内
        data = json_data['mods']['itemlist']['data']['auctions']
        
        for index in data :
            dict = {
                '标题' : index['raw_title'], # 将标题取出写入字典中
                '价格' : index['view_price'],
                '店铺' : index['nick'],
                '购买人数' : index['view_sales'],
                '地点' : index['item_loc'],
                '商品详情页' : 'https:' + index['detail_url'],
                '店铺链接' : index['shopLink'],
                '图片链接' : 'https:' + index['pic_url']
            }
            csvwriter.writerow(dict) # 数据写入csv文件
            print(dict)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47

    三、版本二完整代码(数据保存至sqlite3数据库)

    import re
    import json
    import pprint
    import requests
    import csv # 写入csv文件中
    import sqlite3  #进行SQLite数据库操作
    
    dbpath = 'taobao.db'
    def getdata() :
        init_db(dbpath)
        conn = sqlite3.connect(dbpath)
        cur = conn.cursor()  # 获取游标
    
        url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
        headers = {
            'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
            'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
            'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
            'sec-ch-ua-mobile':'?0',
            'sec-ch-ua-platform':'"Windows"',
            'sec-fetch-dest':'document',
            'sec-fetch-mode':'navigate',
            'sec-fetch-site':'same-origin',
            'sec-fetch-user':'?1',
            'upgrade-insecure-requests':'1',
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
        }
        response = requests.get(url=url,headers=headers)
        # print(response.text)
        html_data = re.findall('g_page_config = (.*);',response.text)[0]
        # print(html_data)
        json_data = json.loads(html_data) # 将原本的json数据格式转换为了python字典
        # pprint.pprint(json_data)
        # 产品标题 raw_title 在'mods' 'itemlist' 'data' 'auctions'标签内
        data = json_data['mods']['itemlist']['data']['auctions']
    
        for value in data :
            sql = '''
                insert into taobao(
                    rawtitle,viewprie,nick,viewsales,itemloc,detailurl,shoplink,picurl)
                    values('%s','%s','%s','%s','%s','%s','%s','%s')'''% (value['raw_title'],value['view_price'],value['nick'],value['view_sales'],value['item_loc'],value['detail_url'],value['shopLink'],value['pic_url'])
            cur.execute(sql)
            conn.commit()
        cur.close()
        conn.close()
    
    # 初始化创建数据库
    def init_db(dbpath) :
        sql = '''
           create table taobao(
               id integer primary key autoincrement,
               rawtitle varchar,
               viewprie numeric,
               nick varchar,
               viewsales varchar,
               itemloc varchar,
               detailurl text,
               shoplink text,
               picurl text
           ) 
        '''
        conn = sqlite3.connect(dbpath)
        cursor = conn.cursor() # 获取数据库游标
        cursor.execute(sql)
        conn.commit()
        conn.close()
    
    getdata()
    print("保存完成!")
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69

    在这里插入图片描述

  • 相关阅读:
    “耳聪目明”的智慧工地到底该怎么搭建,最全面的智慧工地解决方案一看就懂
    atcoder ABC 232 B~E题解
    Pytorch——实现Tensor矩阵的任意角度旋转、平移操作
    华为OD真题--字符串摘要--带答案
    【运维心得】ApacheDirectory找不到java路径的解决方案
    hosts文件地址
    【教3妹学编辑-mysql】详解join(内连接、外连接、交叉连接等)
    Apache APISIX在微软云 ARM 和 x86 服务器上的性能测试对比
    【数据结构】纯c语言双向链表
    2024.4.25力扣每日一题——总行驶距离
  • 原文地址:https://blog.csdn.net/qq_45556665/article/details/126282211