一、导入第三方库,确定url,定义headers ,伪装爬虫代码
import requests
url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
headers = {
'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'sec-fetch-dest':'document',
'sec-fetch-mode':'navigate',
'sec-fetch-site':'same-origin',
'sec-fetch-user':'?1',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
}
注:选中文本后,ctrl+R,采用正则表达式:
(.*?):(.*)
全部替换为'$1':'$2',
这样我们就将每个字段添加了单引号。
替换过后,务必将多余的空格删除,否则会报错
二、版本一完整代码(数据保存至CSV文件)
import re
import json
import pprint
import requests
import csv # 写入csv文件中
with open('taobao.csv','w',encoding='ANSI',newline='') as filename :
# 定义表头
csvwriter = csv.DictWriter(filename,fieldnames=['标题','价格','店铺','购买人数','地点','商品详情页','店铺链接','图片链接'])
# 写入表头
csvwriter.writeheader()
url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
headers = {
'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'sec-fetch-dest':'document',
'sec-fetch-mode':'navigate',
'sec-fetch-site':'same-origin',
'sec-fetch-user':'?1',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
}
response = requests.get(url=url,headers=headers)
# print(response.text)
html_data = re.findall('g_page_config = (.*);',response.text)[0]
# print(html_data)
json_data = json.loads(html_data) # 将原本的json数据格式转换为了python字典
# pprint.pprint(json_data)
# 产品标题 raw_title 在'mods' 'itemlist' 'data' 'auctions'标签内
data = json_data['mods']['itemlist']['data']['auctions']
for index in data :
dict = {
'标题' : index['raw_title'], # 将标题取出写入字典中
'价格' : index['view_price'],
'店铺' : index['nick'],
'购买人数' : index['view_sales'],
'地点' : index['item_loc'],
'商品详情页' : 'https:' + index['detail_url'],
'店铺链接' : index['shopLink'],
'图片链接' : 'https:' + index['pic_url']
}
csvwriter.writerow(dict) # 数据写入csv文件
print(dict)
三、版本二完整代码(数据保存至sqlite3数据库)
import re
import json
import pprint
import requests
import csv # 写入csv文件中
import sqlite3 #进行SQLite数据库操作
dbpath = 'taobao.db'
def getdata() :
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor() # 获取游标
url = 'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20220811&ie=utf8'
headers = {
'cookie':'cna=zIsvG8QofGgCAXAc0HQF5jMC; ariaDefaultTheme=undefined; t=9ac1f71719420207d1f87d27eb676a4c; _m_h5_tk=adcc3c021e3b87caf717886de2956b4f_1660197714179; _m_h5_tk_enc=1af4dc9e2bf60884ef3d0e255253f6b2; xlly_s=1; cookie2=16aa0d04efd876db9a0a6ea3a6201798; _tb_token_=e8f30e5eeeaee; _samesite_flag_=true; sgcookie=E100lJaxeK%2FAPyj3QKfLcL9nnFAvbSQ1NVa%2Fj5KnkOmbyuRuRVi5UIhuo%2F950QL5HA5pu7UW1W7o5e1gKyskjeASeiG%2Fu8b%2Bx2w%2BNK1TNfbC3%2BY%3D; unb=3403337303; uc1=cookie15=Vq8l%2BKCLz3%2F65A%3D%3D&cookie14=UoeyDt7VJs5rtg%3D%3D&existShop=false&pas=0&cookie16=W5iHLLyFPlMGbLDwA%2BdvAGZqLg%3D%3D&cookie21=VFC%2FuZ9ainBZ; uc3=lg2=W5iHLLyFOGW7aA%3D%3D&id2=UNQ3HL3rNGIh9Q%3D%3D&vt3=F8dCv4G1KArg9Z5EDnI%3D&nk2=py7xJGsI3wn8W4Q%3D; csg=abea7184; lgc=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cancelledSubSites=empty; cookie17=UNQ3HL3rNGIh9Q%3D%3D; dnk=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; skt=7eb00df2545b28f1; existShop=MTY2MDE4Nzg5Mw%3D%3D; uc4=id4=0%40UgP8IaO4dk7rKbnRwpAL1RCASure&nk4=0%40pRj%2BYG91XDR4VZfDtp5sZkTvbfnKjg%3D%3D; tracknick=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; _cc_=UIHiLt3xSw%3D%3D; _l_g_=Ug%3D%3D; sg=f3f; _nk_=%5Cu9006%5Cu98CE%5Cu8FFD%5Cu98CEzgf; cookie1=URmvlmqe9vvqj4%2FetXdyS32Np7aof75Ji3WJNOrxmAo%3D; enc=Wc21Ym4ZtT2bAKugjrg4mga24om36KJRqmV58dwu1eCI9NiOMGxoPn%2BuEfXDf82wAhxp6sq2XAkI8TAxsuD0CQ%3D%3D; JSESSIONID=110B64FBCE3C522DA285BDE7FEF11591; tfstk=cun5BPOtj_fSjuRbgz928VtWelqCZadghwVxFImyTdyXp5M5i5ja1Iq4G_qUp-1..; l=eB_Q_LVPLdI5ulzEBOfwnurza77tsIRAguPzaNbMiOCPO-1p5S3FW6YRMrT9CnGVh6kvR3k0hWaBBeYBqIv4n5U62j-lasDmn; isg=BBoasw0KLOE0w6BNINhb8iDla8A8S54lfo04kySTwK14l7rRDNnmNRflY2MLRxa9',
'referer':'https://s.taobao.com/search?q=%E6%98%BE%E5%8D%A1&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.jianhua.201856-taobao-item.2&ie=utf8&initiative_id=tbindexz_20170306',
'sec-ch-ua':'"Chromium";v="104", " Not A;Brand";v="99", "Microsoft Edge";v="104"',
'sec-ch-ua-mobile':'?0',
'sec-ch-ua-platform':'"Windows"',
'sec-fetch-dest':'document',
'sec-fetch-mode':'navigate',
'sec-fetch-site':'same-origin',
'sec-fetch-user':'?1',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47',
}
response = requests.get(url=url,headers=headers)
# print(response.text)
html_data = re.findall('g_page_config = (.*);',response.text)[0]
# print(html_data)
json_data = json.loads(html_data) # 将原本的json数据格式转换为了python字典
# pprint.pprint(json_data)
# 产品标题 raw_title 在'mods' 'itemlist' 'data' 'auctions'标签内
data = json_data['mods']['itemlist']['data']['auctions']
for value in data :
sql = '''
insert into taobao(
rawtitle,viewprie,nick,viewsales,itemloc,detailurl,shoplink,picurl)
values('%s','%s','%s','%s','%s','%s','%s','%s')'''% (value['raw_title'],value['view_price'],value['nick'],value['view_sales'],value['item_loc'],value['detail_url'],value['shopLink'],value['pic_url'])
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
# 初始化创建数据库
def init_db(dbpath) :
sql = '''
create table taobao(
id integer primary key autoincrement,
rawtitle varchar,
viewprie numeric,
nick varchar,
viewsales varchar,
itemloc varchar,
detailurl text,
shoplink text,
picurl text
)
'''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor() # 获取数据库游标
cursor.execute(sql)
conn.commit()
conn.close()
getdata()
print("保存完成!")