import urllib.request
#1、定义一个url
url = "http://www.baidu.com"
# 2、模拟浏览器向服务器发送一个请求
response = urllib.request.urlopen(url)
#3、获取响应中页面的源码,返回的是二进制字符串,因此我们需要解码
# content = response.read().decode("utf-8")
#读取一行
# content = response.readline().decode("utf-8")
#读取所有内容
# content = response.readlines()
# print(content)
#返回状态码,我们可以通过返回状态码判断自己的逻辑是否正确
print(response.getcode()) #200
#返回url地址
print(response.geturl())
#获取响应头信息
print(response.getheaders())
# print(content)
import urllib.request
url_page = "http://www.baidu.com"
#下载页面
# urllib.request.urlretrieve(url_page,r"C:\pycharm\pythonProject\new_project\spider\data\test01.html")
#下载图片
# lisa_url = "https://img2.baidu.com/it/u=3108699108,4205190465&fm=253&fmt=auto&app=138&f=JPEG?w=500&h=625"
# urllib.request.urlretrieve(lisa_url,"lisa.jpg")
#下载视频
url_video = "https://vd3.bdstatic.com/mda-ja3k8u4jhzfw0knk/sc/mda-ja3k8u4jhzfw0knk.mp4?v_from_s=hkapp-haokan-nanjing&auth_key=1658046712-0-0-71d0141bb4ac0387104106f7f9a0d34f&bcevod_channel=searchbox_feed&pd=1&cd=0&pt=3&logid=0112839604&vid=13745365751502067064&abtest=103334_1-103525_1&klogid=0112839604"
urllib.request.urlretrieve(url_video,"lisa.mp4")
UA介绍:
User Agent中文名为用户代理,简称 UA,它是一个特殊字符串头,使得服务器能够识别客户使用的操作系统 及版本、CPU 类型、浏览器及版本。浏览器内核、浏览器渲染引擎、浏览器语言、浏览器插件等
UA反爬虫:
错误示范
import urllib.request
url_page = "https://www.baidu.com"
response = urllib.request.urlopen(url_page)
content = response.read().decode("utf-8")
# http/https www.baidu.com 80/443 s wd = 电脑 #
# 协议 主机 端口号 路径 参数 锚点
# http 80
# https 443
print(content)
# 出现这种情况的主要原因是因为遇到了反扒机制,给的数据不够完整
#
#
#
# location.replace(location.href.replace("https://","http://"));
#
#
#
#
#
#
正确示范
import urllib.request
url_page = "https://www.baidu.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
#1、请求对象定制,解决反扒问题,注意,在传参的时候,不能不写形参,因为中间有一个data形参隐掉了
# def __init__(self, url, data=None, headers={},
request = urllib.request.Request(url = url_page,headers=headers)
#2、将定制的request作为参数传入,获取返回对象,并打印
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
print(content)
quote()方法能够将汉字转换成unicode编码的格式,适用于单个参数
错误示范
import urllib.request
import urllib.parse
url = "https://www.baidu.com/s?ie=UTF-8&wd=电脑"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
print(url)
# 封装请求对象,避免反扒
request = urllib.request.Request(url = url,headers=headers)
#获取返回对象
response = urllib.request.urlopen(request)
#解码返回对象
content = response.read().decode("utf-8")
print(content)
# UnicodeEncodeError: 'ascii' codec can't encode characters in position 19-20: ordinal not in range(128)
正确示范
import urllib.request
import urllib.parse
url = "https://www.baidu.com/s?ie=UTF-8&wd="
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
#将电脑转为Unicode编码
name = urllib.parse.quote("电脑")
#将两个字符串拼接为一个url
url = url+name
print(url)
# https://www.baidu.com/s?ie=UTF-8&wd=%E7%94%B5%E8%84%91
# 封装请求对象,避免反扒
request = urllib.request.Request(url = url,headers=headers)
#获取返回对象
response = urllib.request.urlopen(request)
#解码返回对象
content = response.read().decode("utf-8")
print(content)
urlencode()方法也可以将汉字转换成unicode编码,适用于多个参数
示范
import urllib.request
import urllib.parse
url = "https://www.baidu.com/s?"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
data = {
'wd':'周杰伦',
'sex':'男',
'location':'台湾'
}
new_data = urllib.parse.urlencode(data)
print(new_data) #wd=%E5%91%A8%E6%9D%B0%E4%BC%A6&sex=%E7%94%B7&location=%E5%8F%B0%E6%B9%BE
url = url+new_data
request = urllib.request.Request(url = url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
print(content)
post请求方式与get请求方式区别
示例
import urllib.request
import urllib.parse
import json
url = "https://fanyi.baidu.com/sug"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
data = {
'kw' :'spider'
}
#post 请求,关键字不会放在url中,会放在url 的body中
#post请求,必须编码
new_data = urllib.parse.urlencode(data).encode("utf-8")
#post请求将参数放在data里面
requests = urllib.request.Request(url = url,data = new_data,headers=headers)
response = urllib.request.urlopen(requests)
content = response.read().decode("utf-8")
print(content) #返回json字符串
json_obj = json.loads(content)
print(json_obj)
解析:
1、这里的url从哪里获取的?
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nNjeym5h-1660383709275)(C:\Users\HP\AppData\Roaming\Typora\typora-user-images\image-20220717181523575.png)]
2、这里的kw关键字从哪里获取的
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8tnLEnFj-1660383709276)(C:\Users\HP\AppData\Roaming\Typora\typora-user-images\image-20220717181656690.png)]
有些接口需要cookie,作为反扒手段,我们在使用post方式时候,需要加入cookie作为反扒认证
错误示范
import urllib.request
import urllib.parse
import json
url = "https://fanyi.baidu.com/v2transapi?from=en&to=zh"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
data = {
'from': 'en',
'to': 'zh',
'query': 'spider',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '63766.268839',
'domain': 'common'
}
new_data = urllib.parse.urlencode(data).encode("utf-8")
request = urllib.request.Request(url,new_data,headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
print(content)
content_obj = json.loads(content)
print(content_obj)
#{'errno': 998, 'errmsg': '未知错误', 'query': 'spider', 'from': 'en', 'to': 'zh', 'error': 998}
正确示范
import urllib.request
import urllib.parse
import json
url = "https://fanyi.baidu.com/v2transapi?from=en&to=zh"
headers = {
'Cookie': 'BIDUPSID=67B6062DDE1E084773421458DDB25F84; PSTM=1648366176; BAIDUID=67B6062DDE1E0847CB591CBF62D24FD4:FG=1; BDUSS=Gx0OU1FVzBlRGdTcnJvdHBlYlN4S2QySFZXVmpad2JwZFh4WmtHbjdoUkxMR2xpSVFBQUFBJCQAAAAAAQAAAAEAAABeZiIQ0MezvdG50Me60wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEufQWJLn0FiM;ab_sr=1.0.1_NzQzODVmOThiYWMyN2Y5MDFiMjE1Yjg4MThlODBkNmU0NjU0NzY1YTNiYTgxYjVmYzBiNTIyMGEwY2Q4NmQ2ZDU0MTc5MWI0MmYwYjkwNTY1Zjk4NmI1YjhmYzljOGMzOTYxOWY1YzliMmE5MDU4Zjc3MjU2Y2ZlM2E0YmQ0OWJiNWRmNDZiMTI5ODc4MjdkZTZjZmVmYTVkYWJjZjA5Yzk1MGIwN2JjM2JhYTU0MmNlYjE2NTAxMDM5NDI1MDA4'
}
data = {
'from': 'en',
'to': 'zh',
'query': 'spider',
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': '63766.268839',
'domain': 'common'
}
new_data = urllib.parse.urlencode(data).encode("utf-8")
request = urllib.request.Request(url,new_data,headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
print(content)
content_obj = json.loads(content)
print(content_obj)
{'trans_result': {'data': [{'dst': '蜘蛛', 'prefixWrap': 0, 'result': [[0, '蜘蛛', ['0|6'], [], ['0|6'], ['0|6']]], 'src': 'spider'}], 'from': 'en', 'status': 0, 'to': 'zh', 'type': 2, 'phonetic': [{'src_str': '蜘', 'trg_str': 'zhī'}, {'src_str': '蛛', 'trg_str': 'zhū'}]}, 'dict_result': {'edict': {'item': [{'tr_group': [{'tr': ['predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'they spin silk to make cocoons for eggs or traps for prey'], 'example': [], 'similar_word': []}, {'tr': ['a skillet made of cast iron'], 'example': [], 'similar_word': []}, {'tr': ['a computer program that prowls the internet looking for publicly accessible resources that can be added to a database', 'the database can then be searched with a search engine'], 'example': [], 'similar_word': ['wanderer']}], 'pos': 'noun'}], 'word': 'spider'}, ... 'logid': 1561472443}
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Q2juPXR5-1660383709277)(C:\Users\HP\AppData\Roaming\Typora\typora-user-images\image-20220717192851368.png)]](https://1000bd.com/contentImg/2022/08/16/094634686.png)
![[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MkmDgdaG-1660383709278)(C:\Users\HP\AppData\Roaming\Typora\typora-user-images\image-20220717192932428.png)]](https://1000bd.com/contentImg/2022/08/16/094634902.png)
示例
import urllib.request
import urllib.parse
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url = url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
# print(content)
#这里一定要进行编码,因为open()默认是gbk编码格式,如果不指定格式会报错
fp = open(r"C:\pycharm\pythonProject\new_project\spider\data\douban01.json","w",encoding="utf-8")
fp.write(content)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-G1FNHvVM-1660383709278)(C:\Users\HP\AppData\Roaming\Typora\typora-user-images\image-20220717202233366.png)]
import urllib.request
import urllib.parse
def getpage(start_page,end_page):
for i in range(start_page,end_page+1):
url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start="+str((i-1)*20)+"&limit=20"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
# print(content)
fp = open(r"C:\pycharm\pythonProject\new_project\spider\data\douban0_10.json", "a", encoding="utf-8")
fp.write(content+"\n")
print(url)
print(content)
fp.close()
getpage(1,10)
错误示范
import urllib.request
import urllib.parse
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
data = {
'cname': '北京',
'pid': '',
'pageIndex': 1,
'pageSize': 10
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '53',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'route-cell=ksa; ASP.NET_SessionId=0jnf0d302cprllwokftehbyc; Hm_lvt_1039f1218e57655b6677f30913227148=1658151268; Hm_lpvt_1039f1218e57655b6677f30913227148=1658151268; Hm_lvt_5fd8501a4e4e0eddf0c4596de7bd57ab=1658151268; Hm_lpvt_5fd8501a4e4e0eddf0c4596de7bd57ab=1658151268; SERVERID=d89132ef078a81052172981622f19954|1658151309|1658151267',
'Host': 'www.kfc.com.cn',
'Origin': 'http://www.kfc.com.cn',
'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
data = urllib.parse.urlencode(data).encode("utf-8")
request = urllib.request.Request(url=url, data=data, headers=headers)
response = urllib.request.urlopen(request)
print(response.read())
content = response.read().decode("utf-8")
print(content)
#UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
"b’\x1f\x8b\x08"开头的 ,说明它是gzip压缩过的数据,这也是报错的原因,所以我们需要对我们接收的字节码进行一个gzip解码操作。正确示范
import urllib.request
import urllib.parse
from io import BytesIO
import gzip
url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
data = {
'cname': '北京',
'pid': '',
'pageIndex': 1,
'pageSize': 10
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Content-Length': '53',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'route-cell=ksa; ASP.NET_SessionId=0jnf0d302cprllwokftehbyc; Hm_lvt_1039f1218e57655b6677f30913227148=1658151268; Hm_lpvt_1039f1218e57655b6677f30913227148=1658151268; Hm_lvt_5fd8501a4e4e0eddf0c4596de7bd57ab=1658151268; Hm_lpvt_5fd8501a4e4e0eddf0c4596de7bd57ab=1658151268; SERVERID=d89132ef078a81052172981622f19954|1658151309|1658151267',
'Host': 'www.kfc.com.cn',
'Origin': 'http://www.kfc.com.cn',
'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'}
data = urllib.parse.urlencode(data).encode("utf-8")
request = urllib.request.Request(url=url, data=data, headers=headers)
response = urllib.request.urlopen(request)
html = response.read()
buff = BytesIO(html)
f = gzip.GzipFile(fileobj=buff)
content = f.read().decode("utf-8")
print(content)
1.HTTPError类是URLError类的子类
2.导入的包urllib.error.HTTPError urllib.error.URLError
3.http错误:http错误是针对浏览器无法连接到服务器而增加出来的错误提示。引导并告诉浏览者该页是哪里出 了问题。
4.通过urllib发送请求的时候,有可能会发送失败,这个时候如果想让你的代码更加的健壮,可以通过try‐ except进行捕获异常,异常有两类,URLError\HTTPError
错误示范
import urllib.request
import urllib.parse
# 估计输错一个字符,网页不正确
url = "http://www.aaaa11111.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
print(content)
#urllib.error.URLError:
正确示范
import urllib.request
import urllib.parse
# 估计输错一个字符,网页不正确
url = "http://www.aaaa11111.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
try:
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
print(content)
except urllib.error.URLError:
print("可能是网址不正确")
#C:\python\python.exe C:/pycharm/pythonProject/new_project/spider/test11_异常处理.py
可能是网址不正确
import urllib.request
import urllib.parse
import json
url = "https://weibo.com/ajax/statuses/mymblog?uid=6799431174&page=1&feature=0"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36",
'cookie': 'XSRF-TOKEN=OIn3Ns2cVcwI6p28midBKxCk; WBPSESS=2jFn3n4I-3CwFbURoTaQu1rhpdQSQMxfCEVxWgOF9SBbXdIJ1MgnCaEtj3NUzRClSFjW3odMB1HLdC2FFtjAOw0bvkQHL9MwF4_W1-heh7Je-Z8yr8p6nkqXOLKLsghULu3S129LvAk-DbBr7tUzPpdT9esm8jFdkkPW7muQxjo=; PC_TOKEN=a9d0e8d79e; login_sid_t=7aedc03d933e4ab67fd6651a90ee1dfd; cross_origin_proto=SSL; WBStorage=4d96c54e|undefined; _s_tentry=weibo.com; Apache=4140398740794.2466.1658206348654; SINAGLOBAL=4140398740794.2466.1658206348654; ULV=1658206348657:1:1:1:4140398740794.2466.1658206348654:; wb_view_log=1280*7201.5; SUB=_2A25P0kiVDeRhGeBJ4lsV8y_NzDiIHXVspj1drDV8PUNbmtANLRPlkW9NRjjCAUMp0SGd7xz8lt2ZhFCD8zzwujyr; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFVLT-1i-_FzzCauspwFvZK5JpX5KzhUgL.FoqN1K.Xe02pS0B2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMcS0.4ShepeKMX; ALF=1689742405; SSOLoginState=1658206405; wvr=6; wb_view_log_6799431174=1280*7201.5; webim_unReadCount=%7B%22time%22%3A1658206577505%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D',
'referer': 'https://weibo.com/p/1005056799431174/home?from=page_100505&mod=TAB'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
# print(response.read())
content = response.read().decode("utf-8")
print(content)
为什么要学习handler?
实例
import urllib.request
import urllib.parse
import json
url = "http://www.baidu.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=url,headers=headers)
#请求头定制
handler = urllib.request.HTTPHandler()
opener = urllib.request.build_opener(handler)
response = opener.open(request)
print(response.read().decode("utf-8"))
代理的常用功能?
2.代码配置代理
示例
import urllib.request
import urllib.parse
import json
url = "http://www.baidu.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url=url,headers=headers)
proxies = {'http':'58.220.95.34:10174'}
#请求头定制
#创建一个代理对象
handler = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
print(response.read().decode("utf-8"))
示例
import urllib.request
import urllib.parse
import json
import random
url = "https://www.baidu.com/s?wd=ip"
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1",
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8'
}
request = urllib.request.Request(url=url, headers=headers)
proxies_pool = [
{'http': '58.220.95.34:10174'},
{'http': '112.25.236.167:9091'},
{'http': '183.222.217.168:9091'},
{'http':'111.72.218.180:9091'}
]
proxies = random.choice(proxies_pool)
print(proxies)
# 请求头定制
handler = urllib.request.ProxyHandler(proxies = proxies)
opener = urllib.request.build_opener(handler)
response = opener.open(request)
content = response.read().decode("utf-8")
print(content)
fp = open(r"C:\pycharm\pythonProject\new_project\spider\data\ip.html","w",encoding="utf-8")
fp.write(content)
xpath使用:
语法
1.路径查询
2.谓词查询
3.属性查询
4.模糊查询
5.内容查询
6.逻辑运算
示例:
from lxml import etree
tree = etree.parse(r"C:\pycharm\pythonProject\new_project\spider\data\xpath_test01.html")
li_list = tree.xpath("//ul/li/text()")
print(li_list) #['北京', '上海', '武汉', '深圳']
li_list = tree.xpath("//ul/li[@id='a1']/text()")
print(li_list) #['北京']
li_list = tree.xpath("//ul/li[@id='a1']/@class")
print(li_list) #['b1']
DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8"/>
<title>Titletitle>
head>
<body>
<ul>
<li id = "a1" class = "b1">北京li>
<li>上海li>
<li>武汉li>
<li>深圳li>
ul>
body>
html>
示例
from lxml import etree
import urllib.request
import urllib.parse
url = "https://www.baidu.com/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url= url,headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
print(content)
tree = etree.HTML(content)
result_list = tree.xpath('//input[@class = "bg s_btn"]/@value')
print(result_list) #['百度一下']
示例:
from lxml import etree
import urllib.request
import urllib.parse
def create_request(page):
if(page ==1):
url = "https://sc.chinaz.com/tupian/weimeiyijingtupian.html"
else:
url = "https://sc.chinaz.com/tupian/weimeiyijingtupian_"+str(page)+".html"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
request = urllib.request.Request(url = url,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
return content
def down_load(content):
tree = etree.HTML(content)
src_list = tree.xpath('//div[@id = "container"]//img/@src')
name_list = tree.xpath('//div[@id = "container"]//img/@alt')
for num in range(len(name_list)):
# print(name_list[num])
url = "https:"+src_list[num]
name = name_list[num]
urllib.request.urlretrieve(url = url,filename="C:\pycharm\pythonProject\\new_project\spider\data\img\\"+name+".jpg")
if __name__ == '__main__':
start_page = int(input("请输入起始页:"))
end_page = int(input("请输入终止页:"))
#请求头定制
for page in range(start_page,end_page+1):
# 创建请求头
request = create_request(page)
#获取响应数据
content = get_content(request)
#下载数据
down_load(content)
jsonPath是为了解析json字符串,比json好用一点,他可以无差别的解析相同的东西
注意
{
"store": {
"book": [{
"category": "reference",
"author": "Nigel Rees",
"title": "Sayings of the Century",
"price": 8.95
},
{
"category": "fiction",
"author": "Evelyn Waugh",
"title": "Sword of Honour",
"price": 12.99
},
{
"category": "fiction",
"author": "Herman Melville",
"title": "Moby Dick",
"isbn": "0-553-21311-3",
"price": 8.99
},
{
"category": "fiction",
"author": "J. R. R. Tolkien",
"title": "The Lord of the Rings",
"isbn": "0-395-19395-8",
"price": 22.99
}
],
"bicycle": {
"color": "red",
"price": 19.95
}
}
}
| XPath | JSONPath | Description |
|---|---|---|
| / | $ | 表示根元素 |
| . | @ | 当前元素 |
| / | . or [] | 子元素 |
| … | n/a | 父元素 |
| // | … | 递归下降,JSONPath是从E4X借鉴的。 |
| * | * | 通配符,表示所有的元素 |
| @ | n/a | 属性访问字符 |
| [] | [] | 子元素操作符 |
| | | [,] | 连接操作符在XPath 结果合并其它结点集合。JSONP允许name或者数组索引。 |
| n/a | [start:end:step] | 数组分割操作从ES4借鉴。 |
| [] | ?() | 应用过滤表示式 |
| n/a | () | 脚本表达式,使用在脚本引擎下面。 |
| () | n/a | Xpath分组 |
示例:
import jsonpath
import json
file_name = open(r"C:\pycharm\pythonProject\new_project\spider\data\test.json","r",encoding="utf-8")
json_value= json.loads(file_name.read())
#获取所有book的名称
# book_list = jsonpath.jsonpath(json_value,"$.store.book[*].title")
# print(book_list) #['Sayings of the Century', 'Sword of Honour', 'Moby Dick', 'The Lord of the Rings']
#store的所有元素。所有的bookst和bicycle
# store_element_list = jsonpath.jsonpath(json_value,"$.store.*")
# print(store_element_list)
#获取第三个书
# secondary_book = jsonpath.jsonpath(json_value,"$..book[3]")
# print(secondary_book)
#最后一本书
# last_book = jsonpath.jsonpath(json_value,"$..book[(@.length-1)]")
# print(last_book)
#前面的两本书。
# books = jsonpath.jsonpath(json_value,"$..book[:2]")
# print(books)
#过滤出所有的包含isbn的书。
# books = jsonpath.jsonpath(json_value,"$..book[?(@.isbn)]")
# print(books)
#过滤出价格低于10的书。
# books = jsonpath.jsonpath(json_value,"$..book[?(@.price<10)]")
# print(books)
# 所有元素。
elements = jsonpath.jsonpath(json_value,"$.*")
print(elements)
| XPath | JSONPath | 结果 |
|---|---|---|
/store/book/author | $.store.book[*].author | 书点所有书的作者 |
//author | $..author | 所有的作者 |
/store/* | $.store.* | store的所有元素。所有的bookst和bicycle |
/store//price | $.store..price | store里面所有东西的price |
//book[3] | $..book[2] | 第三个书 |
//book[last()] | $..book[(@.length-1)] | 最后一本书 |
//book[position()<3] | $..book[0,1]``$..book[:2] | 前面的两本书。 |
//book[isbn] | $..book[?(@.isbn)] | 过滤出所有的包含isbn的书。 |
//book[price<10] | $..book[?(@.price<10)] | 过滤出价格低于10的书。 |
//* | $..* | 所有元素。 |
示例
import urllib.request
import urllib.parse
import json
import jsonpath
url = "https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1658321480664_108&jsoncallback=jsonp109&action=cityAction&n_s=new&event_submit_doGetAllRegion=true"
headers = {
# 'cookie': 't=45a6a81b350f4f87534bdc0e5baabba4; cookie2=19b380cb8e13362d5cec24b9bd45f24e; v=0; _tb_token_=53ebb070703e3; cna=wwDHGrqnP0ECAdy9+WJsBaL5; xlly_s=1; l=eBaA_KPILWVU6o1aBO5Courza77TNIRb4CVzaNbMiInca6sRtFTKeNCHEDOWSdtxgtCfYetPUcfRbdLHR3xg5c0c07kqm05j3xvO.; tfstk=c2ePBeqPjTBPzWNagxMFATP0q5cRaFhn_KoKZNadIEPpkdlo7sVkJmV1Rmo90ecl.; isg=BAwM2IV4goiYkZb07LUnSJqv3Wo-RbDvWVS3OmbNs7da8az7jlTOf0dPkPlJluhH',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
, 'referer': 'https://dianying.taobao.com/'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
json_obj = content.replace("jsonp109(", "").replace(");", "")
print(json_obj)
out_file = open(r"C:\pycharm\pythonProject\new_project\spider\data\json01.json", "w", encoding="utf-8")
out_file.write(json_obj)
in_file = open(
r"C:\pycharm\pythonProject\new_project\spider\data\json01.json", "r", encoding="utf-8"
)
json_obj = json.loads(in_file.read())
print(json_obj)
city_list = jsonpath.jsonpath(json_obj, "$..regionName")
print(city_list)
1.安装
2.导入
3.创建对象
服务器响应的文件生成对象
soup = BeautifulSoup(response.read().decode(), ‘lxml’)
soup = BeautifulSoup(open(,encoding = “utf-8”), ‘lxml’)
注意:默认打开文件的编码格式gbk所以需要指定打开编码格式
1.根据标签名查找节点
2.函数
(1).find(返回一个对象)
(2).find_all(返回一个列表)
find_all(‘a’) 查找到所有的a
find_all([‘a’, ‘span’]) 返回所有的a和span
find_all(‘a’, limit=2) 只找前两个a
.select(根据选择器得到节点对象)【推荐】
1.element
2…class
3.#id
4.属性选择器
[attribute]
eg:li = soup.select(‘li[class]’)
[attribute=value]
eg:li = soup.select(‘li[class=“hengheng1”]’)
5.层级选择器
element element (子孙节点)
div p
element>element (子节点)
div>p
element,element (多个节点)
div,p
6.节点信息
(1).获取节点内容:适用于标签中嵌套标签的结构
obj.string
obj.get_text()【推荐】
(2).节点的属性
tag.name 获取标签名
eg:tag = find('li)
print(tag.name)
tag.attrs将属性值作为一个字典返回
(3).获取节点属性
obj.attrs.get(‘title’)【常用】
obj.get(‘title’)
obj[‘title’]
案例
DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Titletitle>
head>
<body>
<div>
<ul>
<li id="a1" class="b1">北京li>
<li>上海li>
<li id="a4">武汉li>
<li>深圳li>
ul>
div>
<a href="" id="a2" class="hh"> 百度a>
<div>
<ul>
<li> 嘉善li>
ul>
<span>
<a href="" class="a3" id="mm">a>
span>
div>
body>
html>
代码
from bs4 import BeautifulSoup
soup = BeautifulSoup(open(r"C:\pycharm\pythonProject\new_project\spider\data\bs4_test.html",encoding="utf-8"),"lxml")
#返回第一个a标签
# result = soup.find("a")
# print(result)
#返回第一个class属性为hh的标签
# result = soup.find("a",class_ = "hh")
# print(result)
#返回所有相同的标签
# result = soup.find_all('a')
#返回多个标签
# result = soup.find_all(["a","li"])
#限制返回个数
# result = soup.find_all("a",limit=1)
# select 根据选择器得到选择对象
# result = soup.select("a")
# 可以通过.来代表class,我们把这种方式称之为类选择器(选择出class=“hh”的标签)
# result = soup.select(".hh")
# #特指id,这代码的意思是选择出id = "a2" 的标签
# result = soup.select("#a2")
# 属性选择器 选择出含有class属性的a标签
# result = soup.select("a[class]")
#选择出属性class = “a3” 的a标签
# result = soup.select("a[class = 'a3']")
# 层级选择器 (通过空格来代表子孙节点) 获取(div 的子节点 li)
# result = soup.select("div li")
# (>来代表子节点) 获取所有div节点下的”子“节点ul
# result = soup.select("div>ul")
#获取div节点和span节点
# result = soup.select("div,ul")
#获取节点的属性信息
# result = soup.select("a")[0].attrs
#获取a的内容信息
# result = soup.select("a")[0].string】
#获取节点a的内容
# result = soup.select("a")[0].get_text()
# 获取标签的名称
# result = soup.select("a")[0].name
#获取节点a的class属性
# result = soup.select("a")[0].get("class")
print(result)
示例:
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
url = "https://www.starbucks.com.cn/menu/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
content = response.read().decode("utf-8")
soup = BeautifulSoup(content, "lxml")
result_list = soup.select("a strong")
# //a/strong/text()
for result in result_list:
name = result.string
print(name)
1.什么是selenium?
1、Selenium是一个用于Web应用程序测试的工具。
2、Selenium 测试直接运行在浏览器中,就像真正的用户在操作一样。
3、支持通过各种driver(FirfoxDriver,IternetExplorerDriver,OperaDriver,ChromeDriver)驱动
真实浏览器完成测试。
4、selenium也是支持无界面浏览器操作的。
2.为什么使用selenium?
3.如何安装selenium?
Selenium的使用步骤?
1、(新版本)导入:
2、创建谷歌浏览器操作对象:
3、访问网址
url = 要访问的网址
browser.get(url)
from selenium import webdriver
# 获取驱动路径
path = r"C:\pycharm\pythonProject\new_project\spider\driver\chromedriver.exe"
# 创建谷歌浏览器对象
brower = webdriver.Chrome(path)
# 设置url
url = "https://www.baidu.com"
brower.get(url)
4、selenium的元素定位?
# 元素定位:
#自动化要做的就是模拟鼠标和键盘来操作来操作这些元素,点击、输入等等。操作这些元素前首先 要找到它们,WebDriver提供很多定位元素的方法
#方法:
1.find_element()/find_elements()
#获取id = "su"的标签
eg:button = browser.find_element(by=By.ID,value="su")
#获取name = "wd"的标签
eg:name = browser.find_element(by=By.NAME,value="wd")
#根据xpath获取标签
#selenium所有的定位方法都是基于find_element,看名字,那就是找web元素的,而text是一个目标文本[object Text]
eg:xpath1 = = browser.find_element(by=By.XPATH, value=r"//input[@id='su']")
#根据标签名称获取标签
eg:names = browser.find_element(by=By.TAG_NAME,value="input")
#根据css选择器获取标签
eg:my_input = browser.find_element(by=By.CSS_SELECTOR,value='#kw')
#根据链接文本名称获取标签信息
eg:browser.find_element(by=By.LINK_TEXT,value="新闻")
#示例============================================================================
from selenium import webdriver
from selenium.webdriver.common.by import By
# 获取驱动路径
path = r"C:\pycharm\pythonProject\new_project\spider\driver\chromedriver.exe"
# 创建谷歌浏览器对象
browser = webdriver.Chrome(path)
# 设置url
# 打开以后一直到代码运行结束之前,不能关闭网页窗口
url = "https://www.baidu.com"
browser.get(url)
# 获取网页源码
# web_page = browser.page_source
# print(web_page)
# 根据id查找
# result= browser.find_element(by=By.ID,value="su")
# 获取name = ”wd的标签“
# result = browser.find_element(by=By.NAME,value="wd")
# 根据xpath获取标签
# result = browser.find_element(by=By.XPATH, value=r"//input[@id='su']")
# 根据标签名称获取标签
# result = browser.find_element(by=By.TAG_NAME,value="input")
# 根据css选择器获取标签
# result = browser.find_element(by=By.CSS_SELECTOR,value='#kw')
# 根据链接文本名称获取标签信息
# result = browser.find_element(by=By.LINK_TEXT,value="新闻")
# 4‐2:访问元素信息
#一定是通过find_element()/find_elements()方法拿到的数据才能使用
#xpath不能直接获取到文本信息
获取元素属性 .get_attribute('class')
获取元素文本 .text
获取标签名 .tag_name
#示例----------------------------------------------------------------------------
from selenium import webdriver
from selenium.webdriver.common.by import By
# 获取驱动路径
path = r"C:\pycharm\pythonProject\new_project\spider\driver\chromedriver.exe"
# 创建谷歌浏览器对象
browser = webdriver.Chrome(path)
# 设置url
# 打开以后一直到代码运行结束之前,不能关闭网页窗口
url = "https://www.baidu.com"
browser.get(url)
# 获取网页源码
# web_page = browser.page_source
# print(web_page)
windows_List = browser.find_elements(by=By.XPATH,value=r'//div/a[@target="_blank"]')
for window in windows_List:
print(window.tag_name)
print(window.text)
print(window.get_attribute("class"))
print("--------------------")
# 4‐3:交互
点击:click()
输入:send_keys()
后退操作:browser.back()
前进操作:browser.forword()
模拟JS滚动:
js='document.documentElement.scrollTop=100000'
browser.execute_script(js) 执行js代码
获取网页代码:page_source
退出:browser.quit()
示例:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time
# 获取驱动路径
path = r"C:\pycharm\pythonProject\new_project\spider\driver\chromedriver.exe"
# 创建谷歌浏览器对象
browser = webdriver.Chrome(path)
# 设置url
# 打开以后一直到代码运行结束之前,不能关闭网页窗口
url = "https://www.baidu.com"
browser.get(url)
# browser.find_element(by=By.ID)
#根据文本框的id获取文本框
input = browser.find_element(by= By.ID,value="kw")
#在文本框中数据要查找的内容
input.send_keys("lisa")
#根据id名称获取百度一下按钮
botton = browser.find_element(by= By.ID,value="su")
#点击按钮
botton.click()
print(browser.page_source)
time.sleep(2)
#在公司项目中一般通过以下代码向下滑动
js='document.documentElement.scrollTop=100000'
browser.execute_script(js)
# print(browser.page_source)
time.sleep(2)
#根据xpath获取下一页按钮
next = browser.find_element(by=By.XPATH,value='//div/a[@class="n"]')
print(next)
#点击下一页
next.click()
time.sleep(2)
#返回上一页
browser.back()
time.sleep(2)
#返回下一页
browser.forward()
time.sleep()
#关闭浏览器
browser.quit()
Chrome-headless 模式, Google 针对 Chrome 浏览器 59版 新增加的一种模式,可以让你不打开UI界面的情况下 使用 Chrome 浏览器,所以运行效果与 Chrome 保持完美一致。
1.系统要求:
Chrome
Unix\Linux 系统需要 chrome >= 59
Windows 系统需要 chrome >= 60
Python>3.6
Selenium>=3.4.*
ChromeDriver>=2.31
C:\python 路径下添加:chromedriver.exe
文件头
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)
browser.get('http://www.baidu.com/')
封装头
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def share_browser():
options = webdriver.ChromeOptions()
# 添加无界面参数
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
return browser
调用封装头
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def share_browser():
options = webdriver.ChromeOptions()
# 添加无界面参数
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)
return browser
browser = share_browser()
browser.get('http://www.baidu.com/')
browser.get_screenshot_as_file("百度.png")
1.文档:
官方文档http://cn.python‐requests.org/zh_CN/latest/
快速上手http://cn.python‐requests.org/zh_CN/latest/user/quickstart.html
2.安装pip install requests
3.response的属性以及类型
类型 :models.Response
r.text : 获取网站源码
r.encoding :访问或定制编码方式
r.url :获取请求的url
r.content :响应的字节类型
r.status_code :响应的状态码
r.headers :响应的头信息
requests.get()
eg: import requests
url = 'http://www.baidu.com/s?'
headers = { 'User‐Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }
data = { 'wd':'北京' }
response = requests.get(url,params=data,headers=headers)
示例:
import requests
url = "https://www.baidu.com/s"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
get_data = {
'wd': '北京'
}
# 获取响应结果,类型为Response
response = requests.get(url=url, params=get_data, headers=headers)
# 获取响应文本
print(response.text)
# 获取响应状态
print(response.status_code)
# 获取请求的url
print(response.url) # https://www.baidu.com/s?wd=%E5%8C%97%E4%BA%AC
# 获取响应文本的字节编码
print(response.content)
# 获取响应头信息
print(response.headers)
# 获取网站的编码方式
print(response.encoding)
#定制参数参数使用params传递
#参数无需urlencode编码
#不需要请求对象的定制
#请求资源路径中?可加可不加
requests.post()
百度翻译:
eg: import requests
post_url = 'http://fanyi.baidu.com/sug'
headers={ 'User‐Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' }
data = { 'kw': 'eye' }
r = requests.post(url = post_url,headers=headers,data=data)
示例:
import requests
url = "https://fanyi.baidu.com/sug"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
post_data = {
'kw': 'spider'
}
response = requests.post(url=url, data=post_data, headers=headers)
print(type(response))
# 获取响应文本
print(response.text)
# 返回的是json字符串,这里使用json进行解析
import json
json_text = json.loads(response.text)
print(json_text)
# 获取响应状态
print(response.status_code)
# 获取请求的url
print(response.url) # https://www.baidu.com/s?wd=%E5%8C%97%E4%BA%AC
# 获取响应文本的字节编码
print(response.content)
# 获取响应头信息
print(response.headers)
# 获取网站的编码方式
print(response.encoding)
proxy定制
在请求中设置proxies参数
参数类型是一个字典类型
eg:
import requests
url = 'http://www.baidu.com/s?'
headers = { 'user‐agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' }
data = { 'wd':'ip' }
proxy = { 'http':'219.149.59.250:9797' }
r = requests.get(url=url,params=data,headers=headers,proxies=proxy)
示例:
import requests
url = "https://www.baidu.com/s"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
get_data = {
'wd': 'ip'
}
proxies = {
'http':'58.220.95.32:10174',
'http':'116.253.208.239:33080'
}
# 获取响应结果,类型为Response
response = requests.get(url=url, params=get_data, headers=headers,proxies=proxies)
print(response.text)
8:cookie定制
应用案例:
(1)古诗文网(需要验证)
(2)云打码平台
用户登陆 actionuser action
开发者登陆 actioncode action
session = requests.session()
response_2 = session.get(url)
print(response_2.text)