目录
- from urllib.request import Request, build_opener, HTTPCookieProcessor
- from fake_useragent import UserAgent
- from http.cookiejar import MozillaCookieJar
-
- url = 'https://www.baidu.com/s?wd=python'
- headers = {
- 'User-Agent': UserAgent().chrome,
- }
- def let_cookie_to_file():
- cookie = MozillaCookieJar() # 创建一个cookie对象,他会自动记录各种cookie信息
- req = Request(url=url, headers=headers, )
- opener = build_opener(HTTPCookieProcessor(cookiejar=cookie))
- resp = opener.open(req)
- cookie.save('cookie.txt', ignore_discard=True, ignore_expires=True) # 保存cookie对象
- print(resp.getcode())
-
-
- def get_cookie_from_file():
- cookie = MozillaCookieJar() # 创建一个cookie对象
- cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
- req = Request(url=url, headers=headers, )
- opener = build_opener(HTTPCookieProcessor(cookiejar=cookie))
- resp = opener.open(req)
- print(resp.getcode())
-
- if __name__ == '__main__':
- # let_cookie_to_file()
- get_cookie_from_file()
URLError本质上是一个异常类。产生URLError的原因有:主机没有联网,服务器不存在,找不到服务器(实际上服务器存在)等。
- from urllib.request import urlopen, Request
- from fake_useragent import UserAgent
- from urllib.error import URLError
-
- url = 'http://gzsx.cooco.net.cn/tweest/'
- url = 'http://jccndk.com'
- headers = {
- 'User-Agent': UserAgent().chrome,
- }
- req = Request(url=url, headers=headers)
- try:
- resp = urlopen(req)
- print(resp.read().decode())
- except URLError as e:
- print(e)
- if e.args:
- print(e.args[0].errno)
- else:
- print('错误404')
由于requests库是第三方库,需要pip安装。
例子1:
- import requests
- from fake_useragent import UserAgent
-
- headers = {
- 'User-Agent': UserAgent().chrome,
- }
-
-
- def no_args():
- url = 'https://hao.360.com/?h_lnk'
- resp = requests.get(url=url)
- print(resp.text)
-
-
- def has_args(args: dict):
- url = 'https://www.baidu.com/?'
- resp = requests.get(url=url, params=args, headers=headers)
- print(resp.text)
-
-
- if __name__ == '__main__':
- # no_args()
- has_args({'wd': 'python'})
例子1:
- import requests
- from fake_useragent import UserAgent
-
- headers = {
- 'User-Agent': UserAgent().chrome,
- }
-
- args = {
- 'searchword': '卫生'
- }
-
- def get():
- url = 'https://www.21wecan.com/rcwjs/searchlist.jsp'
- resp = requests.post(url=url, headers=headers, data=args)
- print(resp.text)
-
- if __name__ == '__main__':
- get()
例子1:
- import requests
- from fake_useragent import UserAgent
-
- url = 'http://httpbin.org/get'
- headers = {
- 'User-Agent': UserAgent().chrome,
- }
- proxy = {
- # 格式:'type':'type://username:password@ip:port'
- 'http': 'http://183.239.38.216:9091'
- }
- resp = requests.get(url=url, headers=headers, proxies=proxy)
- print(resp.text)
超过这个参数,则会报错。
resp = session.get(url=url, headers=headers, params={'wd': 'python'},timeout=5)
session的作用是保持一个持续的会话,在内存当中记录一个网址的cookie,以供后续代码使用。
- import requests
-
- s = requests.Session()
- # 用session对象发出get请求,设置cookies
- resp = s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')
- print(resp.text)
- import requests
- import warnings
- from requests.packages import urllib3
-
- # 方式一:关闭警告
- urllib3.disable_warnings()
- warnings.filterwarnings("ignore")
-
- # 方式二,关闭证书,verify=False
- res = requests.get(url="https://www.12306.cn",verify=False) #不验证证书,报警告,返回200
- print(res.content.decode("utf-8"))
-
-
- s = requests.Session()
- # 用session对象发出get请求,设置cookies
- resp = s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')
- print(resp.text)
获取并保存cookie信息,需要使用response.cookie属性,他是一个字典。