• python 自(3)1使用urlencode多个参数请求使用 2百度翻译post请求post无法添加路径 3百度翻译全部数据获取 4豆瓣get请


    1  使用urlencode  多个参数请求使用 

    1. # 使用urlencode 多个参数请求使用
    2. # https://www.baidu.com/s?wd=周杰伦&sex=男 网页
    3. import urllib.request
    4. import urllib.parse
    5. base_url = 'https://www.baidu.com/s?'
    6. data = {
    7. 'wd': '周杰伦',
    8. 'sex': '男',
    9. 'sing':'歌曲'
    10. }
    11. new_data = urllib.parse.urlencode(data)
    12. url = base_url + new_data
    13. headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 Edg/116.0.0.0"}
    14. request = urllib.request.Request(url=url, headers=headers)
    15. #
    16. # 模拟服务器发送请求
    17. response = urllib.request.urlopen(request)
    18. # 读取html页面数据 获取相应源码
    19. content = response.read().decode("utf8")
    20. print(content)

    2  百度翻译post请求    post无法添加路径 

    request = urllib.request.Request(url=url, headers=headers, data=data)
    
    1. # 百度翻译 post请求
    2. # 引入
    3. import urllib.request
    4. # 路径 在浏览器 F12找到 对应自己需要的路径
    5. url = 'https://fanyi.baidu.com/sug'
    6. # 请求头
    7. headers = {
    8. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 Edg/116.0.0.0"}
    9. # data
    10. data = {
    11. 'kw': 'spider'
    12. }
    13. # 转换data数据 在转换成utf-8格式
    14. data = urllib.parse.urlencode(data).encode('utf-8')
    15. # 防止反爬 post无法拼接路径 所有只能在 requst中定义 定义数据
    16. request = urllib.request.Request(url=url, headers=headers, data=data)
    17. # 模拟服务器发送请求
    18. response = urllib.request.urlopen(request)
    19. # 读取html页面数据 获取相应源码
    20. content = response.read().decode("utf-8")
    21. # 引入
    22. import json
    23. # 字符串转换成json
    24. obj = json.loads(content)
    25. print(obj)

    3  百度翻译全部数据获取

    1. # 百度翻译全部数据
    2. import urllib.request
    3. import urllib.parse
    4. # 找到全部数据的路径
    5. url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
    6. # 请求头 换成了cookie
    7. headers = {"Cookie":"BIDUPSID=359429789B4E589B318E621011F98A01; PSTM=1642150308; __yjs_duid=1_509dd28c4aec6cb726c25a04881a2a151640083333034; BDUSS=lxa25GVFZQZ0RmYUJHRnp2eERudWJ6eVBiOTE0VmJVVllJdXlKY0QzYkowaDVpRVFBQUFBJCQAAAAAAAAAAAEAAADMN6iOb8rFyKW1xLCuAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMlF92HJRfdhVX; BDUSS_BFESS=lxa25GVFZQZ0RmYUJHRnp2eERudWJ6eVBiOTE0VmJVVllJdXlKY0QzYkowaDVpRVFBQUFBJCQAAAAAAAAAAAEAAADMN6iOb8rFyKW1xLCuAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMlF92HJRfdhVX; REALTIME_TRANS_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; H_WISE_SIDS=219946_234020_131862_216850_213356_214798_219942_213030_110085_243885_244478_244720_240590_245600_248174_247146_256083_254833_256348_256739_254317_257586_255230_257995_258723_258838_258984_258958_230288_256222_259708_258773_234295_234208_257262_259643_255910_254300_260278_256230_260356_260364_253022_255212_258081_260330_260352_251786_260805_260836_259408_259300_259422_259584_260717_261043_261028_261116_258578_261664_261471_261712_261629_261863_262052_262067_259033_262184_262165_262226_262229_261410_262263_260441_259403_236312_262487_262296_262452_261869_262621_262608_262606_262597_249410_259518_259944_262775_262743_262747_262906_263074_256999_263203_262987_262282_253901_263301_263278_243615_261683_261620_259447_263416_245653_263549_257289_8000083_8000126_8000142_8000150_8000156_8000164_8000171_8000177_8000195_8000203; H_WISE_SIDS_BFESS=219946_234020_131862_216850_213356_214798_219942_213030_110085_243885_244478_244720_240590_245600_248174_247146_256083_254833_256348_256739_254317_257586_255230_257995_258723_258838_258984_258958_230288_256222_259708_258773_234295_234208_257262_259643_255910_254300_260278_256230_260356_260364_253022_255212_258081_260330_260352_251786_260805_260836_259408_259300_259422_259584_260717_261043_261028_261116_258578_261664_261471_261712_261629_261863_262052_262067_259033_262184_262165_262226_262229_261410_262263_260441_259403_236312_262487_262296_262452_261869_262621_262608_262606_262597_249410_259518_259944_262775_262743_262747_262906_263074_256999_263203_262987_262282_253901_263301_263278_243615_261683_261620_259447_263416_245653_263549_257289_8000083_8000126_8000142_8000150_8000156_8000164_8000171_8000177_8000195_8000203; MCITY=-53%3A; BAIDUID=4FA510A05410004B33EF51007DA08923:FG=1; BA_HECTOR=01852k8h2704a48h24058g8i1ig851k1p; ZFY=OyVrCDKol7NbNTbKUbw885OfM9tG9YDHAVQiqBjirHg:C; BAIDUID_BFESS=4FA510A05410004B33EF51007DA08923:FG=1; delPer=0; PSINO=1; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[C0sZzZJZb70]=mk3SLVN4HKm; H_PS_PSSID=; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BCLID=6775486379151272863; BCLID_BFESS=6775486379151272863; BDSFRCVID=S-FOJexroG0ZmSbq3aoeqaaMUuweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLeOTHGCF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; BDSFRCVID_BFESS=S-FOJexroG0ZmSbq3aoeqaaMUuweG7bTDYrEOwXPsp3LGJLVFakFEG0Pts1-dEu-S2OOogKKLeOTHGCF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=tRAOoC_-tDvDqTrP-trf5DCShUFsttLjB2Q-XPoO3KJADfOPKjbHhn_L-fQuLRQf5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPQ9Qgbx5hQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hD89DjKKD6PVKgTa54cbb4o2WbCQL56P8pcN2b5oQT8lhJbab6JKaKTD3RjzQ45beq06-lOUWJDkXpJvQnJjt2JxaqRC3JjOsl5jDh3MKToDb-oteltHB2Oy0hvcBn5cShnjLUjrDRLbXU6BK5vPbNcZ0l8K3l02V-bIe-t2XjQh-p52f6_JtRIf3f; H_BDCLCKID_SF_BFESS=tRAOoC_-tDvDqTrP-trf5DCShUFsttLjB2Q-XPoO3KJADfOPKjbHhn_L-fQuLRQf5mkf3fbgy4op8P3y0bb2DUA1y4vp0toW3eTxoUJ2-KDVeh5Gqq-KXU4ebPRiWPQ9Qgbx5hQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hD89DjKKD6PVKgTa54cbb4o2WbCQL56P8pcN2b5oQT8lhJbab6JKaKTD3RjzQ45beq06-lOUWJDkXpJvQnJjt2JxaqRC3JjOsl5jDh3MKToDb-oteltHB2Oy0hvcBn5cShnjLUjrDRLbXU6BK5vPbNcZ0l8K3l02V-bIe-t2XjQh-p52f6_JtRIf3f; APPGUIDE_10_6_2=1; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1694776408; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1694777058; ab_sr=1.0.1_ZWNmZjBlMjY0OWYyNjA1ZTYxNDRhZTI2NjIyNmJjOTcwZGE5ZjU3OTQ1Yjg3ZDFlMTgyNDM1MDczOTgwMmE4YWIwMGE1NmM5NjliNzAzY2YwYmE1MDkwY2M5YjYzODdiOWY2N2Y1OGRjNmRkODdkOTc5MTVhY2YxNjQxMTA1ZjZlMDNiYjVlMDQxNWNhNzk2OGY0NjM0OGM3YjBiYzc5ODQzZmY1N2IwYTA3MzQ0Njg2ZTYyYWFjY2RkYTNlYTUy"}
    8. # data 在foom data找到全部数据添加
    9. data = {
    10. "from": "en",
    11. "to": "zh",
    12. "query": "spider",
    13. "transtype": "realtime",
    14. "simple_means_flag": "3",
    15. "sign": "63766.268839",
    16. "token": "3dfdea119e17b74fb8fad08c2071a657",
    17. "domain": "common",
    18. "ts":" 1694777076331"
    19. }
    20. data = urllib.parse.urlencode(data).encode('utf-8')
    21. # 防止反爬 post无法拼接路径 所有只能在 requst中定义
    22. request = urllib.request.Request(url=url, headers=headers, data=data)
    23. # 模拟服务器发送请求
    24. response = urllib.request.urlopen(request)
    25. # 读取html页面数据 获取相应源码
    26. content = response.read().decode("utf-8")
    27. # 引入
    28. import json
    29. # 字符串转换成json
    30. obj = json.loads(content)
    31. print(obj)

    4  豆瓣get请求 第一页

    1. import urllib.request
    2. url = 'https://movie.douban.com/chart'
    3. headers = {
    4. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 Edg/116.0.0.0"}
    5. # 防止反爬
    6. request = urllib.request.Request(url=url, headers=headers)
    7. # 模拟服务器发送请求
    8. response = urllib.request.urlopen(request)
    9. # 读取html页面数据 获取相应源码
    10. content = response.read().decode("utf-8")
    11. fp = open('douban.json', 'w', encoding='utf-8')
    12. fp.write(content)

    5. get豆瓣请求多页数据下载

    1. # https://movie.douban.com/top250?start=0&filter=
    2. # https://movie.douban.com/top250?start=25&filter=
    3. # https://movie.douban.com/review/best/?start=0
    4. # https://movie.douban.com/review/best?start=20
    5. # 引入
    6. import urllib.parse
    7. import urllib.request
    8. # 定义第一个方法 定义page路径
    9. def create_request(page):
    10. # 前边一样不变的路径
    11. base_url = "https://movie.douban.com/top250?"
    12. # 定义后边不太一样的路径
    13. data ={
    14. 'start' :( page - 1 )*10, #定义页数
    15. 'limit':25 #每页个数
    16. }
    17. # 转换data路径编码
    18. data = urllib.parse.urlencode(data)
    19. # 拼接打印路径 base_url +data 成一个完整路径
    20. url = base_url +data
    21. print(url)
    22. # 请求头
    23. headers = {
    24. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 Edg/116.0.0.0"}
    25. # 防止反扒 特定
    26. request = urllib.request.Request(url=url,headers=headers)
    27. # 返回 request
    28. return request
    29. # # 定义第二个方法 定义get_conten 请求数据
    30. def get_conten(request):
    31. # 模拟服务器发送请求
    32. response = urllib.request.urlopen(request)
    33. # 读取html页面数据 获取相应源码
    34. content = response.read().decode('utf_8')
    35. return content
    36. # # 定义第二个方法 定义down_load 下载
    37. def down_load(page,content): # 获取 page页码
    38. # 拼接下载路径 以及下载格式
    39. with open('doban'+ str(page) +'.json','w',encoding='utf-8')as fp:
    40. fp.write(content)
    41. if __name__ == '__main__':
    42. start_page = int(input('开始页码'))
    43. end_page = int(input('结束页码'))
    44. # 遍历
    45. for page in range(start_page,end_page+1):
    46. # request = 定义page路径
    47. request = create_request(page)
    48. # content = 定义请求方法
    49. content = get_conten(request)
    50. # 下载
    51. down_load(page,content)

    6.必胜客多页下载    和post差不多轻微改动

    1. # post 请求
    2. # http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
    3. # cname: 北京
    4. # pid:
    5. # pageIndex: 1
    6. # pageSize: 10
    7. # http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname
    8. # cname: 北京
    9. # pid:
    10. # pageSize: 10
    11. # 引入
    12. import urllib.parse
    13. import urllib.request
    14. # 定义第一个方法 定义page路径
    15. def create_request(page):
    16. # 前边一样不变的路径
    17. base_url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname"
    18. # 定义后边不太一样的路径
    19. data ={
    20. "cname": "北京",
    21. "pid":'',
    22. "pageIndex": 1,
    23. "pageSize": 10
    24. }
    25. # 转换data路径编码
    26. data = urllib.parse.urlencode(data).encode('utf-8')
    27. # # 拼接打印路径 base_url +data 成一个完整路径
    28. # url = base_url +data
    29. # print(url)
    30. # 请求头
    31. headers = {
    32. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 Edg/116.0.0.0"}
    33. # 防止反扒 特定
    34. request = urllib.request.Request(url=base_url,headers=headers,data=data)
    35. # 返回 request
    36. return request
    37. # # 定义第二个方法 定义get_conten 请求数据
    38. def get_conten(request):
    39. # 模拟服务器发送请求
    40. response = urllib.request.urlopen(request)
    41. # 读取html页面数据 获取相应源码
    42. content = response.read().decode('utf_8')
    43. return content
    44. # # 定义第二个方法 定义down_load 下载
    45. def down_load(page,content): # 获取 page页码
    46. # 拼接下载路径 以及下载格式
    47. with open('dobassn'+ str(page) +'.json','w',encoding='utf-8')as fp:
    48. fp.write(content)
    49. if __name__ == '__main__':
    50. start_page = int(input('开始页码'))
    51. end_page = int(input('结束页码'))
    52. # 遍历
    53. for page in range(start_page,end_page+1):
    54. # request = 定义page路径
    55. request = create_request(page)
    56. # content = 定义请求方法
    57. content = get_conten(request)
    58. # 下载
    59. down_load(page,content)

     

    7.urllib异常报错反馈

    1. import urllib.request
    2. # HTTPError 报错
    3. # url = "https://blog.csdn.net/leva345/article/details/132907839?spm=1000.2115.3001.6382&utm_medium=distribute.pc_feed_v2.none-task-blog-yuanlijihua_tag_v1-2-132907839-null-null.pc_personrec&depth_1-utm_source=distribute.pc_feed_v2.none-task-blog-yuanlijihua_tag_v1-2-132907839-null-null.pc_personrec"
    4. # URLError 报错
    5. url = "https://www.goudan.com"
    6. # 请求头
    7. headers = {
    8. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 Edg/116.0.0.0"
    9. }
    10. # 判断 不太状态返回报错 也可以防止反爬
    11. try:
    12. # 定制
    13. request = urllib.request.Request(url=url,headers=headers)
    14. # 模拟请求
    15. response = urllib.request.urlopen(request)
    16. # 获取信息
    17. content = response.read().decode('utf_8')
    18. print(content)
    19. except urllib.error.HTTPError:
    20. print('系统升级')
    21. except urllib.error.URLError:
    22. print('我都说了,别摁了')

     

     

    8. # handler 代理请求 爬取次数太多会被封锁  所以要进行代理

    1. # handler 请求
    2. import urllib.request
    3. import urllib.parse
    4. # 定义路径
    5. url = 'https://baike.baidu.com/'
    6. headers ={
    7. "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36 Edg/116.0.0.0"
    8. }
    9. request = urllib.request.Request(url=url,headers=headers)
    10. # 模拟服务器发送请求
    11. # response = urllib.request.urlopen(request)
    12. handler = urllib.request.HTTPHandler()
    13. opener = urllib.request.build_opener(handler)
    14. response = opener.open(request)
    15. # 读取html页面数据 获取相应源码
    16. content = response.read().decode("utf8")
    17. print(content)

     

  • 相关阅读:
    03. 人工智能核心基础 - 导论(2)
    多线程知识:三个线程如何交替打印ABC循环100次
    暗黑破坏神资unity资源分包精讲
    SpringMVC--HttpMessageConverter
    PAT甲级--1083 List Grades 分数 25
    对字符数组进行冒泡排序
    python14 字典类型
    Conmi的正确答案——Vue默认加载方式设置为Yarn后怎么修改
    银河麒麟v10制作openssh 9.1 p1 rpm —— 筑梦之路
    PHP循环获取Excel表头字母A-Z,当超过时输出AA,AB,AC,AD······
  • 原文地址:https://blog.csdn.net/weixin_68266812/article/details/132907346