• 【自用存档】bs4爬虫代码


    只需更改相应的前端寻找代码部分(下为丐版)

    1. from bs4 import BeautifulSoup
    2. import urllib.request
    3. def getHtml(url):
    4. resp = urllib.request.urlopen(url)
    5. data = resp.read()
    6. return data.decode("gbk")
    7. def getOnePage(url):
    8. nextUrl = ""
    9. try:
    10. html = getHtml(url)
    11. soup = BeautifulSoup(html,"html.parser")
    12. lis = soup.find("div",attrs = {"class":"con shoplist"}).find_all("li")
    13. for li in lis:
    14. title = li.find("a")["title"]
    15. author = li.find("p",attrs={"class":"search_book_author"}).find("a")['title']
    16. time = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[1].text
    17. publisher = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[2].text
    18. #author publisher puddate brief price
    19. print("书名:",title)
    20. print("作者:",author)
    21. print("出版社:",publisher)
    22. print("出版时间",time)
    23. #print("简历")
    24. #print("金额")
    25. print(" ")
    26. except Exception as err:
    27. print(err)
    28. return nextUrl
    29. url= "http://search.dangdang.com/?key=python&act=input&page_index=1"
    30. nextUrl = getOnePage(url)

    以豆瓣top250为例子

    1. from shutil import move
    2. from bs4 import BeautifulSoup
    3. import urllib.request
    4. import time
    5. import pandas as pd
    6. #import response
    7. '''
    8. 爬取单个电影需要信息内容
    9. '''
    10. '''检查改网页是否可以爬取'''
    11. def getHtml(url):
    12. proxy_ip='127.0.0.1:8888'
    13. try:
    14. req = urllib.request.Request(url)
    15. # 对付网站反爬虫程序,有了以下的请求头文
    16. req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0')
    17. # 此类是 URL 请求的抽象。
    18. proxy=urllib.request.ProxyHandler({'http':proxy_ip})
    19. # 返回一个OpenerDirector实例,它按照给定的顺序链接处理程序。handler可以是 的实例,也可以是 的BaseHandler子类
    20. opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    21. # 安装一个OpenerDirector实例作为默认的全局开启器。
    22. urllib.request.install_opener(opener)
    23. # 打开 URL url,它可以是字符串或 Request对象。
    24. data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
    25. return data
    26. '''
    27. except urllib.error.URLError as e:
    28. if hasattr(e,'code'):
    29. print(e.code)
    30. if hasattr(e,'reason'):
    31. print(e.reason)
    32. time.sleep(10) #出现异常延时10s
    33. '''
    34. except Exception as e:
    35. print('exception:'+str(e))
    36. time.sleep(1)
    37. '''获取每个电影页面的内容【名称+简介】'''
    38. def getOnePage(url):
    39. try:
    40. # 调用可否爬取函数
    41. html = getHtml(url)
    42. # 调用bs对象来实现爬取
    43. soup = BeautifulSoup(html,"lxml")
    44. '''
    45. 1. 爬取内容
    46. html嵌套如:related-info -> indent->all hidden
    47. 判错处理:有些简介太长,需要展开显示,
    48. a. 先爬起隐藏内容,
    49. b. 如果为空,则直接爬取页面未被隐藏内容
    50. '''
    51. # 前缀太长,所以分段爬取
    52. movie_content_pre = soup.find("div", attrs={"class":"related-info"}).find("div",attrs={"class":"indent"})
    53. # 简介的判错处理
    54. try:
    55. # movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
    56. movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
    57. except:
    58. # movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
    59. movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
    60. print("all",movie_content)
    61. movie_content = movie_content.strip()
    62. movie_content = movie_content.replace('\u3000', '').replace('\r', '')
    63. movie_content = movie_content.replace("\n","").replace(" ","")
    64. print(" 1.movie content done")
    65. '''
    66. 2. 爬取内容+类型
    67. 因为两者在同一个
      下,
    68. 变量: information -> 存储content的内容
    69. '''
    70. '''
    71. information = soup.find("div",attrs={"id":"content"})
    72. # 一部电影包含多个标签,先用movie_type_all 包含所有标签内容
    73. movie_type_all = information.find("div",attrs={"id":"info"}).find_all("span",attrs={"property":"v:genre"})
    74. # 创建列表存储 类别 内容
    75. movie_type = []
    76. # 按照所有标签内容,遍历得到单个标签
    77. for i in range(len(movie_type_all)):
    78. types = movie_type_all[i].text
    79. movie_type.append(types)
    80. # 为写入文件而转换格式str
    81. movie_type = str(movie_type)
    82. print(" 2.movie type done")
    83. '''
    84. # 查找电影名称
    85. movie_title = soup.find("div",attrs={"id":"content"}).find("span",attrs={"property":"v:itemreviewed"}).text
    86. print(" 2.movie title done")
    87. # print("电影名称\n",movie_title)
    88. # print("电影简介\n",movie_type)
    89. # print("电影内容\n",movie_content)
    90. # 判错 遇错处理
    91. except Exception as err:
    92. print(err)
    93. # 返回数据
    94. return movie_title,movie_content
    95. # # 检查某个电影页面内容是否爬取成功
    96. url = "https://movie.douban.com/subject/1292001/"
    97. getOnePage(url)
    98. data = getOnePage(url)
    99. with open("check.txt","w") as f:
    100. f.write(str(data))

    1. # 排行榜爬取下一个网页的内容(跳转爬取)
    2. '''爬取下一个页面的所有链接内容'''
    3. from one_page_film import a_page_to_all_links
    4. def get_All_Pages_Links():
    5. try:
    6. # 参考学习链接:https://zhuanlan.zhihu.com/p/62601606
    7. # 自定义url实现网址爬取
    8. url_style = "https://movie.douban.com/top250?start={index}&filter="
    9. url_lst = []
    10. # 0-255 步长为25
    11. for i in range(0, 250, 25):
    12. url = url_style.format(index=i)
    13. url_lst.append(url)
    14. except Exception as err:
    15. print(err)
    16. return url_lst
    17. # 存储10页的内容链接
    18. url_list = get_All_Pages_Links()
    19. # 环环嵌套 由页码链接得到每一页的电影链接
    20. for i in url_list:
    21. # flag
    22. print("No: pages",i)
    23. a_page_to_all_links(i)
    1. # 在某一页排行榜爬取每个电影的链接
    2. from bs4 import BeautifulSoup
    3. import requests
    4. # 从spider文件调用函数
    5. from one_page_content import getHtml,getOnePage
    6. '''对该页面的所有电影链接进行爬取'''
    7. def getPageLink(url):
    8. try:
    9. # 调用函数
    10. html = getHtml(url)
    11. # 调用bs对象来实现爬取
    12. soup = BeautifulSoup(html,"lxml")
    13. # 一个电影名称在一个li标签里面
    14. all_links = soup.find("ol",attrs={"class":"grid_view"}).find_all("li")
    15. # 变量movie_link_list用来存储各个电影链接
    16. movie_link_list = []
    17. # 循环遍历li标签得到各个电影的链接
    18. for i in range(len(all_links)):
    19. link = all_links[i].find("div",attrs={"class":"pic"}).find("a")['href']
    20. movie_link_list.append(link)
    21. # print("存储在list的链接:\n",movie_link_list)
    22. except Exception as err:
    23. print(err)
    24. return movie_link_list
    25. def a_page_to_all_links(url):
    26. # 一个页面的所有内容 存储在one_pages_content
    27. one_pages_content = []
    28. # 一个页面中所有的电影链接存储在one_page_link
    29. one_page_link = getPageLink(url)
    30. # flag
    31. epoch = 0
    32. for i in range(len(one_page_link)):
    33. epoch = epoch + 1
    34. print(" epoch:",epoch)
    35. # 调用函数得到单个电影的内容
    36. one_movie_content = getOnePage(one_page_link[i])
    37. # print(one_movie_content)
    38. print("name",one_movie_content[0])
    39. print("content",one_movie_content[1])
    40. # 写入语料库
    41. with open("data/all_content_corpus.txt",'a',encoding="utf-8") as f:
    42. f.write(one_movie_content[0])
    43. f.write(",")
    44. f.write(one_movie_content[1])
    45. f.write("\n")
    46. # url = "https://movie.douban.com/top250?start=0&filter="
    47. # a_page_to_all_links(url)

  • 相关阅读:
    通过git命令查询某个用户提交信息
    dwg如何转换成pdf,cad快速转pdf
    iNFTnews | 喝茶送虚拟股票?浅析奈雪的茶“发币”
    linux权限详解
    docker pull、docker load、docker run使用方法
    海康相机SDK二次开发C++程序
    D. Boris and His Amazing Haircut
    C#,人工智能,深度学习,OpenCV开发,入门教程——Visual Studio 2022,OpenCvSharp环境搭建与可视化
    2022河南萌新联赛第(七)场:南阳理工学院 H-防风台
    SWC ports and port interface
  • 原文地址:https://blog.csdn.net/dannnnnnnnnnnn/article/details/126367927