只需更改相应的前端寻找代码部分(下为丐版)
- from bs4 import BeautifulSoup
- import urllib.request
-
-
- def getHtml(url):
- resp = urllib.request.urlopen(url)
- data = resp.read()
- return data.decode("gbk")
-
- def getOnePage(url):
- nextUrl = ""
- try:
- html = getHtml(url)
- soup = BeautifulSoup(html,"html.parser")
- lis = soup.find("div",attrs = {"class":"con shoplist"}).find_all("li")
-
- for li in lis:
- title = li.find("a")["title"]
- author = li.find("p",attrs={"class":"search_book_author"}).find("a")['title']
- time = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[1].text
- publisher = li.find("p",attrs={"class":"search_book_author"}).find_all("span")[2].text
-
- #author publisher puddate brief price
- print("书名:",title)
- print("作者:",author)
- print("出版社:",publisher)
- print("出版时间",time)
- #print("简历")
- #print("金额")
- print(" ")
-
-
- except Exception as err:
- print(err)
- return nextUrl
-
-
- url= "http://search.dangdang.com/?key=python&act=input&page_index=1"
- nextUrl = getOnePage(url)
-
-
-
-
-
-
-
-
以豆瓣top250为例子
- from shutil import move
- from bs4 import BeautifulSoup
- import urllib.request
- import time
- import pandas as pd
- #import response
- '''
- 爬取单个电影需要信息内容
- '''
-
- '''检查改网页是否可以爬取'''
- def getHtml(url):
- proxy_ip='127.0.0.1:8888'
- try:
- req = urllib.request.Request(url)
- # 对付网站反爬虫程序,有了以下的请求头文
- req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0')
-
- # 此类是 URL 请求的抽象。
- proxy=urllib.request.ProxyHandler({'http':proxy_ip})
- # 返回一个OpenerDirector实例,它按照给定的顺序链接处理程序。handler可以是 的实例,也可以是 的BaseHandler子类
- opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
- # 安装一个OpenerDirector实例作为默认的全局开启器。
- urllib.request.install_opener(opener)
- # 打开 URL url,它可以是字符串或 Request对象。
- data=urllib.request.urlopen(req).read().decode('utf-8','ignore')
-
- return data
- '''
- except urllib.error.URLError as e:
- if hasattr(e,'code'):
- print(e.code)
- if hasattr(e,'reason'):
- print(e.reason)
- time.sleep(10) #出现异常延时10s
- '''
- except Exception as e:
- print('exception:'+str(e))
- time.sleep(1)
-
-
-
- '''获取每个电影页面的内容【名称+简介】'''
- def getOnePage(url):
- try:
- # 调用可否爬取函数
- html = getHtml(url)
-
- # 调用bs对象来实现爬取
- soup = BeautifulSoup(html,"lxml")
-
- '''
- 1. 爬取内容
- html嵌套如:related-info -> indent->all hidden
- 判错处理:有些简介太长,需要展开显示,
- a. 先爬起隐藏内容,
- b. 如果为空,则直接爬取页面未被隐藏内容
- '''
- # 前缀太长,所以分段爬取
- movie_content_pre = soup.find("div", attrs={"class":"related-info"}).find("div",attrs={"class":"indent"})
-
- # 简介的判错处理
- try:
- # movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
- movie_content = movie_content_pre.find("span",attrs={"class":"all hidden"}).text
- except:
- # movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
- movie_content = movie_content_pre.find("span",attrs={"property":"v:summary"}).text
- print("all",movie_content)
-
-
- movie_content = movie_content.strip()
- movie_content = movie_content.replace('\u3000', '').replace('\r', '')
- movie_content = movie_content.replace("\n","").replace(" ","")
- print(" 1.movie content done")
-
-
- '''
- 2. 爬取内容+类型
- 因为两者在同一个 下,
- 变量: information -> 存储content的内容
- '''
- '''
- information = soup.find("div",attrs={"id":"content"})
- # 一部电影包含多个标签,先用movie_type_all 包含所有标签内容
- movie_type_all = information.find("div",attrs={"id":"info"}).find_all("span",attrs={"property":"v:genre"})
- # 创建列表存储 类别 内容
- movie_type = []
- # 按照所有标签内容,遍历得到单个标签
- for i in range(len(movie_type_all)):
- types = movie_type_all[i].text
- movie_type.append(types)
-
- # 为写入文件而转换格式str
- movie_type = str(movie_type)
-
- print(" 2.movie type done")
- '''
-
- # 查找电影名称
- movie_title = soup.find("div",attrs={"id":"content"}).find("span",attrs={"property":"v:itemreviewed"}).text
- print(" 2.movie title done")
-
- # print("电影名称\n",movie_title)
- # print("电影简介\n",movie_type)
- # print("电影内容\n",movie_content)
-
-
- # 判错 遇错处理
- except Exception as err:
- print(err)
-
-
- # 返回数据
- return movie_title,movie_content
-
-
- # # 检查某个电影页面内容是否爬取成功
- url = "https://movie.douban.com/subject/1292001/"
- getOnePage(url)
-
- data = getOnePage(url)
- with open("check.txt","w") as f:
- f.write(str(data))
- # 排行榜爬取下一个网页的内容(跳转爬取)
- '''爬取下一个页面的所有链接内容'''
- from one_page_film import a_page_to_all_links
-
- def get_All_Pages_Links():
- try:
- # 参考学习链接:https://zhuanlan.zhihu.com/p/62601606
- # 自定义url实现网址爬取
- url_style = "https://movie.douban.com/top250?start={index}&filter="
- url_lst = []
- # 0-255 步长为25
- for i in range(0, 250, 25):
- url = url_style.format(index=i)
- url_lst.append(url)
-
- except Exception as err:
- print(err)
- return url_lst
-
- # 存储10页的内容链接
- url_list = get_All_Pages_Links()
- # 环环嵌套 由页码链接得到每一页的电影链接
- for i in url_list:
- # flag
- print("No: pages",i)
- a_page_to_all_links(i)
-
- # 在某一页排行榜爬取每个电影的链接
- from bs4 import BeautifulSoup
- import requests
- # 从spider文件调用函数
- from one_page_content import getHtml,getOnePage
-
- '''对该页面的所有电影链接进行爬取'''
- def getPageLink(url):
- try:
- # 调用函数
- html = getHtml(url)
- # 调用bs对象来实现爬取
- soup = BeautifulSoup(html,"lxml")
-
- # 一个电影名称在一个li标签里面
- all_links = soup.find("ol",attrs={"class":"grid_view"}).find_all("li")
-
- # 变量movie_link_list用来存储各个电影链接
- movie_link_list = []
- # 循环遍历li标签得到各个电影的链接
- for i in range(len(all_links)):
- link = all_links[i].find("div",attrs={"class":"pic"}).find("a")['href']
- movie_link_list.append(link)
-
- # print("存储在list的链接:\n",movie_link_list)
-
- except Exception as err:
- print(err)
-
- return movie_link_list
-
-
-
- def a_page_to_all_links(url):
- # 一个页面的所有内容 存储在one_pages_content
- one_pages_content = []
- # 一个页面中所有的电影链接存储在one_page_link
- one_page_link = getPageLink(url)
- # flag
- epoch = 0
- for i in range(len(one_page_link)):
- epoch = epoch + 1
- print(" epoch:",epoch)
-
- # 调用函数得到单个电影的内容
- one_movie_content = getOnePage(one_page_link[i])
-
- # print(one_movie_content)
- print("name",one_movie_content[0])
- print("content",one_movie_content[1])
-
- # 写入语料库
- with open("data/all_content_corpus.txt",'a',encoding="utf-8") as f:
- f.write(one_movie_content[0])
- f.write(",")
- f.write(one_movie_content[1])
- f.write("\n")
-
-
- # url = "https://movie.douban.com/top250?start=0&filter="
- # a_page_to_all_links(url)
-
-