采用三个框架BeautifulSoup&&pqQuery&&xpath,爬取知名的电影网页
主要是想体验这三种框架爬同一个网页的不同。
当然具体的不同我也说不清道不明 只能是体验了一把
以下代码都是本人亲自撸
如图所示,四个位置。分别爬取 电影名字 -> 电影类别 -> 上映时间 -> 电影评分
以及点击电影名称获取特定电影的电影详情页面,如下图所示
- def save_demo_bs4():
- """
- :return: 采用BeautifulSoup 框架爬取单页中每部电影的名称, 类别, 评分, 上映时间等信息
- """
- url = 'https://ssr1.scrape.center'
- import re
- from bs4 import BeautifulSoup
- html = requests.get(url).text
- soup = BeautifulSoup(html, 'lxml')
- dict = []
- ret = soup.find_all(class_=re.compile("el-card item m-t is-hover-shadow")) # 找到所有的class属性为当前属性的文本
- for ret_ in ret:
- dict_ = {}
- title_ = ret_.find_all(class_=re.compile('name'))
- xiangqing_url = url + title_[0].attrs['href'] # 得到单页中每部电影的详情页面
- title_ = [i.text.replace('\n', "") for i in title_][0]
- categories_ = ret_.find_all(class_=re.compile('categories'))
- sy_date_ = ret_.find_all(class_=re.compile('m-v-sm info'))
- pingfen_ = ret_.find_all(class_=re.compile('score m-t-md m-b-n-sm'))
- pingfen_ = [i.string.replace('\n', '').replace(' ', '') for i in pingfen_][0]
- dict_['标题'] = title_
- categories_ = [i.text.replace('\n', ' ') for i in categories_]
- dict_['类别'] = categories_
- sy_date_ = [i.text.replace('\n', '') for i in sy_date_][1]
- dict_['上映时间'] = sy_date_
- dict_['评分'] = float(pingfen_)
- xiangqin_html = requests.get(xiangqing_url).text
- soup_xiangqin = BeautifulSoup(xiangqin_html, 'lxml')
- ret__ = soup_xiangqin.find_all(class_=re.compile("drama")) # 找到所有的class属性为当前属性的文本
- for x in ret__:
- for idx, y in enumerate(x.children):
- if idx == 2:
- xiangqing = y.text.replace('\n', '').replace(' ', '')
- dict_['电影详情'] = xiangqing
- dict.append(dict_)
代码的运行效果图如下
2. XPATH
- def save_demo_xpath():
- """
- :return: 采用xpath框架爬取单页中每部电影的名称, 类别, 评分, 上映时间等信息
- """
- url = 'https://ssr1.scrape.center'
- import requests
- from lxml import etree
- html = requests.get(url).text
- html = etree.HTML(html)
- title_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/a/h2'
- title_xxpath = '{page}'
- dict = []
- for i in range(10):
- dict_ = {}
- title_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/a/h2'.format(page=i + 1)
- title = html.xpath(title_xxpath)
- title_ = [i.text for i in title][0]
- category_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/div[1]'.format(page=i + 1)
- category_ = html.xpath(category_xxpath)
- category_ = [i.xpath('./button/span/text()') for i in category_][0]
- sy_date_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/div[2]/span[3]'.format(page=i + 1)
- sy_date_ = html.xpath(sy_date_xxpath)
- sy_date_ = [i.text for i in sy_date_][0]
- pingfen_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[3]/p[1]'.format(page=i + 1)
- pingfen_ = html.xpath(pingfen_xxpath)
- pingfen_ = [i.text.replace("\n", "").replace(" ", "") for i in pingfen_][0]
- dict_['类别'] = category_
- dict_['标题'] = title_
- dict_['上映时间'] = sy_date_
- dict_['评分'] = pingfen_
- xiangqing_url_ = html.xpath('//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]'.format(page=i+1))
- xiangqing_url_ = url + [i.xpath('./a/@href') for i in xiangqing_url_][0][0]
- html_xiangqin = requests.get(xiangqing_url_).text
- html_xiangqin = etree.HTML(html_xiangqin)
- xiangqing = html_xiangqin.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[4]/p')
- dict_['电影详情'] = [i.text.replace('\n', '').replace(' ', '') for i in xiangqing][0]
- dict.append(dict_)
- print(dict_)
- print(dict)
代码效果图
3.pqQuery
- def save_demo_pq():
- """
- :return: 采用pyQuery 框架爬取单页中每部电影的名称, 类别, 评分, 上映时间等信息
- """
- url = 'https://ssr1.scrape.center'
- from pyquery import PyQuery as pq
- import requests
-
- html = requests.get(url).text
- doc = pq(html)
- items = doc('.el-card .el-card__body .el-row ').items()
- dict = []
- for item in items:
- dict_ = {}
- title = item.find('a > h2').text()
- categories = item.find('.categories .el-button span').items()
- cate_list = [i.text() for i in categories]
- sy_date = item.find('.m-v-sm.info')
- sy_date = [i.text() for i in sy_date('.m-v-sm span').items()][-1]
- pingfen = item.find('.el-col .score').items()
- pingfen = [i.text() for i in pingfen][0]
- dict_['评分'] = float(pingfen)
- dict_['类别'] = cate_list
- dict_['标题'] = title
- dict_['上映时间'] = str(sy_date).replace("上映", "").replace(" ", "")
- dict.append(dict_)
- xiangqing_utl_ = item.find('a').attr('href')
- xiangqing_utl_ = url + xiangqing_utl_
- html_xiangqing = requests.get(xiangqing_utl_).text
- doc_xiangqing = pq(html_xiangqing)
- xiangqing = [i.find('p').text() for i in doc_xiangqing.find('.drama').items()][0]
- dict_['影片详情'] = xiangqing
- print(dict_)
- print(dict)
代码效果图
综合来讲 我觉得pqQuery 用起来更顺手一点
完毕