• 采用BeautifulSoup&&pqQuery&&xpath三种方法爬取电影详情页


    采用三个框架BeautifulSoup&&pqQuery&&xpath,爬取知名的电影网页

    主要是想体验这三种框架爬同一个网页的不同。

    当然具体的不同我也说不清道不明 只能是体验了一把

    以下代码都是本人亲自撸

    如图所示,四个位置。分别爬取 电影名字 -> 电影类别 ->  上映时间 -> 电影评分

    以及点击电影名称获取特定电影的电影详情页面,如下图所示

     

    1. BeautifulSoup
    1. def save_demo_bs4():
    2. """
    3. :return: 采用BeautifulSoup 框架爬取单页中每部电影的名称, 类别, 评分, 上映时间等信息
    4. """
    5. url = 'https://ssr1.scrape.center'
    6. import re
    7. from bs4 import BeautifulSoup
    8. html = requests.get(url).text
    9. soup = BeautifulSoup(html, 'lxml')
    10. dict = []
    11. ret = soup.find_all(class_=re.compile("el-card item m-t is-hover-shadow")) # 找到所有的class属性为当前属性的文本
    12. for ret_ in ret:
    13. dict_ = {}
    14. title_ = ret_.find_all(class_=re.compile('name'))
    15. xiangqing_url = url + title_[0].attrs['href'] # 得到单页中每部电影的详情页面
    16. title_ = [i.text.replace('\n', "") for i in title_][0]
    17. categories_ = ret_.find_all(class_=re.compile('categories'))
    18. sy_date_ = ret_.find_all(class_=re.compile('m-v-sm info'))
    19. pingfen_ = ret_.find_all(class_=re.compile('score m-t-md m-b-n-sm'))
    20. pingfen_ = [i.string.replace('\n', '').replace(' ', '') for i in pingfen_][0]
    21. dict_['标题'] = title_
    22. categories_ = [i.text.replace('\n', ' ') for i in categories_]
    23. dict_['类别'] = categories_
    24. sy_date_ = [i.text.replace('\n', '') for i in sy_date_][1]
    25. dict_['上映时间'] = sy_date_
    26. dict_['评分'] = float(pingfen_)
    27. xiangqin_html = requests.get(xiangqing_url).text
    28. soup_xiangqin = BeautifulSoup(xiangqin_html, 'lxml')
    29. ret__ = soup_xiangqin.find_all(class_=re.compile("drama")) # 找到所有的class属性为当前属性的文本
    30. for x in ret__:
    31. for idx, y in enumerate(x.children):
    32. if idx == 2:
    33. xiangqing = y.text.replace('\n', '').replace(' ', '')
    34. dict_['电影详情'] = xiangqing
    35. dict.append(dict_)

    代码的运行效果图如下

    2. XPATH

    1. def save_demo_xpath():
    2. """
    3. :return: 采用xpath框架爬取单页中每部电影的名称, 类别, 评分, 上映时间等信息
    4. """
    5. url = 'https://ssr1.scrape.center'
    6. import requests
    7. from lxml import etree
    8. html = requests.get(url).text
    9. html = etree.HTML(html)
    10. title_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/a/h2'
    11. title_xxpath = '{page}'
    12. dict = []
    13. for i in range(10):
    14. dict_ = {}
    15. title_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/a/h2'.format(page=i + 1)
    16. title = html.xpath(title_xxpath)
    17. title_ = [i.text for i in title][0]
    18. category_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/div[1]'.format(page=i + 1)
    19. category_ = html.xpath(category_xxpath)
    20. category_ = [i.xpath('./button/span/text()') for i in category_][0]
    21. sy_date_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]/div[2]/span[3]'.format(page=i + 1)
    22. sy_date_ = html.xpath(sy_date_xxpath)
    23. sy_date_ = [i.text for i in sy_date_][0]
    24. pingfen_xxpath = '//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[3]/p[1]'.format(page=i + 1)
    25. pingfen_ = html.xpath(pingfen_xxpath)
    26. pingfen_ = [i.text.replace("\n", "").replace(" ", "") for i in pingfen_][0]
    27. dict_['类别'] = category_
    28. dict_['标题'] = title_
    29. dict_['上映时间'] = sy_date_
    30. dict_['评分'] = pingfen_
    31. xiangqing_url_ = html.xpath('//*[@id="index"]/div[1]/div[1]/div[{page}]/div/div/div[2]'.format(page=i+1))
    32. xiangqing_url_ = url + [i.xpath('./a/@href') for i in xiangqing_url_][0][0]
    33. html_xiangqin = requests.get(xiangqing_url_).text
    34. html_xiangqin = etree.HTML(html_xiangqin)
    35. xiangqing = html_xiangqin.xpath('//*[@id="detail"]/div[1]/div/div/div[1]/div/div[2]/div[4]/p')
    36. dict_['电影详情'] = [i.text.replace('\n', '').replace(' ', '') for i in xiangqing][0]
    37. dict.append(dict_)
    38. print(dict_)
    39. print(dict)

    代码效果图

     

    3.pqQuery 

    1. def save_demo_pq():
    2. """
    3. :return: 采用pyQuery 框架爬取单页中每部电影的名称, 类别, 评分, 上映时间等信息
    4. """
    5. url = 'https://ssr1.scrape.center'
    6. from pyquery import PyQuery as pq
    7. import requests
    8. html = requests.get(url).text
    9. doc = pq(html)
    10. items = doc('.el-card .el-card__body .el-row ').items()
    11. dict = []
    12. for item in items:
    13. dict_ = {}
    14. title = item.find('a > h2').text()
    15. categories = item.find('.categories .el-button span').items()
    16. cate_list = [i.text() for i in categories]
    17. sy_date = item.find('.m-v-sm.info')
    18. sy_date = [i.text() for i in sy_date('.m-v-sm span').items()][-1]
    19. pingfen = item.find('.el-col .score').items()
    20. pingfen = [i.text() for i in pingfen][0]
    21. dict_['评分'] = float(pingfen)
    22. dict_['类别'] = cate_list
    23. dict_['标题'] = title
    24. dict_['上映时间'] = str(sy_date).replace("上映", "").replace(" ", "")
    25. dict.append(dict_)
    26. xiangqing_utl_ = item.find('a').attr('href')
    27. xiangqing_utl_ = url + xiangqing_utl_
    28. html_xiangqing = requests.get(xiangqing_utl_).text
    29. doc_xiangqing = pq(html_xiangqing)
    30. xiangqing = [i.find('p').text() for i in doc_xiangqing.find('.drama').items()][0]
    31. dict_['影片详情'] = xiangqing
    32. print(dict_)
    33. print(dict)

    代码效果图

    综合来讲 我觉得pqQuery 用起来更顺手一点 

    完毕 

  • 相关阅读:
    灵境:为每个一需要的人配上一个后台
    Ceph单节点部署
    【k8s管理操作】
    mysql总结
    MQTT解读【全网最易懂】
    贪心算法(四) | 加油站、单调递增的数字、监控二叉树 | leecode刷题笔记
    【Vue】问题:TypeScript intellisense is disabled on template
    Java TCP服务端多线程接收RFID网络读卡器上传数据
    再来了解机器学习和深度学习_有监督机器学习_无监督机器学习---人工智能工作笔记0018
    Java中使用CountDownLatch实现并发流程控制
  • 原文地址:https://blog.csdn.net/linxizi0622/article/details/132779377