BeautifulSoup 是一个用于解析HTML和XML文档的Python库。它提供了一种灵活和便捷的方式来导航、搜索和修改解析树。BeautifulSoup简化了网络爬虫的工作,使得开发者可以轻松地解析网页内容,提取所需的数据。
使用BeautifulSoup的第一步是安装它,可以通过pip进行安装:
pip install beautifulsoup4
然后,你可以导入BeautifulSoup类并创建一个BeautifulSoup对象来解析HTML文档:
from bs4 import BeautifulSoup html_doc = """测试页面 标题
这是一个故事。
""" soup = BeautifulSoup(html_doc, 'html.parser')
BeautifulSoup提供了多种选择HTML节点的方法。例如,可以使用find()
或find_all()
方法根据标签名、属性或文本内容来查找节点:
# 查找第一个标签 p_tag = soup.find('p') # 查找所有
标签 p_tags = soup.find_all('p') # 查找class为"title"的
标签 title_p = soup.find('p', class_='title')
你可以使用.
来访问tag对象的子节点:
# 访问标签内部的文本 text = title_p.string # 访问
标签的子标签 children = title_p.children
关联选择是指通过父节点、子节点或兄弟节点之间的关系来选择元素。BeautifulSoup提供了.parent
、.children
、.next_sibling
和.previous_sibling
等属性来访问这些关系:
# 获取标签的父节点 parent_tag = title_p.parent # 遍历
标签的所有子节点 for child in title_p.children: print(child)
BeautifulSoup还提供了多种方法来过滤和选择节点,如基于文本内容、正则表达式或lambda函数的选择:
# 查找包含特定文本的标签 p_with_text = soup.find('p', text='标题') # 查找所有包含特定文本模式的
标签 p_with_pattern = soup.find_all('p', text=re.compile('故事'))
类似于CSS选择器,BeautifulSoup也支持通过CSS选择器语法来选取节点:
# 使用CSS选择器选择class为"title"的标签 title_p_css = soup.select_one('p.title') # 选择所有
标签 p_tags_css = soup.select('p')
from bs4 import BeautifulSoup import requests url = 'http://example.com' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') title_tag = soup.find('title') if title_tag: print("网页标题:", title_tag.string)
from bs4 import BeautifulSoup import requests url = 'http://example.com' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all('a') # 查找所有 标签(链接) for link in links: href = link.get('href') # 获取链接的 href 属性 print("链接:", href)
from bs4 import BeautifulSoup html_doc = """
姓名 | 年龄 |
---|---|
张三 | 25 |
李四 | 30 |
以下是一些使用BeautifulSoup的额外代码案例,这些案例涵盖了从网页中提取信息、修改HTML内容以及使用CSS选择器等不同方面的应用。
from bs4 import BeautifulSoup import requests url = 'https://example.com/somepage' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 使用CSS选择器提取所有的图片链接 img_links = soup.select('img[src]') for img in img_links: print(img['src']) # 使用CSS选择器提取具有特定类的元素 elements_with_class = soup.select('.class-name') for element in elements_with_class: print(element.get_text())
from bs4 import BeautifulSoup html_doc = """
Name | Age |
---|---|
Alice | 28 |
Bob | 35 |
from bs4 import BeautifulSoup html_doc = """Test Page Old Title
Old Story
""" soup = BeautifulSoup(html_doc, 'html.parser') # 修改标题文本 soup.title.string = 'New Title' # 修改类为"title"的p标签的文本 soup.find('p', class_='title').string = 'New Title Text' # 添加新的p标签 new_p = soup.new_tag('p') new_p.string = 'This is a new paragraph.' soup.body.append(new_p) # 输出修改后的HTML print(soup.prettify())
from bs4 import BeautifulSoup html_doc = """""" soup = BeautifulSoup(html_doc, 'html.parser') # 提取所有的item类div,并处理它们的内容 items = soup.find_all('div', class_='item') for item in items: header = item.find('h2').text description = item.find('p').text print(f"Header: {header}") print(f"Description: {description}") print("-" * 20)Item 1
Description for item 1
Item 2
Description for item 2
#!/usr/bin/env python from urllib import response from bs4 import BeautifulSoup import requests import os import logging from fake_useragent import UserAgent # 隧道域名:端口号 # tunnel = "r250.kdltps.com:15818" # 用户名密码方式 # username = "t19754578624032" # password = "hemrc89p" # proxies = { # "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, # "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} # } # 白名单方式(需提前设置白名单) # proxies = { # # "http": "http://%(proxy)s/" % {"proxy": tunnel}, # "https": "http://%(proxy)s/" % {"proxy": tunnel} # } # # # 要访问的目标网页 # target_url = "https://dev.kdlapi.com/testproxy" # # 使用隧道域名发送请求 # response = requests.get(target_url, proxies=proxies) # # 获取页面内容 # if response.status_code == 200: # print(response.text) # 请勿使用keep-alive复用连接(会导致隧道不能切换IP) local_save_path = 'C:/Users/EA/Desktop/10-爬虫篇-第十次直播/Code/novel/' ua = UserAgent() headers = { "User-Agent":ua.random } logging.basicConfig(level=logging.INFO, format='%(asctime)s -%(levelname)s: %(message)s') url = 'https://www.biqukan8.cc/0_790/' url_list = [] name_list = [] flag_name = '' # print(url_list) # print(name_list) def novel_content(url,name): txt_response = requests.get(url=url,headers=headers) # print(txt_response.text) txts_soup = BeautifulSoup(str(txt_response.text),"lxml") txts = txts_soup.find_all(id = 'content',class_='showtxt') # print(type(list(enumerate(txt)))) # print(len(list(enumerate(txts)))) #1 证明他是一个整段 text_soup = BeautifulSoup(str(txts),'lxml') text = text_soup.div.text file_write(name,text) def file_write(name,text): directory_path = local_save_path + novel_name if os.path.exists(directory_path): print(f"目录'{directory_path}'存在!") else: #创建路径 os.mkdir(directory_path) print(f"目录'{directory_path}'已经创建!") #将刚刚获取到的小说内容写进去 write_flag = True name_path = os.path.join(directory_path,f"{name}.txt") with open(name_path,"a+",encoding='utf-8') as file: for each in text: if each == 'h': write_flag = False if write_flag == True and each != '': file.write(each) file.write('\n\n') response = requests.get(url,headers=headers) response.encoding = 'gbk' # logging.info(response.text) soup = BeautifulSoup(response.text,"lxml") chapters = soup.find_all('div',class_='listmain') # logging.info(chapters) download_soup = BeautifulSoup(str(chapters),"lxml") # logging.info(download_soup.contents) #小说名 novel_name = str(download_soup.dl.dt).split("》")[0][5:] # print(str(novel_name).split("》")[0][5:]) #《元尊》正文卷 flag_name = "《"+novel_name+"》"+"正文卷" # logging.info(flag_name) begin_flag = False for child in download_soup.dl.children: if child != '\n': if child.string == u"%s" % flag_name: begin_flag = True if begin_flag == True and child.a != None: download_url = "https://www.biqukan8.cc/" + child.a.get("href") download_name = child.a.string # print(download_url,download_name) url_list.append(download_url) name_list.append(download_name) # #用zip函数把两个列表合并起来 combined_list = zip(url_list,name_list) for item1,item2 in combined_list: novel_content(item1,item2)