Python使用Beautiful Soup及解析html获取元素并提取内容值
1. 包括解析获取标题
2. 根据标签及id获取所有元素
3. 根据标签及class获取所有元素
4. 获取元素下的标签的值
5. 获取元素下的parent及child的元素的值
from bs4 import BeautifulSoup
file_html = 'test/demo.html'
file = open(file_html, "rb")
html = file.read().decode("utf-8")
bs = BeautifulSoup(html, "html.parser")
print("获取文章title")
print(bs.title)
id_list = bs.find_all('input', id='mSearchInput')
div_class_list = bs.find_all('div', class_='view-num-box')
for i, div in enumerate(div_class_list):
print(i, div.text, ' parent: ', div.parent.text)
print('-----------------------------------------------------------')
blog_list = bs.find_all('article', class_='blog-list-box')
for i, blog in enumerate(blog_list):
print(i, blog.text, '\ntitle: ', bs.find_all('div', class_='blog-list-box-top')[i].text)
print(blog.h4.text)
print(blog.span.text)
print(blog.div, blog.div.next)
for j, content in enumerate(blog.contents):
print('contents: ', j, content.text)
for j, child in enumerate(blog.children):
print('child: ', j, child.text)
div_list = bs.find_all('div', class_='user-profile-head-address')
print('div_list: ', div_list[0].text)
meta_list = bs.find_all('meta')
for j, meta in enumerate(meta_list):
print(j, meta.text, meta.attrs['content'])
print("2. NavigableString的例子:获取title的string内容和div的属性")
print(bs.title.string)
print(bs.div.attrs)
print("3. BeautifulSoup的例子:获取整个html文档的name")
print(bs.name)
print("4. Comment的例子:获取a的string")
print(bs.a.string)
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
参考