html = '''
<html><head><title>The Domouse's story</title></head>
<body>
<p class="title"name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup= BeautifulSoup(html,'lxml')
print(soup.prettify())#格式化代码,打印结果自动补全缺失的代码
print(soup.title.string)#文章标题
结果:
<html>
<head>
<title>
The Domouse's story
</title>
</head>
<body>
<p class="title" name="dromouse">
<b>
The Dormouse's story
</b>
</p>
<p class="story">
Once upon a time there were little sisters;and their names were
<a class="sister" href="http://example.com/elsie" id="link1">
<!--Elsie-->
</a>
<a class="sister" hred="http://example.com/lacle" id="link2">
Lacle
</a>
and
<a class="sister" hred="http://example.com/tilie" id="link3">
Tillie
</a>
and they lived at bottom of a well.
</p>
<p class="story">
...
</p>
</body>
</html>
The Domouse's story
选择元素
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
<p class="title"name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title)
#<title>The Domouse's story</title>
print(type(soup.title))
#<class 'bs4.element.Tag'>
print(soup.head)
#<head><title>The Domouse's story</title></head>
print(soup.p)#当出现多个时,只返回第一个
#<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
获取标签名称:
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
<p class="title"name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title.name)
#title
获取属性:
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
<p class="title"name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.attrs['name'])
#dromouse
print(soup.p['name'])
#dromouse
获取标签内容:
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
<p class="title"name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"><!--Elsie--></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.</p>
<p class="story">...</p>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.string)
#The Dormouse's story
根据name查找
html = '''
<div class="panel">
<div class="panel-heading"name="elements">
<h4>Hello</h4>
</div>
<div class="panel-body">
<ul class="list"Id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-small"Id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>
</div>
<div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all('ul'))#列表类型
print(type(soup.find_all('ul')[0]))
结果:
[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<class 'bs4.element.Tag'>