python beautifulsoup 对html 进行爬取分类（部分）

作者：ymnets 发布时间：2023-05-06 阅读：0

html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''

from bs4 import BeautifulSoup
soup= BeautifulSoup(html,'lxml')

print(soup.prettify())#格式化代码，打印结果自动补全缺失的代码
print(soup.title.string)#文章标题结果：
<html>
<head>
<title>
 The Domouse's story
</title>
</head>
<body>

 
 The Dormouse's story
 


 Once upon a time there were little sisters;and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">
 
 </a>
 <a class="sister" hred="http://example.com/lacle" id="link2">
 Lacle
 </a>
 and
 <a class="sister" hred="http://example.com/tilie" id="link3">
 Tillie
 </a>
 and they lived at bottom of a well.


 ...

</body>
</html>
The Domouse's story
选择元素
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title)
 #<title>The Domouse's story</title>
print(type(soup.title))
 #<class 'bs4.element.Tag'>
print(soup.head)
 #<head><title>The Domouse's story</title></head>
print(soup.p)#当出现多个时，只返回第一个
 #The Dormouse's story 获取标签名称：
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title.name)
 #title 获取属性：
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml') print(soup.p.attrs['name'])
 #dromouse
print(soup.p['name'])
 #dromouse
获取标签内容：
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml') print(soup.p.string)
 #The Dormouse's story 根据name查找
html = '''
<div class="panel">
 <div class="panel-heading"name="elements">
 <h4>Hello</h4>
 </div>
 <div class="panel-body">
 <ul class="list"Id="list-1">
 <li class="element">Foo</li>
 <li class="element">Bar</li>
 <li class="element">Jay</li>
 </ul>
 <ul class="list list-small"Id="list-2">
 <li class="element">Foo</li>
 <li class="element">Bar</li>
 </ul>
 </div>
<div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

print(soup.find_all('ul'))#列表类型
print(type(soup.find_all('ul')[0])) 结果： [<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<class 'bs4.element.Tag'>

本文标签：python beautifulsoup html 进行爬取分类部分

版权说明：

python beautifulsoup 对html 进行爬取分类（部分）

你可能感兴趣的