BeautifulSoup库
0、所有方法都有的
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
1、基本用法
'''
基本用法demo1
'''
def demo01(html_doc):
soup = BeautifulSoup(html_doc, "lxml")
print(soup.prettify())
print(soup.title.string)
2、节点选择器
'''
节点选择器demo2
'''
def demo02(html_doc):
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.title.string)
print(soup.head)
print(soup.p)
print(type(soup.p))
print(soup.a)
3、提取节点信息
'''
提取节点信息demo3
'''
def demo03(html_doc):
soup = BeautifulSoup(html_doc, "lxml")
tag = soup.a
print(tag.name)
print("class值为: ", tag.attrs["class"])
print("href值为: ", tag.attrs["href"])
print(tag.string)
4、获取子节点信息
'''
获取子节点信息demo4
'''
def demo04(html_doc):
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.head)
print(soup.head.title)
print(soup.head.title.string)
5、关联选择
1、获取子节点--contents
'''
关联选择demo05--01--下级节点
使用contents属性进行获取--获取子节点
介绍:
在做选择的时候,有时候不能做到一步就获取到我想要的节点元素,需要选取某一个节点元素,
然后以这个节点为基准再选取它的子节点、父节点、兄弟节点等
'''
def demo05():
html_doc01 = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">...</p>
"""
html_doc02 = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b>
</p>
<p class="story">...</p>
"""
soup01 = BeautifulSoup(html_doc01, "lxml")
print(soup01.p.contents)
soup02 = BeautifulSoup(html_doc02, "lxml")
print(soup02.p.contents)
2、获取子节点--children
'''
关联选择demo06--02--下级节点
使用children属性进行获取--获取子节点
'''
def demo06():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
print(soup.p.children)
print(list(soup.p.children))
for item in soup.p.children:
print(item)
3、获取子孙节点--descendants
'''
关联选择demo07--03--下级节点
使用descendants属性进行获取--获取子孙节点(获取:子节点和孙节点的内容)
'''
def demo07():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span>Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
print(soup.p.descendants)
print(list(soup.p.descendants))
4、获取父节点--parent、祖先节点--parents
'''
关联选择demo08--01--上级节点
使用parent属性进行获取--获取父节点
使用parents属性进行获取--获取祖先节点
'''
def demo08():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<p>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
</p>
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
print(soup.p.parent)
print(soup.a.parent)
print("=======================")
print(soup.a.parents)
for i, parent in enumerate(soup.a.parents):
print(i, parent)
5、获取兄弟节点
'''
关联选择demo09--兄弟节点
# 可以使用的属性有:
1、next_sibling
2、previous_sibling
3、next_siblings
4、previous_siblings
'''
def demo09():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
<a href="http://example.com/a" class="sister" id="link3">a</a>
<a href="http://example.com/b" class="sister" id="link3">b</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
print(soup.a.next_sibling)
print(soup.a.next_siblings)
print(soup.a.previous_sibling)
print(soup.a.previous_siblings)
6、方法选择器
1、find_all()
'''
方法选择器 -- find_all() -- 以列表形式返回多个元素
find_all(name, attrs={}, recursive=True, string, limit)
# 1、name: 标签的名称--查找标签
# 2、attrs: 属性过滤器字典
# 3、recursive: 递归查找一个元素的子孙元素们,默认为True
# 4、string:查找文本
# 5、limit: 查找结果的个数限制
'''
def demo10():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
print(soup.find_all("a"))
print(soup.find_all(attrs={"class": "sister"}))
print(soup.find_all(class_ = "sister"))
print(soup.find_all(class_ = "hi"))
print(soup.find_all(string="Elsie"))
2、find()
'''
方法选择器 -- find() -- 返回单个元素【一般是返回第一个元素作为结果】
'''
def demo11():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>,
<a href="http://example.com/lacie" class="sister" id="link2"><span>Lacie</span></a> and
<a href="http://example.com/tillie" class="sister" id="link3"><span>Tillie</span></a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
print(soup.find("a"))
3、其他方法选择器
'''
其他方法选择器
find_parents(): 返回所以的祖先节点
find_parent(): 返回当前节点的父节点
find_next_siblings():返回当前节点后面的所有兄弟节点
find_previous_siblings():返回当前节点后面的相邻的那个兄弟节点
find_next_sibling():返回当前节点前面的所有兄弟节点
find_previous_sibling():返回当前节点前面的相邻的那个兄弟节点
'''
7、CSS选择器--select()
'''
CSS选择器 -- select()方法
'''
def demo12():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, "lxml")
print(soup.select(".panel-heading"))
print(soup.select("ul li"))
print(soup.select("#list-2 li"))
print(soup.select("ul"))
print(type(soup.select('ul')[0]))
说明:
在 select(css)中的 css 有多个节点时,节点元素之间用空格分开,就是查找子孙节点,
例如 soup.select(“div p”)是查找所有<div>节点下面的所有子孙<p>节点。
节点元素之间用" > "分开(注意>的前后至少包含一个空格),就是查找直接子节点:
用" ~ "连接两个节点表示查找前一个节点后面的所有同级别的兄弟节点(注意~号前后至少有一个空格),
例如 soup.select(“div ~ p”)查找<div>后面的所有同级别的<p>兄弟节点。
用" + "连接两个节点表示查找前一个节点后面的第一个同级别的兄弟节点(注意+号前后至少有一个空格):
例如 soup.select(“div + p”)查找<div>后面的第一个同级别的<p>兄弟节点。
8、嵌套选择--select()
'''
嵌套选择 -- select( )方法
'''
def demo13():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
9、获取属性
'''
获取属性(两种方法)
'''
def demo14():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, 'lxml')
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
10、获取文本
'''
获取文本(两种方式)
'''
def demo15():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, 'lxml')
for li in soup.select('li'):
print('String:', li.string)
print('get text:', li.get_text())
参考链接
1、Python爬虫:史上最详细的BeautifulSoup教程