正则表达式
样例网站
用法讲解
BeautifulSoup库详解
灵活方便的网页解析库,高效,支持多种解析器。利用bs不用编写正则表达式即可方便地实现网页信息的提取
安装
- pip install beautifullsoup4
详细用法
解析库
解析器 | 使用方法 | 优势 | 劣势 |
---|---|---|---|
python标准库 | BeautifulSoup(markup, 'html.parser') | python内置标准库,执行速度适中,文档容错能力强 | python2.7.3或3.2.2前的版本中文容错能力差 |
lxml html解析 | BeautifulSoup(markup, 'lxml') | 速度快,文档容错能力强 | 需要安装c语言库 |
lxml xml解析 | BeautifulSoup(markup, 'xml') | 速度快,唯一支持xml的解析器 | 需要安装c语言库 |
html5lib | BeautifulSoup(markup, 'html5lib') | 最好的容错性,以浏览器的方式解析文档、生成HTML5格式的文档 | 速度慢,不依赖外部扩展 |
基本使用
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, "lxml")
print(soup.prettify())
print(soup.title.string())
标签选择器
选择元素
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.p)
获取名称
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title.name)
获取属性
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.atrs['name'])
print(soup.p['name'])
获取内容
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.string)
嵌套选择
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.head.title.string) // 这句是重点,可以嵌套的往下去访问节点
子节点和子孙节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.contents)
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children): // 所有的子节点
print(i, child)
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants): // 所有的子孙节点
print(i, child)
父亲和祖先节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.a.parent) // 父节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.parents)))
兄弟节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
标准选择器
find_all(name, attrs, recursive, text, **kwargs)
可根据标签名、属性、内容查找文档
name
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all('ul'))
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.find_all('ul'):
print(ul.find_all('ul'))
attrs
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class='element'))
find(name, attrs, recursive, text, **kwargs)
find返回单个元素,find_all返回所有的元素
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find('ul'))
print(soup.find('page'))
-
find_parents() find_parent()
- find_parents() 返回所有祖先节点
- find_parent() 返回直接父亲节点
-
find_next_siblings() find_next_sibling
- find_next_siblings() 返回后面所有兄弟节点
- find_next_sibling()返回后面第一个兄弟节点
-
find_previous_siblings() find_previous_sibling()
- find_previous_siblings() 返回前面所有兄弟节点
- find_previous)sibling() 返回前面第一个兄弟节点
-
find_all_next() find_next()
- find_all_next() 返回节点后所有符合条件的节点
- find_next()返回第一个符合条件的节点
-
find_all_previous() 和find_previous()
- find_all_previous() 返回节点前所有符合条件的节点
- find_previous() 返回第一个符合条件的节点
CSS选择器
通过select()直接传入css选择期即可完成选择
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.select('.panel-body'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
获取属性
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
获取内容
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ui class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ui>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select("li"):
print(ui.get_text())
总结
- 推荐使用lxml解析库,必要时使用html.parser
- 标签选择筛选功能弱但是速度快
- 建议使用find(),find_all()查询匹配单个结果或多个结果
- 如果对css选择器熟悉建议使用select()
- 记住常用的获取属性和文本值的方法
PyQuery
强大又灵活的网页解析库,如果熟悉juqery的话,可以很快接入pyquery。不用写麻烦的正则
安装
pip install pyquery
初始化
html="""
<div>
<ul>
<li class="item-0">first item</li>
<li class='item-1'><a href="link2.html">second item</a></li>
<li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li>
<li class='item-1 active'>< a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
"""
from pyquery import PyQuery as pq
doc = pq(html)
print(doc("li"))
URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
文件初始化
from pyquery import PyQuery as pq
doc = pq(filename="demo.html")
print(doc('li'))
基本CSS选择器
html="""
<div id="container">
<ul>
<li class="item-0">first item</li>
<li class='item-1'><a href="link2.html">second item</a></li>
<li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li>
<li class='item-1 active'>< a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
"""
from pyquery import PyQeury as pq
doc = pq(html)
print(doc("#container ul .item-0"))
操作接口
-
find函数 find("li")
-
children()孩子节点
-
parent() 父元素
-
parents() 祖先节点
-
siblings() 所有兄弟元素
-
items() 所有元素
-
attr(name) 属性
-
text() 文本
-
html() 获取html内容
-
addClass(name) 添加css class
-
removeClass(name) 移除css class
-
attr("name", "link") 修改属性值
-
css("font-size", "14px") 设置css值
-
item.remove() 移除元素
官方文档
pyquery.readthedocs.io
Selenium库
自动化测试工具,支持多种浏览器,驱动多种浏览器可以进行一系列的操作。爬虫中主要用来解决JavaScript渲染问题。
安装
pip install selenium
用法
基本使用
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
browser = webdriver.Chrome()
try:
browser.get("https://www.baidu.con")
input = browser.find_element_by_id("kw")
input.send_keys("Python")
input.send_keys(Keys.ENTER)
wait = WebDriverWait(browser, 10)
wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
print(browser.current_url)
print(browser.get_cookies())
print(browser.page_source)
finally:
browser.close()
声明浏览器对象
from selenium import webdriver
browser = webdriver.Chrome()
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantonJS()
browser = webdriver.Safari()
访问页面
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
print(browser.page_source)
browser.close()
查找元素
单个元素
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input_first = browser.find_element_by_id('q')
input_second = browser.find_element_by_css_selector('#q')
input_third = browser.find_element_by_xpath('//*[@id="q"]')
print(inpout_firsta, input_second, input_third)
from selenium import webdriver
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element(By.ID, 'q')
print(input_first)
browser.close()
多个元素
find_elements_by_css_selector
find_elements
元素交互
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input.send_keys("iPhone")
time.sleep(1)
input.clear()
input.send_keys('iPad')
button = browser.find_element_by_class_name('btn-search')
button.click()
更多操作: selenium-python.readthedocs.io/api.html
交互动作
将动作附加到动作链中串行执行
from selenium import webdriver
from selenium.webdriver import ActionChains
browser = webdriver.Chrome()
url= 'http://www.r
browser.switch_to_frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')