python爬虫学习-day006

377 阅读13分钟

正则表达式

样例网站

regex

用法讲解

BeautifulSoup库详解

灵活方便的网页解析库,高效,支持多种解析器。利用bs不用编写正则表达式即可方便地实现网页信息的提取

安装
  • pip install beautifullsoup4
详细用法
解析库
解析器 使用方法 优势 劣势
python标准库 BeautifulSoup(markup, 'html.parser') python内置标准库,执行速度适中,文档容错能力强 python2.7.3或3.2.2前的版本中文容错能力差
lxml html解析 BeautifulSoup(markup, 'lxml') 速度快,文档容错能力强 需要安装c语言库
lxml xml解析 BeautifulSoup(markup, 'xml') 速度快,唯一支持xml的解析器 需要安装c语言库
html5lib BeautifulSoup(markup, 'html5lib') 最好的容错性,以浏览器的方式解析文档、生成HTML5格式的文档 速度慢,不依赖外部扩展
基本使用
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""

from bs4 import BeautifulSoup as bs
soup = bs(html, "lxml")
print(soup.prettify())
print(soup.title.string())
标签选择器
选择元素
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.p)

获取名称
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.title.name)
获取属性
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.atrs['name'])
print(soup.p['name'])
获取内容
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.string)
嵌套选择
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.head.title.string) // 这句是重点,可以嵌套的往下去访问节点
子节点和子孙节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.contents)
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.children)
for i, child in enumerate(soup.p.children): // 所有的子节点
    print(i, child)
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants): // 所有的子孙节点
    print(i, child)
父亲和祖先节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.a.parent) // 父节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.parents)))
兄弟节点
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
<p class="title" name="dromouse"> <b> The Dirmouse's story</b></p>
<p class="story"> Onece upon a time there were three little sisters; and their names were
<a href = "http://example.com/elsio" class = "sister" id="link1"></a>
<a href = "http://example.com/elsio" class = "sister" id="link2>Lacle</a>
<a href = "http://example.com/elsio" class = "sister" id="link3>Tittle</a>
and they lived at the bottom of a well.</p>
<p class="story> ...</p>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
标准选择器

find_all(name, attrs, recursive, text, **kwargs)

可根据标签名、属性、内容查找文档

name
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all('ul'))

html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.find_all('ul'):
    print(ul.find_all('ul'))
attrs
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class='element'))

find(name, attrs, recursive, text, **kwargs)

find返回单个元素,find_all返回所有的元素

html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.find('ul'))
print(soup.find('page'))
  • find_parents() find_parent()

    • find_parents() 返回所有祖先节点
    • find_parent() 返回直接父亲节点
  • find_next_siblings() find_next_sibling

    • find_next_siblings() 返回后面所有兄弟节点
    • find_next_sibling()返回后面第一个兄弟节点
  • find_previous_siblings() find_previous_sibling()

    • find_previous_siblings() 返回前面所有兄弟节点
    • find_previous)sibling() 返回前面第一个兄弟节点
  • find_all_next() find_next()

    • find_all_next() 返回节点后所有符合条件的节点
    • find_next()返回第一个符合条件的节点
  • find_all_previous() 和find_previous()

    • find_all_previous() 返回节点前所有符合条件的节点
    • find_previous() 返回第一个符合条件的节点

CSS选择器

通过select()直接传入css选择期即可完成选择

html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
print(soup.select('.panel-body'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ul in soup.select('ul'):
    print(ul.select('li'))
获取属性
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select('ul'):
    print(ul['id'])
    print(ul.attrs['id'])
获取内容
html = """
<html><head><title>The Dommouse's story</title></head>
<body>
	<div class="panel-body">
		<ul class="list" id="list-1">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
			<li class="element">Jay</li>
		</ul>
		<ui class="list list-small" id="list-2">
			<li class="element">Foo</li>
			<li class="element">Bar</li>
		</ui>
	</div>
</body>
</html>
"""
from bs4 import BeautifulSoup as bs
soup = bs(html, 'lxml')
for ui in soup.select("li"):
    print(ui.get_text())
总结
  • 推荐使用lxml解析库,必要时使用html.parser
  • 标签选择筛选功能弱但是速度快
  • 建议使用find(),find_all()查询匹配单个结果或多个结果
  • 如果对css选择器熟悉建议使用select()
  • 记住常用的获取属性和文本值的方法

PyQuery

强大又灵活的网页解析库,如果熟悉juqery的话,可以很快接入pyquery。不用写麻烦的正则

安装

pip install pyquery

初始化
html="""
<div>
	<ul>
		<li class="item-0">first item</li>
		<li class='item-1'><a href="link2.html">second item</a></li>
		<li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li>
		<li class='item-1 active'>< a href="link4.html">fourth item</a></li>
		<li class="item-0"><a href="link5.html">fifth item</a></li>
	</ul>
</div>
"""

from pyquery import PyQuery as pq
doc = pq(html)
print(doc("li"))
URL初始化
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
文件初始化
from pyquery import PyQuery as pq
doc = pq(filename="demo.html")
print(doc('li'))
基本CSS选择器
html="""
<div id="container">
	<ul>
		<li class="item-0">first item</li>
		<li class='item-1'><a href="link2.html">second item</a></li>
		<li class='item-0 active'><a href="link3.html"><span class="bold">third item</span></a></li>
		<li class='item-1 active'>< a href="link4.html">fourth item</a></li>
		<li class="item-0"><a href="link5.html">fifth item</a></li>
	</ul>
</div>
"""
from pyquery import PyQeury as pq
doc = pq(html)
print(doc("#container ul .item-0"))
操作接口
  • find函数 find("li")

  • children()孩子节点

  • parent() 父元素

  • parents() 祖先节点

  • siblings() 所有兄弟元素

  • items() 所有元素

  • attr(name) 属性

  • text() 文本

  • html() 获取html内容

  • addClass(name) 添加css class

  • removeClass(name) 移除css class

  • attr("name", "link") 修改属性值

  • css("font-size", "14px") 设置css值

  • item.remove() 移除元素

官方文档

pyquery.readthedocs.io

Selenium库

自动化测试工具,支持多种浏览器,驱动多种浏览器可以进行一系列的操作。爬虫中主要用来解决JavaScript渲染问题。

安装

pip install selenium

用法
基本使用
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


browser = webdriver.Chrome()
try:
    browser.get("https://www.baidu.con")
    input = browser.find_element_by_id("kw")
	input.send_keys("Python")
    input.send_keys(Keys.ENTER)
    wait = WebDriverWait(browser, 10)
    wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
    print(browser.current_url)
    print(browser.get_cookies())
    print(browser.page_source)
finally:
    browser.close()
声明浏览器对象
from selenium import webdriver

browser = webdriver.Chrome()
browser = webdriver.Firefox()
browser = webdriver.Edge()
browser = webdriver.PhantonJS()
browser = webdriver.Safari()
访问页面
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
print(browser.page_source)
browser.close()
查找元素

单个元素

from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input_first = browser.find_element_by_id('q')
input_second = browser.find_element_by_css_selector('#q')
input_third = browser.find_element_by_xpath('//*[@id="q"]')
print(inpout_firsta, input_second, input_third)
from selenium import webdriver
from selenium.webdriver.common.by import By

browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
input_first = browser.find_element(By.ID, 'q')
print(input_first)
browser.close()
多个元素

find_elements_by_css_selector

find_elements

元素交互
from selenium import webdriver
import time

browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input.send_keys("iPhone")
time.sleep(1)
input.clear()
input.send_keys('iPad')
button = browser.find_element_by_class_name('btn-search')
button.click()

更多操作: selenium-python.readthedocs.io/api.html

交互动作

将动作附加到动作链中串行执行

from selenium import webdriver
from selenium.webdriver import ActionChains

browser = webdriver.Chrome()
url= 'http://www.r
browser.switch_to_frame('iframeResult')
source = browser.find_element_by_css_selector('#draggable')
target = browser.find_element_by_css_selector('#droppable')