@[toc]
Splash 是一个 JavaScript 渲染服务,基于 WebKit 开发。以下是详细的使用方法:
1. 安装和启动 Splash
Docker 安装(推荐)
# 拉取Splash镜像
docker pull scrapinghub/splash
# 启动Splash服务
docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash
本地安装(Ubuntu)
sudo apt-get update
sudo apt-get install docker.io
# 然后使用Docker运行
检查Splash服务状态
# 检查Splash容器是否运行
docker ps | grep splash
# 检查Splash服务是否可访问
curl http://localhost:8050/
# 查看Splash日志
docker logs <splash_container_id>
# 检查Splash服务健康状态
curl http://localhost:8050/_ping
调整Splash容器配置
# 重启Splash容器并增加资源限制
docker stop splash-container
docker rm splash-container
# 重新启动并增加资源限制
docker run -d -p 8050:8050 -p 5023:5023 \
--name splash-container \
--memory=2g \ # 增加内存限制
--cpus=1.5 \ # 增加CPU限制
scrapinghub/splash \
--max-timeout 3600
# 重启Splash
docker restart splash-container
查看效果
我们在8050端口上运行了Splash服务,打开http://192.168.99.100:8050/即可看到其Web页面
2. 基本使用方法
Python 客户端库
pip install scrapy-splash requests
简单示例
import requests
from urllib.parse import quote
def splash_render_basic(url):
splash_url = 'http://localhost:8050/render.html'
params = {
'url': url,
'wait': 2, # 等待秒数
'timeout': 30
}
response = requests.get(splash_url, params=params)
return response.text
# 使用示例
html_content = splash_render_basic('https://www.cvma.org.cn/6847/index.html')
print(html_content)
3. 完整功能示例
渲染HTML内容
import requests
import json
def splash_render_html(url, wait_time=2):
splash_url = 'http://localhost:8050/render.html'
params = {
'url': url,
'wait': wait_time,
'proxy': 'http://your-proxy.com:port', # 可选代理
'images': 0, # 不加载图片(加速)
'timeout': 60
}
try:
response = requests.get(splash_url, params=params, timeout=90)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
return None
# 使用
content = splash_render_html('https://www.cvma.org.cn/6847/index.html', wait_time=3)
渲染JSON数据(获取更多信息)
def splash_render_json(url, wait_time=2):
splash_url = 'http://localhost:8050/render.json'
params = {
'url': url,
'wait': wait_time,
'html': 1, # 返回HTML
'png': 1, # 返回截图
'har': 1, # 返回HAR网络记录
'images': 0 # 不加载图片
}
response = requests.get(splash_url, params=params)
data = response.json()
return {
'html': data.get('html'),
'png': data.get('png'), # base64编码的截图
'har': data.get('har') # 网络请求记录
}
# 使用
result = splash_render_json('https://www.cvma.org.cn/6847/index.html')
print("HTML长度:", len(result['html']))
4. 执行自定义 JavaScript
简单的JS执行
def splash_execute_js(url):
splash_url = 'http://localhost:8050/execute'
lua_script = """
function main(splash)
splash:go(splash.args.url)
splash:wait(%s)
-- 执行JavaScript
local scroll_to = splash:jsfunc([[
function() {
window.scrollTo(0, document.body.scrollHeight);
}
]])
scroll_to()
splash:wait(1)
return {
html = splash:html(),
url = splash:url()
}
end
""" % 2
data = {
'lua_source': lua_script,
'url': url,
'timeout': 60
}
response = requests.post(splash_url, json=data)
return response.json()
# 使用
result = splash_execute_js('https://www.cvma.org.cn/6847/index.html')
复杂交互操作
def splash_complex_interaction(url):
splash_url = 'http://localhost:8050/execute'
lua_script = """
function main(splash)
splash:go(splash.args.url)
splash:wait(2)
-- 点击按钮(如果有)
local success = splash:runjs([[
var button = document.querySelector('.load-more');
if (button) {
button.click();
return true;
}
return false;
]])
if success then
splash:wait(3) -- 等待加载完成
end
-- 获取页面信息
local title = splash:evaljs("document.title")
local news_count = splash:evaljs("document.querySelectorAll('.news-item').length")
return {
html = splash:html(),
title = title,
news_count = news_count,
url = splash:url()
}
end
"""
data = {
'lua_source': lua_script,
'url': url
}
response = requests.post(splash_url, json=data)
return response.json()
5. 高级功能
设置请求头
def splash_with_headers(url):
splash_url = 'http://localhost:8050/execute'
lua_script = """
function main(splash)
splash:set_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
splash:set_custom_headers({
["Referer"] = "https://www.cvma.org.cn/",
["Accept-Language"] = "zh-CN,zh;q=0.9"
})
splash:go(splash.args.url)
splash:wait(2)
return splash:html()
end
"""
data = {'lua_source': lua_script, 'url': url}
response = requests.post(splash_url, json=data)
return response.text
处理Cookie和会话
def splash_with_cookies(url):
splash_url = 'http://localhost:8050/execute'
lua_script = """
function main(splash)
-- 首先访问首页获取Cookie
splash:go("https://www.cvma.org.cn/")
splash:wait(1)
-- 然后访问目标页面(携带Cookie)
splash:go(splash.args.url)
splash:wait(2)
-- 获取所有Cookie
local cookies = splash:get_cookies()
return {
html = splash:html(),
cookies = cookies
}
end
"""
data = {'lua_source': lua_script, 'url': url}
response = requests.post(splash_url, json=data)
return response.json()
6. 与Scrapy集成
settings.py 配置
# Scrapy settings.py
SPLASH_URL = 'http://localhost:8050'
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
Scrapy Spider 示例
import scrapy
from scrapy_splash import SplashRequest
class CvmaSpider(scrapy.Spider):
name = 'cvma'
start_urls = ['https://www.cvma.org.cn/6847/index.html']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(
url,
self.parse,
args={'wait': 2, 'timeout': 90}
)
def parse(self, response):
# 提取渲染后的内容
news_items = response.css('.news-item')
for item in news_items:
yield {
'title': item.css('h3::text').get(),
'date': item.css('.date::text').get(),
'link': item.css('a::attr(href)').get()
}
7. 错误处理和优化
完整的错误处理
import requests
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def splash_render_robust(url, max_retries=3):
splash_url = 'http://localhost:8050/render.html'
# 配置重试策略
session = requests.Session()
retry_strategy = Retry(
total=max_retries,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
session.mount("https://", adapter)
params = {
'url': url,
'wait': 2,
'timeout': 60,
'resource_timeout': 30
}
for attempt in range(max_retries):
try:
response = session.get(splash_url, params=params, timeout=90)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
print(f"尝试 {attempt + 1} 失败: {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # 指数退避
else:
raise e
# 使用
try:
html = splash_render_robust('https://www.cvma.org.cn/6847/index.html')
print("成功获取内容")
except Exception as e:
print(f"最终失败: {e}")
8. 性能优化建议
def splash_optimized(url):
"""优化性能的Splash调用"""
splash_url = 'http://localhost:8050/execute'
lua_script = """
function main(splash)
-- 性能优化设置
splash.images_enabled = false
splash.resource_timeout = 10
splash:go(splash.args.url)
-- 等待特定元素出现,而不是固定时间
splash:wait_for_resume([[
function main(splash) {
return new Promise(function(resolve) {
function check() {
var element = document.querySelector('.content-loaded');
if (element) {
resolve();
} else {
setTimeout(check, 500);
}
}
check();
});
}
]])
return splash:html()
end
"""
data = {'lua_source': lua_script, 'url': url}
response = requests.post(splash_url, json=data, timeout=60)
return response.text
Splash对象属性
上图中main()方法的第一个参数是splash,这个对象非常重要,它类似于Selenium中的WebDriver对象
1. images_enabled
设置图片是否加载,默认情况下是加载的。禁用该属性后,可以节省网络流量并提高网页加载速度
注意的是,禁用图片加载可能会影响JavaScript渲染。因为禁用图片之后,它的外层DOM节点的高度会受影响,进而影响DOM节点的位置
因此,如果JavaScript对图片节点有操作的话,其执行就会受到影响
function main(splash, args)
splash.images_enabled = false
splash:go('https://www.baidu.com')
return {html=splash:html()}
end
2. plugins_enabled
可以控制浏览器插件(如Flash插件)是否开启
默认情况下,此属性是false,表示不开启
splash.plugins_enabled = true/false
3. scroll_position
控制页面上下或左右滚动
splash.scroll_position = {x=100, y=200}
Splash对象的方法
1. go()
该方法用来请求某个链接,而且它可以模拟GET和POST请求,同时支持传入请求头、表单等数据
ok, reason = splash:go{url, baseurl=nil, headers=nil, http_method="GET", body=nil, formdata=nil}
返回结果是结果ok和原因reason
如果ok为空,代表网页加载出现了错误,此时reason变量中包含了错误的原因
| 参数 | 含义 |
|---|---|
| url | 请求的URL |
| baseurl | 可选参数,默认为空,表示资源加载相对路径 |
| headers | 可选参数,默认为空,表示请求头 |
| http_method | 可选参数,默认为GET,同时支持POST |
| body | 可选参数,默认为空,发POST请求时的表单数据,使用的Content-type为application/json |
| formdata | 可选参数,默认为空,POST的时候的表单数据,使用的Content-type为application/x-www-form-urlencoded |
splash:go{"http://www.sxt.cn", http_method="POST", body="name=17703181473"}
2. wait()
控制页面的等待时间
splash:wait{time, cancel_on_redirect=false, cancel_on_error=true}
| 参数 | 含义 |
|---|---|
| time | 等待的秒数 |
| cancel_on_redirect | 可选参数,默认为false,表示如果发生了重定向就停止等待,并返回重定向结果 |
| cancel_on_error | 可选参数,默认为false,表示如果发生了加载错误,就停止等待 |
function main(splash)
splash:go("https://www.taobao.com")
splash:wait(2)
return {html=splash:html()}
end
3. jsfunc()
直接调用JavaScript定义的方法,但是所调用的方法需要用双中括号包围,这相当于实现了JavaScript方法到Lua脚本的转换
function main(splash, args)
splash:go("http://www.sxt.cn")
local scroll_to = splash:jsfunc("window.scrollTo")
scroll_to(0, 300)
return {png=splash:png()}
end
4. evaljs()与 runjs()
- evaljs() 以执行JavaScript代码并返回最后一条JavaScript语句的返回结果
- runjs() 以执行JavaScript代码,它与evaljs()的功能类似,但是更偏向于执行某些动作或声明某些方法
function main(splash, args)
splash:go("https://www.baidu.com")
splash:runjs("foo = function() { return 'sxt' }")
local result = splash:evaljs("foo()")
return result
end
5. html()
获取网页的源代码
function main(splash, args)
splash:go("https://www.bjsxt.com")
return splash:html()
end
6. png()
获取PNG格式的网页截图
function main(splash, args)
splash:go("https://www.bjsxt.com")
return splash:png()
end
7. har()
获取页面加载过程描述
function main(splash, args)
splash:go("https://www.bjsxt.com")
return splash:har()
end
8. url()
获取当前正在访问的URL
function main(splash, args)
splash:go("https://www.bjsxt.com")
return splash:url()
end
9. get_cookies()
获取当前页面的Cookies
function main(splash, args)
splash:go("https://www.bjsxt.com")
return splash:get_cookies()
end
10. add_cookie()
当前页面添加Cookie
cookies = splash:add_cookie{name, value, path=nil, domain=nil, expires=nil, httpOnly=nil, secure=nil}
function main(splash)
splash:add_cookie{"sessionid", "123456abcdef", "/", domain="http://bjsxt.com"}
splash:go("http://bjsxt.com/")
return splash:html()
end
function main(splash)
splash:add_cookie{"sessionid", "123456abcdef", "/", domain="http://bjsxt.com"}
splash:go("http://bjsxt.com/")
return splash:html()
end
11. clear_cookies()
可以清除所有的Cookies
function main(splash)
splash:go("https://www.bjsxt.com/")
splash:clear_cookies()
return splash:get_cookies()
end
12. set_user_agent()
设置浏览器的User-Agent
function main(splash)
splash:set_user_agent('Splash')
splash:go("http://httpbin.org/get")
return splash:html()
end
13. set_custom_headers()
设置请求头
function main(splash)
splash:set_custom_headers({
["User-Agent"] = "Splash",
["Site"] = "Splash",
})
splash:go("http://httpbin.org/get")
return splash:html()
end
14. select()
选中符合条件的第一个节点
如果有多个节点符合条件,则只会返回一个
其参数是CSS选择器
function main(splash)
splash:go("https://www.baidu.com/")
input = splash:select("#kw")
splash:wait(3)
return splash:png()
end
15. send_text()
填写文本
function main(splash)
splash:go("https://www.baidu.com/")
input = splash:select("#kw")
input:send_text('Splash')
splash:wait(3)
return splash:png()
end
16. mouse_click()
模拟鼠标点击操作
function main(splash)
splash:go("https://www.baidu.com/")
input = splash:select("#kw")
input:send_text('Splash')
submit = splash:select('#su')
submit:mouse_click()
splash:wait(3)
return splash:png()
end
17. 代理Ip
function main(splash)
splash:on_request(function(request)
request:set_proxy{
host='61.138.33.20',
port=808,
username='uanme',
password='passwrod'
}
end)
-- 设置请求头
splash:set_user_agent("Mozilla/5.0")
splash:go("https://httpbin.org/get")
return splash:html()
end
Splash与Python结合
1. render.html
此接口用于获取JavaScript渲染的页面的HTML代码,接口地址就是Splash的运行地址加此接口名称,例如http://192.168.99.100:8050/render.html
import requests
url = 'http://192.168.99.100:8050/render.html?url=https://www.bjsxt.com&wait=3'
response = requests.get(url)
print(response.text)
2. render.png
此接口可以获取网页截图
import requests
url = 'http://192.168.99.100:8050/render.png?url=https://www.jd.com&wait=5&width=1000&height=700'
response = requests.get(url)
with open('taobao.png', 'wb') as f:
f.write(response.content)
3. execute
最为强大的接口。前面说了很多Splash Lua脚本的操作,用此接口便可实现与Lua脚本的对接
import requests
from urllib.parse import quote
lua = '''
function main(splash)
return 'hello'
end
'''
url = 'http://192.168.99.100:8050/execute?lua_source=' + quote(lua)
response = requests.get(url)
print(response.text)