[1340]Splash的使用

60 阅读8分钟

@[toc]

Splash 是一个 JavaScript 渲染服务,基于 WebKit 开发。以下是详细的使用方法:

1. 安装和启动 Splash

Docker 安装(推荐)

# 拉取Splash镜像
docker pull scrapinghub/splash

# 启动Splash服务
docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash

本地安装(Ubuntu)

sudo apt-get update
sudo apt-get install docker.io
# 然后使用Docker运行

检查Splash服务状态

# 检查Splash容器是否运行
docker ps | grep splash

# 检查Splash服务是否可访问
curl http://localhost:8050/

# 查看Splash日志
docker logs <splash_container_id>

# 检查Splash服务健康状态
curl http://localhost:8050/_ping

调整Splash容器配置

# 重启Splash容器并增加资源限制
docker stop splash-container
docker rm splash-container

# 重新启动并增加资源限制
docker run -d -p 8050:8050 -p 5023:5023 \
  --name splash-container \
  --memory=2g \  # 增加内存限制
  --cpus=1.5 \   # 增加CPU限制
  scrapinghub/splash \
  --max-timeout 3600

# 重启Splash
docker restart splash-container

查看效果

我们在8050端口上运行了Splash服务,打开http://192.168.99.100:8050/即可看到其Web页面

2. 基本使用方法

Python 客户端库

pip install scrapy-splash requests

简单示例

import requests
from urllib.parse import quote

def splash_render_basic(url):
    splash_url = 'http://localhost:8050/render.html'
    
    params = {
        'url': url,
        'wait': 2,  # 等待秒数
        'timeout': 30
    }
    
    response = requests.get(splash_url, params=params)
    return response.text

# 使用示例
html_content = splash_render_basic('https://www.cvma.org.cn/6847/index.html')
print(html_content)

3. 完整功能示例

渲染HTML内容

import requests
import json

def splash_render_html(url, wait_time=2):
    splash_url = 'http://localhost:8050/render.html'
    
    params = {
        'url': url,
        'wait': wait_time,
        'proxy': 'http://your-proxy.com:port',  # 可选代理
        'images': 0,  # 不加载图片(加速)
        'timeout': 60
    }
    
    try:
        response = requests.get(splash_url, params=params, timeout=90)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
        return None

# 使用
content = splash_render_html('https://www.cvma.org.cn/6847/index.html', wait_time=3)

渲染JSON数据(获取更多信息)

def splash_render_json(url, wait_time=2):
    splash_url = 'http://localhost:8050/render.json'
    
    params = {
        'url': url,
        'wait': wait_time,
        'html': 1,        # 返回HTML
        'png': 1,         # 返回截图
        'har': 1,         # 返回HAR网络记录
        'images': 0       # 不加载图片
    }
    
    response = requests.get(splash_url, params=params)
    data = response.json()
    
    return {
        'html': data.get('html'),
        'png': data.get('png'),  # base64编码的截图
        'har': data.get('har')   # 网络请求记录
    }

# 使用
result = splash_render_json('https://www.cvma.org.cn/6847/index.html')
print("HTML长度:", len(result['html']))

4. 执行自定义 JavaScript

简单的JS执行

def splash_execute_js(url):
    splash_url = 'http://localhost:8050/execute'
    
    lua_script = """
    function main(splash)
        splash:go(splash.args.url)
        splash:wait(%s)
        
        -- 执行JavaScript
        local scroll_to = splash:jsfunc([[
            function() {
                window.scrollTo(0, document.body.scrollHeight);
            }
        ]])
        scroll_to()
        
        splash:wait(1)
        
        return {
            html = splash:html(),
            url = splash:url()
        }
    end
    """ % 2
    
    data = {
        'lua_source': lua_script,
        'url': url,
        'timeout': 60
    }
    
    response = requests.post(splash_url, json=data)
    return response.json()

# 使用
result = splash_execute_js('https://www.cvma.org.cn/6847/index.html')

复杂交互操作

def splash_complex_interaction(url):
    splash_url = 'http://localhost:8050/execute'
    
    lua_script = """
    function main(splash)
        splash:go(splash.args.url)
        splash:wait(2)
        
        -- 点击按钮(如果有)
        local success = splash:runjs([[
            var button = document.querySelector('.load-more');
            if (button) {
                button.click();
                return true;
            }
            return false;
        ]])
        
        if success then
            splash:wait(3)  -- 等待加载完成
        end
        
        -- 获取页面信息
        local title = splash:evaljs("document.title")
        local news_count = splash:evaljs("document.querySelectorAll('.news-item').length")
        
        return {
            html = splash:html(),
            title = title,
            news_count = news_count,
            url = splash:url()
        }
    end
    """
    
    data = {
        'lua_source': lua_script,
        'url': url
    }
    
    response = requests.post(splash_url, json=data)
    return response.json()

5. 高级功能

设置请求头

def splash_with_headers(url):
    splash_url = 'http://localhost:8050/execute'
    
    lua_script = """
    function main(splash)
        splash:set_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")
        splash:set_custom_headers({
            ["Referer"] = "https://www.cvma.org.cn/",
            ["Accept-Language"] = "zh-CN,zh;q=0.9"
        })
        
        splash:go(splash.args.url)
        splash:wait(2)
        
        return splash:html()
    end
    """
    
    data = {'lua_source': lua_script, 'url': url}
    response = requests.post(splash_url, json=data)
    return response.text

处理Cookie和会话

def splash_with_cookies(url):
    splash_url = 'http://localhost:8050/execute'
    
    lua_script = """
    function main(splash)
        -- 首先访问首页获取Cookie
        splash:go("https://www.cvma.org.cn/")
        splash:wait(1)
        
        -- 然后访问目标页面(携带Cookie)
        splash:go(splash.args.url)
        splash:wait(2)
        
        -- 获取所有Cookie
        local cookies = splash:get_cookies()
        
        return {
            html = splash:html(),
            cookies = cookies
        }
    end
    """
    
    data = {'lua_source': lua_script, 'url': url}
    response = requests.post(splash_url, json=data)
    return response.json()

6. 与Scrapy集成

settings.py 配置

# Scrapy settings.py
SPLASH_URL = 'http://localhost:8050'

DOWNLOADER_MIDDLEWARES = {
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}

SPIDER_MIDDLEWARES = {
    'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}

DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'

Scrapy Spider 示例

import scrapy
from scrapy_splash import SplashRequest

class CvmaSpider(scrapy.Spider):
    name = 'cvma'
    start_urls = ['https://www.cvma.org.cn/6847/index.html']
    
    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(
                url, 
                self.parse,
                args={'wait': 2, 'timeout': 90}
            )
    
    def parse(self, response):
        # 提取渲染后的内容
        news_items = response.css('.news-item')
        for item in news_items:
            yield {
                'title': item.css('h3::text').get(),
                'date': item.css('.date::text').get(),
                'link': item.css('a::attr(href)').get()
            }

7. 错误处理和优化

完整的错误处理

import requests
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def splash_render_robust(url, max_retries=3):
    splash_url = 'http://localhost:8050/render.html'
    
    # 配置重试策略
    session = requests.Session()
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    params = {
        'url': url,
        'wait': 2,
        'timeout': 60,
        'resource_timeout': 30
    }
    
    for attempt in range(max_retries):
        try:
            response = session.get(splash_url, params=params, timeout=90)
            response.raise_for_status()
            return response.text
            
        except requests.exceptions.RequestException as e:
            print(f"尝试 {attempt + 1} 失败: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # 指数退避
            else:
                raise e

# 使用
try:
    html = splash_render_robust('https://www.cvma.org.cn/6847/index.html')
    print("成功获取内容")
except Exception as e:
    print(f"最终失败: {e}")

8. 性能优化建议

def splash_optimized(url):
    """优化性能的Splash调用"""
    splash_url = 'http://localhost:8050/execute'
    
    lua_script = """
    function main(splash)
        -- 性能优化设置
        splash.images_enabled = false
        splash.resource_timeout = 10
        
        splash:go(splash.args.url)
        
        -- 等待特定元素出现,而不是固定时间
        splash:wait_for_resume([[
            function main(splash) {
                return new Promise(function(resolve) {
                    function check() {
                        var element = document.querySelector('.content-loaded');
                        if (element) {
                            resolve();
                        } else {
                            setTimeout(check, 500);
                        }
                    }
                    check();
                });
            }
        ]])
        
        return splash:html()
    end
    """
    
    data = {'lua_source': lua_script, 'url': url}
    response = requests.post(splash_url, json=data, timeout=60)
    return response.text

Splash对象属性

上图中main()方法的第一个参数是splash,这个对象非常重要,它类似于Selenium中的WebDriver对象

1. images_enabled

设置图片是否加载,默认情况下是加载的。禁用该属性后,可以节省网络流量并提高网页加载速度

注意的是,禁用图片加载可能会影响JavaScript渲染。因为禁用图片之后,它的外层DOM节点的高度会受影响,进而影响DOM节点的位置

因此,如果JavaScript对图片节点有操作的话,其执行就会受到影响

function main(splash, args)
  splash.images_enabled = false
  splash:go('https://www.baidu.com')
  return {html=splash:html()}
end

2. plugins_enabled

可以控制浏览器插件(如Flash插件)是否开启

默认情况下,此属性是false,表示不开启

splash.plugins_enabled = true/false

3. scroll_position

控制页面上下或左右滚动

splash.scroll_position = {x=100, y=200}

Splash对象的方法

1. go()

该方法用来请求某个链接,而且它可以模拟GET和POST请求,同时支持传入请求头、表单等数据

ok, reason = splash:go{url, baseurl=nil, headers=nil, http_method="GET", body=nil, formdata=nil}

返回结果是结果ok和原因reason

如果ok为空,代表网页加载出现了错误,此时reason变量中包含了错误的原因

参数含义
url请求的URL
baseurl可选参数,默认为空,表示资源加载相对路径
headers可选参数,默认为空,表示请求头
http_method可选参数,默认为GET,同时支持POST
body可选参数,默认为空,发POST请求时的表单数据,使用的Content-type为application/json
formdata可选参数,默认为空,POST的时候的表单数据,使用的Content-type为application/x-www-form-urlencoded
splash:go{"http://www.sxt.cn", http_method="POST", body="name=17703181473"}

2. wait()

控制页面的等待时间

splash:wait{time, cancel_on_redirect=false, cancel_on_error=true}
参数含义
time等待的秒数
cancel_on_redirect可选参数,默认为false,表示如果发生了重定向就停止等待,并返回重定向结果
cancel_on_error可选参数,默认为false,表示如果发生了加载错误,就停止等待
function main(splash)
    splash:go("https://www.taobao.com")
    splash:wait(2)
    return {html=splash:html()}
end

3. jsfunc()

直接调用JavaScript定义的方法,但是所调用的方法需要用双中括号包围,这相当于实现了JavaScript方法到Lua脚本的转换

function main(splash, args)
  splash:go("http://www.sxt.cn")
  local scroll_to = splash:jsfunc("window.scrollTo")
  scroll_to(0, 300)
  return {png=splash:png()}
end

4. evaljs()与 runjs()

  • evaljs() 以执行JavaScript代码并返回最后一条JavaScript语句的返回结果
  • runjs() 以执行JavaScript代码,它与evaljs()的功能类似,但是更偏向于执行某些动作或声明某些方法
function main(splash, args)
  splash:go("https://www.baidu.com")
  splash:runjs("foo = function() { return 'sxt' }")
  local result = splash:evaljs("foo()")
  return result
end

5. html()

获取网页的源代码

function main(splash, args)
  splash:go("https://www.bjsxt.com")
  return splash:html()
end

6. png()

获取PNG格式的网页截图

function main(splash, args)
  splash:go("https://www.bjsxt.com")
  return splash:png()
end

7. har()

获取页面加载过程描述

function main(splash, args)
  splash:go("https://www.bjsxt.com")
  return splash:har()
end

8. url()

获取当前正在访问的URL

function main(splash, args)
  splash:go("https://www.bjsxt.com")
  return splash:url()
end

9. get_cookies()

获取当前页面的Cookies

function main(splash, args)
  splash:go("https://www.bjsxt.com")
  return splash:get_cookies()
end

10. add_cookie()

当前页面添加Cookie

cookies = splash:add_cookie{name, value, path=nil, domain=nil, expires=nil, httpOnly=nil, secure=nil}
 
 
function main(splash)
    splash:add_cookie{"sessionid", "123456abcdef", "/", domain="http://bjsxt.com"}
    splash:go("http://bjsxt.com/")
    return splash:html()
end
function main(splash)
    splash:add_cookie{"sessionid", "123456abcdef", "/", domain="http://bjsxt.com"}
    splash:go("http://bjsxt.com/")
    return splash:html()
end

11. clear_cookies()

可以清除所有的Cookies

function main(splash)
    splash:go("https://www.bjsxt.com/")
    splash:clear_cookies()
    return splash:get_cookies()
end

12. set_user_agent()

设置浏览器的User-Agent

function main(splash)
  splash:set_user_agent('Splash')
  splash:go("http://httpbin.org/get")
  return splash:html()
end

13. set_custom_headers()

设置请求头

function main(splash)
  splash:set_custom_headers({
     ["User-Agent"] = "Splash",
     ["Site"] = "Splash",
  })
  splash:go("http://httpbin.org/get")
  return splash:html()
end

14. select()

选中符合条件的第一个节点

如果有多个节点符合条件,则只会返回一个

其参数是CSS选择器

function main(splash)
  splash:go("https://www.baidu.com/")
  input = splash:select("#kw")
  splash:wait(3)
  return splash:png()
end

15. send_text()

填写文本

function main(splash)
  splash:go("https://www.baidu.com/")
  input = splash:select("#kw")
  input:send_text('Splash')
  splash:wait(3)
  return splash:png()
end

16. mouse_click()

模拟鼠标点击操作

function main(splash)
  splash:go("https://www.baidu.com/")
  input = splash:select("#kw")
  input:send_text('Splash')
  submit = splash:select('#su')
  submit:mouse_click()
  splash:wait(3)
  return splash:png()
end

17. 代理Ip

function main(splash)
    splash:on_request(function(request)
        request:set_proxy{
            host='61.138.33.20',
            port=808,
            username='uanme',
            password='passwrod'
        }
    
     end)
    
    -- 设置请求头
    splash:set_user_agent("Mozilla/5.0")
​
    splash:go("https://httpbin.org/get")
    return splash:html()
end

Splash与Python结合

1. render.html

此接口用于获取JavaScript渲染的页面的HTML代码,接口地址就是Splash的运行地址加此接口名称,例如http://192.168.99.100:8050/render.html

import requests
url = 'http://192.168.99.100:8050/render.html?url=https://www.bjsxt.com&wait=3'
response = requests.get(url)
print(response.text)

2. render.png

此接口可以获取网页截图

import requests
 
url = 'http://192.168.99.100:8050/render.png?url=https://www.jd.com&wait=5&width=1000&height=700'
response = requests.get(url)
with open('taobao.png', 'wb') as f:
    f.write(response.content)

3. execute

最为强大的接口。前面说了很多Splash Lua脚本的操作,用此接口便可实现与Lua脚本的对接

import requests
from urllib.parse import quote
 
lua = '''
function main(splash)
    return 'hello'
end
'''
 
url = 'http://192.168.99.100:8050/execute?lua_source=' + quote(lua)
response = requests.get(url)
print(response.text)