使用
-
文档地址
https://splash.readthedocs.io/en/stable/
-
lua教程
https://www.runoob.com/lua/lua-tutorial.html
-
安装命令
docker pull scrapinghub/splash
-
启动命令
docker run -p 8050:8050 scrapinghub/splash
-
开放端口
8050
代码实操
-
lua代码
function main(splash, args) assert(splash:go(args.url)) assert(splash:wait(0.5)) local title = splash:evaljs('document.title') return {title=title} end
-
异步lua代码
-
访问三个网站
function main(splash, args) local example_urls = {'www.taobao.com','www.zhipin.com','www.zhihu.com'} local urls = args.urls or example_urls local results = {} for index,url in ipairs(urls) do local ok,reason = splash:go('http://'..url) if ok then splash:wait(2) results[url] = splash:png() end end return results end
-
splash 属性
-
设置超时时间
-
resource_timeout 属性
-
超过0.1秒没有响应直接报错
function main(splash, args) splash.resource_timeout = 0.1 assert(splash:go(args.url)) assert(splash:wait(0.5)) return { html = splash:html(), png = splash:png(), har = splash:har(), } end
-
报错如下
{ "error": 400, "type": "ScriptError", "description": "Error happened while executing Lua script", "info": { "source": "[string \"function main(splash, args)\r...\"]", "line_number": 3, "error": "network5", "type": "LUA_ERROR", "message": "Lua error: [string \"function main(splash, args)\r...\"]:3: network5" } }
-
-
禁用图片加载
-
images_enables 属性
function main(splash, args) splash.images_enabled = false assert(splash:go(args.url)) assert(splash:wait(0.5)) return { html = splash:html(), png = splash:png(), har = splash:har(), } end
-
-
控制页面滚动
-
scroll_position 属性
function main(splash, args) assert(splash:go('http://www.jd.com')) splash.scroll_position = {y=400} return {png=splash:png()} end
-
splash 方法
-
go方法
ok,reason = splash:go(url,baseurl=nil,headers=nil,http_method='GET',body=nil,formdata=nil)
- 参数说明
- url:请求的url
- baseurl:资源加载的相对路径
- headers:请求头
- http_method 请求方法
- 参数说明
-
wait方法
-
控制页面等待时间
function main(splash, args) assert(splash:go('http://www.jd.com')) splash:wait(2) return {png=splash:png()} end
-
-
jsfunc 方法
-
直接调用js方法
function main(splash, args) local get_div_count = splash:jsfunc([[function() { var body = document.body; var divs = body.getElementsByTagName('div'); return divs.length;}]]) splash:go('https://www.baidu.com') return ('There are %s DIVs'):format(get_div_count()) end
-
-
evaljs方法
-
执行js
function main(splash, args) assert(splash:go(args.url)) assert(splash:wait(0.5)) local title = splash:evaljs('document.title') return {title=title} end
-
-
runjs方法
-
也是用于执行js,但是和evaljs方法相比,更偏向于执行动作或声明方法
function main(splash, args) splash:go('https://www.baidu.com') splash:runjs('foo = function() {return "bar"}') local result = splash:evaljs("foo()") return result end
-
-
html 方法
-
返回页面源码
function main(splash, args) splash:go('https://www.baidu.com') return splash:html() end
-
-
png方法
-
用于返回png格式图片
function main(splash, args) splash:go('https://www.baidu.com') return splash:png() end
-
-
jpeg方法
-
用于返回jpeg格式图片
function main(splash, args) splash:go('https://www.baidu.com') return splash:jpeg() end
-
-
har方法
-
展示页面加载过程详细信息
function main(splash, args) splash:go('https://www.baidu.com') return splash:har() end
-
-
url方法
-
当前访问的url
function main(splash, args) splash:go('https://www.baidu.com') return splash:url() end
-
-
set_user_agent 方法
-
设置 user_agent
function main(splash, args) splash:set_user_agent('Splash') splash:go('https://www.httpbin.org/get') return splash:html() end
-
-
select 方法
-
选中符合条件的第一个节点,css选择器
function main(splash, args) splash:go('https://www.baidu.com') input = splash:select('#kw') input:send_text('Splash') splash:wait(3) return splash:png() end
-
-
mouse_click 方法
-
模拟鼠标点击操作
function main(splash, args) splash:go('https://www.baidu.com') input = splash:select('#kw') input:send_text('Splash') submit = splash:select('#su') submit:mouse_click() splash:wait(5) return splash:png() end
-
Splash提供的api
-
rander.html
-
返回页面的html代码
-
API地址是Splash 运行地址加上些api的名称
http://81.70.250.129:8050/render.html?url=https://www.baidu.com;wait=5
-
用python代码实现如下
import requests ret = requests.get(url='http://81.70.250.129:8050/render.html?url=https://www.baidu.com;wait=5') print(ret.text)
-
-
render.png
-
返回页面截图
- 参数
- width 宽
- height 高
- 参数
-
API
http://81.70.250.129:8050/render.html?url=https://www.baidu.com&wait=5&width=1000&height=700
-
Python代码
import requests url = 'http://81.70.250.129:8050/render.html?url=https://www.baidu.com&wait=5&width=1000&height=700' ret = requests.get(url=url) print(ret.content)
- 返回的是二进制数据,保存到本地后就是个图片
-
-
render.jpeg
- 同上
-
render.har
-
返回 一个JSON格式,包含加载过程中的HAR数据
-
API
http://81.70.250.129:8050/render.har?url=https://www.baidu.com;wait=5
-
Python代码
import requests url = 'http://81.70.250.129:8050/render.har?url=https://www.baidu.com;wait=3' ret = requests.get(url=url) print(ret.content)
-
-
rander.json
-
此API包含前面介绍的所有的render相关的api功能,返回值是JSON
-
API
http://81.70.250.129:8050/render.json?url=https://www.baidu.com
-
-
API还可以自己通过传入不同的参数返回结果
- html=1 返回结果增加页面源代码
- har=1 返回结果增加har
- png=1 返回结果增加截图
-
API
http://81.70.250.129:8050/render.json?url=https://www.baidu.com&html=1&har=1&png=1
-
execute
-
可以传递交互
-
实现一个最简单的lua脚本
function main(splash, args) return {'hello'} end
-
将此脚本url编码一下拼接到execute后面
pass
-
-
Python实现
import requests from urllib.parse import quote lua = """ function main(splash) return 'hello' end """ url = 'http://81.70.250.129:8050/execute?lua_source=' + quote(lua) ret = requests.get(url=url) print(ret.text)
-
评论区