spider-flow 使用有一年多的时间。在使用中发现网上教程不多。selenium 节点变量不好使用,经常不明所以的就采不到数据。
今天分析下:www.zoomlion.com/other/searc… 反爬,并用spider-flow 实现翻页采集。
列表页
查看列表页源码,发现新闻列表内容并不在源码,打开chrome 开发者工具,查看网络请求,
分析
发现第一个请求最有可能是,但这%3Cli 这是啥东西,不认识,猜可能是unicode 编码,或是base64编码之类的。在网上打开unicode 解码工具,一看正是。
就是这个请求,我们深挖下这个请求,应该有关键词,翻页之类的变量。
一看很蒙,这应该加密处理了。不过我们不怕,spider-flow 有selenium节点,支持渲染请求。
处理
输入url, 并给变量命名resp2,(在这重命名,是为了避免被后面覆盖).
变量抽取,首先 resp2 是个SeleniumResponse对象,需要转成SpiderResponse对象respele,(我发现SeleniumResponse 抽取时不太好用)
urls 是抽取的列表页url 数组。
inext 是翻页变量,这的翻页需要点击下面的下一页按钮。
函数节点,跟selenium 节点配合使用,在函数节点编写命令操作selenium 页面。
第一个函数表示选择a.next 这个节点,并点击。 第二个函数表示休息10s, 让页面完成渲染。 在执行条件处写上, ${inext < 10} 表示循环10 次。
详情页
详情页就比较简单了,在源码中可以找到标题,正文,时间等字段。 用网络请求节点,变量节点
源码
- 最后,附上完成的xml配置。
<mxGraphModel>
<root>
<mxCell id="0">
<JsonProperty as="data">
{"spiderName":"中联重科","submit-strategy":"child","threadCount":"1"}
</JsonProperty>
</mxCell>
<mxCell id="1" parent="0"/>
<mxCell id="2" value="开始" style="start" parent="1" vertex="1">
<mxGeometry x="80" y="80" width="24" height="24" as="geometry"/>
<JsonProperty as="data">
{"shape":"start"}
</JsonProperty>
</mxCell>
<mxCell id="7" value="提取项目名、详情地址" style="variable" parent="1" vertex="1">
<mxGeometry x="330" y="80" width="24" height="24" as="geometry"/>
<JsonProperty as="data">
{"value":"提取项目名、详情地址","loopVariableName":"","variable-name":["respele","urls","inext"],"variable-description":["","",""],"loopCount":"","variable-value":["${resp2.html.element()}","${respele.selectors('ul.search_list li a')}","${inext==null?0:inext+1}"],"shape":"variable"}
</JsonProperty>
</mxCell>
<mxCell id="9" value="抓取详情页" style="request" parent="1" vertex="1">
<mxGeometry x="450.16668701171875" y="80" width="24" height="24" as="geometry"/>
<JsonProperty as="data">
{"value":"抓取详情页","loopVariableName":"index","method":"GET","sleep":"300","timeout":"120000","response-charset":"","retryCount":"3","retryInterval":"","body-type":"none","body-content-type":"text/plain","loopCount":"${urls.size()}","url":"https://www.zoomlion.com${urls[index].attr('href')}","proxy":"","request-body":"","follow-redirect":"1","tls-validate":"1","cookie-auto-set":"1","repeat-enable":"0","shape":"request"}
</JsonProperty>
</mxCell>
<mxCell id="10" value="" parent="1" source="7" target="9" edge="1">
<mxGeometry relative="1" as="geometry"/>
<JsonProperty as="data">
{"value":"","condition":""}
</JsonProperty>
</mxCell>
<mxCell id="12" value="提取详情页" style="variable" parent="1" vertex="1">
<mxGeometry x="550" y="80" width="24" height="24" as="geometry"/>
<JsonProperty as="data">
{"value":"提取详情页","loopVariableName":"","variable-name":["title","date1","content"],"variable-description":["","",""],"loopCount":"","variable-value":["${resp.selector('h1,h4').text()}","${resp.regx('(\\d{1,2}\\/\\d{1,2}\\.\\d{4})')}","${resp.selector('#main_content,div.details')}"],"shape":"variable"}
</JsonProperty>
</mxCell>
<mxCell id="13" value="" style="strokeWidth=2;sharp=1;" parent="1" source="9" target="12" edge="1">
<mxGeometry relative="1" as="geometry"/>
<JsonProperty as="data">
{"value":"","exception-flow":"0","lineWidth":"2","line-style":"sharp","lineColor":"black","condition":"","transmit-variable":"1"}
</JsonProperty>
</mxCell>
<mxCell id="14" value="输出" style="output" parent="1" vertex="1">
<mxGeometry x="660.1666870117188" y="80" width="24" height="24" as="geometry"/>
<JsonProperty as="data">
{"value":"输出","loopVariableName":"","datasourceId":"da9568e3380ea467cc18817a76443f61","tableName":"f_news","csvName":"C:\\DataChange\\32.csv","csvEncoding":"UTF-8","output-name":["title","edittime","content1","url","source","channel","nohtml"],"loopCount":"","output-value":["${title}","${date1}","${content.pichtml()}","${resp.url}","中联重科","news","${content.text()}"],"output-all":"0","output-database":"0","output-csv":"0","shape":"output"}
</JsonProperty>
</mxCell>
<mxCell id="15" value="" parent="1" source="12" target="14" edge="1">
<mxGeometry relative="1" as="geometry"/>
<JsonProperty as="data">
{"value":"","condition":""}
</JsonProperty>
</mxCell>
<mxCell id="34" value="Selenium" style="selenium" parent="1" vertex="1">
<mxGeometry x="190" y="64" width="32" height="32" as="geometry"/>
<JsonProperty as="data">
{"value":"Selenium","nodeVariableName":"resp2","loopVariableName":"","loopCount":"","pageLoadTimeout":"","implicitlyWaitTimeout":"","driverType":"chrome","window-size":"","user-agent":"","arguments":"","url":"https://www.zoomlion.com/other/search.html?key=%u8A79%u7EAF%u65B0","proxy":"","cookie-auto-set":"1","repeat-enable":"0","headless":"0","javascript-disabled":"0","image-disabled":"0","plugin-disable":"1","java-disable":"1","incognito":"0","sandbox":"0","hide-scrollbar":"0","maximized":"0","shape":"selenium"}
</JsonProperty>
</mxCell>
<mxCell id="35" value="" style="strokeWidth=2;sharp=1;" parent="1" source="2" target="34" edge="1">
<mxGeometry relative="1" as="geometry"/>
<JsonProperty as="data">
{"value":"","exception-flow":"0","lineWidth":"2","line-style":"sharp","lineColor":"black","condition":"","transmit-variable":"1"}
</JsonProperty>
</mxCell>
<mxCell id="36" value="" style="strokeWidth=2;sharp=1;" parent="1" source="34" target="7" edge="1">
<mxGeometry relative="1" as="geometry"/>
<JsonProperty as="data">
{"value":"","exception-flow":"0","lineWidth":"2","line-style":"sharp","lineColor":"black","condition":"","transmit-variable":"1"}
</JsonProperty>
</mxCell>
<mxCell id="37" value="执行函数" style="function" parent="1" vertex="1">
<mxGeometry x="222" y="154" width="32" height="32" as="geometry"/>
<JsonProperty as="data">
{"value":"执行函数","loopVariableName":"","loopCount":"","function":["${resp2.selector('a.next').click()}","${resp2.sleep(10000)}"],"shape":"function"}
</JsonProperty>
</mxCell>
<mxCell id="41" value="" style="strokeWidth=2;sharp=1;" parent="1" source="37" target="7" edge="1">
<mxGeometry relative="1" as="geometry"/>
<JsonProperty as="data">
{"value":"","exception-flow":"0","lineWidth":"2","line-style":"sharp","lineColor":"black","condition":"","transmit-variable":"1"}
</JsonProperty>
</mxCell>
<mxCell id="45" value="" style="strokeWidth=2;strokeColor=blue;sharp=1;" edge="1" parent="1" source="7" target="37">
<mxGeometry relative="1" as="geometry"/>
<JsonProperty as="data">
{"value":"","exception-flow":"0","lineWidth":"2","line-style":"sharp","lineColor":"blue","condition":"${inext<10}","transmit-variable":"1"}
</JsonProperty>
</mxCell>
</root>
</mxGraphModel>