核心
设置UA,优先在lua脚本中使用splash:set_user_agent(“{ua}”)
设置ip代理,使用SplashRequest的proxy
代码
pip install fake-useragent
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from urllib.parse import quote
from scrapy_splash import SplashRequest
from risk_control_info.utils import get_proxy_ip
from fake_useragent import UserAgent
ua = UserAgent()
script = """
function main(splash, args)
splash.images_enabled = false
splash:set_user_agent("{ua}")
assert(splash:go(args.url))
assert(splash:wait(args.wait))
return splash:html()
end""".format(ua=ua.chrome)
class AppQimaiHotSearchSpider(scrapy.Spider):
name = 'app_qimai_hot_search'
allowed_domains = ['qimai.cn']
user_agent = ua.chrome
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
},
'SPIDER_MIDDLEWARES': {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
},
}
def start_requests(self):
url = "http://httpbin.org/get"
yield SplashRequest(url=url,
callback=self.parse,
endpoint='execute',
args={
'lua_source': script,
'proxy': "http://" + get_proxy_ip(url),
'wait': 3})
def parse(self, response):
print(response.body.decode())