scrapy-spalsh使用UA和IP代理

437 阅读1分钟

核心

设置UA,优先在lua脚本中使用splash:set_user_agent(“{ua}”)

设置ip代理,使用SplashRequest的proxy

代码

pip install fake-useragent
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from urllib.parse import quote
from scrapy_splash import SplashRequest

from risk_control_info.utils import get_proxy_ip
from fake_useragent import UserAgent

ua = UserAgent()
script = """
function main(splash, args)
  splash.images_enabled = false
  splash:set_user_agent("{ua}")
  assert(splash:go(args.url))
  assert(splash:wait(args.wait))
  return splash:html()
end""".format(ua=ua.chrome)


class AppQimaiHotSearchSpider(scrapy.Spider):
    name = 'app_qimai_hot_search'
    allowed_domains = ['qimai.cn']
    user_agent = ua.chrome
    custom_settings = {
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
        },
        'SPIDER_MIDDLEWARES': {
            'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
        },
    }

    def start_requests(self):
        url = "http://httpbin.org/get"

        yield SplashRequest(url=url,
                            callback=self.parse,
                            endpoint='execute',
                            args={
                                'lua_source': script,
                                'proxy': "http://" + get_proxy_ip(url),
                                'wait': 3})

    def parse(self, response):
        print(response.body.decode())

结果