selenium 使用代理的方法汇总

3,453 阅读2分钟

在docker中启动selenium gird使用扩展,并使用隧道代理。how to set proxy with authentication in selenium chromedriver python

proxy with authentication(账号密码认证代理)不支持chrome headless,但是对docker selenium 或者 selenium gird集群,是支持的。 启动selenium docker

docker run -d -p 4444:4444 --shm-size=2g -m 800M --memory-swap=800M --name=chrome  --restart=always selenium/standalone-chrome

一、selenium使用隧道动态代理(会生成本地zip插件文件)

import os
import time
import zipfile

from selenium import webdriver
from scrapy.selector import Selector

PROXY_HOST = 'http-dyn.abuyun.com'  # rotating proxy or host
PROXY_PORT = 9020  # port
PROXY_USER = ''  # username
PROXY_PASS = ''  # password

REMOTE_SELENIUM = '111.22.111.11:4444' # 远端docker selenium地址

manifest_json = """
{
    "version": "1.0.0",
    "manifest_version": 2,
    "name": "Chrome Proxy",
    "permissions": [
        "proxy",
        "tabs",
        "unlimitedStorage",
        "storage",
        "<all_urls>",
        "webRequest",
        "webRequestBlocking"
    ],
    "background": {
        "scripts": ["background.js"]
    },
    "minimum_chrome_version":"22.0.0"
}
"""

background_js = """
var config = {
        mode: "fixed_servers",
        rules: {
        singleProxy: {
            scheme: "http",
            host: "%s",
            port: parseInt(%s)
        },
        bypassList: ["localhost"]
        }
    };

chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});

function callbackFn(details) {
    return {
        authCredentials: {
            username: "%s",
            password: "%s"
        }
    };
}

chrome.webRequest.onAuthRequired.addListener(
            callbackFn,
            {urls: ["<all_urls>"]},
            ['blocking']
);
""" % (PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS)


def get_chromedriver(use_proxy=False, user_agent=None, use_docker=True):
    path = os.path.dirname(os.path.abspath(__file__))
    chrome_options = webdriver.ChromeOptions()
    if use_proxy:
        pluginfile = 'proxy_auth_plugin.zip'

        with zipfile.ZipFile(pluginfile, 'w') as zp:
            zp.writestr("manifest.json", manifest_json)
            zp.writestr("background.js", background_js)
        chrome_options.add_extension(pluginfile)
    if user_agent:
        chrome_options.add_argument('--user-agent=%s' % user_agent)
    if use_docker:
        driver = webdriver.Remote(
            command_executor="http://{}/wd/hub".format(REMOTE_SELENIUM),
            # command_executor="http://192.168.95.56:4444/wd/hub",
            options=chrome_options
        )
    else:
        driver = webdriver.Chrome(
            os.path.join(path, '/usr/local/bin/chromedriver'),
            chrome_options=chrome_options)
    return driver


def main():
    # 使用代理 使用docker
    driver = get_chromedriver(use_proxy=True, use_docker=True)
    print(driver)
    n = 0
    while True:
        # driver = get_chromedriver(use_proxy=True, use_docker=True)
        # print(driver)
        driver.get('https://www.cip.cc')
        ip_text = Selector(text=driver.page_source).xpath(
            '//pre/text()').extract_first().strip()
        print(ip_text)
        driver.close()
        time.sleep(3)
        n += 1
        if n > 10:
            break
    driver.quit()


if __name__ == '__main__':
    main()

效果图

二、selenium 使用HTTP代理(常规HOST:PORT代理)

from selenium import webdriver
PROXY = "88.157.149.250:8080" # IP:PORT or HOST:PORT

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % PROXY)

chrome = webdriver.Chrome(chrome_options=chrome_options)
chrome.get("http://www.cip.cc")
print(chrome.page_source)

三、远程使用的两种方式

方式一:

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy, ProxyType

chrome_options = webdriver.ChromeOptions()
proxy_ip = '110.90.175.186:23730'  # 代理地址和端口
chrome_options.add_argument('--proxy-server=%s' % proxy_ip)

driver = webdriver.Remote(
    command_executor="http://10.254.217.236:80/wd/hub",
    desired_capabilities=DesiredCapabilities.CHROME,
    options=chrome_options
)

driver.get("http://www.cip.cc")
# driver.maximize_window()
# driver.save_screenshot('bd.png')
print(driver.page_source)
print(driver.__dict__)
driver.close()

注意到:代理是生效的,但是driver.__dict__中,proxy为空。所以更加推荐方式二。

{'command_executor': <selenium.webdriver.remote.remote_connection.RemoteConnection object at 0x10e6f3e50>, '_is_remote': True, 'session_id': 'f5626c8f3872c6bd095eeb24eccd6e01', 'capabilities': {'acceptInsecureCerts': False, 'browserName': 'chrome', 'browserVersion': '80.0.3987.106', 'chrome': {'chromedriverVersion': '80.0.3987.106 (f68069574609230cf9b635cd784cfb1bf81bb53a-refs/branch-heads/3987@{#882})', 'userDataDir': '/tmp/.com.google.Chrome.hL5Tvb'}, 'goog:chromeOptions': {'debuggerAddress': 'localhost:42468'}, 'networkConnectionEnabled': False, 'pageLoadStrategy': 'normal', 'platformName': 'linux', 'proxy': {}, 'setWindowRect': True, 'strictFileInteractability': False, 'timeouts': {'implicit': 0, 'pageLoad': 300000, 'script': 30000}, 'unhandledPromptBehavior': 'dismiss and notify', 'webdriver.remote.sessionid': 'f5626c8f3872c6bd095eeb24eccd6e01'}, 'error_handler': <selenium.webdriver.remote.errorhandler.ErrorHandler object at 0x10e6f3a00>, 'w3c': True, '_switch_to': <selenium.webdriver.remote.switch_to.SwitchTo object at 0x10e6f3f10>, '_mobile': <selenium.webdriver.remote.mobile.Mobile object at 0x10e6f39d0>, '_file_detector': <selenium.webdriver.remote.file_detector.LocalFileDetector object at 0x10f41bfa0>}

方式二:

from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.common.proxy import Proxy, ProxyType

host_port = "113.93.226.75:17318"
proxy = Proxy({
    'proxyType': ProxyType.MANUAL,
    'httpProxy': host_port,
    'sslProxy': host_port,
    'ftpProxy': host_port
})

driver = webdriver.Remote(
    command_executor="http://10.254.217.236:80/wd/hub",
    desired_capabilities=DesiredCapabilities.CHROME,
    proxy=proxy
)

driver.get("http://www.cip.cc")
driver.maximize_window()
driver.save_screenshot('bd.png')
print(driver.page_source)
print(driver.__dict__)
driver.close()

更加推荐方式二,因为在driver的属性中,proxy属性不为空字典。你可以把drvier.__dict__用日志打印出来,方便排查问题。

{'command_executor': <selenium.webdriver.remote.remote_connection.RemoteConnection object at 0x10d242e50>, '_is_remote': True, 'session_id': '92e65eb5c08a91c861ee7acc7b90a308', 'capabilities': {'acceptInsecureCerts': False, 'browserName': 'chrome', 'browserVersion': '80.0.3987.106', 'chrome': {'chromedriverVersion': '80.0.3987.106 (f68069574609230cf9b635cd784cfb1bf81bb53a-refs/branch-heads/3987@{#882})', 'userDataDir': '/tmp/.com.google.Chrome.6vv8se'}, 'goog:chromeOptions': {'debuggerAddress': 'localhost:40891'}, 'networkConnectionEnabled': False, 'pageLoadStrategy': 'normal', 'platformName': 'linux', 'proxy': {'httpProxy': '113.93.226.75:17318', 'proxyType': 'MANUAL', 'ftpProxy': '113.93.226.75:17318', 'sslProxy': '113.93.226.75:17318'}, 'setWindowRect': True, 'strictFileInteractability': False, 'timeouts': {'implicit': 0, 'pageLoad': 300000, 'script': 30000}, 'unhandledPromptBehavior': 'dismiss and notify', 'webdriver.remote.sessionid': '92e65eb5c08a91c861ee7acc7b90a308'}, 'error_handler': <selenium.webdriver.remote.errorhandler.ErrorHandler object at 0x10d242970>, 'w3c': True, '_switch_to': <selenium.webdriver.remote.switch_to.SwitchTo object at 0x10d242f40>, '_mobile': <selenium.webdriver.remote.mobile.Mobile object at 0x10df67e50>, '_file_detector': <selenium.webdriver.remote.file_detector.LocalFileDetector object at 0x10df67e80>}