Selenium快速使用代理IP

2,677 阅读3分钟

由于部分网站反爬较多,如果采集需要做网站js等多种策略分析,导致研发的工作量急剧增加,为了快速启动项目采集数据,可以使用 Selenium自动化测试工具,模拟用户终端,结合爬虫代理IP,方便快捷的采集数据。

1. Selenium爬虫采集的优点:

(1)研发投入少,代码易维护

(2)regression方便

(3)可扩展性好

(4)采集效果好

2、Selenium环境搭建

安装

pip3 install -r requirements.txt

安装chrome并下载对应版本的chrome deriver

下载chrome www.google.com/chrome/

下载对应版本 driver chromedriver.chromium.org/downloads

基础配置

接口基本配置

# Redis数据库地址
REDIS_HOST = 'localhost'
# Redis端口
REDIS_PORT = 6379
# Redis密码,如无填None
REDIS_PASSWORD = None
# 产生器使用的浏览器
BROWSER_TYPE = 'Chrome'
# 产生器类,如扩展其他站点,请在此配置
GENERATOR_MAP = {
    'weibo': 'WeiboCookiesGenerator'
}
# 测试类,如扩展其他站点,请在此配置
TESTER_MAP = {
    'weibo': 'WeiboValidTester'
}
# 检测器检测接口
TEST_URL_MAP = {
    'weibo': 'https://m.weibo.cn/api/container/getIndex?uid=1804544030&type=uid&page=1&containerid=1076031804544030'
}
# 产生器和验证器循环周期
CYCLE = 120
# API地址和端口
API_HOST = '0.0.0.0'
API_PORT = 5000

进程开关

在config.py修改

# 产生器开关,模拟登录添加Cookies
GENERATOR_PROCESS = True
# 验证器开关,循环检测数据库中Cookies是否可用,不可用删除
VALID_PROCESS = False
# API接口服务
API_PROCESS = False

导入账号

python3 importer.py
请输入账号密码组, 输入exit退出读入
180000000----16yun
账号 180000000 密码 16yun
录入成功
exit

运行

请先导入一部分账号之后再运行,运行命令:

python3 run.py

运行效果

三个进程全部开启:

API接口开始运行
 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
Cookies生成进程开始运行
Cookies检测进程开始运行
正在生成Cookies 账号 180000000 密码 16yun
正在测试Cookies 用户名 180000000
Cookies有效 180000000

3.下面提供Selenium使用代理IP的demo:

import os

import time

import zipfile

from selenium import webdriver

from selenium.common.exceptions import TimeoutException

from selenium.webdriver.common.by import By

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.support.ui import WebDriverWait

class GenCookies(object):

随机useragent

USER_AGENT = open('useragents.txt').readlines()

代理服务器(产品官网 www.16yun.cn)

PROXY_HOST = 't.16yun.cn' # proxy or host

PROXY_PORT = 31111 # port

PROXY_USER = 'USERNAME' # username

PROXY_PASS = 'PASSWORD' # password

@classmethod

def get_chromedriver(cls, use_proxy=False, user_agent=None):

manifest_json = """

{

"version": "1.0.0",

"manifest_version": 2,

"name": "Chrome Proxy",

"permissions": [

"proxy",

"tabs",

"unlimitedStorage",

"storage",

"<all_urls>",

"webRequest",

"webRequestBlocking"

],

"background": {

"scripts": ["background.js"]

},

"minimum_chrome_version":"22.0.0"

}

"""

background_js = """

var config = {

mode: "fixed_servers",

rules: {

singleProxy: {

scheme: "http",

host: "%s",

port: parseInt(%s)

},

bypassList: ["localhost"]

}

};

chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});

function callbackFn(details) {

return {

authCredentials: {

username: "%s",

password: "%s"

}

};

}

chrome.webRequest.onAuthRequired.addListener(

callbackFn,

{urls: ["<all_urls>"]},

['blocking']

);

""" % (cls.PROXY_HOST, cls.PROXY_PORT, cls.PROXY_USER, cls.PROXY_PASS)

path = os.path.dirname(os.path.abspath(file))

chrome_options = webdriver.ChromeOptions()

关闭webdriver的一些标志

chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])

if use_proxy:

pluginfile = 'proxy_auth_plugin.zip'

with zipfile.ZipFile(pluginfile, 'w') as zp:

zp.writestr("manifest.json", manifest_json)

zp.writestr("background.js", background_js)

chrome_options.add_extension(pluginfile)

if user_agent:

chrome_options.add_argument('--user-agent=%s' % user_agent)

driver = webdriver.Chrome(

os.path.join(path, 'chromedriver'),

chrome_options=chrome_options)

修改webdriver get属性

script = '''

Object.defineProperty(navigator, 'webdriver', {

get: () => undefined

})

'''

driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": script})

return driver

def init(self, username, password):

登录example网站

self.url = 'passport.example.cn/signin/logi…'

self.browser = self.get_chromedriver(use_proxy=True, user_agent=self.USER_AGENT)

self.wait = WebDriverWait(self.browser, 20)

self.username = username

self.password = password

def open(self):

"""

打开网页输入用户名密码并点击

:return: None

"""

self.browser.delete_all_cookies()

self.browser.get(self.url)

username = self.wait.until(EC.presence_of_element_located((By.ID, 'loginName')))

password = self.wait.until(EC.presence_of_element_located((By.ID, 'loginPassword')))

submit = self.wait.until(EC.element_to_be_clickable((By.ID, 'loginAction')))

username.send_keys(self.username)

password.send_keys(self.password)

time.sleep(1)

submit.click()

def password_error(self):

"""

判断是否密码错误

:return:

"""

try:

return WebDriverWait(self.browser, 5).until(

EC.text_to_be_present_in_element((By.ID, 'errorMsg'), '用户名或密码错误'))

except TimeoutException:

return False

def get_cookies(self):

"""

获取Cookies

:return:

"""

return self.browser.get_cookies()

def main(self):

"""

入口

:return:

"""

self.open()

if self.password_error():

return {

'status': 2,

'content': '用户名或密码错误'

}

cookies = self.get_cookies()

return {

'status': 1,

'content': cookies

}

if name == 'main':

result = GenCookies(

username='180000000',

password='16yun',

).main()

print(result)