selenium使用

一环境搭建

下载selenium

pip install selenium

下载浏览器驱动（以Edge为例）

在设置中找到当前Edge版本号，在[Microsoft Edge WebDriver - Microsoft Edge Developer](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/)中下载对应版本驱动

将下载的压缩包解压后得到驱动的exe文件，将该文件拖到python安装文件夹的script文件夹下

在浏览器中的使用：

示例

from selenium.webdriver import Edge # 引入

web = Edge() # 创建浏览器对象

web.get("https://www.baidu.com") # 执行操作，打开百度

二 selenium的简单使用

引入引擎和需要的类
创建浏览器对象
打开网页，可以选中元素操纵和获取信息

示例

from selenium.webdriver import Edge
from selenium.webdriver.common.keys import Keys
import time

# 创建浏览器对象
web = Edge()

# 打开一个网页
web.get("https://www.lagou.com")

# 可以使用xpath 类名 样式查找element
web.find_element('xpath', '//*[@id="changeCityBox"]/p[1]/a').click()

# 由于加载需要一点时间，所以需要等待
time.sleep(1)

web.find_element('xpath', '//*[@id="search_input"]').send_keys('python', Keys.ENTER)

time.sleep(1)

web.find_element('xpath', '//*[@id="jobList"]/div[1]/div[1]/div[1]/div[1]/div[1]/a').click()

time.sleep(1)

# 切换selenium操作的页面
web.switch_to.window(web.window_handles[-1])

job_detail = web.find_element('xpath', '//*[@id="job_detail"]/dd[2]/div').text

print(job_detail)

# 关闭当前页面
web.close()

# 切回原窗口
web.switch_to.window(web.window_handles[0])

注意iframe的存在：

# 如果源码中有iframe的话，是没有办法直接拿到数据的，必须先切换到iframe中再操作
frame = web.find_element('xpath', '//*[@id="g_iframe"]')
web.switch_to.frame(frame)

注意操作select：

下拉列表应当先引用下拉列表的支持

from selenium.webdriver.support.select import Select

拿到select元素并进行包装

sel_el = web.find_element_by_xpath('//*[@id="OptionDate"]')
# 对元素进行包装, 包装成下拉菜单
sel = Select(sel_el)

使用条件切换选项，拿到每个选项中的不同数据

for i in range(len(sel.options)):  # i就是每一个下拉框选项的索引位置
    sel.select_by_index(i)  # 按照索引进行切换
    time.sleep(2)
    table = web.find_element_by_xpath('//*[@id="TableList"]/table')
    print(table.text)  # 打印所有文本信息
    print("===================================")

三无头浏览器

可以通过参数配置的方式来不打开浏览器也能拿到数据

如果有被iframe包裹的话需要切换到iframe里面才能拿到数据

from selenium.webdriver import Edge
from selenium.webdriver.edge.options import Options	#引入设置项
import time

# 准备好参数配置
opt = Options()
opt.add_argument("--headless")
opt.add_argument("--disable-gpu")

web = Edge(options=opt)  # 把参数配置设置到浏览器中
web.get("https://music.163.com/#/song?id=1430620302")

time.sleep(2)

iframe = web.find_element('xpath', '//*[@id="g_iframe"]')
web.switch_to.frame(iframe)	# 切换到frame中拿取

# 如何拿到页面代码Elements(经过数据加载以及js执行之后的结果的html内容)
print(web.page_source)

Options基础配置

user-agent

代理

不加载图片和css

opt = Options()
# 无头浏览器
opt.add_argument("--headless")
opt.add_argument("--disable-gpu")
# 不加载图片和css
prefs = {"profile.managed_default_content_settings.images": 2,
         'permissions.default.stylesheet': 2}
opt.add_experimental_option('prefs', prefs)
# 设置user-Agent
opt.add_argument('user-agent=' + UserAgent().random)  # 初始化一个别的User-Agent

# IP池
proxy_arr = [
    '--proxy-server=http://111.3.118.247:30001',
    '--proxy-server=http://183.247.211.50:30001',
    '--proxy-server=http://122.9.101.6:8888',
]
proxy = random.choice(proxy_arr)  # 随机选择一个代理
print(proxy)  # 如果某个代理访问失败,可从proxy_arr中去除
opt.add_argument(proxy)  # 添加代理

# 添加配置
driver = Edge(options=opt)

四超级🦅处理验证码

地址：》》超级鹰验证码识别-专业的验证码云端识别服务,让验证码识别更快速、更准确、更强大 (chaojiying.com)

注册后在个人中心生成一个软件id

下载官方demo

官方demo，直接复制，即可使用

#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def PostPic_base64(self, base64_str, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
            'file_base64':base64_str
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    # 用户中心>>软件ID 生成一个替换 96001
    chaojiying = Chaojiying_Client('超级鹰用户名', '超级鹰用户名的密码', '96001')
    # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    im = open('a.jpg', 'rb').read()
    # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
    # http://www.chaojiying.com/price.html
    print(chaojiying.PostPic(im, 1902))												
    #print chaojiying.PostPic(base64_str, 1902)  #此处为传入 base64代码

使用超级鹰给超级鹰做登录验证

from selenium.webdriver import Edge
from chaojiying import Chaojiying_Client
import time

web = Edge()
web.get('http://www.chaojiying.com/user/login/')
chaojiying = Chaojiying_Client('mutreei01', 'yanlifei12138', '936512')
img = web.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png
verify_code = chaojiying.PostPic(img, 1902)['pic_str']

web.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys('用户名')
web.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys('密码')
web.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(verify_code)


time.sleep(5)
web.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').click()

time.sleep(10)

五其他实用操作

如果你的程序被识别到了怎么办?

1.chrome的版本号如果小于88 在你启动浏览器的时候(此时没有加载任何网页内容), 向页面嵌入js代码. 去掉webdriver

web = Chrome()

web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
   navigator.webdriver = undefined
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})
web.get(xxxxxxx)

2.chrome的版本大于等于88

option = Options()
# option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('--disable-blink-features=AutomationControlled')

web操作：

移动到某一位置点击：

from selenium.webdriver import Edge
from selenium.webdriver.common.action_chains import ActionChains
web = Edge()
verify_img_element = web.find_element_by_xpath('//*[@id="J-loginImg"]')
ActionChains(web).move_to_element_with_offset(verify_img_element, x, y).click().perform()   # perform提交

拖拽：

btn = web.find_element_by_xpath('//*[@id="nc_1_n1z"]')
ActionChains(web).drag_and_drop_by_offset(btn, 300, 0).perform()

最实用的selenium使用指北

selenium使用

一 环境搭建

二 selenium的简单使用

三 无头浏览器

四 超级🦅处理验证码

五 其他实用操作

一环境搭建

三无头浏览器

四超级🦅处理验证码

五其他实用操作