本文已参与「新人创作礼」活动,一起开启掘金创作之路。
用selenium模拟浏览器操作,绕过二级网页验证码登录,打开新窗口,一级一级的抓取xpath,保存到本地
效果展示:
from bs4 import BeautifulSoup
from selenium import webdriver
import random
from selenium.webdriver.edge.options import Options
from selenium.webdriver import Edge
from selenium import webdriver
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
from yzm111 import deliver
import os # 注意要输入OS模块
print("模拟登录爬取开始--------")
m = int(input("请输入保存图片数:"))
n = int(input("请输入模拟鼠标滑轮速度:"))
url_ = input("请输入目标网址(道客巴巴):")
a = input("确认不保留获取窗口?y/n(y是无头模式,爬取需要更长时间缓冲): ")
s = Service(r"D:\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\msedgedriver.exe") #改成自己的地址,官方有下载嗷
# 将edge换成Chrome
options = Options()
options.add_argument('--headless')#无头浏览器,相当于看不见浏览器在运行
if a == 'y':
browser2 = webdriver.Edge(service=s, options=options)
time.sleep(2)
browser2.get('https://www.doc88.com/')
browser2.find_element(By.XPATH, '//*[@id="account-login"]').click()
browser2.find_element(By.XPATH, '//*[@id="loginname1"]').send_keys("自己道客的名称")
browser2.find_element(By.XPATH, '//*[@id="password1"]').send_keys("自己的密码")
time.sleep(5)
browser2.find_element(By.XPATH, '//*[@id="login-btn"]').click()
sss = browser2.find_element(By.XPATH, '/html/body/div[4]/div/div[1]/div[3]/div[2]/div/div/p').text
if sss == "账号或密码有误":
print("登录失败")
else:
print("登录成功")
time.sleep(5)
if a == 'n':
browser = webdriver.Edge(service=s)
time.sleep(2)
browser.get('https://www.doc88.com/')
browser.find_element(By.XPATH, '//*[@id="account-login"]').click()
browser.find_element(By.XPATH, '//*[@id="loginname1"]').send_keys("自己道客的名称")
browser.find_element(By.XPATH, '//*[@id="password1"]').send_keys("自己的密码")
time.sleep(5)
browser.find_element(By.XPATH, '//*[@id="login-btn"]').click()
sss = browser.find_element(By.XPATH, '/html/body/div[4]/div/div[1]/div[3]/div[2]/div/div/p').text
if sss == "账号或密码有误":
print("登录失败")
else:
print("登录成功")
time.sleep(5)
# 模拟登录
# 将edge换成Chrome
if a == 'n':
js2 = 'window.open("{}");'.format(url_)
#打开新窗口
browser.get('{}'.format(url_))
time.sleep(5)
else:
browser2.get("{}".format(url_))
time.sleep(5)
# html = browser.page_source
# soup = BeautifulSoup(html, 'lxml')
# 获取object标签
# list = soup.find_all('object')
# for item in list:
# # 获取object下的参数,索引为3的是需要的链接值
# childs = soup.find('object').children
# i = 0
# for child in childs:
# if i == 3:
# value = child['value']
# url = item['data']+value
# print(url)
# i += 1
# browser.get(url)
# browser.get_screenshot_as_file('D:/360/1.png')
# 模拟鼠标滚轮,滑动页面至底部
if a == 'n':
js1 = "window.scrollTo(0, document.body.scrollHeight)"
browser.execute_script(js1)
time.sleep(2)
button1 = browser.find_element(By.XPATH, '//*[@id="continueButton"]')
b_text = button1.text
print(b_text)
t = '继续免费阅读全文'
if len(b_text) == len(t)+2:
browser.find_element(By.XPATH, '//*[@id="continueButton"]').click()
print("需要展开")
time.sleep(3)
if len(b_text) != len(t)+2:
print("无需展开")
# print(type(b_text))
# print(type(t))
# print(len(b_text))
# print(len(t))
#https://www.doc88.com/p-63347038652945.html
# if b_1 == "继续免费阅读全文":
# button2.click()
# print("已展开")
# else:
# print("无需展开")
time.sleep(2)
js3 = "window.scrollTo(0, 0)"
browser.execute_script(js3) #移动到浏览器最上方
time.sleep(2)
else:
button1 = browser2.find_element(By.XPATH, '//*[@id="continueButton"]')
b_text = button1.text
print(b_text)
t = '继续免费阅读全文'
if len(b_text) == len(t)+2:
browser2.find_element(By.XPATH, '//*[@id="continueButton"]').click()
print("需要展开")
time.sleep(3)
else:
print("无需展开")
time.sleep(2)
js3 = "window.scrollTo(0, 0)"
browser2.execute_script(js3)
time.sleep(2)
if a == 'n':
for i in range(0, m):
js = "window.scrollBy(0, {})".format(n) # 向下滑动n个像素
browser.execute_script(js)
time.sleep(1)
w1 = browser.find_element(By.XPATH, '//*[@id="dragdrop"]').screenshot_as_png
filename = 'D:\道客巴巴\'
if not os.path.exists(filename):
os.mkdir(filename)
print("正在获取图片({})........".format(i + 1))
with open(filename + '{}'.format(i + 1) + '.png', mode='wb') as f:
f.write(w1)
else:
for i in range(0, m):
js = "window.scrollBy(0, {})".format(n) # 向下滑动n个像素
browser2.execute_script(js)
time.sleep(3)
w1 = browser2.find_element(By.XPATH, '//*[@id="dragdrop"]').screenshot_as_png
filename = 'D:\道客巴巴\'
if not os.path.exists(filename):
os.mkdir(filename)
print("正在获取图片({})........".format(i + 1))
with open(filename + '{}'.format(i + 1) + '.png', mode='wb') as f:
f.write(w1)
技术很简单,就是实现起来会一点小问题,最后成功抓取到所需信息 ,一些点想跟大家分享一下,就是这个’D:\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\msedgedriver.exe‘ 是edge的驱动,我们在他的官网就能找到下载下来,倒也不是非要这个驱动,但是在我的电脑上总是出bug,所以下载了驱动,也是很顺利完成了接下来的爬取, 这个项目说大也不大,但是花费了我熬了两个晚上的夜,找了很多资料参考,原来是看到油猴有一个插件可以分析文档并导入到PDF或者照片压缩包里,我觉得很神奇,所以想用python复刻一下,技术原理就是:
1.通过selenium模拟浏览器操作,首先找到道客某巴首页官网,找到xpath发送数据再通过click()点击登录,这样可以绕过验证码
2.至于判断展开和登录成功了没就是获取xpath的文本信息,进行判断就好了
3.无头浏览器就是添加‘options.add_argument('--headless')’到options里就好了,server提供了驱动地址,无头浏览器就是看不见浏览器打开,看起来更像脚本
4.打开需要爬取的页面之后,将文章的xpath screenshot_png()到本地就好了,同时要滑动页面,防止获取到同一个页面的内容
注:本文只做技术分享讨论,无其他用途