youtube邮箱需要点击事件,才能展示出来。其他方式较为复杂,暂时先用selenium写一下。
# 使用selenium库来模拟人的行为
import time
import json
from selenium import webdriver
option = webdriver.ChromeOptions()
option.add_argument('--disable-gpu') # 禁止gpu加速
option.add_argument("no-sandbox") # 取消沙盒模式
option.add_argument("disable-blink-features=AutomationControlled") # 禁用启用Blink运行时的功能
option.add_experimental_option('excludeSwitches', ['enable-automation']) # 开发者模式
driver = webdriver.Chrome(options=option)
# 移除 `window.navigator.webdriver`. scrapy 默认为True
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.get('https://google.com/ncr')
# url = 'https://www.youtube.com/channel/UCVXCo0W9pk2dDkEBNLhTt7A'
driver.find_element_by_xpath('//*[@id="gb"]/div/div[2]/a').click()
driver.find_element_by_xpath('//*[@id="identifierId"]').send_keys('Google账号')
driver.find_element_by_xpath('//*[@id="identifierNext"]/div/button').click()
time.sleep(10)
driver.find_element_by_xpath('//*[@id="password"]/div[1]/div/div[1]/input').send_keys('Google密码')
driver.find_element_by_xpath('//*[@id="passwordNext"]/div/button').click()
time.sleep(5)
def get_email(url):
# 开始访问youtube
driver.get(url)
driver.find_element_by_id('more-icon').click()
time.sleep(5)
driver.find_element_by_xpath('//*[@id="view-email-button-container"]/yt-button-view-model/button-view-model/button').click()
time.sleep(5)
iframe = driver.find_element_by_xpath('//*[@id="recaptcha"]/div/div/iframe')
# 切换到iframe上
driver.switch_to_frame(iframe)
time.sleep(5)
driver.find_element_by_xpath('//*[@id="recaptcha-anchor"]/div[1]').click()
time.sleep(5)
# 切回主页面
driver.switch_to_default_content()
driver.find_element_by_xpath('//*[@id="submit-btn"]/span').click()
time.sleep(5)
email = driver.find_element_by_xpath('//*[@id="email"]').text
print(email)
return email
f = open('source_url.txt', encoding='utf-8')
lines = f.read()
lines = json.loads(lines)
item_list = []
for zu_item in lines:
url = zu_item['url']
email = get_email(url)
item = {'url': url, 'email': email}
print(item)
item_list.append(item)
print(item_list)
# 根据网站的验证方式,模拟人的操作进行验证
# ...
# 完成验证后,获取验证通过后的响应
# response = driver.page_source
# # 关闭浏览器实例
# driver.quit()
编写的时候,到人机身份验证那里卡了一下。原因就在于那个验证框是iframe的,需要先将driver切换过去,然后提交的时候还要记得切换回来,否则会报错定位不到对应元素。