爬取快代理的所有的ip:port,并验证是否可用
爬取快代理的所有的ip:port,并验证是否可用
from selenium import webdriver
import urllib2
import time
for i in range(10):
url = "https://www.kuaidaili.com/free/inha/"
driver = webdriver.Chrome(r"C:\Users\xxx\Desktop\chromedriver.exe")
url = url + str(i + 1) + "/"
driver.get(url)
time.sleep(1)
data = driver.find_elements_by_xpath("//tbody//tr")
http_list = []
for da in data:
ip = da.find_element_by_xpath("./td[1]").text
port = da.find_element_by_xpath("./td[2]").text
http = ip + ":" + port
http_list.append(http)
driver.close()
for http in http_list:
httpproxy = urllib2.ProxyHandler({"http": http})
opener = urllib2.build_opener(httpproxy)
request = urllib2.Request("https://www.baidu.com")
try:
response = opener.open(request, timeout=10)
print(http + u"可以使用")
except:
print(http + u"无效")
解决网站屏蔽无界面浏览器的问题(未调通)
import pyvirtualdisplay
import selenium.webdriver
"""
一些网站会禁止使用那个无界面浏览器,所以这个部分就是来解决这个问题的,我们可以隐藏浏览器
"""
options= selenium.webdriver.ChromeOptions()
options.add_argument("--disable-extensions") # 禁止外部插件
options.add_argument("--profile-directory=Defalt")
options.add_argument("--incongnito")
options.add_argument("--disable-plugins-discovery")
options.add_argument("--start-maxmined")
display = pyvirtualdisplay.Display()
display.start()
"""
课上演示有问题/并没有解决
直接使用这种方式的话会造成一个错误 easyprocess.EasyProcessError:
使用浏览器配置选项进行解决
Chromeoptions的在网络上直接搜用法大全,可以详解
"""
driver = selenium.webdriver.Chrome(r"C:\Users\xxx\Desktop\chromedriver.exe",chrome_options=options)
driver.delete_all_cookies()
driver.set_window_size(800,800)
driver.set_window_position(0,0)
print("OK")
driver.get("http://www.baidu.com")
print(driver.page_source)
driver.close()
提取网页的所有文本
from selenium import webdriver
import time
url = "https://www.51shucheng.net/guanchang/erhaoshouzhang/erhaoshouzhang1/1730.html"
driver = webdriver.Chrome(r"C:\Users\xxxx\Desktop\chromedriver.exe")
driver.get(url)
time.sleep(1)
data = driver.find_elements_by_xpath("/*")
for tag in data:
print(tag.text)
driver.quit()