你的爬虫ip又被封了?教你一招

479 阅读3分钟

  python爬取代理IP及测试是否可用

  很多人在爬虫时为了防止被封IP,所以就会去各大网站上查找免费的代理IP,由于不是每个IP地址都是有效的,如果要进去一个一个比对的话效率太低了,我也遇到了这种情况,所以就直接尝试了一下去网站爬取免费的代理IP,并且逐一的测试,最后将有效的IP进行返回。

  在这里我选择的是番茄加速www.fanqieip.net/代理IP网站进行爬取,…

  一、准备工作

  导入包并且设置头标签

  import requests

  from bs4 import BeautifulSoup

  header = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'

  }

  二、提取网页源码

  提取网页源码返回的是整个网站的HTML

  def getHtml(url):

  try:

  reponse = requests.get(url, headers=header)

  reponse.raise_for_status()

  reponse.encoding = reponse.apparent_encoding

  return reponse.text

  except:

  return "网页源码提取错误"

  三、解析HTML并提取IP

  函数传入的参数是HTML和存放IP地址的列表

  # 解析网页,提取IP

  def getIp(html, list):

  try:

  soup = BeautifulSoup(html, "html.parser")

  tr = soup.find("tbody").find_all_next("tr")

  for ip in tr:

  # 提取IP

  td = ip.find_next("td").string

  td = str(td).replace(" ", "").replace("\n", "").replace("\t", "")

  # 提取端口号

  dk = ip.find_all_next("td")[1].string

  dk = str(dk).replace(" ", "").replace("\n", "").replace("\t", "")

  # 将IP和端口号进行连接

  ip = td + ":" + dk

  list.append(ip) # 再进IP地址存放至指定列表中去

  except:

  print("获取IP失败")

  四、测试IP是否可用

  在这里测试IP的原理是用requests请求百度网站,并且传入代理IP,如果网站返回状态码为200那么说明此IP有效,如果出现其他情况则判断IP地址无效

  # 测试出可用IP

  def ip_text(list, valid_IP):

  try:

  url = "www.baidu.com//"

  for ip in list:

  try:

  rep = requests.get(url, proxies={'https': ip}, headers=header, timeout=0.5)

  if rep.status_code == 200: # 如果放回的状态码是200,那么说明该IP地址可用

  valid_IP.append(ip)

  print("该代理IP有效:" + ip)

  else:

  print("该代理IP无效:" + ip)

  except:

  print("该代理IP无效:" + ip)

  except:

  print("IP测试失败")

  五、主函数main

  主函数中主要负责调用函数和自定义页数指定生成URL,并且在程序结束前会输出有效IP地址

  if __name__ == '__main__':

  valid_IP = [] # 有效IP地址

  for i in range(1, 90): # 可自定义页数

  ip_list = [] # 存放所有爬取到的ip

  url = "www.fanqieip.net/index\_" + str(i) + ".html"

  print(url)

  html = getHtml(url)

  getIp(html, ip_list)

  ip_text(ip_list, valid_IP)

  print("=" * 30)

  print("测试完成,有效IP如下:")

  print("-" * 30)

  for a in valid_IP:

  print(a)

  print("=" * 30)

  代码整体框架已经结束完毕了,最后把所有代码呈现出了

  完整代码

  # -*- coding: utf-8 -*-

  import requests

  from bs4 import BeautifulSoup

  header = {

  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'

  }

  # 提取网页源码

  def getHtml(url):

  try:

  reponse = requests.get(url, headers=header)

  reponse.raise_for_status()

  reponse.encoding = reponse.apparent_encoding

  return reponse.text

  except:

  return "网页源码提取错误"

  # 解析网页,提取IP

  def getIp(html, list):

  try:

  soup = BeautifulSoup(html, "html.parser")

  tr = soup.find("tbody").find_all_next("tr")

  for ip in tr:

  # 提取IP

  td = ip.find_next("td").string

  td = str(td).replace(" ", "").replace("\n", "").replace("\t", "")

  # 提取端口号

  dk = ip.find_all_next("td")[1].string

  dk = str(dk).replace(" ", "").replace("\n", "").replace("\t", "")

  # 将IP和端口号进行连接

  ip = td + ":" + dk

  list.append(ip) # 再进IP地址存放至指定列表中去

  except:

  print("获取IP失败")

  # 测试出可用IP

  def ip_text(list, valid_IP):

  try:

  url = "www.baidu.com//"

  for ip in list:

  try:

  rep = requests.get(url, proxies={'https': ip}, headers=header, timeout=0.5)

  if rep.status_code == 200: # 如果放回的状态码是200,那么说明该IP地址可用

  valid_IP.append(ip)

  print("该代理IP有效:" + ip)

  else:

  print("该代理IP无效:" + ip)

  except:

  print("该代理IP无效:" + ip)

  except:

  print("IP测试失败")

  if __name__ == '__main__':

  valid_IP = [] # 有效IP地址

  for i in range(1, 90): # 可自定义页数

  ip_list = [] # 存放所有爬取到的ip

  url = "www.fanqieip.net/index\_" + str(i) + ".html"

  print(url)

  html = getHtml(url)

  getIp(html, ip_list)

  ip_text(ip_list, valid_IP)

  print("=" * 30)

  print("测试完成,有效IP如下:")

  print("-" * 30)

  for a in valid_IP:

  print(a)

  print("=" * 30)