先上几张效果图,有兴趣的在阅读
@TOC
一、爬取代理IP网站中所有IP
目前寻找到3个成功率比较高的代理IP网站
-
hidemy.name/cn/proxy-li… (可能需要v-p-n)
-
www.xiladaili.com/https/(可能需要v-p-n)
1. 爬取第一个网站中所有的ip地址
import requests
from lxml import etree
# 定义一个全局数组,用了存放获取到的ip地址
list_ips = []
# 获取ip地址
def get_url(urls='https://hidemy.name/cn/proxy-list/?type=s#list'):
# 构建请求头
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'cookie': '_ym_uid=1631777163767759683; _ym_d=1631777163; PAPVisitorId=b90c63abf5f4ae440925e5MpEdpDVCg3; PAPVisitorId=b90c63abf5f4ae440925e5MpEdpDVCg3; _ga=GA1.2.2061127712.1631777164; _gid=GA1.2.1247344787.1631777164; _ym_isad=2; _dc_gtm_UA-90263203-1=1; _gat_UA-90263203-1=1; _fbp=fb.1.1631777165623.195031136',
}
respons = requests.get(urls, headers=headers, )
# 获取请求内容,并且转换成html格式
html_content = etree.HTML(respons.content.decode('utf-8'))
tbody_list = html_content.xpath("//*[@class='table_block']/table/tbody/tr")
print(len(tbody_list))
# 如果能获取到数据,进行下一步
if len(tbody_list):
# 遍历内容
for tbody in tbody_list:
# 获取ip地址
ip_name = tbody.xpath('./td[1]/text()')[0]
# 获取端口号
port_name = tbody.xpath('./td[2]/text()')[0]
# 字符串拼接
ips = ip_name + ':' + port_name
# 添加到全局数组中
list_ips.append(ips)
else:
# 获取下一页url
next_url_xpath = html_content.xpath("//*[@class='next_array']/a/@href")
if len(next_url_xpath):
# 拼接下一页url
next_url = 'https://hidemy.name' + next_url_xpath[0]
# 继续请求
get_url(next_url)
else:
print('没有下一页了')
else:
print('没有获取到')
# 返回数据
return list_ips
# 调用方法
if __name__ == '__main__':
list_ip = get_url()
print(list_ip)
2. 爬取第二个网站中所有的ip地址
# 2.获取这的代理ip: http://www.xiladaili.com/https/
def get_https_ip(urls='http://www.xiladaili.com/https/'):
# 构建header
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'Cookie': 'Hm_lvt_9bfa8deaeafc6083c5e4683d7892f23d=1631780310; Hm_lpvt_9bfa8deaeafc6083c5e4683d7892f23d=1631780310',
}
# 发送请求
respons = requests.get(urls, headers=headers, )
# 延时2秒等待数据加载完成在获取
time.sleep(2)
# 获取内容,转换成html格式
html_content = etree.HTML(respons.content.decode('utf-8'))
# 读取内容
tbody_list = html_content.xpath("//*[@class='fl-table']/tbody/tr")
if len(tbody_list):
# 遍历内容
for tbody in tbody_list:
# 直接获取ip地址+端口号
ip_name = tbody.xpath('./td[1]/text()')[0]
# 添加到数组
list_ips.append(ip_name)
else:
# 获取下一页链接
next_url_xpath = html_content.xpath("//*[text()='下一页']/@href")
if len(next_url_xpath):
# 只提前30页ip,该网站一共有2000多页,我们只需要提前30页即可
if (next_url_xpath[0] == '/https/30/'):
print('到30页了')
return
# 拼接下一页地址
next_url = 'http://www.xiladaili.com' + next_url_xpath[0]
print(next_url)
# 递归调用自己
get_https_ip(next_url)
else:
print('没有下一页了')
else:
print('没获取到数据')
# 该网站存在,因为前30页数据中,某一页突然没数据,然后隔一页才有,因此在这需要再次获取下一页链接
next_url_xpath = html_content.xpath("//*[text()='下一页']/@href")
if len(next_url_xpath):
if (next_url_xpath[0] == '/https/30/'):
print('到100页了')
return
next_url = 'http://www.xiladaili.com' + next_url_xpath[0]
print(next_url)
get_https_ip(next_url)
else:
print('没有下一页了')
# 返回数据
return list_ips
# 调用方法
if __name__ == '__main__':
list_ip = get_https_ip()
print(list_ip)
3. 爬取第三个网站中所有的ip地址
第三个网站做了反爬措施!!! 用户打开以后不能复制,不能右键,端口号是图片组成 因此我们要获取图片,然后识别图片中的数字来获取端口号
3.1 通过图片url获取图片内容,并且识别图片中数字
# 头文件需要安装,同时还得需要安装tesseract,具体的可以参考百度!!!
import pytesseract
from PIL import Image
# 通过图片的url,下载图片到本地,然后在通过ocr进行识别,最后返回图片内容
def getImageWithocr(image_url):
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'cookie': 'MIMVPSESSID=js2d5ibkumcpibthk64mmf85sa; Hm_lvt_51e3cc975b346e7705d8c255164036b3=1631607506,1631677690,1631849436; Hm_lpvt_51e3cc975b346e7705d8c255164036b3=1631858821',
}
responds = requests.get(image_url, headers=headers)
# 存储到本地port_image文件夹下面
with open("port_image/background_pic.png", "wb") as f:
f.write(responds.content)
try:
# 这里识别以后有可以把8识别成B,有可能有‘,’,因此需要转换!!!
text = pytesseract.image_to_string(Image.open('port_image/background_pic.png')).replace(',', '').replace('B',
'8')
# print(text)
except:
print('读取失败,没有找到图片')
# 最后返回图片内容
return text
3.2 爬取数据
# 接上面代码
# 因为免费用户只能查看第一页ip,所以不用翻页
def getProxyip(urls='https://proxy.mimvp.com/freeopen?proxy=in_hp&sort=&page=1'):
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36',
'cookie': 'MIMVPSESSID=2o7m730du3qm487gpjgh7cjmbc; Hm_lvt_51e3cc975b346e7705d8c255164036b3=1631860283; Hm_lpvt_51e3cc975b346e7705d8c255164036b3=1631860283',
}
responds = requests.get(urls, headers=headers)
html_content = etree.HTML(responds.content.decode('utf-8'))
allips = html_content.xpath("//*[@class='mimvp-tbl free-proxylist-tbl']/tbody/tr")
if len(allips):
for ip_name_item in allips:
# 获取ip地址
ip_name = ip_name_item.xpath('./td[2]/text()')[0]
# 获取端口号图片
image_url = 'https://proxy.mimvp.com' + ip_name_item.xpath('./td[3]/img/@src')[0]
# 识别出端口号
port_name = getImageWithocr(image_url)
# 拼接地址
ips = ip_name + ':' + port_name.replace('\n', '').replace('\x0c', '')
# 添加数据
list_ips.append(ips)
else:
print('没有获取到ip地址')
return list_ips
# 调用方法
if __name__ == '__main__':
list_ip = getProxyip()
print(list_ip)
二、检测可用代理IP
1. 检测原理
检测原理:通过代理IP访问www.baidu.com 如果能够访问说明代理IP可以使用,否则不能使用
2. 检测代码
代码如下:
import requests
import threading
# test_csd.py 就是上一步获取所有代理ip的文件,通过导入这个文件可以直接调用内部方法,得到代理ip的数组
import test_csd
def jc_net(ips):
'''
# 设置代理,此处需要注意一下
# 有的是需要 {'https': 'https://+'ips}, urllib版本号不同设置代理方式不同,我的是1.22
'''
proxy = {'https': ips}
# 以百度为例
url = 'https://www.baidu.com'
# 构建header
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Connection': 'keep-alive'}
try:
# 用代理请求百度,设置超时时间20秒
response = requests.get(url, proxies=proxy, headers=head, timeout=20)
except (requests.exceptions.ConnectTimeout, requests.exceptions.ProxyError, requests.exceptions.SSLError,
requests.exceptions.ConnectionError) as e:
# 捕获异常,此处说明代理不可用!!
print("443失败", ips)
else:
# 此处说明代理可用
# 有需要的可以把可用代理添加到数组中,或者执行其他任务
# 以本代码为例:
# 1.获取当前目录下所有的log文件
log_list = file_name('./')
# 2.把当前可用的代理ip转换成'123_345_793_8080.log'形式,用来当做log日志
str_ip_log = ips.replace('.', '_').replace(':', '_') + '.log'
# 3.遍历一下当前目录所有的log文件
for logs in log_list:
# 如果有相同的,说明执行过这个任务,就不必要再次执行了
if str_ip_log == logs:
print('有一样的,这个IP不符合', logs)
return
else:
# 此处是可以执行的ip
print('可以执行的IP', ips, response.status_code)
# 终端后台执行任务,并把日志文件以代理ip的形式输出 csdn_request3.py文件就是刷博客的文件
ml = 'nohup python3 -u csdn_request3.py %s > %s 2>&1 &' % (ips, str_ip_log)
os.system(ml)
# 获取当前目录下所有的.log文件
def file_name(file_dir):
L = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if os.path.splitext(file)[1] == '.log':
L.append(file)
return L
# 程序运行
if __name__ == '__main__':
# 1.get_url 2.get_https_ip 3.getProxyip
# 只有要调用1 ,2 ,3的方法就能得到ip的数组
list_ip = test_csd.get_url()
# 遍历数组
for ips in list_ip:
# 用多线程方式执行效率比较高
sub_thread = threading.Thread(target=jc_net, args=(ips,))
sub_thread.start()
三、使用代理IP shua博客,增加(Liu-Lan-Liang)
csdn_request3.py 文件
# -*- encoding: UTF-8 -*-
import requests
import time
import sys
from lxml import etree
import threading
# 计数用
index = 0
# 接收代理ip地址
ip_addr = 0
# 通过url主链接获取当前博客下所有的文章
def getAllUrl(main_url):
headers = {
'cookie': 'uuid_tt_dd=10_20960831740-1631525577857-460581; dc_session_id=10_1631525577857.927126; c_first_ref=default; c_first_page=https%3A//blog.csdn.net/wds326598; c_segment=5; c_page_id=default; dc_tos=qzd96y; log_Id_pv=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1631525578; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1631525578; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_20960831740-1631525577857-460581; firstDie=1; dc_sid=c8f285999921042d4b05864bfd620df0; log_Id_view=1; __gads=ID=d00ed7cf9e676320-227bc26d41ca00a3:T=1631525579:RT=1631525579:S=ALNI_MZLDz0lBRqXKnIgmf75SqQUL_KObg',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'sec-ch-ua-platform': 'macOS'
}
respons = requests.get(main_url, headers=headers)
html = etree.HTML(respons.content.decode('utf-8'))
# 获取文章总数
hrefs_items = html.xpath('//*[@id="articleMeList-blog"]/div[2]/div')
if len(hrefs_items):
for heef_item in hrefs_items:
# 获取连接
href_str = heef_item.xpath('./h4/a/@href')[0]
# 开辟多线程同时进行访问所有连接
ips_thread = threading.Thread(target=request_url, args=(href_str,))
ips_thread.start()
# print(href_str)
else:
# 获取文章失败,等待5秒重新获取
print('Failed to get article')
time.sleep(5)
getAllUrl(main_url)
# 访问url连接,增加访问量
def request_url(url):
dates = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
headers = {
'cookie': 'BAIDU_SSP_lcr=https://www.baidu.com/link?url=qlVAL9zV4uL3n5tTxTm2qbTnlPMTlUdjgVr_W4svXMFjsbN3ADy1Z50rUXmuliM-gGpyre4deNLw2JJvlgy7JL13zWeS1wqxgu3orbz4vZC&wd=&eqid=9671cf32000310ef00000003613ea007; uuid_tt_dd=10_20960831740-1592037280587-766261; _ga=GA1.2.1049417259.1592301375; UN=wds326598; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_20960831740-1592037280587-766261!5744*1*wds326598; p_uid=U010000; __gads=ID=adb85208beef7041-221af0961acb00c1:T=1628472768:RT=1629764431:S=ALNI_MYFiqh-bQuAcUCJczfdiXp2MH854A; c_segment=5; dc_sid=f4a4fc6c468bb8df7a9239566abe3420; FCCDCF=[null,null,["[[],[],[],[],null,null,true]",1630046245527],null,null]; c_first_ref=www.baidu.com; csrfToken=P9etaQyF8vVaf95fvGvlRyM8; Hm_lvt_e5ef47b9f471504959267fd614d579cd=1629686745,1630373243,1630544882; Hm_lpvt_e5ef47b9f471504959267fd614d579cd=1630544882; dc_session_id=10_1631492414376.899948; firstDie=1; c_utm_medium=distribute.pc_search_result.none-task-blog-2%7Eall%7Efirst_rank_ecpm_v1%7Erank_v29_ecpm-10-112058213.first_rank_v2_pc_rank_v29; c_utm_term=Python%E5%AE%9E%E7%8E%B0%E5%8D%9A%E5%AE%A2%E5%88%B7%E9%87%8F; referrer_search=1631494235031; c_first_page=https%3A//blog.csdn.net/wds326598/article/details/120039413; SESSION=2907da69-08df-471a-8ba4-6f202bb8082a; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1631332169,1631353585,1631494371,1631494437; unlogin_scroll_step=1631494529135; c_pref=https%3A//blog.csdn.net/wds326598/article/details/120039413; c_ref=https%3A//blog.csdn.net/wds326598; acw_sc__v2=613ea229cfd8332006818f4c2931a686f0bcd045; UserName=wds326598; UserInfo=fd133d71f8154027a2ddb28bc2c27626; UserToken=fd133d71f8154027a2ddb28bc2c27626; UserNick=Ares%E7%A7%B0%E9%9B%84; AU=EC1; BT=1631494697658; ssxmod_itna=YqIx0D2GKWqGqBKGHD8ibe0K4+4Rh3uwYMqTxqGNeKoDZDiT4AP35qExwxKwAnzeRO093FqbELBYKoom8FdhD84i7DKqibDCqD1D3qDkbRYxiiTxieDiDYaaDmZP0kkHqD3DR=s=wQKDDLcDBoDGML6DBYD78=D38dDtqGn9SLWQqDfR86QVS3798U0IwRXD7UcBS3DCr+qbgDo0OD5z=oeG75NCBKkcY4=7GDN0hDKA0D1xKwOk9GMFB43mGxxD; ssxmod_itna2=YqIx0D2GKWqGqBKGHD8ibe0K4+4Rh3uwYMqTxA=WY43D/nexFODrK+20PAPksYzeAoqxqz/Y4Cv0s55+ioqOQu0w+jLiN8iIvmlCCMMUxH5Q7Cx+zc0whniw6Q=q2fMGDrFKZYCee6xfLYzRi3KRSXzSwfwST36Yq=VKTXx=4NOGW2jjuoUlMTNHbKiaRIzKZ3RbTIWweasUkasWWYbomcHa8cjWS9jUTf1KWrn0ZUkhltWvmHHhlHHkQu1+9fELnasqRcov=M9n6FjO=RwV960a4Z2+1nkTNc11hdSgRnS16ol56gXjvjBrMnbDkctB9leBl4ITQ5M9KDyyAtF+CB4rPTtcedY5L+xhaEmYA/AYkTV4P3=DPnxTQTaR+uChHKDkD4mjAKIRwC2GmO5ArsDRa4wF3fi1hi33Qo4ohhI/48Ff/jYHKWocYaZeA93rm7fGiK/qYdgLRD3D07q0DiBPdGYYAPxcKL660052cQiB5zDKGfK4gq/ADVrxex072q/GxiDDLxD2nhxKDqexQQ+xfbCPD===; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22uid_%22%3A%7B%22value%22%3A%22wds326598%22%2C%22scope%22%3A1%7D%2C%22islogin%22%3A%7B%22value%22%3A%221%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%221%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; log_Id_click=1213; c_page_id=default; dc_tos=qzclwa; log_Id_pv=2828; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1631495387; log_Id_view=5446',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36'
}
# 在这里需要注意一下,urllib3的版本号我的是1.26.6,如果是1.25的需要用
# proxies = {"https": 'https://' + ip_addr}
proxies = {"https": ip_addr}
try:
requests.get(url, headers=headers, proxies=proxies)
except:
# 如果增加失败,需要打印出失败的时间,和对应的ip地址
print("Failed to fetch", dates, ip_addr)
else:
# 增加成功
print(url, ip_addr, dates, index)
if __name__ == '__main__':
# 博客的主链接需要放到数组中
lists_url = ['https://blog.csdn.net/wds326598', 'https://blog.csdn.net/qq_36961698']
# 判断一下终端输入的参数
if len(sys.argv) == 2:
ip_addr = sys.argv[1]
while True:
index += 1
for main_url in lists_url:
# 开辟多线程同时进行获取文章
first_thread = threading.Thread(target=getAllUrl, args=(main_url,))
first_thread.start()
time.sleep(60)
else:
# 必须输入1个代理ip
print('Must enter a proxy ip')
四、总结
整个流程下来是需要3个文件
- 爬取所有网站https代理
- 过滤寻找可以使用的https代理
- csdn_request3.py核心文件
总体的流程并不复杂,主要难点是获取可用的代理ip,代理ip越多S取量越高!!!
如果你发现其他可用代理ip网站可以留言,我会爬取分享给大家!
另外在说一点(本文章以CSDN为例):
只有高质量的博客才能吸引更多读者,刷取只是一种卑鄙无耻的行为!!!大家看看玩玩就好!!!