记使用爬虫刷博客访问量的经历
1、选择技术
就个人而言,我对python和java都是比较熟悉的,这两种语言也支持写爬虫,所以在这我收集整理并修改了一下刷博客访问量有用且高效的代码。
2、python爬虫刷博客访问量
战前需要: python3、requests、selenium、chromedriver_win32.zip(这里版本要看你自己的关于chrome的版本,我这个是77的)
接下来先不用代理api,自己写一个list,放一些高匿ip进去试试,去高匿ip代理找,这里我使用的是:www.goubanjia.com/,这里每天有20个免费代理,至于西刺代理,真的不好说,高匿ip多,但是很多不能用,接下来 放代码:
viewFlow.py #这个是用来刷博客访问量的
import random
import requests
import time
from selenium import webdriver
import sys
import os
# 随机获取浏览器标识
def get_UA():
UA_list = [
"Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19",
"Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
"Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"
]
randnum = random.randint(0, len(UA_list)-1)
h_list = UA_list[randnum]
return h_list
# 获取代理IP
def get_ip():
# # 这里填写大象代理api地址,num参数必须为1,每次只请求一个IP地址
# url = 'http://www.baidu.com'
# response = requests.get(url)
# response.close()
# ip = response.text
# print(ip)
ip_list = ['117.141.155.242:53281','112.250.107.37:53281','223.215.102.59:61234','121.61.89.48:61234','47.111.56.178:80','223.215.102.59:61234','112.250.107.37:53281','123.115.254.208:8118','218.2.226.42:80','222.94.212.39:8118','223.215.102.59:61234','117.141.155.242:53281']
num = random.randint(0, len(ip_list)-1)
ip = ip_list[num]
return ip
if __name__ == '__main__':
url = "https://juntech.top/posts/c2d8ad83.html"
# 无限循环,每次都要打开一个浏览器窗口,不是标签
while 1:
# 调用函数获取浏览器标识, 字符串
headers = get_UA()
# 调用函数获取IP代理地址,这里获取是字符串,而不是像前两个教程获得的是数组
print(get_ip())
proxy = get_ip()
# 使用chrome自定义
chrome_options = webdriver.ChromeOptions()
# 设置代理
chrome_options.add_argument('--proxy-server=http://'+proxy)
# 设置UA
chrome_options.add_argument('--user-agent="'+headers+'"')
# 使用设置初始化webdriver
driver = webdriver.Chrome(chrome_options=chrome_options)
try:
# 访问超时30秒
driver.set_page_load_timeout(30)
# 访问网页
driver.get(url)
# 退出当前浏览器
driver.close()
# 延迟1~3秒继续
time_delay = random.randint(1, 3)
while time_delay > 0:
print(str(time_delay) + " seconds left!!")
time.sleep(1)
time_delay = time_delay - 1
pass
except:
print("timeout")
# 退出浏览器
driver.quit()
time.sleep(1)
# 重启脚本, 之所以选择重启脚本是因为,长时间运行该脚本会出现一些莫名其妙的问题,不如重启解决
python = sys.executable
os.execl(python, python, *sys.argv)
finally:
pass
getIp.py #这个是用来保存西刺代理的可用的高匿ip的:
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 03 19:06:18 2018
@author: Administrator
"""
import urllib3
import re
import requests
import time
from threading import Thread
from threading import Lock
from queue import Queue
#从西刺抓下来的所有代理ip
all_find_list=[]
#将所有抓到的代理压入队列,四个线程可以从队列中获取代理ip
gaoni_queue=Queue()
#能够成功连接的代理ip
success_list=[]
lock=Lock()
def get_proxy(checking_ip):
#根据得到的代理ip,设置proxy的格式
proxy_ip = 'http://' + checking_ip
proxy_ips = 'https://' + checking_ip
proxy = {'https': proxy_ips, 'http': proxy_ip}
return proxy
def checking_ip():
global gaoni_queue
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}
while 1:
#若从队列1秒内无法获得代理ip,说明所有代理均已检测完成,抛出Empty异常
try:
checking_ip = gaoni_queue.get(True,1)
except:
gaoni_queue.task_done()
break
proxy=get_proxy(checking_ip)
url = 'https://juntech.top/posts/sds15482.html'
#使用上面的url,测试代理ip是否能够链接
try:
page = requests.get(url, headers=headers, proxies=proxy)
except:
lock.acquire()
print( checking_ip,'失败')
lock.release()
else:
lock.acquire()
print( checking_ip,'成功')
success_list.append(checking_ip)
lock.release()
def get_all():
headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
global all_find_list
for i in range(1,10):
#从xici网站的高匿页面获取ip
url='http://www.xicidaili.com/nn/%d'%i
r = requests.get(url,headers=headers)
data=r.text
#抓取所需数据的正则表达式
p=r'<td>(.*?)</td>\s+<td>(.*?)</td>\s+<td>\s+(.*?)\s+</td>\s+<td class="country">(.*?)</td>'
find_list=re.findall(p,data)
all_find_list+=find_list
#将ip地址与端口组成规定格式
for row in all_find_list:
ip=row[0]+':'+row[1]
gaoni_queue.put(ip)
if __name__=='__main__':
get_all()
print( gaoni_queue.qsize())
thread_1=Thread(target=checking_ip)
thread_2=Thread(target=checking_ip)
thread_3=Thread(target=checking_ip)
thread_4=Thread(target=checking_ip)
thread_1.start()
thread_2.start()
thread_3.start()
thread_4.start()
thread_1.join()
thread_2.join()
thread_3.join()
thread_4.join()
f=open("ip.txt","w")
for row in success_list:
f.write(row+'\n')
f.close()
运行完会生成一个ip.txt,里面保存了你设置的页码中可使用的ip及端口信息
IncreaseFlow.py #这个和getIp.py可以连起来用,和第一个没关系:
import urllib3
import requests
import time
import random
#定义代理列表,以便使用随机代理访问,
user_agent_list=[
'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]
#url列表,包含要访问的网站
url_list=[
'https://juntech.top/posts/sds15482.html',
'https://juntech.top',
'https://juntech.top/posts/9211474f.html',
'https://juntech.top/posts/95e45310.html',
'https://juntech.top/posts/898da94c.html',
'https://juntech.top/posts/b829b66a.html'
]
#生成ip地址列表,以便使用代理ip访问,这里ip地址从txt文档中读取
def get_proxy_ip_list():
global proxy_ip_list
proxy_list=[]
proxy_ip_list=[]
print("导入proxy_list...")
f=open("ip.txt")
line=f.readline().strip('\n')
while line:
proxy_list.append(line)
line=f.readline().strip('\n')
f.close()
for i in range(len(proxy_list)):
ip = 'http://' + proxy_list[i]
proxy_ip_list.append(ip)
return proxy_ip_list
#另一种生成ip地址列表的方法,这里ip地址直接写在函数中的ip_list中
def get_proxy_ip_list2():
global proxy_ip_list2
proxy_ip_list2=[]
ip_list=['61.135.217.7:80','118.190.95.35:9001','112.115.57.20:3128','124.235.181.175:80']
for i in range(len(ip_list)):
ip = 'http://' + ip_list[i]
proxy_ip_list2.append(ip)
print(proxy_ip_list2)
return proxy_ip_list2
#生成爬虫,爬取网页
def visit(url,headers,proxies):
times=0
while(1):
url=random.choice(url_list)
header={'User_Agent':random.choice(user_agent_list)}
proxy={'http':random.choice(proxies)}
try:
#print '%s%s%s'%(url,header,proxy)
res=requests.get(url,headers=header,proxies=proxy)
print(res.headers)
except:
print('wrong')
time.sleep(0.1)
else:
print('visit %d times'%times)
time.sleep(random.random())
# time.sleep(15)
times+=1
if times%20==0:
time.sleep(15)
if __name__=="__main__":
get_proxy_ip_list()
visit(url_list,user_agent_list,proxy_ip_list)
在这里推荐使用第一个!!!
3、jsoup爬虫刷博客访问量
这个没什么好说的,会java的很容易理解,至于jsoup,查看官网,就可以很轻松的配合httpClient写出一个爬虫,代码如下:
新建一个maven工程:
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>top.juntech</groupId>
<artifactId>viewflow</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId></groupId>
<artifactId></artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.10</version>
</dependency>
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>5.1.8.RELEASE</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
</project>
创建爬虫程序:
package top.juntech;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.util.StringUtils;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class Spider {
private static Log logger = LogFactory.getLog(Spider.class);
private static List<String> list = new ArrayList<String>();
/**
* 随机获取一个Ip地址
* @return
*/
public static String[] getRandomIp() {
if (list.size() == 0) {
list.add("47.111.56.178:80");
list.add("121.61.89.48:61234");
list.add("222.94.212.39:8118");
list.add("111.29.3.222:8080");
list.add("111.29.3.188:80");
list.add("123.115.254.208:8118");
list.add("183.247.152.98:53281");
list.add("58.254.220.116:53579");
list.add("27.159.167.176:9999");
list.add("114.248.0.31:8060");
list.add("218.2.226.42:80");
list.add("27.208.28.139:8060");
list.add("58.246.3.178:53281");
list.add("115.29.42.152:80");
list.add("183.234.241.105:8118");
list.add("223.85.196.75:9999");
list.add("223.111.131.101:8080");
list.add("27.203.242.102:8060");
list.add("223.215.102.59:61234");
list.add("47.111.56.178:80");
list.add("121.61.89.48:61234");
list.add("223.215.102.59:61234");
list.add("112.250.107.37:53281");
list.add("117.141.155.242:53281");
}
Random random = new Random();
int n = random.nextInt(list.size());
return list.get(n).split(":");
}
/**
* Jsoup打开连接地址获取Document对象
* @param url
* @return
*/
public static Document getDocument(String url) {
try {
Connection conn = Jsoup.connect(url).ignoreContentType(true).ignoreHttpErrors(true).userAgent("Mozilla");
// 设置代理
String ip[] = Spider.getRandomIp();
Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip[0].trim(), Integer.parseInt(ip[1].trim())));
conn.proxy(proxy);
// 设置超时时间并获取Document对象
Document document = conn.timeout(8000).get();
if (null != document && !StringUtils.isEmpty(document.toString())) {// 表示ip被拦截或者其他情况
System.out.println(proxy.toString());
return document;
}
} catch (Exception e) {
logger.error("抓取失败...");
}
return null;
}
}
创建测试类:
package top.juntech;
import org.jsoup.nodes.Document;
import org.junit.Test;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class MyTest {
List<String> list = new ArrayList<String>();
@Test
public void testDetail1() throws InterruptedException {
list.add("https://juntech.top/posts/c2d8ad83.html");
list.add("https://juntech.top/posts/9211474f.html");
list.add("https://juntech.top/posts/95e45310.html");
list.add("https://juntech.top/posts/898da94c.html");
// 随机访问其中一篇博客
for (int i = 0; i <= 1000; i++) {
Random random = new Random();
int n = random.nextInt(list.size());
String url = list.get(n);
Document doc = Spider.getDocument(url);
if (null != doc) {
System.out.println("第" + i + "次抓取,url: " + url);
}
}
}
}
4、注意
爬虫代理ip刷博客访问量是很可耻的,请小心使用,毕竟博客文章的精彩程度会给你带来大量的流量,而不是要靠这种偏门的手段去获取,这只会使自己 更懒惰!