记录爬虫刷博客访问量的可耻经历

20,268 阅读6分钟

记使用爬虫刷博客访问量的经历

1、选择技术

就个人而言,我对python和java都是比较熟悉的,这两种语言也支持写爬虫,所以在这我收集整理并修改了一下刷博客访问量有用且高效的代码。

2、python爬虫刷博客访问量

战前需要: python3、requests、selenium、chromedriver_win32.zip(这里版本要看你自己的关于chrome的版本,我这个是77的)

接下来先不用代理api,自己写一个list,放一些高匿ip进去试试,去高匿ip代理找,这里我使用的是:www.goubanjia.com/,这里每天有20个免费代理,至于西刺代理,真的不好说,高匿ip多,但是很多不能用,接下来 放代码:

viewFlow.py #这个是用来刷博客访问量的

import random
import requests
import time
from selenium import webdriver
import sys
import os

# 随机获取浏览器标识
def get_UA():
    UA_list = [
        "Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19",
        "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30",
        "Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
        "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
        "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36"
    ]
    randnum = random.randint(0, len(UA_list)-1)
    h_list = UA_list[randnum]
    return h_list

# 获取代理IP
def get_ip():
    # # 这里填写大象代理api地址,num参数必须为1,每次只请求一个IP地址
    # url = 'http://www.baidu.com'
    # response = requests.get(url)
    # response.close()
    # ip = response.text
    # print(ip)
    ip_list = ['117.141.155.242:53281','112.250.107.37:53281','223.215.102.59:61234','121.61.89.48:61234','47.111.56.178:80','223.215.102.59:61234','112.250.107.37:53281','123.115.254.208:8118','218.2.226.42:80','222.94.212.39:8118','223.215.102.59:61234','117.141.155.242:53281']
    num = random.randint(0, len(ip_list)-1)
    ip = ip_list[num]
    return ip

if __name__ == '__main__':
    url = "https://juntech.top/posts/c2d8ad83.html"
    # 无限循环,每次都要打开一个浏览器窗口,不是标签
    while 1:
        # 调用函数获取浏览器标识, 字符串
        headers = get_UA()
        # 调用函数获取IP代理地址,这里获取是字符串,而不是像前两个教程获得的是数组
        print(get_ip())
        proxy = get_ip()
        # 使用chrome自定义
        chrome_options = webdriver.ChromeOptions()
        # 设置代理
        chrome_options.add_argument('--proxy-server=http://'+proxy)
        # 设置UA
        chrome_options.add_argument('--user-agent="'+headers+'"')
        # 使用设置初始化webdriver
        driver = webdriver.Chrome(chrome_options=chrome_options)

        try:
            # 访问超时30秒
            driver.set_page_load_timeout(30)
            # 访问网页
            driver.get(url)
            # 退出当前浏览器
            driver.close()
            # 延迟1~3秒继续
            time_delay = random.randint(1, 3)
            while time_delay > 0:
                print(str(time_delay) + " seconds left!!")
                time.sleep(1)
                time_delay = time_delay - 1
                pass
        except:
            print("timeout")
            # 退出浏览器
            driver.quit()
            time.sleep(1)
            # 重启脚本, 之所以选择重启脚本是因为,长时间运行该脚本会出现一些莫名其妙的问题,不如重启解决
            python = sys.executable
            os.execl(python, python, *sys.argv)
        finally:
            pass

getIp.py #这个是用来保存西刺代理的可用的高匿ip的:

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 03 19:06:18 2018
@author: Administrator
"""
 
import urllib3
import re
import requests
import time
from threading import Thread
from threading import Lock
from queue import Queue
 
#从西刺抓下来的所有代理ip
all_find_list=[]
#将所有抓到的代理压入队列,四个线程可以从队列中获取代理ip
gaoni_queue=Queue()
#能够成功连接的代理ip
success_list=[]
 
lock=Lock()
 
def get_proxy(checking_ip):
    #根据得到的代理ip,设置proxy的格式
    proxy_ip = 'http://' + checking_ip
    proxy_ips = 'https://' + checking_ip
    proxy = {'https': proxy_ips, 'http': proxy_ip}
    return proxy
    
def checking_ip():
    global gaoni_queue
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }
    
    while 1:
        #若从队列1秒内无法获得代理ip,说明所有代理均已检测完成,抛出Empty异常
        try:
            checking_ip = gaoni_queue.get(True,1)
        except:
            gaoni_queue.task_done()
            break
            
        proxy=get_proxy(checking_ip)
        url = 'https://juntech.top/posts/sds15482.html'
        #使用上面的url,测试代理ip是否能够链接
        try:
            page = requests.get(url, headers=headers, proxies=proxy)
        except:
            lock.acquire()
            print( checking_ip,'失败')
            lock.release()
        else:
            lock.acquire()
            print( checking_ip,'成功')
            success_list.append(checking_ip)
            lock.release()
 
 
def get_all():
    headers = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
    global all_find_list
    for i in range(1,10):
        #从xici网站的高匿页面获取ip
        url='http://www.xicidaili.com/nn/%d'%i
        r = requests.get(url,headers=headers)
        data=r.text
        #抓取所需数据的正则表达式
        p=r'<td>(.*?)</td>\s+<td>(.*?)</td>\s+<td>\s+(.*?)\s+</td>\s+<td class="country">(.*?)</td>'
        find_list=re.findall(p,data)
        all_find_list+=find_list
    #将ip地址与端口组成规定格式
    for row in all_find_list:
        ip=row[0]+':'+row[1]
        gaoni_queue.put(ip)
        
 
 
if __name__=='__main__':
    get_all()
    print( gaoni_queue.qsize())
    thread_1=Thread(target=checking_ip)
    thread_2=Thread(target=checking_ip)
    thread_3=Thread(target=checking_ip)
    thread_4=Thread(target=checking_ip)
    thread_1.start()
    thread_2.start()
    thread_3.start()    
    thread_4.start()
    thread_1.join()
    thread_2.join()
    thread_3.join()
    thread_4.join()
    f=open("ip.txt","w")
    for row in success_list:
        f.write(row+'\n')
    f.close()

运行完会生成一个ip.txt,里面保存了你设置的页码中可使用的ip及端口信息

IncreaseFlow.py #这个和getIp.py可以连起来用,和第一个没关系:

import urllib3
import requests
import time
import random
#定义代理列表,以便使用随机代理访问,
user_agent_list=[
            'Mozilla/5.0(compatible;MSIE9.0;WindowsNT6.1;Trident/5.0)',
            'Mozilla/4.0(compatible;MSIE8.0;WindowsNT6.0;Trident/4.0)',
            'Mozilla/4.0(compatible;MSIE7.0;WindowsNT6.0)',
            'Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11',
            'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
            'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
            'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',  
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36' 
]
#url列表,包含要访问的网站
url_list=[
        'https://juntech.top/posts/sds15482.html',
        'https://juntech.top',
        'https://juntech.top/posts/9211474f.html',
        'https://juntech.top/posts/95e45310.html',
        'https://juntech.top/posts/898da94c.html',
        'https://juntech.top/posts/b829b66a.html'
        
        ]

#生成ip地址列表,以便使用代理ip访问,这里ip地址从txt文档中读取
def get_proxy_ip_list():
    global proxy_ip_list
    proxy_list=[]
    proxy_ip_list=[]
    print("导入proxy_list...")
    f=open("ip.txt")
    line=f.readline().strip('\n')
    while line:
        proxy_list.append(line)
        line=f.readline().strip('\n')
    f.close()
    for i in range(len(proxy_list)):
        ip = 'http://' + proxy_list[i]
        proxy_ip_list.append(ip)
    return proxy_ip_list
#另一种生成ip地址列表的方法,这里ip地址直接写在函数中的ip_list中
def get_proxy_ip_list2():
    global proxy_ip_list2
    proxy_ip_list2=[]
    ip_list=['61.135.217.7:80','118.190.95.35:9001','112.115.57.20:3128','124.235.181.175:80']
    for i in range(len(ip_list)):
        ip = 'http://' + ip_list[i]
        proxy_ip_list2.append(ip)
    print(proxy_ip_list2)
    return proxy_ip_list2  
    
#生成爬虫,爬取网页    
def visit(url,headers,proxies):
    times=0
    while(1):
        url=random.choice(url_list)
        header={'User_Agent':random.choice(user_agent_list)}
        proxy={'http':random.choice(proxies)}
        try:
            #print '%s%s%s'%(url,header,proxy)
            res=requests.get(url,headers=header,proxies=proxy)
            print(res.headers)
        except:
            print('wrong')
            time.sleep(0.1)
        else:
            print('visit %d times'%times)
            time.sleep(random.random())
            # time.sleep(15)
            times+=1 
        if times%20==0:
            time.sleep(15)
        
if __name__=="__main__":
    get_proxy_ip_list()
    visit(url_list,user_agent_list,proxy_ip_list)

在这里推荐使用第一个!!!

3、jsoup爬虫刷博客访问量

这个没什么好说的,会java的很容易理解,至于jsoup,查看官网,就可以很轻松的配合httpClient写出一个爬虫,代码如下:

新建一个maven工程:

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>top.juntech</groupId>
    <artifactId>viewflow</artifactId>
    <version>1.0-SNAPSHOT</version>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.12.1</version>
        </dependency>
        <dependency>
            <groupId></groupId>
            <artifactId></artifactId>
        </dependency>
        <!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
        <dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
            <version>1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.10</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-core</artifactId>
            <version>5.1.8.RELEASE</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
    </dependencies>

</project>

创建爬虫程序:

package top.juntech;


import org.apache.commons.logging.Log;
        import org.apache.commons.logging.LogFactory;
        import org.jsoup.Connection;
        import org.jsoup.Jsoup;
        import org.jsoup.nodes.Document;
        import org.springframework.util.StringUtils;

        import java.net.InetSocketAddress;
        import java.net.Proxy;
        import java.util.ArrayList;
        import java.util.List;
        import java.util.Random;

public class Spider {

    private static Log logger = LogFactory.getLog(Spider.class);

    private static List<String> list = new ArrayList<String>();

    /**
     * 随机获取一个Ip地址
     * @return
     */
    public static String[] getRandomIp() {
        if (list.size() == 0) {
            list.add("47.111.56.178:80");
            list.add("121.61.89.48:61234");
            list.add("222.94.212.39:8118");
            list.add("111.29.3.222:8080");
            list.add("111.29.3.188:80");
            list.add("123.115.254.208:8118");
            list.add("183.247.152.98:53281");
            list.add("58.254.220.116:53579");
            list.add("27.159.167.176:9999");
            list.add("114.248.0.31:8060");
            list.add("218.2.226.42:80");
            list.add("27.208.28.139:8060");
            list.add("58.246.3.178:53281");
            list.add("115.29.42.152:80");
            list.add("183.234.241.105:8118");
            list.add("223.85.196.75:9999");
            list.add("223.111.131.101:8080");
            list.add("27.203.242.102:8060");
            list.add("223.215.102.59:61234");
            list.add("47.111.56.178:80");
            list.add("121.61.89.48:61234");
            list.add("223.215.102.59:61234");
            list.add("112.250.107.37:53281");
            list.add("117.141.155.242:53281");

        }

        Random random = new Random();
        int n = random.nextInt(list.size());
        return list.get(n).split(":");
    }

    /**
     * Jsoup打开连接地址获取Document对象
     * @param url
     * @return
     */
    public static Document getDocument(String url) {
        try {
            Connection conn = Jsoup.connect(url).ignoreContentType(true).ignoreHttpErrors(true).userAgent("Mozilla");
            // 设置代理
            String ip[] = Spider.getRandomIp();
            Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(ip[0].trim(), Integer.parseInt(ip[1].trim())));
            conn.proxy(proxy);
            // 设置超时时间并获取Document对象
            Document document = conn.timeout(8000).get();
            if (null != document && !StringUtils.isEmpty(document.toString())) {// 表示ip被拦截或者其他情况
                System.out.println(proxy.toString());
                return document;
            }
        } catch (Exception e) {
            logger.error("抓取失败...");
        }
        return null;
    }

}

创建测试类:

package top.juntech;

import org.jsoup.nodes.Document;
import org.junit.Test;

import java.util.ArrayList;
import java.util.List;
import java.util.Random;

public class MyTest {

    List<String> list = new ArrayList<String>();

    @Test
    public void testDetail1() throws InterruptedException {
        list.add("https://juntech.top/posts/c2d8ad83.html");
        list.add("https://juntech.top/posts/9211474f.html");
        list.add("https://juntech.top/posts/95e45310.html");
        list.add("https://juntech.top/posts/898da94c.html");


        // 随机访问其中一篇博客
        for (int i = 0; i <= 1000; i++) {
            Random random = new Random();
            int n = random.nextInt(list.size());
            String url = list.get(n);
            Document doc = Spider.getDocument(url);
            if (null != doc) {
                System.out.println("第" + i + "次抓取,url: " + url);
            }
        }
    }

}

4、注意

爬虫代理ip刷博客访问量是很可耻的,请小心使用,毕竟博客文章的精彩程度会给你带来大量的流量,而不是要靠这种偏门的手段去获取,这只会使自己 更懒惰!