使用爬虫代理采集网站失败的解决方法

606 阅读2分钟

爬虫程序采集网站必须使用动态代理,才能避免出现网站访问频繁的限制,这是众所周知的。但是在具体采集网站的过程中,即使使用了动态代理依然会出现403、503或429的反爬错误,这是为什么呢?根据以往的经验,一般来说是因为以下几个原因造成的:

1、动态User-Agent的修改

爬虫程序采集网站,正常的HTTP请求都需要进行ua(User-Agent)优化,因为ua是浏览器标识,如果http请求没有ua,甚至有些爬虫程序主动标示为采集,那么目标网站拒绝采集的可能性很高

2、控制单个代理IP的请求频率

虽然爬虫程序使用了动态代理,但是如果程序的多线程控制实现不好,会导致单个代理IP在短时间内发出大量的请求,导致该IP被访问频繁

3、IP有效时间的管理

动态代理IP使用过程中,必须进行存活检查,一旦发现延迟较高、带宽很低的代理IP,应该主动丢弃,避免使用过程中出现超时的情况

如果觉得上面的工作太麻烦,推荐使用自动转发的爬虫代理加强版,这种产品能实现每个http请求自动分配不同的代理IP转发,同时进行IP池的自动多线程管理,确保了请求联通率99%以上同时延迟低于300ms,可以快速上手采集网站,下面是产品demo可以直接复制使用,配置代理参数(proxyHost、proxyPort、proxyUser、proxyPass)和目标网站(targetUrl)就可以Run:

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.ProxyAuthenticationStrategy;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.NameValuePair;
import org.apache.http.util.EntityUtils;

public class Demo
{
    // 代理服务器(产品官网 www.16yun.cn)
    final static String proxyHost = "t.16yun.cn";
    final static Integer proxyPort = 31000;

    // 代理验证信息
    final static String proxyUser = "username";
    final static String proxyPass = "password";


    private static PoolingHttpClientConnectionManager cm = null;
    private static HttpRequestRetryHandler httpRequestRetryHandler = null;
    private static HttpHost proxy = null;

    private static CredentialsProvider credsProvider = null;
    private static RequestConfig reqConfig = null;

    static {
        ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
        LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();

        Registry registry = RegistryBuilder.create()
            .register("http", plainsf)
            .register("https", sslsf)
            .build();

        cm = new PoolingHttpClientConnectionManager(registry);
        cm.setMaxTotal(20);
        cm.setDefaultMaxPerRoute(5);

        proxy = new HttpHost(proxyHost, proxyPort, "http");

        credsProvider = new BasicCredentialsProvider();
        credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass));

        reqConfig = RequestConfig.custom()
            .setConnectionRequestTimeout(5000)
            .setConnectTimeout(5000)
            .setSocketTimeout(5000)
            .setExpectContinueEnabled(false)
            .setProxy(new HttpHost(proxyHost, proxyPort))
            .build();
    }

    public static void doRequest(HttpRequestBase httpReq) {
        CloseableHttpResponse httpResp = null;

        try {
            setHeaders(httpReq);

            httpReq.setConfig(reqConfig);

            CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(cm)
                .setDefaultCredentialsProvider(credsProvider)
                .build();

            AuthCache authCache = new BasicAuthCache();
            authCache.put(proxy, new BasicScheme());

            HttpClientContext localContext = HttpClientContext.create();
            localContext.setAuthCache(authCache);

            httpResp = httpClient.execute(httpReq, localContext);

            int statusCode = httpResp.getStatusLine().getStatusCode();

            System.out.println(statusCode);

            BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent()));

            String line = "";
            while((line = rd.readLine()) != null) {
                System.out.println(line);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (httpResp != null) {
                    httpResp.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * 设置请求头
     *
     * @param httpReq
     */
    private static void setHeaders(HttpRequestBase httpReq) {

        // 设置Proxy-Tunnel
        // Random random = new Random();
        // int tunnel = random.nextInt(10000);
        // httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel));

        httpReq.setHeader("Accept-Encoding", null);

    }


    public static void doGetRequest() {
        // 要访问的目标页面
        String targetUrl = "https://httpbin.org/ip";


        try {
            HttpGet httpGet = new HttpGet(targetUrl);

            doRequest(httpGet);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        doGetRequest();


    }
}