每月需要数据做统计报告咋办

158 阅读1分钟

有时候公司需要一些行业分析报告,这时候前端需要自己想办法从其他网站采集一些数据进行分析。但是鉴于项目临时需要,如果进行网站分析、数据采集框架学习需要大量的时间,这时候建议使用爬虫代理,可以直接参考demo,通过爬虫代理自动切换IP,避免网站的反派策略,快速采集数据满足业务要求,例如HttpClient4.x的demo如下:

import java.io.BufferedReader;

import java.io.InputStreamReader;

import java.io.IOException;

import java.net.URI;

import java.util.Arrays;

import java.util.ArrayList;

import java.util.HashSet;

import java.util.List;

import java.util.Set;

import org.apache.http.Header;

import org.apache.http.HeaderElement;

import org.apache.http.HttpHost;

import org.apache.http.auth.AuthScope;

import org.apache.http.auth.UsernamePasswordCredentials;

import org.apache.http.client.AuthCache;

import org.apache.http.client.CredentialsProvider;

import org.apache.http.client.HttpRequestRetryHandler;

import org.apache.http.client.config.RequestConfig;

import org.apache.http.client.config.AuthSchemes;

import org.apache.http.client.entity.GzipDecompressingEntity;

import org.apache.http.client.entity.UrlEncodedFormEntity;

import org.apache.http.client.methods.CloseableHttpResponse;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.client.methods.HttpPost;

import org.apache.http.client.methods.HttpRequestBase;

import org.apache.http.client.protocol.HttpClientContext;

import org.apache.http.config.Registry;

import org.apache.http.config.RegistryBuilder;

import org.apache.http.conn.socket.ConnectionSocketFactory;

import org.apache.http.conn.socket.LayeredConnectionSocketFactory;

import org.apache.http.conn.socket.PlainConnectionSocketFactory;

import org.apache.http.conn.ssl.SSLConnectionSocketFactory;

import org.apache.http.impl.auth.BasicScheme;

import org.apache.http.impl.client.BasicAuthCache;

import org.apache.http.impl.client.BasicCredentialsProvider;

import org.apache.http.impl.client.CloseableHttpClient;

import org.apache.http.impl.client.HttpClients;

import org.apache.http.impl.client.ProxyAuthenticationStrategy;

import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;

import org.apache.http.message.BasicHeader;

import org.apache.http.message.BasicNameValuePair;

import org.apache.http.NameValuePair;

import org.apache.http.util.EntityUtils;

public class Demo

{

// 代理服务器(产品官网 www.16yun.cn)

final static String proxyHost = "t.16yun.cn";

final static Integer proxyPort = 31000;

// 代理验证信息

final static String proxyUser = "username";

final static String proxyPass = "password";

private static PoolingHttpClientConnectionManager cm = null;

private static HttpRequestRetryHandler httpRequestRetryHandler = null;

private static HttpHost proxy = null;

private static CredentialsProvider credsProvider = null;

private static RequestConfig reqConfig = null;

static {

ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();

LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();

Registry registry = RegistryBuilder.create()

.register("http", plainsf)

.register("https", sslsf)

.build();

cm = new PoolingHttpClientConnectionManager(registry);

cm.setMaxTotal(20);

cm.setDefaultMaxPerRoute(5);

proxy = new HttpHost(proxyHost, proxyPort, "http");

credsProvider = new BasicCredentialsProvider();

credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass));

reqConfig = RequestConfig.custom()

.setConnectionRequestTimeout(5000)

.setConnectTimeout(5000)

.setSocketTimeout(5000)

.setExpectContinueEnabled(false)

.setProxy(new HttpHost(proxyHost, proxyPort))

.build();

}

public static void doRequest(HttpRequestBase httpReq) {

CloseableHttpResponse httpResp = null;

try {

setHeaders(httpReq);

httpReq.setConfig(reqConfig);

CloseableHttpClient httpClient = HttpClients.custom()

.setConnectionManager(cm)

.setDefaultCredentialsProvider(credsProvider)

.build();

AuthCache authCache = new BasicAuthCache();

authCache.put(proxy, new BasicScheme());

HttpClientContext localContext = HttpClientContext.create();

localContext.setAuthCache(authCache);

httpResp = httpClient.execute(httpReq, localContext);

int statusCode = httpResp.getStatusLine().getStatusCode();

System.out.println(statusCode);

BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent()));

String line = "";

while((line = rd.readLine()) != null) {

System.out.println(line);

}

} catch (Exception e) {

e.printStackTrace();

} finally {

try {

if (httpResp != null) {

httpResp.close();

}

} catch (IOException e) {

e.printStackTrace();

}

}

}

/**

  • 设置请求头

  • @param httpReq

*/

private static void setHeaders(HttpRequestBase httpReq) {

// 设置Proxy-Tunnel

// Random random = new Random();

// int tunnel = random.nextInt(10000);

// httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel));

httpReq.setHeader("Accept-Encoding", null);

}

public static void doGetRequest() {

// 要访问的目标页面

String targetUrl = "httpbin.org/ip";

try {

HttpGet httpGet = new HttpGet(targetUrl);

doRequest(httpGet);

} catch (Exception e) {

e.printStackTrace();

}

}

public static void main(String[] args) {

doGetRequest();

}

}