计算机网络基础架构与HTTP原理
-
计算机⽹络是如何⼯作的
- 在打开网页的时候发⽣了了什么
-
计算机网络的最基础架构
- 很多主机通过ip地址相连接,ip地址相当于一个个门牌号,ipv4,版本4是32位的,有2的32次方大概42亿个地址
- IPv4全称叫Internet Protocol Version 4,意思是IP协议第四版
- Ipv6是128位的,可以为地球上的每一立沙子分配地址
- 简单的说就是:一个计算机网络就是很多台主机,彼此之前用线连接起来,每个主机有个ip地址
-
敲回车的时候发生什么
- 如果本地缓存了dns就不会再去请求网络的dns,本地可以覆盖掉网络dns,host文件,就不用去问开门老大爷
- DNS解析成ip地址,DNS就是问路无所不知的老大爷
- 端口port(原意港口的意思),https默认端口433,http默认80
-
开发的时候可以把hosts绑定到不同的环境上m,本地强制让某个域名跳转到哪里去
-
TCP协议,传输控制协议(TCP,Transmission Control Protocol),三次握手四次挥手,(除此之外还有UDP协议,广播),全双工,像电话一样,双向的高速通道,定义了字节流在网上如何发送和接受
-
TCP协议之上有HTTP协议(HyperText Transfer Protocol,超文本传输协议),可以跑图片,音频等等,丰富多彩的世界,定义了文本之外的东西如何去传说
-
请求通信之前
-
之后得到相应response
-
一个完整的http响应
-
浏览器是如何工作的,数据如何被浏览器处理
- 通过html,来一点就解析一点
爬取豆瓣
package com.github.hcsp;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.io.InputStream;
public class Main {
public static void main(String[] args) throws IOException {
// 创建一个default 客户端
CloseableHttpClient httpclient = HttpClients.createDefault();
// 发起了一个http Get请求
HttpGet httpGet = new HttpGet("https://movie.douban.com/top250");
// 执行这个请求拿到response
CloseableHttpResponse response = httpclient.execute(httpGet);
try {
// getStatusLine就是http response的第一行
System.out.println(response.getStatusLine());
HttpEntity entity1 = response.getEntity();
// do something useful with the response body
// and ensure it is fully consumed
InputStream is = entity1.getContent();
System.out.println(IOUtils.toString(is, "UTF-8"));
EntityUtils.consume(entity1);
} finally {
response.close();
}
}
}
-
使用jsoup解析html
package com.github.hcsp;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
public class Main {
public static void main(String[] args) throws IOException {
// 创建一个default 客户端
CloseableHttpClient httpclient = HttpClients.createDefault();
// 发起了一个http Get请求
HttpGet httpGet = new HttpGet("https://github.com/gradle/gradle/issues");
// 执行这个请求拿到response
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
// 传输的类型
httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");
CloseableHttpResponse response = httpclient.execute(httpGet);
try {
// getStatusLine就是http response的第一行
System.out.println(response.getStatusLine());
HttpEntity entity1 = response.getEntity();
InputStream is = entity1.getContent();
String html = IOUtils.toString(is, "UTF-8");
Document document = Jsoup.parse(html);
System.out.println(document);
ArrayList<Element> issues = document.select(".js-issue-row");
for (Element element : issues) {
System.out.println(element.child(0).child(1).child(0).text());
System.out.println(element.child(0).child(1).child(0).attr("href"));
}
} finally {
response.close();
}
}
}
- issues github api可以拿到结构化数据,有api的通过api去访问,api的好处是稳定的
为什么有些数据拿不不到?
-
同步与异步加载
- 服务器器端⼀一次返回所有的数据
- 服务器端返回部分数据,使⽤用AJAX异步加载
爬虫铭记的原则
- 只要浏览器能做到,其他代码一定能做到
课后训练题
- 1-1
package com.github.hcsp.http;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class Crawler {
static class GitHubPullRequest {
// Pull request的编号
int number;
// Pull request的标题
String title;
// Pull request的作者的 GitHub 用户名
String author;
GitHubPullRequest(int number, String title, String author) {
this.number = number;
this.title = title;
this.author = author;
}
}
// 给定一个仓库名,例如"golang/go",或者"gradle/gradle",返回第一页的Pull request信息
public static List<GitHubPullRequest> getFirstPageOfPullRequests(String repo) throws IOException {
// 创建一个default 客户端
CloseableHttpClient httpclient = HttpClients.createDefault();
// 发起了一个http Get请求
StringBuilder target = new StringBuilder("https://api.github.com/repos/" + repo + "/issues");
System.out.println(target);
HttpGet httpGet = new HttpGet(String.valueOf(target));
// https://api.github.com/repos/hcsp/read-github-pull-requests/issues
// 执行这个请求拿到response
httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
// 传输的类型
httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");
CloseableHttpResponse response = httpclient.execute(httpGet);
try {
HttpEntity entity1 = response.getEntity();
InputStream is = entity1.getContent();
String html = IOUtils.toString(is, "UTF-8");
JSONArray JSONArray = JSON.parseArray(html);
return traverse(JSONArray);
} finally {
response.close();
}
}
public static void main(String[] args) throws IOException {
System.out.println(
getFirstPageOfPullRequests("gradle/gradle")
);
}
public static List<GitHubPullRequest> traverse(JSONArray issuesInfoList) {
List<GitHubPullRequest> pullRequestsList =
new ArrayList<GitHubPullRequest>();
for (int i = 0; i < issuesInfoList.size(); i++) {
JSONObject account = (JSONObject) issuesInfoList.getJSONObject(i).get("user");
if (issuesInfoList.getJSONObject(i).get("pull_request") != null) {
Integer number = (Integer) issuesInfoList.getJSONObject(i).get("number");
String title = (String) issuesInfoList.getJSONObject(i).get("title");
String user = account.getString("login");
GitHubPullRequest gp = new GitHubPullRequest(number, title, user);
pullRequestsList.add(gp);
}
}
return pullRequestsList;
}
}