22中级 - Java Web:Web前后端架构分析【爬虫项目实战】

189 阅读3分钟

计算机网络基础架构与HTTP原理

  • 计算机⽹络是如何⼯作的

    • 在打开网页的时候发⽣了了什么
  • 计算机网络的最基础架构

    • 很多主机通过ip地址相连接,ip地址相当于一个个门牌号,ipv4,版本4是32位的,有2的32次方大概42亿个地址
    • IPv4全称叫Internet Protocol Version 4,意思是IP协议第四版
    • Ipv6是128位的,可以为地球上的每一立沙子分配地址
    • 简单的说就是:一个计算机网络就是很多台主机,彼此之前用线连接起来,每个主机有个ip地址 1.png
  • 敲回车的时候发生什么

    • 如果本地缓存了dns就不会再去请求网络的dns,本地可以覆盖掉网络dns,host文件,就不用去问开门老大爷
    • DNS解析成ip地址,DNS就是问路无所不知的老大爷
    • 端口port(原意港口的意思),https默认端口433,http默认80
  • 开发的时候可以把hosts绑定到不同的环境上m,本地强制让某个域名跳转到哪里去
    2.png

  • TCP协议,传输控制协议(TCP,Transmission Control Protocol),三次握手四次挥手,(除此之外还有UDP协议,广播),全双工,像电话一样,双向的高速通道,定义了字节流在网上如何发送和接受

3.png

  • TCP协议之上有HTTP协议(HyperText Transfer Protocol,超文本传输协议),可以跑图片,音频等等,丰富多彩的世界,定义了文本之外的东西如何去传说

  • 请求通信之前

4.png

  • 之后得到相应response

  • 一个完整的http响应

5.png

  • 浏览器是如何工作的,数据如何被浏览器处理

    • 通过html,来一点就解析一点

爬取豆瓣

package com.github.hcsp;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.io.InputStream;

public class Main {
    public static void main(String[] args) throws IOException {
        // 创建一个default 客户端
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 发起了一个http Get请求
        HttpGet httpGet = new HttpGet("https://movie.douban.com/top250");
        // 执行这个请求拿到response
        CloseableHttpResponse response = httpclient.execute(httpGet);
        try {
            // getStatusLine就是http response的第一行
            System.out.println(response.getStatusLine());
            HttpEntity entity1 = response.getEntity();
            // do something useful with the response body
            // and ensure it is fully consumed
            InputStream is = entity1.getContent();
            System.out.println(IOUtils.toString(is, "UTF-8"));
            EntityUtils.consume(entity1);
        } finally {
            response.close();
        }

    }
}
package com.github.hcsp;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;

public class Main {
    public static void main(String[] args) throws IOException {
        // 创建一个default 客户端
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 发起了一个http Get请求

        HttpGet httpGet = new HttpGet("https://github.com/gradle/gradle/issues");
        // 执行这个请求拿到response
        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
        // 传输的类型
        httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");

        CloseableHttpResponse response = httpclient.execute(httpGet);
        try {
            // getStatusLine就是http response的第一行
            System.out.println(response.getStatusLine());
            HttpEntity entity1 = response.getEntity();
            InputStream is = entity1.getContent();

            String html = IOUtils.toString(is, "UTF-8");

            Document document = Jsoup.parse(html);
            System.out.println(document);
            ArrayList<Element> issues = document.select(".js-issue-row");

            for (Element element : issues) {
                System.out.println(element.child(0).child(1).child(0).text());
                System.out.println(element.child(0).child(1).child(0).attr("href"));
            }
        } finally {
            response.close();
        }
    }
}
  • issues github api可以拿到结构化数据,有api的通过api去访问,api的好处是稳定的

为什么有些数据拿不不到?

  • 同步与异步加载

    • 服务器器端⼀一次返回所有的数据
    • 服务器端返回部分数据,使⽤用AJAX异步加载

爬虫铭记的原则

  • 只要浏览器能做到,其他代码一定能做到

课后训练题

  • 1-1
package com.github.hcsp.http;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

public class Crawler {
    static class GitHubPullRequest {
        // Pull request的编号
        int number;
        // Pull request的标题
        String title;
        // Pull request的作者的 GitHub 用户名
        String author;

        GitHubPullRequest(int number, String title, String author) {
            this.number = number;
            this.title = title;
            this.author = author;
        }
    }

    // 给定一个仓库名,例如"golang/go",或者"gradle/gradle",返回第一页的Pull request信息
    public static List<GitHubPullRequest> getFirstPageOfPullRequests(String repo) throws IOException {
        // 创建一个default 客户端
        CloseableHttpClient httpclient = HttpClients.createDefault();
        // 发起了一个http Get请求

        StringBuilder target = new StringBuilder("https://api.github.com/repos/" + repo + "/issues");

        System.out.println(target);

        HttpGet httpGet = new HttpGet(String.valueOf(target));

//        https://api.github.com/repos/hcsp/read-github-pull-requests/issues
        // 执行这个请求拿到response
        httpGet.addHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.6)");
        // 传输的类型
        httpGet.addHeader("Content-Type", "application/x-www-form-urlencoded");

        CloseableHttpResponse response = httpclient.execute(httpGet);
        try {
            HttpEntity entity1 = response.getEntity();
            InputStream is = entity1.getContent();
            String html = IOUtils.toString(is, "UTF-8");
            JSONArray JSONArray = JSON.parseArray(html);
            return traverse(JSONArray);
        } finally {
            response.close();
        }
    }

    public static void main(String[] args) throws IOException {
        System.out.println(
                getFirstPageOfPullRequests("gradle/gradle")
        );
    }

    public static List<GitHubPullRequest> traverse(JSONArray issuesInfoList) {
        List<GitHubPullRequest> pullRequestsList =
                new ArrayList<GitHubPullRequest>();
        for (int i = 0; i < issuesInfoList.size(); i++) {
            JSONObject account = (JSONObject) issuesInfoList.getJSONObject(i).get("user");
            if (issuesInfoList.getJSONObject(i).get("pull_request") != null) {
                Integer number = (Integer) issuesInfoList.getJSONObject(i).get("number");
                String title = (String) issuesInfoList.getJSONObject(i).get("title");
                String user = account.getString("login");
                GitHubPullRequest gp = new GitHubPullRequest(number, title, user);
                pullRequestsList.add(gp);
            }
        }
        return pullRequestsList;
    }
}