java11 httpclient小试牛刀——爬取b站排行榜

587 阅读1分钟

使用jsoup解析HTML

Lombok简化代码

Junit做单元测试

public class CrawlerBilibili {
    @Test
    public void fun6() throws IOException, InterruptedException {
        CookieManager manager = new CookieManager();
        //典型的建造者模式,header之类的使用默认配置,实际上就是对请求报文的编辑
        HttpRequest httpRequest = HttpRequest.newBuilder()
            //设置超时时间
                .timeout(Duration.ofSeconds(10))
            //设置请求方式 默认为get
                .GET()
            //设置请求地址
                .uri(URI.create("https://www.bilibili.com/ranking"))
                .build();
        HttpResponse.BodyHandler<String> handler = HttpResponse.BodyHandlers.ofString();
        HttpClient httpClient = HttpClient.newBuilder()
                .cookieHandler(manager)
            //设置使用协议
                .version(HttpClient.Version.HTTP_1_1)
            //设置重定策略
                .followRedirects(HttpClient.Redirect.ALWAYS)
                .build();
        //发送请求,此处可设置为异步回调机制,默认为同步
        HttpResponse<String> send = httpClient.send(httpRequest, handler);
        //获取回传报文的主体
        String body = send.body();
        //解析DOM
        Elements rankItem = Jsoup.parse(body).getElementById("app")
                .getElementsByClass("b-page-body").get(0)
                .getElementsByClass("main-inner b-wrap").get(0)
                .getElementsByClass("rank-container").get(0)
                .getElementsByClass("rank-body").get(0)
                .getElementsByClass("rank-list").get(0)
                .getElementsByClass("rank-item");
        rankItem.stream()
                .map(e->{
                    Element element = e.getElementsByClass("content").get(0).getElementsByClass("info").get(0);
                    String title=element.getElementsByClass("title").text();
                    String href=element.getElementsByClass("title").attr("href");
                    Integer number=Integer.valueOf(e.getElementsByClass("num").get(0).text());
                    Element detail = element.getElementsByClass("detail").get(0);
                    String play=detail.getElementsByClass("data-box").get(0).text();
                    String comment= detail.getElementsByClass("data-box").get(1).text();
                    String author = detail.getElementsByAttribute("href").get(0).getElementsByClass("data-box").get(0).text();
                    String score = element.getElementsByClass("pts").get(0).getElementsByTag("div").text();
                    return new RankVideo(href,number,title,play,comment,author,score);
                })
                .collect(Collectors.toList())
                .forEach(System.out::println);
    }
}
@Data
@ToString
@EqualsAndHashCode
@AllArgsConstructor
@NoArgsConstructor
class RankVideo {
    private String href;
    private Integer num;
    private String title;
    private String play;
    private String comment;
    private String author;
    private String score;
}

至于如何解析分析HTML文档的例子我考完试再来补充