使用jsoup解析HTML
Lombok简化代码
Junit做单元测试
public class CrawlerBilibili {
@Test
public void fun6() throws IOException, InterruptedException {
CookieManager manager = new CookieManager();
//典型的建造者模式,header之类的使用默认配置,实际上就是对请求报文的编辑
HttpRequest httpRequest = HttpRequest.newBuilder()
//设置超时时间
.timeout(Duration.ofSeconds(10))
//设置请求方式 默认为get
.GET()
//设置请求地址
.uri(URI.create("https://www.bilibili.com/ranking"))
.build();
HttpResponse.BodyHandler<String> handler = HttpResponse.BodyHandlers.ofString();
HttpClient httpClient = HttpClient.newBuilder()
.cookieHandler(manager)
//设置使用协议
.version(HttpClient.Version.HTTP_1_1)
//设置重定策略
.followRedirects(HttpClient.Redirect.ALWAYS)
.build();
//发送请求,此处可设置为异步回调机制,默认为同步
HttpResponse<String> send = httpClient.send(httpRequest, handler);
//获取回传报文的主体
String body = send.body();
//解析DOM
Elements rankItem = Jsoup.parse(body).getElementById("app")
.getElementsByClass("b-page-body").get(0)
.getElementsByClass("main-inner b-wrap").get(0)
.getElementsByClass("rank-container").get(0)
.getElementsByClass("rank-body").get(0)
.getElementsByClass("rank-list").get(0)
.getElementsByClass("rank-item");
rankItem.stream()
.map(e->{
Element element = e.getElementsByClass("content").get(0).getElementsByClass("info").get(0);
String title=element.getElementsByClass("title").text();
String href=element.getElementsByClass("title").attr("href");
Integer number=Integer.valueOf(e.getElementsByClass("num").get(0).text());
Element detail = element.getElementsByClass("detail").get(0);
String play=detail.getElementsByClass("data-box").get(0).text();
String comment= detail.getElementsByClass("data-box").get(1).text();
String author = detail.getElementsByAttribute("href").get(0).getElementsByClass("data-box").get(0).text();
String score = element.getElementsByClass("pts").get(0).getElementsByTag("div").text();
return new RankVideo(href,number,title,play,comment,author,score);
})
.collect(Collectors.toList())
.forEach(System.out::println);
}
}
@Data
@ToString
@EqualsAndHashCode
@AllArgsConstructor
@NoArgsConstructor
class RankVideo {
private String href;
private Integer num;
private String title;
private String play;
private String comment;
private String author;
private String score;
}
至于如何解析分析HTML文档的例子我考完试再来补充