一、jsoup是什么
jsoup 是一款java的html解析器,主要是解析网页爬虫返回的数据。
[官方文档](https:
二、导入依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
三、使用方法
1.加载html的方法
@Slf4j
public class Test {
public static void main(String[] args) throws IOException {
// 从URL加载HTML
Document document = Jsoup.connect("http://www.baidu.com").get()
String title = document.title()
// 获取html中的标题
log.info("title :{}" , title)
// 从字符串加载HTML
String html = "<html><head><title>jsoup</title></head>"
+ "<body><p>初学jsoup</p></body></html>"
Document doc = Jsoup.parse(html)
title = doc.title()
log.info("title :{}" , title)
// 从文件加载HTML
doc = Jsoup.parse(new File("D:\\jsoup\\html\\index.html"), "utf-8")
title = doc.title()
log.info("title :{}" , title)
}
}
2.获取html 中的title ,head,body
Document document = Jsoup.connect("http://www.baidu.com").get()
String title = document.title()
log.info("title:{}",title)
Element head= document.head()
log.info("head:{}",head)
Element body=document.body()
log.info("body:{}",body)
3.获取html标签内容
public static void main(String[] args) throws IOException {
Document document = Jsoup.connect("http://www.baidu.com").get();
Elements paragraphs = document.getElementsByTag("p");
System.out.println("paragraphs: " + paragraphs);
Element paragraph = document.getElementById("head");
System.out.println("paragraph: " + paragraph);
Elements linkUrls = document.select("a[href]");
for (Element link : linkUrls) {
System.out.println("Href: " + link.attr("href"));
System.out.println("Text: " + link.text());
}
Elements pngs = document.select("img[src$=.png]");
for (Element png : pngs) {
System.out.println("Name: " + png.attr("name"));
}
Element headerDiv = document.select("div.header").first();
System.out.println("Id: " + headerDiv.id());
Elements sampleLinks = document.select("h3 > a");
for (Element link : sampleLinks) {
System.out.println("Text: " + link.text());
}
}