jsoup 解析html

500 阅读1分钟

一、jsoup是什么

jsoup 是一款javahtml解析器,主要是解析网页爬虫返回的数据。

[官方文档](https://www.wenjiangs.com/doc/stdtla6y)

二、导入依赖

<dependency>
	<groupId>org.jsoup</groupId>
	   <artifactId>jsoup</artifactId>
	<version>1.11.3</version>
</dependency>

三、使用方法

1.加载html的方法

@Slf4j
public class Test {

	public static void main(String[] args) throws IOException {

		// 从URL加载HTML
		Document document = Jsoup.connect("http://www.baidu.com").get();
		String title = document.title();
		// 获取html中的标题
		log.info("title :{}" , title);

		// 从字符串加载HTML
		String html = "<html><head><title>jsoup</title></head>"
				+ "<body><p>初学jsoup</p></body></html>";
		Document doc = Jsoup.parse(html);
		title = doc.title();
		log.info("title :{}" , title);

		// 从文件加载HTML
		doc = Jsoup.parse(new File("D:\\jsoup\\html\\index.html"), "utf-8");
		title = doc.title();
		log.info("title :{}" , title);

	}

}

2.获取html 中的title ,head,body

 Document document = Jsoup.connect("http://www.baidu.com").get();
		
        String title = document.title();
        log.info("title:{}",title);
        
        Element head= document.head();
        log.info("head:{}",head);
        
        Element body=document.body();
        log.info("body:{}",body);

3.获取html标签内容

public static void main(String[] args) throws IOException {

		Document document = Jsoup.connect("http://www.baidu.com").get();

		// 标签名称
		Elements paragraphs = document.getElementsByTag("p");
		System.out.println("paragraphs: " + paragraphs);
		// id
		Element paragraph = document.getElementById("head");
		System.out.println("paragraph: " + paragraph);

		// 获取url
		Elements linkUrls = document.select("a[href]");

		for (Element link : linkUrls) {
			System.out.println("Href: " + link.attr("href"));
			System.out.println("Text: " + link.text());
		}

		// 获取图片
		Elements pngs = document.select("img[src$=.png]");

		for (Element png : pngs) {
			System.out.println("Name: " + png.attr("name"));
		}

		// 获取标签下的内容(标签是div ,class 为header)
		Element headerDiv = document.select("div.header").first();
		System.out.println("Id: " + headerDiv.id());

		// 获取复合标签的内容
		Elements sampleLinks = document.select("h3 > a");
		for (Element link : sampleLinks) {
			System.out.println("Text: " + link.text());
		}
	}