使用JAVA爬取博客的名称和地址

84 阅读3分钟

小知识,大挑战!本文正在参与“程序员必备小知识”创作活动。​

 1. 设计思路

因为博客有分页功能,所以想获取全部博客的信息一定要先计算总共有多少页,当前页爬取完后跳转到下一页的链接爬取新的博客信息;

有两种方式来获取页数:

1. 通过爬取分页的数值

 但在获取class信息上此种方式辨识度不足,在选中状态下class会进行变化

2. 获取总博客数/每页条数=页数

 此种方式获取便签内容只有一条辨识度足够,进行正则获取数值即可获得总博客数,但是在每页条数进行系统变化的时候可能获得的分页结果会不准确,目前csdn是每页40条如果变为20则会出现数据差异。

获取完页数后就需要遍历每页博客的地址来获取不同页的博客信息

目前分页地址只是数值代表变更的地址栏,如此可以直接遍历按页数进行拼接地址获取博客内容

完整代码最后会贴,如下只是获取页数的部分代码:

    public static List allArtitcle()throws IOException{
        Connection conn = Jsoup.connect(URL)
                .userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
                .timeout(5000)
                .method(Connection.Method.GET);
        Document doc = conn.get();
        Element body = doc.body();

        //获取总页数
        // 获取博客总数的代码
        Element articleListDiv = body.getElementById("container-header-blog");
        // 获取span标签的内容
        String totalPageStr = articleListDiv.select("span").text();
        // 正则取数字
        String regEx="[^0-9]";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(totalPageStr);
        int totalPage = (int) Math.ceil(Double.valueOf(m.replaceAll("").trim())/40L);
        int pageNow = 1; // 初始页数

        // 遍历传递页数进行下一个方法的地址拼接
        List<Article> articleList = new ArrayList<Article>();
        for(pageNow = 1; pageNow <= totalPage; pageNow++){
            articleList.addAll(getArtitcleByPage(pageNow)); 
        }
        return articleList;
    }

页数处理完毕后每页的博客也该处理爬取了,先观察页面的布局特点

 可以看到是articleMeList-blog这个id下article-list这个class下面组成的一个个div构成的平级目录

articleMeList-blog -> article-list -> [div,div,div....]

如此就按级获取elment和elments,而在进行遍历的时候需要获取h4标签下的a标签的内容,代码如下:

    public static List getArtitcleByPage(int pageNow)throws IOException{
        //获取url地址的http链接Connection
        Connection conn = Jsoup.connect(URL+""+pageNow+"")	//博客首页的url地址
                .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0")	//http请求的浏览器设置
                .timeout(5000)   //http连接时长
                .method(Connection.Method.GET);  //请求类型是get请求,http请求还是post,delete等方式
        //获取页面的html文档
        Document doc = conn.get();
        Element body = doc.body();

        //将爬取出来的文章封装到Artcle中,并放到ArrayList里面去
        List<Article> resultList = new ArrayList<Article>();

        Element articleListDiv = body.getElementById("articleMeList-blog");
        Elements articleList = articleListDiv.getElementsByClass("article-list");
        Elements articleItem = null;
        try {
            articleItem = articleList.get(0).getElementsByClass("article-item-box csdn-tracking-statistics");
        } catch (Exception e){
            System.out.println("aa");
        }
            for(Element article : articleItem){
            Article articleEntity = new Article();
            Element linkNode = (article.select("h4 a")).get(0); // 获取h4标签下的a标签


            articleEntity.setAddress(linkNode.attr("href")); // 获取a表情的href属性的值
            articleEntity.setTitle(linkNode.text()); // 获取a标签内的text文本

            resultList.add(articleEntity);
        }
        Thread task;
        return resultList;
    }

2. 完整代码,copy即可用

package com.jingan.jinganservice.task;

import com.jingan.jinganpublic.util.HttpUtil;
import com.jingan.jinganservice.model.Article;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author 余生大大
 * @title: ImageCrawling
 * @projectName jinganplatform
 * @description: TODO
 * @date 2021/10/22 0022下午 16:37
 */
public class ImageCrawling {

    public static int TASKSIZE = 1;

    // 文章数据集合
    public static List<Article> staticArticleList;

    // 获取爬取的文章
    static {
        try {
            staticArticleList = allArtitcle();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Scheduled(fixedRate = 60*1000)
    void test(){
        HttpUtil httpUtil = new HttpUtil();
        for (Article article:staticArticleList) {
            httpUtil.sendGet(article.getAddress(),null);
        }
        System.out.println(TASKSIZE++);
    }

    private static final String URL = "https://blog.csdn.net/AnNanDu/article/list/";

    public static void main(String[] args) throws IOException {
        List<Article> articleList = allArtitcle();

        //遍历输出博主所有的文章
        for(Article article : articleList) {
            System.out.println("文章标题:" + article.getTitle());
            System.out.println("文章绝对路劲地址:" + article.getAddress());
        }
        System.out.println(articleList.size());
    }

    public static List allArtitcle()throws IOException{
        Connection conn = Jsoup.connect(URL)
                .userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
                .timeout(5000)
                .method(Connection.Method.GET);
        Document doc = conn.get();
        Element body = doc.body();

        //获取总页数
        // 获取博客总数的代码
        Element articleListDiv = body.getElementById("container-header-blog");
        // 获取span标签的内容
        String totalPageStr = articleListDiv.select("span").text();
        // 正则取数字
        String regEx="[^0-9]";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(totalPageStr);
        int totalPage = (int) Math.ceil(Double.valueOf(m.replaceAll("").trim())/40L);
        int pageNow = 1;

        List<Article> articleList = new ArrayList<Article>();
        for(pageNow = 1; pageNow <= totalPage; pageNow++){
            articleList.addAll(getArtitcleByPage(pageNow));
        }
        return articleList;
    }

    public static List getArtitcleByPage(int pageNow)throws IOException{
        //获取url地址的http链接Connection
        Connection conn = Jsoup.connect(URL+""+pageNow+"")	//博客首页的url地址
                .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0")	//http请求的浏览器设置
                .timeout(5000)   //http连接时长
                .method(Connection.Method.GET);  //请求类型是get请求,http请求还是post,delete等方式
        //获取页面的html文档
        Document doc = conn.get();
        Element body = doc.body();

        //将爬取出来的文章封装到Artcle中,并放到ArrayList里面去
        List<Article> resultList = new ArrayList<Article>();

        Element articleListDiv = body.getElementById("articleMeList-blog");
        Elements articleList = articleListDiv.getElementsByClass("article-list");
        Elements articleItem = null;
        try {
            articleItem = articleList.get(0).getElementsByClass("article-item-box csdn-tracking-statistics");
        } catch (Exception e){
            System.out.println("aa");
        }
        for(Element article : articleItem){
            Article articleEntity = new Article();
            Element linkNode = (article.select("h4 a")).get(0);


            articleEntity.setAddress(linkNode.attr("href"));
            articleEntity.setTitle(linkNode.text());

            resultList.add(articleEntity);
        }
        Thread task;
        return resultList;
    }
}

3. 结果