定时任务爬取数据

277 阅读1分钟

定时任务处理爬虫程序,有些数据需要爬虫定时去爬取,我们加入spring boot的定时任务注解来时爬虫可以在特定时间执行我们的爬取任务

package com.jingan.jinganservice.task;

import com.jingan.jinganpublic.util.HttpUtil;
import com.jingan.jinganservice.model.Article;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.FutureTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Anzepeng
 * @title: TimingTask
 * @projectName jinganplatfrom
 * @description: TODO
 * @date 2021/1/31 0031下午 16:15
 */
@Component
public class TimingTask {

    public static int TASKSIZE = 1;

    // 文章数据集合
    public static List<Article> staticArticleList;

    // 获取爬取的文章
    static {
        try {
            staticArticleList = allArtitcle();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    @Scheduled(fixedRate = 60*1000)
    void test(){
        HttpUtil httpUtil = new HttpUtil();
        for (Article article:staticArticleList) {
             httpUtil.sendGet(article.getAddress(),null);
        }
        System.out.println(TASKSIZE++);
    }

    private static final String URL = "https://blog.csdn.net/AnNanDu/article/list/";

    public static void main(String[] args) throws IOException {
        List<Article> articleList = allArtitcle();

        //遍历输出博主所有的文章
        for(Article article : articleList) {
            System.out.println("文章标题:" + article.getTitle());
            System.out.println("文章绝对路劲地址:" + article.getAddress());
        }
        System.out.println(articleList.size());
    }

    public static List allArtitcle()throws IOException{
        Connection conn = Jsoup.connect(URL)
                .userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
                .timeout(5000)
                .method(Connection.Method.GET);
        Document doc = conn.get();
        Element body = doc.body();

        //获取总页数
        // 获取博客总数的代码
        Element articleListDiv = body.getElementById("container-header-blog");
        // 获取span标签的内容
        String totalPageStr = articleListDiv.select("span").text();
        // 正则取数字
        String regEx="[^0-9]";
        Pattern p = Pattern.compile(regEx);
        Matcher m = p.matcher(totalPageStr);
        int totalPage = (int) Math.ceil(Double.valueOf(m.replaceAll("").trim())/40L);
        int pageNow = 1;

        // 遍历传递页数进行下一个方法的地址拼接
        List<Article> articleList = new ArrayList<Article>();
        for(pageNow = 1; pageNow <= totalPage; pageNow++){
            articleList.addAll(getArtitcleByPage(pageNow));
        }
        return articleList;
    }

    public static List getArtitcleByPage(int pageNow)throws IOException{
        //获取url地址的http链接Connection
        Connection conn = Jsoup.connect(URL+""+pageNow+"") //博客首页的url地址
                .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0")    //http请求的浏览器设置
                .timeout(5000)   //http连接时长
                .method(Connection.Method.GET);  //请求类型是get请求,http请求还是post,delete等方式
        //获取页面的html文档
        Document doc = conn.get();
        Element body = doc.body();

        //将爬取出来的文章封装到Artcle中,并放到ArrayList里面去
        List<Article> resultList = new ArrayList<Article>();

        Element articleListDiv = body.getElementById("articleMeList-blog");
        Elements articleList = articleListDiv.getElementsByClass("article-list");
        Elements articleItem = null;
        try {
            articleItem = articleList.get(0).getElementsByClass("article-item-box csdn-tracking-statistics");
        } catch (Exception e){
            System.out.println("aa");
        }
            for(Element article : articleItem){
            Article articleEntity = new Article();
            Element linkNode = (article.select("h4 a")).get(0); // 获取h4标签下的a标签


            articleEntity.setAddress(linkNode.attr("href")); // 获取a表情的href属性的值
            articleEntity.setTitle(linkNode.text()); // 获取a标签内的text文本

            resultList.add(articleEntity);
        }
        Thread task;
        return resultList;
    }
}