定时任务处理爬虫程序,有些数据需要爬虫定时去爬取,我们加入spring boot的定时任务注解来时爬虫可以在特定时间执行我们的爬取任务
package com.jingan.jinganservice.task;
import com.jingan.jinganpublic.util.HttpUtil;
import com.jingan.jinganservice.model.Article;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.FutureTask;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Anzepeng
* @title: TimingTask
* @projectName jinganplatfrom
* @description: TODO
* @date 2021/1/31 0031下午 16:15
*/
@Component
public class TimingTask {
public static int TASKSIZE = 1;
// 文章数据集合
public static List<Article> staticArticleList;
// 获取爬取的文章
static {
try {
staticArticleList = allArtitcle();
} catch (IOException e) {
e.printStackTrace();
}
}
@Scheduled(fixedRate = 60*1000)
void test(){
HttpUtil httpUtil = new HttpUtil();
for (Article article:staticArticleList) {
httpUtil.sendGet(article.getAddress(),null);
}
System.out.println(TASKSIZE++);
}
private static final String URL = "https://blog.csdn.net/AnNanDu/article/list/";
public static void main(String[] args) throws IOException {
List<Article> articleList = allArtitcle();
//遍历输出博主所有的文章
for(Article article : articleList) {
System.out.println("文章标题:" + article.getTitle());
System.out.println("文章绝对路劲地址:" + article.getAddress());
}
System.out.println(articleList.size());
}
public static List allArtitcle()throws IOException{
Connection conn = Jsoup.connect(URL)
.userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
.timeout(5000)
.method(Connection.Method.GET);
Document doc = conn.get();
Element body = doc.body();
//获取总页数
// 获取博客总数的代码
Element articleListDiv = body.getElementById("container-header-blog");
// 获取span标签的内容
String totalPageStr = articleListDiv.select("span").text();
// 正则取数字
String regEx="[^0-9]";
Pattern p = Pattern.compile(regEx);
Matcher m = p.matcher(totalPageStr);
int totalPage = (int) Math.ceil(Double.valueOf(m.replaceAll("").trim())/40L);
int pageNow = 1;
// 遍历传递页数进行下一个方法的地址拼接
List<Article> articleList = new ArrayList<Article>();
for(pageNow = 1; pageNow <= totalPage; pageNow++){
articleList.addAll(getArtitcleByPage(pageNow));
}
return articleList;
}
public static List getArtitcleByPage(int pageNow)throws IOException{
//获取url地址的http链接Connection
Connection conn = Jsoup.connect(URL+""+pageNow+"") //博客首页的url地址
.userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0") //http请求的浏览器设置
.timeout(5000) //http连接时长
.method(Connection.Method.GET); //请求类型是get请求,http请求还是post,delete等方式
//获取页面的html文档
Document doc = conn.get();
Element body = doc.body();
//将爬取出来的文章封装到Artcle中,并放到ArrayList里面去
List<Article> resultList = new ArrayList<Article>();
Element articleListDiv = body.getElementById("articleMeList-blog");
Elements articleList = articleListDiv.getElementsByClass("article-list");
Elements articleItem = null;
try {
articleItem = articleList.get(0).getElementsByClass("article-item-box csdn-tracking-statistics");
} catch (Exception e){
System.out.println("aa");
}
for(Element article : articleItem){
Article articleEntity = new Article();
Element linkNode = (article.select("h4 a")).get(0); // 获取h4标签下的a标签
articleEntity.setAddress(linkNode.attr("href")); // 获取a表情的href属性的值
articleEntity.setTitle(linkNode.text()); // 获取a标签内的text文本
resultList.add(articleEntity);
}
Thread task;
return resultList;
}
}