需求
朋友想用爬虫解决一点工作中遇到的问题
要求如下:
找涉及 【后勤】【物业】【保洁】【食堂】【餐厅】【绿化】【行政办公】词汇的标题 提取以下字段
获取以上数据,填充入以下格式的Excel中
具体实现
作为一个JAVA程序猿,当然要选趁手的兵器了!IDEA启动!
选择合适的框架
我选择使用springboot+webmagic,解析DOM用了xpath,兵器捡趁手的用没毛病吧。为什么选webmagic?因为我之前用过,感觉跟之前python那个差不了太多吧(我都是粗浅使用,个人感觉用着没差多少,都是获取网页解析DOM)
webmagic官网在这里!有兴趣可以研究下,我只是粗浅使用了一下
引入pom依赖
<!-- WebMagic爬虫-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
编写Processor
package com.easyjob.spider;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.easyjob.util.HttpClientUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import static com.easyjob.spider.SpiderService.concurrentHashMap;
/**
* @author yyyyy
* @date 2024/9/18 20:02
* @description
*/
@Component
@Slf4j
public class TenderProcessor implements PageProcessor {
private Site site = Site.me()
.setUserAgent("Chrome/27.0.1453.94")
.setRetryTimes(3).setSleepTime(100);
@Override
public void process(Page page) {
String prefix = "http://www.chnenergybidding.com.cn";
String titleName = page.getHtml().xpath("//h1").get();
if (null == titleName) {//第一次
List<String> fileNameList = page.getHtml().xpath("//div[@class='right-bd']//li//a[2]//text()").all();
List<String> timeList = page.getHtml().xpath("//div[@class='right-bd']//li//span[@class='r']//text()").all();
List<String> origionUrls = page.getHtml().xpath("//div[@class='right-bd']//li/div/a[1]/@href").all();
List<String> zbNumList = page.getHtml().xpath("//div[@class='right-bd']//li//a[1]//span/text()").all();
List<String> targetUrls = new ArrayList<>();
for (int i = 0; i < origionUrls.size(); i++) {
String targetUrl = prefix + origionUrls.get(i);
String projectName = fileNameList.get(i).replaceAll("中标结果公告", "").trim();
if (projectName.contains("后勤")||projectName.contains("物业")||
projectName.contains("保洁")||projectName.contains("食堂")||
projectName.contains("餐厅")||projectName.contains("绿化")||projectName.contains("行政办公")){
targetUrls.add(targetUrl);
concurrentHashMap.put(projectName, new ExcelEntity(projectName, timeList.get(i), "1",zbNumList.get(i)));
page.addTargetRequest(new Request(targetUrl));
}
}
} else if (StringUtils.isNotEmpty(titleName)&&titleName.contains("中标结果公告")) {//第二次
titleName = titleName.replaceAll("<h1 id=\"title\">", "")
.replaceAll("</h1>", "")
.replaceAll("中标结果公告", "");
log.info("中标结果公告: "+titleName);
ExcelEntity excelEntity = concurrentHashMap.get(titleName);
if (excelEntity.getStatus().equals("1")) {
String companyName = page.getHtml().xpath("//table//tr[2]//td[3]").all().get(0)
.replaceAll("<td>", "")
.replaceAll("</td>", "")
.replaceAll("<br>", "");
excelEntity.setCompanyName(companyName);
String[] urls = page.getRequest().getUrl().split("/");
String id = urls[urls.length - 1].replaceAll(".html", "");
String jsUrlPrefix="http://www.chnenergybidding.com.cn/bidcms/services/ShenZhouWebService/getlikeNews?response=application/json&zbnum="+ excelEntity.getZbNum()+
"&infoid="+id;
//候选人公示URL
List<String> redeayList = new ArrayList<>();
//结果公示
List<String> resultList = new ArrayList<>();
try {
JSONArray jsonArray = HttpClientUtil.get(jsUrlPrefix);
for(Object info:jsonArray){
JSONObject infoObj=(JSONObject) info;
String infoid = infoObj.getString("infoid");
if (infoObj.getString("title").contains("候选人")){
redeayList.add("http://www.chnenergybidding.com.cn/bidcms/WebbuilderMIS/RedirectPage/RedirectPage.jspx?locationurl=http://www.chnenergybidding.com.cn/bidweb/&&infoid="+infoid);
}else if (infoObj.getString("title").contains("招标公告")){
resultList.add("http://www.chnenergybidding.com.cn/bidcms/WebbuilderMIS/RedirectPage/RedirectPage.jspx?locationurl=http://www.chnenergybidding.com.cn/bidweb/&&infoid="+infoid);
}
}
} catch (IOException e) {
concurrentHashMap.remove(titleName);
}
if (CollectionUtils.isEmpty(redeayList) || CollectionUtils.isEmpty(redeayList)) {
concurrentHashMap.remove(titleName);
} else {
excelEntity.setResultList(resultList);
excelEntity.setStatus("2");
concurrentHashMap.put(titleName, excelEntity);
for (String url : redeayList) {
page.addTargetRequest(new Request(url));
}
}
}
} else if (StringUtils.isNotEmpty(titleName)&&titleName.contains("中标候选人公示")) {//第三次 候选人公示
titleName = titleName.replaceAll("<h1 id=\"title\">", "")
.replaceAll("</h1>", "")
.replaceAll("中标候选人公示", "");
ExcelEntity excelEntity = concurrentHashMap.get(titleName);
if (excelEntity.getStatus().equals("2")) {
String firstPrice = page.getHtml().xpath("//table//tr[3]//td[2]").get()
.replaceAll("</td>","")
.replaceAll("<td style=\"display:display\">","");
String secondPrice = page.getHtml().xpath("//table//tr[3]//td[3]").get()
.replaceAll("</td>","")
.replaceAll("<td style=\"display:display\">","");
excelEntity.setFirstPrice(firstPrice);
excelEntity.setSecondPrice(secondPrice);
excelEntity.setStatus("3");
for (String url : excelEntity.getResultList()) {
page.addTargetRequest(new Request(url));
}
}
} else {//最后一次
try{
titleName = titleName.replaceAll("<h1 id=\"title\">", "")
.replaceAll("</h1>", "")
.replaceAll("项目招标公告", "");
ExcelEntity excelEntity = concurrentHashMap.get(titleName);
List<String> all = page.getHtml().xpath("//div[@class='con']//span/text()").all()
.stream().filter(i->!i.contains("\u00A0")).collect(Collectors.toList());
//招标人
String fullText =
all.stream().filter(i->i.contains("招标人为")).collect(Collectors.toList()).get(0);
int personIndex = fullText.indexOf("招标人为");
int i = fullText.indexOf(",", personIndex);
String person = fullText.substring(personIndex+4, i);
//项目单位
String companyNameFull =
all.stream().filter(j->j.contains("项目单位为")).collect(Collectors.toList()).get(0);
int companyIndex = companyNameFull.indexOf("项目单位为:");
int j = companyNameFull.indexOf(",", companyIndex);
String companyName = companyNameFull.substring(companyIndex+6, j);
//服务内容
StringBuilder stringBuilder = new StringBuilder();
boolean flag=false;
boolean flag2=false;
for (int i1 = 0; i1 < all.size(); i1++) {
if (!flag&&all.get(i1).contains("项目概况") && StringUtils.isNotEmpty(all.get(i1).trim())){
flag=true;
}
if (all.get(i1).contains("投标人资格要求")){
flag=false;
String serviceContent = stringBuilder.toString();
excelEntity.setServiceContent(serviceContent);
stringBuilder.delete(0,stringBuilder.length());
flag2=true;
continue;
}
if (flag){
stringBuilder.append(all.get(i1));
}
if (all.get(i1).contains("招标文件的获取")){
flag2=false;
String quality = stringBuilder.toString();
excelEntity.setQuality(quality);
break;
}
if (flag2){
stringBuilder.append(all.get(i1));
}
}
//服务资格
//服务周期
excelEntity.setPerson(person);
excelEntity.setActualCompany(companyName);
excelEntity.setStatus("4");
log.info("成功数据为:"+titleName+"\n URL:"+page.getRequest().getUrl());
concurrentHashMap.put(titleName, excelEntity);
}catch (Exception e){
concurrentHashMap.remove(titleName);
}
}
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new TenderProcessor())
.addUrl("http://www.chnenergybidding.com.cn/bidweb/001/001006/001006003/"+i+".html")
// .setDownloader(new PageDownLoader())
.thread(4)
.run();
}
}
}
直接上代码!涉及的概念这些看官方文档就行,就不再赘述。没复杂需求的话只需要写好process就OK了,数据我没有入库,写在了map中,需要入库的话看下官方文档Pipleline相关。需要注意的点有:
1.将下次请求的地址加入队列中,以为这是一个会循环调起的方法,本次队列方法执行完就会执行下次URL的解析 page.addTargetRequest(new Request(targetUrl));
2.多次URL请求进来解析方式肯定发生了变化,注意区别(我在代码里用了关键字区分)
3.对于动态加载的网页,刚开始我用了selenium操作浏览器的方式,等加载完了之后再解析网页。但是当数据量过大的时候会卡死... 所以我选择了解析网页里的请求数据,拼接参数使用httpClient进行了请求,最后统一组装数据(HttpClientUtil.get(jsUrlPrefix))这个需要细心一点慢慢剥出来数据
- 使用自定义downloader,注意配置驱动的正确性
个人感觉,使用这个玩意不难,复杂的是DOM的数据解析,一定得细心一点。
最终成果
因为不着急,没考虑多的,程序跑俩小时,用poi写入Excel中!完成!