用Java实现爬虫

94 阅读4分钟

需求

朋友想用爬虫解决一点工作中遇到的问题

1.png 要求如下:

2.png

找涉及 【后勤】【物业】【保洁】【食堂】【餐厅】【绿化】【行政办公】词汇的标题 提取以下字段

3.png

4.png

5.png

6.png

7.png

获取以上数据,填充入以下格式的Excel中

8.png

具体实现

作为一个JAVA程序猿,当然要选趁手的兵器了!IDEA启动!

选择合适的框架

我选择使用springboot+webmagic,解析DOM用了xpath,兵器捡趁手的用没毛病吧。为什么选webmagic?因为我之前用过,感觉跟之前python那个差不了太多吧(我都是粗浅使用,个人感觉用着没差多少,都是获取网页解析DOM)

webmagic官网在这里!有兴趣可以研究下,我只是粗浅使用了一下

webmagic.io/docs/zh/

引入pom依赖

<!--        WebMagic爬虫-->
<dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-core</artifactId>
    <version>0.7.3</version>
</dependency><dependency>
    <groupId>us.codecraft</groupId>
    <artifactId>webmagic-extension</artifactId>
    <version>0.7.3</version>
</dependency>

编写Processor

package com.easyjob.spider;

import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.easyjob.util.HttpClientUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import static com.easyjob.spider.SpiderService.concurrentHashMap;

/**
 * @author yyyyy
 * @date 2024/9/18 20:02
 * @description
 */
@Component
@Slf4j
public class TenderProcessor implements PageProcessor {
    private Site site = Site.me()
            .setUserAgent("Chrome/27.0.1453.94")
            .setRetryTimes(3).setSleepTime(100);
    @Override
    public void process(Page page) {
        String prefix = "http://www.chnenergybidding.com.cn";
        String titleName = page.getHtml().xpath("//h1").get();
        if (null == titleName) {//第一次
            List<String> fileNameList = page.getHtml().xpath("//div[@class='right-bd']//li//a[2]//text()").all();
            List<String> timeList = page.getHtml().xpath("//div[@class='right-bd']//li//span[@class='r']//text()").all();
            List<String> origionUrls = page.getHtml().xpath("//div[@class='right-bd']//li/div/a[1]/@href").all();
            List<String> zbNumList = page.getHtml().xpath("//div[@class='right-bd']//li//a[1]//span/text()").all();

            List<String> targetUrls = new ArrayList<>();
            for (int i = 0; i < origionUrls.size(); i++) {
                String targetUrl = prefix + origionUrls.get(i);
                String projectName = fileNameList.get(i).replaceAll("中标结果公告", "").trim();
                if (projectName.contains("后勤")||projectName.contains("物业")||
                        projectName.contains("保洁")||projectName.contains("食堂")||
                        projectName.contains("餐厅")||projectName.contains("绿化")||projectName.contains("行政办公")){
                    targetUrls.add(targetUrl);
                    concurrentHashMap.put(projectName, new ExcelEntity(projectName, timeList.get(i), "1",zbNumList.get(i)));
                    page.addTargetRequest(new Request(targetUrl));
                }
            }
        } else if (StringUtils.isNotEmpty(titleName)&&titleName.contains("中标结果公告")) {//第二次

            titleName = titleName.replaceAll("<h1 id=\"title\">", "")
                    .replaceAll("</h1>", "")
                    .replaceAll("中标结果公告", "");
            log.info("中标结果公告: "+titleName);
            ExcelEntity excelEntity = concurrentHashMap.get(titleName);
            if (excelEntity.getStatus().equals("1")) {
                String companyName = page.getHtml().xpath("//table//tr[2]//td[3]").all().get(0)
                        .replaceAll("<td>", "")
                        .replaceAll("</td>", "")
                        .replaceAll("<br>", "");
                excelEntity.setCompanyName(companyName);

                String[] urls = page.getRequest().getUrl().split("/");
                String id = urls[urls.length - 1].replaceAll(".html", "");
                String jsUrlPrefix="http://www.chnenergybidding.com.cn/bidcms/services/ShenZhouWebService/getlikeNews?response=application/json&zbnum="+ excelEntity.getZbNum()+
                        "&infoid="+id;

                //候选人公示URL
                List<String> redeayList = new ArrayList<>();
                //结果公示
                List<String> resultList = new ArrayList<>();

                try {
                    JSONArray jsonArray = HttpClientUtil.get(jsUrlPrefix);
                    for(Object info:jsonArray){
                        JSONObject infoObj=(JSONObject) info;
                        String infoid = infoObj.getString("infoid");
                        if (infoObj.getString("title").contains("候选人")){
                            redeayList.add("http://www.chnenergybidding.com.cn/bidcms/WebbuilderMIS/RedirectPage/RedirectPage.jspx?locationurl=http://www.chnenergybidding.com.cn/bidweb/&&infoid="+infoid);
                        }else if (infoObj.getString("title").contains("招标公告")){
                            resultList.add("http://www.chnenergybidding.com.cn/bidcms/WebbuilderMIS/RedirectPage/RedirectPage.jspx?locationurl=http://www.chnenergybidding.com.cn/bidweb/&&infoid="+infoid);
                        }
                    }
                } catch (IOException e) {
                    concurrentHashMap.remove(titleName);
                }

                if (CollectionUtils.isEmpty(redeayList) || CollectionUtils.isEmpty(redeayList)) {
                    concurrentHashMap.remove(titleName);
                } else {
                    excelEntity.setResultList(resultList);
                    excelEntity.setStatus("2");
                    concurrentHashMap.put(titleName, excelEntity);
                    for (String url : redeayList) {
                        page.addTargetRequest(new Request(url));
                    }

                }
            }
        } else if (StringUtils.isNotEmpty(titleName)&&titleName.contains("中标候选人公示")) {//第三次 候选人公示
            titleName = titleName.replaceAll("<h1 id=\"title\">", "")
                    .replaceAll("</h1>", "")
                    .replaceAll("中标候选人公示", "");
            ExcelEntity excelEntity = concurrentHashMap.get(titleName);
            if (excelEntity.getStatus().equals("2")) {
                String firstPrice = page.getHtml().xpath("//table//tr[3]//td[2]").get()
                        .replaceAll("</td>","")
                        .replaceAll("<td style=\"display:display\">","");
                String secondPrice = page.getHtml().xpath("//table//tr[3]//td[3]").get()
                        .replaceAll("</td>","")
                        .replaceAll("<td style=\"display:display\">","");
                excelEntity.setFirstPrice(firstPrice);
                excelEntity.setSecondPrice(secondPrice);
                excelEntity.setStatus("3");
                for (String url : excelEntity.getResultList()) {
                    page.addTargetRequest(new Request(url));
                }
            }
        } else {//最后一次
            try{
            titleName = titleName.replaceAll("<h1 id=\"title\">", "")
                    .replaceAll("</h1>", "")
                    .replaceAll("项目招标公告", "");
            ExcelEntity excelEntity = concurrentHashMap.get(titleName);

            List<String> all = page.getHtml().xpath("//div[@class='con']//span/text()").all()
                    .stream().filter(i->!i.contains("\u00A0")).collect(Collectors.toList());
            //招标人
            String fullText =
                    all.stream().filter(i->i.contains("招标人为")).collect(Collectors.toList()).get(0);
            int personIndex = fullText.indexOf("招标人为");
            int i = fullText.indexOf(",", personIndex);
            String person = fullText.substring(personIndex+4, i);
            //项目单位
            String companyNameFull =
                    all.stream().filter(j->j.contains("项目单位为")).collect(Collectors.toList()).get(0);
            int companyIndex = companyNameFull.indexOf("项目单位为:");
            int j = companyNameFull.indexOf(",", companyIndex);
            String companyName = companyNameFull.substring(companyIndex+6, j);
            //服务内容
            StringBuilder stringBuilder = new StringBuilder();
            boolean flag=false;
            boolean flag2=false;
            for (int i1 = 0; i1 < all.size(); i1++) {

                if (!flag&&all.get(i1).contains("项目概况") && StringUtils.isNotEmpty(all.get(i1).trim())){
                    flag=true;
                }
                if (all.get(i1).contains("投标人资格要求")){
                    flag=false;
                    String serviceContent = stringBuilder.toString();
                    excelEntity.setServiceContent(serviceContent);
                    stringBuilder.delete(0,stringBuilder.length());
                    flag2=true;
                    continue;
                }
                if (flag){
                    stringBuilder.append(all.get(i1));
                }
                if (all.get(i1).contains("招标文件的获取")){
                    flag2=false;
                    String quality = stringBuilder.toString();
                    excelEntity.setQuality(quality);
                    break;
                }
                if (flag2){
                    stringBuilder.append(all.get(i1));
                }

            }
            //服务资格

            //服务周期
            excelEntity.setPerson(person);
            excelEntity.setActualCompany(companyName);
            excelEntity.setStatus("4");
            log.info("成功数据为:"+titleName+"\n URL:"+page.getRequest().getUrl());
            concurrentHashMap.put(titleName, excelEntity);

            }catch (Exception e){
                concurrentHashMap.remove(titleName);

            }
        }
    }

    @Override
    public Site getSite() {
        return site;
    }
        public static void main(String[] args) {
            Spider.create(new TenderProcessor())
                 .addUrl("http://www.chnenergybidding.com.cn/bidweb/001/001006/001006003/"+i+".html")
//                .setDownloader(new PageDownLoader())
                    .thread(4)
                    .run();
        }

    }
}

直接上代码!涉及的概念这些看官方文档就行,就不再赘述。没复杂需求的话只需要写好process就OK了,数据我没有入库,写在了map中,需要入库的话看下官方文档Pipleline相关。需要注意的点有:

1.将下次请求的地址加入队列中,以为这是一个会循环调起的方法,本次队列方法执行完就会执行下次URL的解析 page.addTargetRequest(new Request(targetUrl));

2.多次URL请求进来解析方式肯定发生了变化,注意区别(我在代码里用了关键字区分)

3.对于动态加载的网页,刚开始我用了selenium操作浏览器的方式,等加载完了之后再解析网页。但是当数据量过大的时候会卡死... 所以我选择了解析网页里的请求数据,拼接参数使用httpClient进行了请求,最后统一组装数据(HttpClientUtil.get(jsUrlPrefix))这个需要细心一点慢慢剥出来数据

  1. 使用自定义downloader,注意配置驱动的正确性

个人感觉,使用这个玩意不难,复杂的是DOM的数据解析,一定得细心一点。

最终成果

因为不着急,没考虑多的,程序跑俩小时,用poi写入Excel中!完成!