利用无头浏览器的拉取网页视频数据 ------ HtmlUnit (Java)

264 阅读4分钟

一、HtmlUnit 简介

HtmlUnit 是 Java 程序的无头浏览器。它对 HTML 进行建模并提供一个 API,允许玩家调用页面、填写表单、单击链接等来模拟使用浏览器。

优点:

  1. 模拟浏览器行为,可以处理JavaScript、CSS等
  2. 可以用于自动化测试和数据抓取
  3. 编码易上手

缺点:

  1. 性能较慢,因为它是在Java虚拟机中运行的
  2. JS支持不完全
  3. 如果网站有复杂的客户端验证或者对浏览器有特殊要求,HtmlUnit可能无法处理。例如动态加载

二、HtmlUnit 使用

1、依赖

<dependency>
    <groupId>net.sourceforge.htmlunit</groupId>
    <artifactId>htmlunit-core-js</artifactId>
    <version>2.9</version>
    <scope>compile</scope>
</dependency>
<dependency>
    <groupId>commons-logging</groupId>
    <artifactId>commons-logging-api</artifactId>
    <version>1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-collections/commons-collections -->
<dependency>
    <groupId>commons-collections</groupId>
    <artifactId>commons-collections</artifactId>
    <version>3.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>2.5</version>
</dependency>

<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-exec</artifactId>
    <version>1.3</version>
</dependency>

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>

2、主要代码 (以b站为,其中主要流程代码和工具类放在同一个类,方便运行测试,可自行分类)

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.HttpMethod;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.Executor;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.springframework.util.CollectionUtils;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@Slf4j
public class HtmlUnitClass {
    private HtmlUnitClass() {
    }
    
    //模拟缓存,对已经拉取的url,直接返回结果
    private static final Map<String, String> CATCH_RESULT_CACHE = new ConcurrentHashMap<>();
    
    //定义路径
    private static final String DIR = "your dir";
    
    /**
     * @description: 匹配数据 返回相关内容
     **/
    private static String matchAndReturnResult(String regex, String stringContent) {
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(stringContent);
        return matcher.find() ? matcher.group(1) : null;
    }
    
    private static Boolean doDownload(String link, WebClient webClient, String fileDir) throws IOException {
        Page page = webClient.getPage(HtmlunitUtil.getWebRequest(link));
        WebResponse webResponse = page.getWebResponse();
        try (InputStream inputStream = webResponse.getContentAsStream();
             OutputStream outputStream = Files.newOutputStream(Paths.get(fileDir))) {
            int copyNumber = IOUtils.copy(inputStream, outputStream);
            //限制大小
            return copyNumber <= 209715200L;
        }
    }
    
    public static void assembleWebClient(WebClient webClient) {
        webClient.getOptions().setJavaScriptEnabled(false);
        webClient.getOptions().setCssEnabled(false);
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
        webClient.getOptions().setThrowExceptionOnScriptError(false);
        webClient.addRequestHeader("Referer", "https://www.bilibili.com/index.html");
        webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.40");
    }
    
    public static WebRequest getWebRequest(String url) throws MalformedURLException {
        return new WebRequest(new URL(url), HttpMethod.GET);
    }
    
    private static Boolean doDownload(String link, WebClient webClient, String fileDir) throws IOException {
        Page page = webClient.getPage(getWebRequest(link));
        WebResponse webResponse = page.getWebResponse();
        try (InputStream inputStream = webResponse.getContentAsStream();
             OutputStream outputStream = Files.newOutputStream(Paths.get(fileDir))) {
            int copyNumber = IOUtils.copy(inputStream, outputStream);
            //限制拉取大小
            return copyNumber <= 209715200L;
        }
    }
    
    public static String getRedirectUrl(String shortUrl) {
        try {
            return Jsoup.connect(shortUrl).followRedirects(true).execute().url().toString();
        } catch (IOException e) {
            log.error("getRedirectUrl failed : ", e);
            return null;
        }
    }
    
    public static String downloadPic(String originalUrl) {
        try {
            HttpURLConnection connect = getConnectByUrl(originalUrl, "GET", null);
            if (null == connect) {
                log.info("downloadUrl : {},链接异常", originalUrl);
                throw new Exception("获取图片失败...");
            }

            String[] split = originalUrl.split("\\.");
            String yyyyMmDd = getYyyyMmDd();
            String resultPicName = UUID.randomUUID() + "." + split[split.length - 1];
            doDownload(yyyyMmDd, resultPicName, connect);
            return DIR + yyyyMmDd + "/" + resultPicName;
        } catch (Exception e) {
            log.error("downloadPic failed : ", e);
            return null;
        }
    }
    
    private static void doDownload(String yyyyMmDd, String fileName, HttpURLConnection connect) throws IOException {
        String aimFolder = DIR + yyyyMmDd;
        mkDirIfNotExists(aimFolder);
        String filePath = aimFolder + "/" + fileName;
        try (DataInputStream in = new DataInputStream(connect.getInputStream());
             DataOutputStream out = new DataOutputStream(Files.newOutputStream(Paths.get(filePath)))) {
            byte[] buffer = new byte[2048];
            int count;
            while ((count = in.read(buffer)) > 0) {
                out.write(buffer, 0, count);
            }
        }
    }
    
    public static HttpURLConnection getConnectByUrl(String requestUrl, String requestMethod, Map<String, String> headers) throws IOException {
        URL url = getUrl(requestUrl);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestMethod(requestMethod);
        connection.setRequestProperty("Accept", "application/json");
        if (!CollectionUtils.isEmpty(headers)) {
            headers.forEach(connection::setRequestProperty);
        }

        connection.connect();
        int responseCode = connection.getResponseCode();
        if (responseCode != HttpURLConnection.HTTP_OK) {
            log.error("请求链接失败 connectUrlAndReturnBodyString");
            return null;
        }

        return connection;
    }

    public static URL getUrl(String url) throws MalformedURLException {
        return new URL(url);
    }
    
    @SuppressWarnings("unchecked")
    public static <T> T getStringContentByRequest(String url, WebClient webClient) throws IOException {
        // 设置请求参数,建立请求
        WebRequest webRequest = getWebRequest(url);
        Page page = webClient.getPage(webRequest);
        WebResponse webResponse = page.getWebResponse();
        return (T) webResponse.getContentAsString();
    }
    
    public static Boolean mkDirIfNotExists(String folderPath) {
        File folder = new File(folderPath);
        if (!folder.exists()) {
            return folder.mkdir();
        }

        return true;
    }

    public static String getYyyyMmDd() {
        LocalDate currentDate = LocalDate.now();
        DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMdd");
        return currentDate.format(formatter);
    }
    
    private static void deleteFile(String filePath) {
        File file = new File(filePath);
        file.delete();
    }
    
    
    /********************* 以上为工具类代码 以下为主要实现代码 *********************/
    
   
    /**
     * 实现入口
     */
    public static String catchVideoAndPic(String url) {
        String cache = CATCH_RESULT_CACHE.get(url);
        if (StringUtils.isNotBlank(cache)) {
            return cache;
        }

        String redirectUrl = getRedirectUrl(url.trim());
        if (null == redirectUrl) {
            log.error("url 异常");
            return null;
        }

        // 建立无头浏览器
        try (WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
            assembleWebClient(webClient);

            String result;
            // 获取响应体
            String contentAsString = getStringContentByRequest(url, webClient);
            if (redirectUrl.contains("video")) {
                result = getVideoDownloadUrl(webClient, contentAsString, url);
            } else {
                result = getPicDownloadUrl(contentAsString, url);
            }

            CATCH_RESULT_CACHE.put(url, result);
            return result;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
    
    /**
     * 获取图片下载链接
     */
    private static String getPicDownloadUrl(String contentAsString, String url) throws Exception {
        // 模式匹配找图片
        String s = matchAndReturnResult("<script>window.__INITIAL_STATE__=(.*?);\\(function\\(\\)", contentAsString);
        if (null == s) {
            log.warn("图文链接:{} 未找到图文.", url);
            throw new Exception("未找到图文...");
        }

        JSONObject jsonObject = JSON.parseObject(s);
        JSONObject picContent = jsonObject.getJSONObject("readInfo").getJSONObject("opus").getJSONObject("content");
        if (CollectionUtils.isEmpty(picContent)) {
            log.warn("图文链接: {} 没有图片", url);
            throw new Exception("没有图片...");
        }

        JSONArray paragraphs = picContent.getJSONArray("paragraphs");
        List<String> originalPicUrlList = new ArrayList<>();
        for (Object paragraph : paragraphs) {
            JSONObject jsonObj = JSON.parseObject(JSON.toJSONString(paragraph));
            JSONObject picObj = jsonObj.getJSONObject("pic");
            JSONArray pics;
            if (null == picObj || CollectionUtils.isEmpty(pics = picObj.getJSONArray("pics"))) {
                continue;
            }

            for (Object realPicObj : pics) {
                JSONObject realPicJson = JSON.parseObject(JSON.toJSONString(realPicObj));
                String downloadUrl = realPicJson.getString("url");
                if (StringUtils.isNotBlank(downloadUrl)) {
                    originalPicUrlList.add(downloadUrl);
                }
            }
        }

        return JSONObject.toJSONString(downloadPics(originalPicUrlList));
    }
    
    /**
     * 下载图片
     */
    private static List<String> downloadPics(List<String> originalPicUrlList) {
        List<String> showPicUrlList = new ArrayList<>();
        originalPicUrlList.forEach(originalPicUrl -> showPicUrlList.add(downloadPic(originalPicUrl)));
        return showPicUrlList;
    }
    
    /**
     * 获取视频链接数据
     */
    private static String getVideoDownloadUrl(WebClient webClient, String contentAsString, String url) throws Exception {
        // 模式匹配找视频总数
        String s = matchAndReturnResult("<script>window.__INITIAL_STATE__=(.*?);\\(function\\(\\)", contentAsString);
        if (null == s) {
            log.warn("视频链接:{} 未找到视频 s", url);
            throw new Exception("未找到视频...");
        }

        JSONObject jsonObject = JSON.parseObject(s);
        int videoNum = jsonObject.getJSONObject("videoData").getIntValue("videos");
        log.info("视频链接:{} 视频总数" + videoNum, url);

        List<String> videoDownloadUrlList = new ArrayList<>();
        if (videoNum > 1) {
            for (int i = 1; i <= videoNum; i++) {
                // 获取响应体
                String singleSubUrl = url + "?p=" + i;
                contentAsString = getStringContentByRequest(singleSubUrl, webClient);
                doGetVideoDownloadUrl(contentAsString, webClient, singleSubUrl, i, videoDownloadUrlList);
            }
        } else {
            doGetVideoDownloadUrl(contentAsString, webClient, url, 1, videoDownloadUrlList);
        }


        return JSONObject.toJSONString(videoDownloadUrlList);
    }
    
    /**
     * 下载视频数据
     */
    private static void doGetVideoDownloadUrl(String contentAsString, WebClient webClient, String url, int i, List<String> videoDownloadUrlList) throws Exception {
        // 获取视频链接
        String s2 = matchAndReturnResult("<script>window.__playinfo__=(.*?)</script>", contentAsString);
        if (null == s2) {
            log.warn("视频链接:{} 没有找到视频链接 s2", url);
            throw new Exception("没有视频...");
        }

        String videoLink = JSON.parseObject(s2).getJSONObject("data").getJSONObject("dash").getJSONArray("video").getJSONObject(0).getString("baseUrl");
        String audioLink = JSON.parseObject(s2).getJSONObject("data").getJSONObject("dash").getJSONArray("audio").getJSONObject(0).getString("baseUrl");
        log.info("视频链接:{} 视频下载链接 : " + videoLink, url);
        log.info("视频链接:{} 音频下载链接 : " + audioLink, url);

        //目录名去除./&*这些字符
        String videoName = UUID.randomUUID() + "";
        String dir = DIR;
        String videoFile = dir + videoName + ".mp4";
        String audioFile = dir + videoName + ".mp3";

        // 下载视频
        if (Boolean.TRUE.equals(doDownload(videoLink, webClient, videoFile))) {
            // 下载音频
            doDownload(audioLink, webClient, audioFile);
        } else {
            throw new Exception("视频太大了...");
        }


        //整合视频音频
        videoDownloadUrlList.add(integrateFile(videoFile, audioFile, i, videoName));
    }
    
    /**
     * 整合音频视频文件,利用 ffmpeg 命令
     */
    private static String integrateFile(String videoFile, String audioFile, Integer i, String videoName) throws IOException {
        videoName = i + videoName + ".mp4";
        String yyyyMmDd = getYyyyMmDd();
        try {
            String aimFolder = DIR + yyyyMmDd;
            mkDirIfNotExists(aimFolder);
            String finalDir = aimFolder + "/" + videoName;
            CommandLine commandLine = CommandLine.parse("ffmpeg -i " + videoFile + " -i " + audioFile + " -c copy " + finalDir);
            Executor executor = new DefaultExecutor();
            PumpStreamHandler streamHandler = new PumpStreamHandler(System.out, System.err);
            executor.setStreamHandler(streamHandler);
            // 执行命令
            System.out.println(commandLine);
            executor.execute(commandLine);
        } finally {
            deleteFile(audioFile);
            deleteFile(videoFile);
        }

        return DIR + yyyyMmDd + "/" + videoName;
    }
}