一、HtmlUnit 简介
HtmlUnit 是 Java 程序的无头浏览器。它对 HTML 进行建模并提供一个 API,允许玩家调用页面、填写表单、单击链接等来模拟使用浏览器。
优点:
- 模拟浏览器行为,可以处理JavaScript、CSS等
- 可以用于自动化测试和数据抓取
- 编码易上手
缺点:
- 性能较慢,因为它是在Java虚拟机中运行的
- JS支持不完全
- 如果网站有复杂的客户端验证或者对浏览器有特殊要求,HtmlUnit可能无法处理。例如动态加载
二、HtmlUnit 使用
1、依赖
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit-core-js</artifactId>
<version>2.9</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging-api</artifactId>
<version>1.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-collections/commons-collections -->
<dependency>
<groupId>commons-collections</groupId>
<artifactId>commons-collections</artifactId>
<version>3.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-io/commons-io -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-exec</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
2、主要代码 (以b站为,其中主要流程代码和工具类放在同一个类,方便运行测试,可自行分类)
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.HttpMethod;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.WebResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.exec.CommandLine;
import org.apache.commons.exec.DefaultExecutor;
import org.apache.commons.exec.Executor;
import org.apache.commons.exec.PumpStreamHandler;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.springframework.util.CollectionUtils;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Slf4j
public class HtmlUnitClass {
private HtmlUnitClass() {
}
//模拟缓存,对已经拉取的url,直接返回结果
private static final Map<String, String> CATCH_RESULT_CACHE = new ConcurrentHashMap<>();
//定义路径
private static final String DIR = "your dir";
/**
* @description: 匹配数据 返回相关内容
**/
private static String matchAndReturnResult(String regex, String stringContent) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(stringContent);
return matcher.find() ? matcher.group(1) : null;
}
private static Boolean doDownload(String link, WebClient webClient, String fileDir) throws IOException {
Page page = webClient.getPage(HtmlunitUtil.getWebRequest(link));
WebResponse webResponse = page.getWebResponse();
try (InputStream inputStream = webResponse.getContentAsStream();
OutputStream outputStream = Files.newOutputStream(Paths.get(fileDir))) {
int copyNumber = IOUtils.copy(inputStream, outputStream);
//限制大小
return copyNumber <= 209715200L;
}
}
public static void assembleWebClient(WebClient webClient) {
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.addRequestHeader("Referer", "https://www.bilibili.com/index.html");
webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.40");
}
public static WebRequest getWebRequest(String url) throws MalformedURLException {
return new WebRequest(new URL(url), HttpMethod.GET);
}
private static Boolean doDownload(String link, WebClient webClient, String fileDir) throws IOException {
Page page = webClient.getPage(getWebRequest(link));
WebResponse webResponse = page.getWebResponse();
try (InputStream inputStream = webResponse.getContentAsStream();
OutputStream outputStream = Files.newOutputStream(Paths.get(fileDir))) {
int copyNumber = IOUtils.copy(inputStream, outputStream);
//限制拉取大小
return copyNumber <= 209715200L;
}
}
public static String getRedirectUrl(String shortUrl) {
try {
return Jsoup.connect(shortUrl).followRedirects(true).execute().url().toString();
} catch (IOException e) {
log.error("getRedirectUrl failed : ", e);
return null;
}
}
public static String downloadPic(String originalUrl) {
try {
HttpURLConnection connect = getConnectByUrl(originalUrl, "GET", null);
if (null == connect) {
log.info("downloadUrl : {},链接异常", originalUrl);
throw new Exception("获取图片失败...");
}
String[] split = originalUrl.split("\\.");
String yyyyMmDd = getYyyyMmDd();
String resultPicName = UUID.randomUUID() + "." + split[split.length - 1];
doDownload(yyyyMmDd, resultPicName, connect);
return DIR + yyyyMmDd + "/" + resultPicName;
} catch (Exception e) {
log.error("downloadPic failed : ", e);
return null;
}
}
private static void doDownload(String yyyyMmDd, String fileName, HttpURLConnection connect) throws IOException {
String aimFolder = DIR + yyyyMmDd;
mkDirIfNotExists(aimFolder);
String filePath = aimFolder + "/" + fileName;
try (DataInputStream in = new DataInputStream(connect.getInputStream());
DataOutputStream out = new DataOutputStream(Files.newOutputStream(Paths.get(filePath)))) {
byte[] buffer = new byte[2048];
int count;
while ((count = in.read(buffer)) > 0) {
out.write(buffer, 0, count);
}
}
}
public static HttpURLConnection getConnectByUrl(String requestUrl, String requestMethod, Map<String, String> headers) throws IOException {
URL url = getUrl(requestUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod(requestMethod);
connection.setRequestProperty("Accept", "application/json");
if (!CollectionUtils.isEmpty(headers)) {
headers.forEach(connection::setRequestProperty);
}
connection.connect();
int responseCode = connection.getResponseCode();
if (responseCode != HttpURLConnection.HTTP_OK) {
log.error("请求链接失败 connectUrlAndReturnBodyString");
return null;
}
return connection;
}
public static URL getUrl(String url) throws MalformedURLException {
return new URL(url);
}
@SuppressWarnings("unchecked")
public static <T> T getStringContentByRequest(String url, WebClient webClient) throws IOException {
// 设置请求参数,建立请求
WebRequest webRequest = getWebRequest(url);
Page page = webClient.getPage(webRequest);
WebResponse webResponse = page.getWebResponse();
return (T) webResponse.getContentAsString();
}
public static Boolean mkDirIfNotExists(String folderPath) {
File folder = new File(folderPath);
if (!folder.exists()) {
return folder.mkdir();
}
return true;
}
public static String getYyyyMmDd() {
LocalDate currentDate = LocalDate.now();
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyyMMdd");
return currentDate.format(formatter);
}
private static void deleteFile(String filePath) {
File file = new File(filePath);
file.delete();
}
/********************* 以上为工具类代码 以下为主要实现代码 *********************/
/**
* 实现入口
*/
public static String catchVideoAndPic(String url) {
String cache = CATCH_RESULT_CACHE.get(url);
if (StringUtils.isNotBlank(cache)) {
return cache;
}
String redirectUrl = getRedirectUrl(url.trim());
if (null == redirectUrl) {
log.error("url 异常");
return null;
}
// 建立无头浏览器
try (WebClient webClient = new WebClient(BrowserVersion.CHROME)) {
assembleWebClient(webClient);
String result;
// 获取响应体
String contentAsString = getStringContentByRequest(url, webClient);
if (redirectUrl.contains("video")) {
result = getVideoDownloadUrl(webClient, contentAsString, url);
} else {
result = getPicDownloadUrl(contentAsString, url);
}
CATCH_RESULT_CACHE.put(url, result);
return result;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* 获取图片下载链接
*/
private static String getPicDownloadUrl(String contentAsString, String url) throws Exception {
// 模式匹配找图片
String s = matchAndReturnResult("<script>window.__INITIAL_STATE__=(.*?);\\(function\\(\\)", contentAsString);
if (null == s) {
log.warn("图文链接:{} 未找到图文.", url);
throw new Exception("未找到图文...");
}
JSONObject jsonObject = JSON.parseObject(s);
JSONObject picContent = jsonObject.getJSONObject("readInfo").getJSONObject("opus").getJSONObject("content");
if (CollectionUtils.isEmpty(picContent)) {
log.warn("图文链接: {} 没有图片", url);
throw new Exception("没有图片...");
}
JSONArray paragraphs = picContent.getJSONArray("paragraphs");
List<String> originalPicUrlList = new ArrayList<>();
for (Object paragraph : paragraphs) {
JSONObject jsonObj = JSON.parseObject(JSON.toJSONString(paragraph));
JSONObject picObj = jsonObj.getJSONObject("pic");
JSONArray pics;
if (null == picObj || CollectionUtils.isEmpty(pics = picObj.getJSONArray("pics"))) {
continue;
}
for (Object realPicObj : pics) {
JSONObject realPicJson = JSON.parseObject(JSON.toJSONString(realPicObj));
String downloadUrl = realPicJson.getString("url");
if (StringUtils.isNotBlank(downloadUrl)) {
originalPicUrlList.add(downloadUrl);
}
}
}
return JSONObject.toJSONString(downloadPics(originalPicUrlList));
}
/**
* 下载图片
*/
private static List<String> downloadPics(List<String> originalPicUrlList) {
List<String> showPicUrlList = new ArrayList<>();
originalPicUrlList.forEach(originalPicUrl -> showPicUrlList.add(downloadPic(originalPicUrl)));
return showPicUrlList;
}
/**
* 获取视频链接数据
*/
private static String getVideoDownloadUrl(WebClient webClient, String contentAsString, String url) throws Exception {
// 模式匹配找视频总数
String s = matchAndReturnResult("<script>window.__INITIAL_STATE__=(.*?);\\(function\\(\\)", contentAsString);
if (null == s) {
log.warn("视频链接:{} 未找到视频 s", url);
throw new Exception("未找到视频...");
}
JSONObject jsonObject = JSON.parseObject(s);
int videoNum = jsonObject.getJSONObject("videoData").getIntValue("videos");
log.info("视频链接:{} 视频总数" + videoNum, url);
List<String> videoDownloadUrlList = new ArrayList<>();
if (videoNum > 1) {
for (int i = 1; i <= videoNum; i++) {
// 获取响应体
String singleSubUrl = url + "?p=" + i;
contentAsString = getStringContentByRequest(singleSubUrl, webClient);
doGetVideoDownloadUrl(contentAsString, webClient, singleSubUrl, i, videoDownloadUrlList);
}
} else {
doGetVideoDownloadUrl(contentAsString, webClient, url, 1, videoDownloadUrlList);
}
return JSONObject.toJSONString(videoDownloadUrlList);
}
/**
* 下载视频数据
*/
private static void doGetVideoDownloadUrl(String contentAsString, WebClient webClient, String url, int i, List<String> videoDownloadUrlList) throws Exception {
// 获取视频链接
String s2 = matchAndReturnResult("<script>window.__playinfo__=(.*?)</script>", contentAsString);
if (null == s2) {
log.warn("视频链接:{} 没有找到视频链接 s2", url);
throw new Exception("没有视频...");
}
String videoLink = JSON.parseObject(s2).getJSONObject("data").getJSONObject("dash").getJSONArray("video").getJSONObject(0).getString("baseUrl");
String audioLink = JSON.parseObject(s2).getJSONObject("data").getJSONObject("dash").getJSONArray("audio").getJSONObject(0).getString("baseUrl");
log.info("视频链接:{} 视频下载链接 : " + videoLink, url);
log.info("视频链接:{} 音频下载链接 : " + audioLink, url);
//目录名去除./&*这些字符
String videoName = UUID.randomUUID() + "";
String dir = DIR;
String videoFile = dir + videoName + ".mp4";
String audioFile = dir + videoName + ".mp3";
// 下载视频
if (Boolean.TRUE.equals(doDownload(videoLink, webClient, videoFile))) {
// 下载音频
doDownload(audioLink, webClient, audioFile);
} else {
throw new Exception("视频太大了...");
}
//整合视频音频
videoDownloadUrlList.add(integrateFile(videoFile, audioFile, i, videoName));
}
/**
* 整合音频视频文件,利用 ffmpeg 命令
*/
private static String integrateFile(String videoFile, String audioFile, Integer i, String videoName) throws IOException {
videoName = i + videoName + ".mp4";
String yyyyMmDd = getYyyyMmDd();
try {
String aimFolder = DIR + yyyyMmDd;
mkDirIfNotExists(aimFolder);
String finalDir = aimFolder + "/" + videoName;
CommandLine commandLine = CommandLine.parse("ffmpeg -i " + videoFile + " -i " + audioFile + " -c copy " + finalDir);
Executor executor = new DefaultExecutor();
PumpStreamHandler streamHandler = new PumpStreamHandler(System.out, System.err);
executor.setStreamHandler(streamHandler);
// 执行命令
System.out.println(commandLine);
executor.execute(commandLine);
} finally {
deleteFile(audioFile);
deleteFile(videoFile);
}
return DIR + yyyyMmDd + "/" + videoName;
}
}