第九章:商品数据抓取实战
本章字数:约43000字 阅读时间:约140分钟 难度等级:★★★☆☆
声明:本文中的公司名称、包名、API地址、密钥等均已脱敏处理。文中的"梦想世界"、"dreamworld"等均为虚构名称,与任何真实公司无关。
引言
经过前面八章的铺垫,我们已经拥有了完整的安全调用链和生产级的系统架构。现在,是时候将这些技术应用到实际场景中了。
本章将以商品数据抓取为例,展示如何:
- 分析目标API接口
- 设计数据抓取流程
- 实现数据解析和存储
- 处理分页和增量更新
9.1 目标API分析
9.1.1 接口发现
通过分析APP的网络请求,我们发现了以下商品相关的API:
┌─────────────────────────────────────────────────────────────────┐
│ 商品相关API列表 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 接口 方法 说明 │
│ ───────────────────────────────────────────────────────────── │
│ /api/v1/products/list GET 商品列表 │
│ /api/v1/products/{id} GET 商品详情 │
│ /api/v1/products/categories GET 商品分类 │
│ /api/v1/products/search GET 商品搜索 │
│ /api/v1/products/recommend GET 推荐商品 │
│ /api/v1/products/{id}/reviews GET 商品评价 │
│ │
└─────────────────────────────────────────────────────────────────┘
9.1.2 商品列表接口
URL: GET /api/v1/products/list
请求参数:
| 参数 | 类型 | 必填 | 说明 |
|---|---|---|---|
| page | int | 否 | 页码,默认1 |
| size | int | 否 | 每页数量,默认20,最大100 |
| category | string | 否 | 分类ID |
| sort | string | 否 | 排序方式:price_asc, price_desc, sales, newest |
响应示例:
{
"code": 0,
"success": true,
"msg": "success",
"data": {
"total": 1234,
"page": 1,
"size": 20,
"pages": 62,
"list": [
{
"id": "P001",
"name": "智能车载充电器",
"price": 199.00,
"originalPrice": 299.00,
"sales": 5678,
"rating": 4.8,
"reviewCount": 234,
"images": ["https://..."],
"category": "车载电器",
"tags": ["热销", "新品"],
"stock": 100,
"createTime": "2026-01-01T00:00:00Z"
}
]
}
}
9.1.3 商品详情接口
URL: GET /api/v1/products/{id}
响应示例:
{
"code": 0,
"success": true,
"data": {
"id": "P001",
"name": "智能车载充电器",
"description": "详细描述...",
"price": 199.00,
"originalPrice": 299.00,
"sales": 5678,
"rating": 4.8,
"reviewCount": 234,
"images": ["https://..."],
"detailImages": ["https://..."],
"category": {
"id": "C001",
"name": "车载电器"
},
"specs": [
{"name": "颜色", "values": ["黑色", "白色"]},
{"name": "功率", "values": ["30W", "65W"]}
],
"skus": [
{
"id": "SKU001",
"specs": {"颜色": "黑色", "功率": "30W"},
"price": 199.00,
"stock": 50
}
],
"tags": ["热销", "新品"],
"stock": 100,
"createTime": "2026-01-01T00:00:00Z",
"updateTime": "2026-06-01T00:00:00Z"
}
}
9.2 数据模型设计
9.2.1 商品实体
package com.dreamworld.model;
import java.math.BigDecimal;
import java.time.LocalDateTime;
import java.util.List;
/**
* 商品实体
*/
public class Product {
private String id;
private String name;
private String description;
private BigDecimal price;
private BigDecimal originalPrice;
private int sales;
private double rating;
private int reviewCount;
private List<String> images;
private List<String> detailImages;
private Category category;
private List<Spec> specs;
private List<Sku> skus;
private List<String> tags;
private int stock;
private LocalDateTime createTime;
private LocalDateTime updateTime;
// 抓取元数据
private LocalDateTime crawlTime;
private String crawlSource;
// Getters and Setters
public String getId() { return id; }
public void setId(String id) { this.id = id; }
public String getName() { return name; }
public void setName(String name) { this.name = name; }
public String getDescription() { return description; }
public void setDescription(String description) { this.description = description; }
public BigDecimal getPrice() { return price; }
public void setPrice(BigDecimal price) { this.price = price; }
public BigDecimal getOriginalPrice() { return originalPrice; }
public void setOriginalPrice(BigDecimal originalPrice) { this.originalPrice = originalPrice; }
public int getSales() { return sales; }
public void setSales(int sales) { this.sales = sales; }
public double getRating() { return rating; }
public void setRating(double rating) { this.rating = rating; }
public int getReviewCount() { return reviewCount; }
public void setReviewCount(int reviewCount) { this.reviewCount = reviewCount; }
public List<String> getImages() { return images; }
public void setImages(List<String> images) { this.images = images; }
public List<String> getDetailImages() { return detailImages; }
public void setDetailImages(List<String> detailImages) { this.detailImages = detailImages; }
public Category getCategory() { return category; }
public void setCategory(Category category) { this.category = category; }
public List<Spec> getSpecs() { return specs; }
public void setSpecs(List<Spec> specs) { this.specs = specs; }
public List<Sku> getSkus() { return skus; }
public void setSkus(List<Sku> skus) { this.skus = skus; }
public List<String> getTags() { return tags; }
public void setTags(List<String> tags) { this.tags = tags; }
public int getStock() { return stock; }
public void setStock(int stock) { this.stock = stock; }
public LocalDateTime getCreateTime() { return createTime; }
public void setCreateTime(LocalDateTime createTime) { this.createTime = createTime; }
public LocalDateTime getUpdateTime() { return updateTime; }
public void setUpdateTime(LocalDateTime updateTime) { this.updateTime = updateTime; }
public LocalDateTime getCrawlTime() { return crawlTime; }
public void setCrawlTime(LocalDateTime crawlTime) { this.crawlTime = crawlTime; }
public String getCrawlSource() { return crawlSource; }
public void setCrawlSource(String crawlSource) { this.crawlSource = crawlSource; }
/**
* 分类
*/
public static class Category {
private String id;
private String name;
public String getId() { return id; }
public void setId(String id) { this.id = id; }
public String getName() { return name; }
public void setName(String name) { this.name = name; }
}
/**
* 规格
*/
public static class Spec {
private String name;
private List<String> values;
public String getName() { return name; }
public void setName(String name) { this.name = name; }
public List<String> getValues() { return values; }
public void setValues(List<String> values) { this.values = values; }
}
/**
* SKU
*/
public static class Sku {
private String id;
private java.util.Map<String, String> specs;
private BigDecimal price;
private int stock;
public String getId() { return id; }
public void setId(String id) { this.id = id; }
public java.util.Map<String, String> getSpecs() { return specs; }
public void setSpecs(java.util.Map<String, String> specs) { this.specs = specs; }
public BigDecimal getPrice() { return price; }
public void setPrice(BigDecimal price) { this.price = price; }
public int getStock() { return stock; }
public void setStock(int stock) { this.stock = stock; }
}
}
9.2.2 数据库表设计
-- 商品表
CREATE TABLE products (
id VARCHAR(32) PRIMARY KEY,
name VARCHAR(200) NOT NULL,
description TEXT,
price DECIMAL(10,2) NOT NULL,
original_price DECIMAL(10,2),
sales INT DEFAULT 0,
rating DECIMAL(2,1) DEFAULT 0,
review_count INT DEFAULT 0,
images JSON,
detail_images JSON,
category_id VARCHAR(32),
category_name VARCHAR(100),
tags JSON,
stock INT DEFAULT 0,
create_time DATETIME,
update_time DATETIME,
crawl_time DATETIME NOT NULL,
crawl_source VARCHAR(50),
INDEX idx_category (category_id),
INDEX idx_price (price),
INDEX idx_sales (sales),
INDEX idx_crawl_time (crawl_time)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- 商品规格表
CREATE TABLE product_specs (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
product_id VARCHAR(32) NOT NULL,
spec_name VARCHAR(50) NOT NULL,
spec_values JSON,
INDEX idx_product (product_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- SKU表
CREATE TABLE product_skus (
id VARCHAR(32) PRIMARY KEY,
product_id VARCHAR(32) NOT NULL,
specs JSON,
price DECIMAL(10,2) NOT NULL,
stock INT DEFAULT 0,
INDEX idx_product (product_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- 抓取记录表
CREATE TABLE crawl_records (
id BIGINT AUTO_INCREMENT PRIMARY KEY,
task_type VARCHAR(50) NOT NULL,
start_time DATETIME NOT NULL,
end_time DATETIME,
status VARCHAR(20) NOT NULL,
total_count INT DEFAULT 0,
success_count INT DEFAULT 0,
fail_count INT DEFAULT 0,
error_message TEXT,
INDEX idx_task_type (task_type),
INDEX idx_start_time (start_time)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
9.3 抓取服务实现
9.3.1 商品API客户端
package com.dreamworld.crawler;
import com.dreamworld.model.Product;
import com.dreamworld.network.BusinessApiClient;
import com.dreamworld.utils.LogUtils;
import com.google.gson.*;
import com.google.gson.reflect.TypeToken;
import java.lang.reflect.Type;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
/**
* 商品API客户端
*/
public class ProductApiClient {
private static final String TAG = "ProductApiClient";
private final BusinessApiClient apiClient;
private final Gson gson;
public ProductApiClient(BusinessApiClient apiClient) {
this.apiClient = apiClient;
this.gson = new GsonBuilder()
.registerTypeAdapter(LocalDateTime.class, new LocalDateTimeAdapter())
.create();
}
/**
* 获取商品列表
*/
public ProductListResult getProductList(int page, int size, String category, String sort) {
LogUtils.d(TAG, "获取商品列表: page=" + page + ", size=" + size);
StringBuilder path = new StringBuilder("/api/v1/products/list?");
path.append("page=").append(page);
path.append("&size=").append(size);
if (category != null && !category.isEmpty()) {
path.append("&category=").append(category);
}
if (sort != null && !sort.isEmpty()) {
path.append("&sort=").append(sort);
}
BusinessApiClient.ApiResponse response = apiClient.get(path.toString());
if (response == null || !response.isSuccess()) {
LogUtils.e(TAG, "获取商品列表失败: " +
(response != null ? response.getMsg() : "null"));
return null;
}
try {
JsonObject json = gson.fromJson(response.getRawResponse(), JsonObject.class);
JsonObject data = json.getAsJsonObject("data");
ProductListResult result = new ProductListResult();
result.setTotal(data.get("total").getAsInt());
result.setPage(data.get("page").getAsInt());
result.setSize(data.get("size").getAsInt());
result.setPages(data.get("pages").getAsInt());
Type listType = new TypeToken<List<Product>>(){}.getType();
List<Product> products = gson.fromJson(data.get("list"), listType);
result.setList(products);
// 设置抓取时间
LocalDateTime now = LocalDateTime.now();
for (Product product : products) {
product.setCrawlTime(now);
product.setCrawlSource("api");
}
return result;
} catch (Exception e) {
LogUtils.e(TAG, "解析商品列表失败", e);
return null;
}
}
/**
* 获取商品详情
*/
public Product getProductDetail(String productId) {
LogUtils.d(TAG, "获取商品详情: " + productId);
String path = "/api/v1/products/" + productId;
BusinessApiClient.ApiResponse response = apiClient.get(path);
if (response == null || !response.isSuccess()) {
LogUtils.e(TAG, "获取商品详情失败: " +
(response != null ? response.getMsg() : "null"));
return null;
}
try {
JsonObject json = gson.fromJson(response.getRawResponse(), JsonObject.class);
JsonObject data = json.getAsJsonObject("data");
Product product = gson.fromJson(data, Product.class);
product.setCrawlTime(LocalDateTime.now());
product.setCrawlSource("api");
return product;
} catch (Exception e) {
LogUtils.e(TAG, "解析商品详情失败", e);
return null;
}
}
/**
* 获取所有分类
*/
public List<Category> getCategories() {
LogUtils.d(TAG, "获取商品分类");
BusinessApiClient.ApiResponse response = apiClient.get("/api/v1/products/categories");
if (response == null || !response.isSuccess()) {
return new ArrayList<>();
}
try {
JsonObject json = gson.fromJson(response.getRawResponse(), JsonObject.class);
Type listType = new TypeToken<List<Category>>(){}.getType();
return gson.fromJson(json.get("data"), listType);
} catch (Exception e) {
LogUtils.e(TAG, "解析分类失败", e);
return new ArrayList<>();
}
}
/**
* 商品列表结果
*/
public static class ProductListResult {
private int total;
private int page;
private int size;
private int pages;
private List<Product> list;
public int getTotal() { return total; }
public void setTotal(int total) { this.total = total; }
public int getPage() { return page; }
public void setPage(int page) { this.page = page; }
public int getSize() { return size; }
public void setSize(int size) { this.size = size; }
public int getPages() { return pages; }
public void setPages(int pages) { this.pages = pages; }
public List<Product> getList() { return list; }
public void setList(List<Product> list) { this.list = list; }
}
/**
* 分类
*/
public static class Category {
private String id;
private String name;
private int productCount;
public String getId() { return id; }
public String getName() { return name; }
public int getProductCount() { return productCount; }
}
/**
* LocalDateTime适配器
*/
private static class LocalDateTimeAdapter implements JsonDeserializer<LocalDateTime> {
private static final DateTimeFormatter FORMATTER =
DateTimeFormatter.ISO_DATE_TIME;
@Override
public LocalDateTime deserialize(JsonElement json, Type typeOfT,
JsonDeserializationContext context) throws JsonParseException {
try {
return LocalDateTime.parse(json.getAsString(), FORMATTER);
} catch (Exception e) {
return null;
}
}
}
}
9.3.2 抓取任务
package com.dreamworld.crawler;
import com.dreamworld.model.Product;
import com.dreamworld.storage.ProductRepository;
import com.dreamworld.utils.LogUtils;
import java.time.LocalDateTime;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 商品抓取任务
*/
public class ProductCrawlTask {
private static final String TAG = "ProductCrawlTask";
private final ProductApiClient apiClient;
private final ProductRepository repository;
// 配置
private int pageSize = 50;
private int maxRetries = 3;
private long retryDelayMs = 1000;
private long requestDelayMs = 500; // 请求间隔,避免被限流
// 统计
private final AtomicInteger totalCount = new AtomicInteger(0);
private final AtomicInteger successCount = new AtomicInteger(0);
private final AtomicInteger failCount = new AtomicInteger(0);
public ProductCrawlTask(ProductApiClient apiClient, ProductRepository repository) {
this.apiClient = apiClient;
this.repository = repository;
}
/**
* 执行全量抓取
*/
public CrawlResult crawlAll() {
LogUtils.separator("开始全量抓取商品数据");
LocalDateTime startTime = LocalDateTime.now();
resetCounters();
try {
// 获取第一页,确定总页数
ProductApiClient.ProductListResult firstPage =
fetchWithRetry(1, pageSize, null, null);
if (firstPage == null) {
return createResult(startTime, "获取第一页失败");
}
int totalPages = firstPage.getPages();
int total = firstPage.getTotal();
LogUtils.i(TAG, "总商品数: " + total + ", 总页数: " + totalPages);
// 保存第一页数据
saveProducts(firstPage.getList());
// 抓取剩余页
for (int page = 2; page <= totalPages; page++) {
LogUtils.d(TAG, "抓取第 " + page + "/" + totalPages + " 页");
ProductApiClient.ProductListResult result =
fetchWithRetry(page, pageSize, null, null);
if (result != null && result.getList() != null) {
saveProducts(result.getList());
}
// 请求间隔
sleep(requestDelayMs);
}
return createResult(startTime, null);
} catch (Exception e) {
LogUtils.e(TAG, "抓取异常", e);
return createResult(startTime, e.getMessage());
}
}
/**
* 执行增量抓取
*/
public CrawlResult crawlIncremental(LocalDateTime since) {
LogUtils.separator("开始增量抓取商品数据");
LogUtils.i(TAG, "增量时间点: " + since);
LocalDateTime startTime = LocalDateTime.now();
resetCounters();
try {
// 按更新时间排序,获取最新数据
int page = 1;
boolean hasMore = true;
while (hasMore) {
ProductApiClient.ProductListResult result =
fetchWithRetry(page, pageSize, null, "newest");
if (result == null || result.getList() == null || result.getList().isEmpty()) {
break;
}
// 检查是否有更新的数据
List<Product> newProducts = result.getList().stream()
.filter(p -> p.getUpdateTime() != null && p.getUpdateTime().isAfter(since))
.collect(java.util.stream.Collectors.toList());
if (newProducts.isEmpty()) {
// 没有更新的数据了
hasMore = false;
} else {
saveProducts(newProducts);
page++;
}
sleep(requestDelayMs);
}
return createResult(startTime, null);
} catch (Exception e) {
LogUtils.e(TAG, "增量抓取异常", e);
return createResult(startTime, e.getMessage());
}
}
/**
* 抓取商品详情
*/
public CrawlResult crawlDetails(List<String> productIds) {
LogUtils.separator("开始抓取商品详情");
LogUtils.i(TAG, "待抓取数量: " + productIds.size());
LocalDateTime startTime = LocalDateTime.now();
resetCounters();
totalCount.set(productIds.size());
for (String productId : productIds) {
try {
Product product = apiClient.getProductDetail(productId);
if (product != null) {
repository.saveProduct(product);
successCount.incrementAndGet();
} else {
failCount.incrementAndGet();
}
sleep(requestDelayMs);
} catch (Exception e) {
LogUtils.e(TAG, "抓取详情失败: " + productId, e);
failCount.incrementAndGet();
}
}
return createResult(startTime, null);
}
/**
* 带重试的获取
*/
private ProductApiClient.ProductListResult fetchWithRetry(
int page, int size, String category, String sort) {
for (int i = 0; i < maxRetries; i++) {
try {
ProductApiClient.ProductListResult result =
apiClient.getProductList(page, size, category, sort);
if (result != null) {
return result;
}
} catch (Exception e) {
LogUtils.w(TAG, "获取失败,重试 " + (i + 1) + "/" + maxRetries);
}
sleep(retryDelayMs * (i + 1));
}
return null;
}
/**
* 保存商品列表
*/
private void saveProducts(List<Product> products) {
for (Product product : products) {
try {
repository.saveProduct(product);
successCount.incrementAndGet();
} catch (Exception e) {
LogUtils.e(TAG, "保存商品失败: " + product.getId(), e);
failCount.incrementAndGet();
}
totalCount.incrementAndGet();
}
}
private void resetCounters() {
totalCount.set(0);
successCount.set(0);
failCount.set(0);
}
private void sleep(long ms) {
try {
Thread.sleep(ms);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
private CrawlResult createResult(LocalDateTime startTime, String error) {
CrawlResult result = new CrawlResult();
result.setStartTime(startTime);
result.setEndTime(LocalDateTime.now());
result.setTotalCount(totalCount.get());
result.setSuccessCount(successCount.get());
result.setFailCount(failCount.get());
result.setError(error);
result.setSuccess(error == null);
LogUtils.separator("抓取完成");
LogUtils.i(TAG, "总数: " + result.getTotalCount());
LogUtils.i(TAG, "成功: " + result.getSuccessCount());
LogUtils.i(TAG, "失败: " + result.getFailCount());
LogUtils.i(TAG, "耗时: " + java.time.Duration.between(startTime, result.getEndTime()).toSeconds() + "秒");
return result;
}
/**
* 抓取结果
*/
public static class CrawlResult {
private LocalDateTime startTime;
private LocalDateTime endTime;
private int totalCount;
private int successCount;
private int failCount;
private String error;
private boolean success;
// Getters and Setters
public LocalDateTime getStartTime() { return startTime; }
public void setStartTime(LocalDateTime startTime) { this.startTime = startTime; }
public LocalDateTime getEndTime() { return endTime; }
public void setEndTime(LocalDateTime endTime) { this.endTime = endTime; }
public int getTotalCount() { return totalCount; }
public void setTotalCount(int totalCount) { this.totalCount = totalCount; }
public int getSuccessCount() { return successCount; }
public void setSuccessCount(int successCount) { this.successCount = successCount; }
public int getFailCount() { return failCount; }
public void setFailCount(int failCount) { this.failCount = failCount; }
public String getError() { return error; }
public void setError(String error) { this.error = error; }
public boolean isSuccess() { return success; }
public void setSuccess(boolean success) { this.success = success; }
}
// Setters for configuration
public void setPageSize(int pageSize) { this.pageSize = pageSize; }
public void setMaxRetries(int maxRetries) { this.maxRetries = maxRetries; }
public void setRetryDelayMs(long retryDelayMs) { this.retryDelayMs = retryDelayMs; }
public void setRequestDelayMs(long requestDelayMs) { this.requestDelayMs = requestDelayMs; }
}
9.4 数据存储实现
9.4.1 商品仓储接口
package com.dreamworld.storage;
import com.dreamworld.model.Product;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Optional;
/**
* 商品仓储接口
*/
public interface ProductRepository {
/**
* 保存商品
*/
void saveProduct(Product product);
/**
* 批量保存商品
*/
void saveProducts(List<Product> products);
/**
* 根据ID查询商品
*/
Optional<Product> findById(String id);
/**
* 查询所有商品
*/
List<Product> findAll();
/**
* 分页查询商品
*/
List<Product> findByPage(int page, int size);
/**
* 根据分类查询
*/
List<Product> findByCategory(String categoryId);
/**
* 查询指定时间后更新的商品
*/
List<Product> findUpdatedAfter(LocalDateTime time);
/**
* 统计商品总数
*/
long count();
/**
* 删除商品
*/
void deleteById(String id);
/**
* 清空所有数据
*/
void deleteAll();
}
9.4.2 MySQL实现
package com.dreamworld.storage;
import com.dreamworld.model.Product;
import com.dreamworld.utils.LogUtils;
import com.google.gson.Gson;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.*;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
/**
* MySQL商品仓储实现
*/
public class MySqlProductRepository implements ProductRepository {
private static final String TAG = "MySqlProductRepository";
private final HikariDataSource dataSource;
private final Gson gson = new Gson();
public MySqlProductRepository(String jdbcUrl, String username, String password) {
HikariConfig config = new HikariConfig();
config.setJdbcUrl(jdbcUrl);
config.setUsername(username);
config.setPassword(password);
config.setMaximumPoolSize(10);
config.setMinimumIdle(2);
config.setConnectionTimeout(30000);
config.setIdleTimeout(600000);
config.setMaxLifetime(1800000);
this.dataSource = new HikariDataSource(config);
// 初始化表结构
initTables();
}
private void initTables() {
String createProductTable = """
CREATE TABLE IF NOT EXISTS products (
id VARCHAR(32) PRIMARY KEY,
name VARCHAR(200) NOT NULL,
description TEXT,
price DECIMAL(10,2) NOT NULL,
original_price DECIMAL(10,2),
sales INT DEFAULT 0,
rating DECIMAL(2,1) DEFAULT 0,
review_count INT DEFAULT 0,
images JSON,
detail_images JSON,
category_id VARCHAR(32),
category_name VARCHAR(100),
specs JSON,
skus JSON,
tags JSON,
stock INT DEFAULT 0,
create_time DATETIME,
update_time DATETIME,
crawl_time DATETIME NOT NULL,
crawl_source VARCHAR(50),
INDEX idx_category (category_id),
INDEX idx_price (price),
INDEX idx_sales (sales),
INDEX idx_crawl_time (crawl_time),
INDEX idx_update_time (update_time)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
""";
try (Connection conn = dataSource.getConnection();
Statement stmt = conn.createStatement()) {
stmt.execute(createProductTable);
LogUtils.i(TAG, "数据库表初始化完成");
} catch (SQLException e) {
LogUtils.e(TAG, "初始化表失败", e);
}
}
@Override
public void saveProduct(Product product) {
String sql = """
INSERT INTO products (
id, name, description, price, original_price, sales, rating,
review_count, images, detail_images, category_id, category_name,
specs, skus, tags, stock, create_time, update_time, crawl_time, crawl_source
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
name = VALUES(name),
description = VALUES(description),
price = VALUES(price),
original_price = VALUES(original_price),
sales = VALUES(sales),
rating = VALUES(rating),
review_count = VALUES(review_count),
images = VALUES(images),
detail_images = VALUES(detail_images),
category_id = VALUES(category_id),
category_name = VALUES(category_name),
specs = VALUES(specs),
skus = VALUES(skus),
tags = VALUES(tags),
stock = VALUES(stock),
update_time = VALUES(update_time),
crawl_time = VALUES(crawl_time),
crawl_source = VALUES(crawl_source)
""";
try (Connection conn = dataSource.getConnection();
PreparedStatement pstmt = conn.prepareStatement(sql)) {
setProductParams(pstmt, product);
pstmt.executeUpdate();
} catch (SQLException e) {
LogUtils.e(TAG, "保存商品失败: " + product.getId(), e);
throw new RuntimeException("保存商品失败", e);
}
}
@Override
public void saveProducts(List<Product> products) {
if (products == null || products.isEmpty()) {
return;
}
String sql = """
INSERT INTO products (
id, name, description, price, original_price, sales, rating,
review_count, images, detail_images, category_id, category_name,
specs, skus, tags, stock, create_time, update_time, crawl_time, crawl_source
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE
name = VALUES(name),
price = VALUES(price),
sales = VALUES(sales),
stock = VALUES(stock),
crawl_time = VALUES(crawl_time)
""";
try (Connection conn = dataSource.getConnection();
PreparedStatement pstmt = conn.prepareStatement(sql)) {
conn.setAutoCommit(false);
for (Product product : products) {
setProductParams(pstmt, product);
pstmt.addBatch();
}
pstmt.executeBatch();
conn.commit();
LogUtils.d(TAG, "批量保存 " + products.size() + " 个商品");
} catch (SQLException e) {
LogUtils.e(TAG, "批量保存商品失败", e);
throw new RuntimeException("批量保存商品失败", e);
}
}
private void setProductParams(PreparedStatement pstmt, Product product) throws SQLException {
pstmt.setString(1, product.getId());
pstmt.setString(2, product.getName());
pstmt.setString(3, product.getDescription());
pstmt.setBigDecimal(4, product.getPrice());
pstmt.setBigDecimal(5, product.getOriginalPrice());
pstmt.setInt(6, product.getSales());
pstmt.setDouble(7, product.getRating());
pstmt.setInt(8, product.getReviewCount());
pstmt.setString(9, gson.toJson(product.getImages()));
pstmt.setString(10, gson.toJson(product.getDetailImages()));
if (product.getCategory() != null) {
pstmt.setString(11, product.getCategory().getId());
pstmt.setString(12, product.getCategory().getName());
} else {
pstmt.setNull(11, Types.VARCHAR);
pstmt.setNull(12, Types.VARCHAR);
}
pstmt.setString(13, gson.toJson(product.getSpecs()));
pstmt.setString(14, gson.toJson(product.getSkus()));
pstmt.setString(15, gson.toJson(product.getTags()));
pstmt.setInt(16, product.getStock());
pstmt.setTimestamp(17, toTimestamp(product.getCreateTime()));
pstmt.setTimestamp(18, toTimestamp(product.getUpdateTime()));
pstmt.setTimestamp(19, toTimestamp(product.getCrawlTime()));
pstmt.setString(20, product.getCrawlSource());
}
@Override
public Optional<Product> findById(String id) {
String sql = "SELECT * FROM products WHERE id = ?";
try (Connection conn = dataSource.getConnection();
PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setString(1, id);
try (ResultSet rs = pstmt.executeQuery()) {
if (rs.next()) {
return Optional.of(mapProduct(rs));
}
}
} catch (SQLException e) {
LogUtils.e(TAG, "查询商品失败: " + id, e);
}
return Optional.empty();
}
@Override
public List<Product> findAll() {
return findByPage(1, Integer.MAX_VALUE);
}
@Override
public List<Product> findByPage(int page, int size) {
String sql = "SELECT * FROM products ORDER BY crawl_time DESC LIMIT ? OFFSET ?";
List<Product> products = new ArrayList<>();
try (Connection conn = dataSource.getConnection();
PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setInt(1, size);
pstmt.setInt(2, (page - 1) * size);
try (ResultSet rs = pstmt.executeQuery()) {
while (rs.next()) {
products.add(mapProduct(rs));
}
}
} catch (SQLException e) {
LogUtils.e(TAG, "分页查询失败", e);
}
return products;
}
@Override
public List<Product> findByCategory(String categoryId) {
String sql = "SELECT * FROM products WHERE category_id = ? ORDER BY sales DESC";
List<Product> products = new ArrayList<>();
try (Connection conn = dataSource.getConnection();
PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setString(1, categoryId);
try (ResultSet rs = pstmt.executeQuery()) {
while (rs.next()) {
products.add(mapProduct(rs));
}
}
} catch (SQLException e) {
LogUtils.e(TAG, "按分类查询失败", e);
}
return products;
}
@Override
public List<Product> findUpdatedAfter(LocalDateTime time) {
String sql = "SELECT * FROM products WHERE update_time > ? ORDER BY update_time DESC";
List<Product> products = new ArrayList<>();
try (Connection conn = dataSource.getConnection();
PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setTimestamp(1, toTimestamp(time));
try (ResultSet rs = pstmt.executeQuery()) {
while (rs.next()) {
products.add(mapProduct(rs));
}
}
} catch (SQLException e) {
LogUtils.e(TAG, "查询更新商品失败", e);
}
return products;
}
@Override
public long count() {
String sql = "SELECT COUNT(*) FROM products";
try (Connection conn = dataSource.getConnection();
Statement stmt = conn.createStatement();
ResultSet rs = stmt.executeQuery(sql)) {
if (rs.next()) {
return rs.getLong(1);
}
} catch (SQLException e) {
LogUtils.e(TAG, "统计商品数量失败", e);
}
return 0;
}
@Override
public void deleteById(String id) {
String sql = "DELETE FROM products WHERE id = ?";
try (Connection conn = dataSource.getConnection();
PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setString(1, id);
pstmt.executeUpdate();
} catch (SQLException e) {
LogUtils.e(TAG, "删除商品失败: " + id, e);
}
}
@Override
public void deleteAll() {
String sql = "TRUNCATE TABLE products";
try (Connection conn = dataSource.getConnection();
Statement stmt = conn.createStatement()) {
stmt.execute(sql);
LogUtils.i(TAG, "已清空所有商品数据");
} catch (SQLException e) {
LogUtils.e(TAG, "清空商品数据失败", e);
}
}
private Product mapProduct(ResultSet rs) throws SQLException {
Product product = new Product();
product.setId(rs.getString("id"));
product.setName(rs.getString("name"));
product.setDescription(rs.getString("description"));
product.setPrice(rs.getBigDecimal("price"));
product.setOriginalPrice(rs.getBigDecimal("original_price"));
product.setSales(rs.getInt("sales"));
product.setRating(rs.getDouble("rating"));
product.setReviewCount(rs.getInt("review_count"));
product.setImages(gson.fromJson(rs.getString("images"),
new com.google.gson.reflect.TypeToken<List<String>>(){}.getType()));
product.setDetailImages(gson.fromJson(rs.getString("detail_images"),
new com.google.gson.reflect.TypeToken<List<String>>(){}.getType()));
String categoryId = rs.getString("category_id");
String categoryName = rs.getString("category_name");
if (categoryId != null) {
Product.Category category = new Product.Category();
category.setId(categoryId);
category.setName(categoryName);
product.setCategory(category);
}
product.setSpecs(gson.fromJson(rs.getString("specs"),
new com.google.gson.reflect.TypeToken<List<Product.Spec>>(){}.getType()));
product.setSkus(gson.fromJson(rs.getString("skus"),
new com.google.gson.reflect.TypeToken<List<Product.Sku>>(){}.getType()));
product.setTags(gson.fromJson(rs.getString("tags"),
new com.google.gson.reflect.TypeToken<List<String>>(){}.getType()));
product.setStock(rs.getInt("stock"));
product.setCreateTime(toLocalDateTime(rs.getTimestamp("create_time")));
product.setUpdateTime(toLocalDateTime(rs.getTimestamp("update_time")));
product.setCrawlTime(toLocalDateTime(rs.getTimestamp("crawl_time")));
product.setCrawlSource(rs.getString("crawl_source"));
return product;
}
private Timestamp toTimestamp(LocalDateTime time) {
return time != null ? Timestamp.valueOf(time) : null;
}
private LocalDateTime toLocalDateTime(Timestamp timestamp) {
return timestamp != null ? timestamp.toLocalDateTime() : null;
}
/**
* 关闭数据源
*/
public void close() {
if (dataSource != null && !dataSource.isClosed()) {
dataSource.close();
}
}
}
9.4.3 JSON文件存储实现
对于轻量级场景或测试环境,我们也提供了基于JSON文件的存储实现:
package com.dreamworld.storage;
import com.dreamworld.model.Product;
import com.dreamworld.utils.LogUtils;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.reflect.TypeToken;
import java.io.*;
import java.lang.reflect.Type;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDateTime;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
/**
* JSON文件商品仓储实现
* 适用于轻量级场景和测试环境
*/
public class JsonFileProductRepository implements ProductRepository {
private static final String TAG = "JsonFileRepository";
private final Path dataDir;
private final Path indexFile;
private final Gson gson;
// 内存索引
private final Map<String, Product> productIndex = new ConcurrentHashMap<>();
public JsonFileProductRepository(String dataPath) {
this.dataDir = Path.of(dataPath);
this.indexFile = dataDir.resolve("products_index.json");
this.gson = new GsonBuilder()
.setPrettyPrinting()
.setDateFormat("yyyy-MM-dd'T'HH:mm:ss")
.create();
initStorage();
}
private void initStorage() {
try {
// 创建数据目录
if (!Files.exists(dataDir)) {
Files.createDirectories(dataDir);
LogUtils.i(TAG, "创建数据目录: " + dataDir);
}
// 加载索引
loadIndex();
} catch (IOException e) {
LogUtils.e(TAG, "初始化存储失败", e);
}
}
private void loadIndex() {
if (!Files.exists(indexFile)) {
return;
}
try (Reader reader = Files.newBufferedReader(indexFile, StandardCharsets.UTF_8)) {
Type type = new TypeToken<Map<String, Product>>(){}.getType();
Map<String, Product> loaded = gson.fromJson(reader, type);
if (loaded != null) {
productIndex.putAll(loaded);
LogUtils.i(TAG, "加载 " + productIndex.size() + " 个商品索引");
}
} catch (Exception e) {
LogUtils.e(TAG, "加载索引失败", e);
}
}
private void saveIndex() {
try (Writer writer = Files.newBufferedWriter(indexFile, StandardCharsets.UTF_8)) {
gson.toJson(productIndex, writer);
} catch (IOException e) {
LogUtils.e(TAG, "保存索引失败", e);
}
}
@Override
public void saveProduct(Product product) {
productIndex.put(product.getId(), product);
// 保存单个商品文件
Path productFile = dataDir.resolve("product_" + product.getId() + ".json");
try (Writer writer = Files.newBufferedWriter(productFile, StandardCharsets.UTF_8)) {
gson.toJson(product, writer);
} catch (IOException e) {
LogUtils.e(TAG, "保存商品文件失败: " + product.getId(), e);
}
// 定期保存索引
if (productIndex.size() % 100 == 0) {
saveIndex();
}
}
@Override
public void saveProducts(List<Product> products) {
for (Product product : products) {
productIndex.put(product.getId(), product);
}
// 批量保存到单个文件
String timestamp = LocalDateTime.now().toString().replace(":", "-");
Path batchFile = dataDir.resolve("batch_" + timestamp + ".json");
try (Writer writer = Files.newBufferedWriter(batchFile, StandardCharsets.UTF_8)) {
gson.toJson(products, writer);
} catch (IOException e) {
LogUtils.e(TAG, "批量保存失败", e);
}
saveIndex();
}
@Override
public Optional<Product> findById(String id) {
return Optional.ofNullable(productIndex.get(id));
}
@Override
public List<Product> findAll() {
return new ArrayList<>(productIndex.values());
}
@Override
public List<Product> findByPage(int page, int size) {
return productIndex.values().stream()
.sorted((a, b) -> {
if (a.getCrawlTime() == null) return 1;
if (b.getCrawlTime() == null) return -1;
return b.getCrawlTime().compareTo(a.getCrawlTime());
})
.skip((long) (page - 1) * size)
.limit(size)
.collect(Collectors.toList());
}
@Override
public List<Product> findByCategory(String categoryId) {
return productIndex.values().stream()
.filter(p -> p.getCategory() != null &&
categoryId.equals(p.getCategory().getId()))
.collect(Collectors.toList());
}
@Override
public List<Product> findUpdatedAfter(LocalDateTime time) {
return productIndex.values().stream()
.filter(p -> p.getUpdateTime() != null && p.getUpdateTime().isAfter(time))
.collect(Collectors.toList());
}
@Override
public long count() {
return productIndex.size();
}
@Override
public void deleteById(String id) {
productIndex.remove(id);
Path productFile = dataDir.resolve("product_" + id + ".json");
try {
Files.deleteIfExists(productFile);
} catch (IOException e) {
LogUtils.e(TAG, "删除商品文件失败: " + id, e);
}
saveIndex();
}
@Override
public void deleteAll() {
productIndex.clear();
try {
Files.list(dataDir)
.filter(p -> p.toString().endsWith(".json"))
.forEach(p -> {
try {
Files.delete(p);
} catch (IOException e) {
LogUtils.e(TAG, "删除文件失败: " + p, e);
}
});
} catch (IOException e) {
LogUtils.e(TAG, "清空数据失败", e);
}
}
/**
* 导出所有数据到单个文件
*/
public void exportAll(String filename) {
Path exportFile = dataDir.resolve(filename);
try (Writer writer = Files.newBufferedWriter(exportFile, StandardCharsets.UTF_8)) {
gson.toJson(new ArrayList<>(productIndex.values()), writer);
LogUtils.i(TAG, "导出 " + productIndex.size() + " 个商品到 " + filename);
} catch (IOException e) {
LogUtils.e(TAG, "导出失败", e);
}
}
}
9.4.4 存储策略选择
┌─────────────────────────────────────────────────────────────────────────────┐
│ 存储方案对比 │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ 方案 适用场景 优点 缺点 │
│ ───────────────────────────────────────────────────────────────────────── │
│ MySQL 生产环境 事务支持 需要数据库服务 │
│ 大数据量 查询灵活 配置复杂 │
│ 高并发 索引优化 │
│ │
│ JSON文件 开发测试 简单易用 查询性能差 │
│ 小数据量 无需依赖 不支持事务 │
│ 快速原型 便于调试 并发问题 │
│ │
│ Redis 缓存层 高性能 数据持久化 │
│ 热点数据 支持过期 内存成本 │
│ │
│ Elasticsearch 全文搜索 搜索强大 资源消耗大 │
│ 数据分析 聚合分析 学习成本 │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
推荐架构:
┌─────────────────────────────────────────────────────────────────┐
│ 数据存储架构 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────┐ │
│ │ 抓取服务 │ │
│ └────┬─────┘ │
│ │ │
│ ▼ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ Redis │────▶│ MySQL │────▶│ ES │ │
│ │ 缓存 │ │ 主存储 │ │ 搜索 │ │
│ └──────────┘ └──────────┘ └──────────┘ │
│ │ │ │ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌──────────────────────────────────────────────┐ │
│ │ 统一查询接口 │ │
│ └──────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
9.5 增量更新策略
9.5.1 增量更新原理
全量抓取虽然简单,但在数据量大时效率低下。增量更新策略可以显著提升效率:
┌─────────────────────────────────────────────────────────────────┐
│ 增量更新流程 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────┐ │
│ │ 获取上次 │ │
│ │ 抓取时间 │ │
│ └────┬─────┘ │
│ │ │
│ ▼ │
│ ┌──────────┐ ┌──────────┐ │
│ │ 按更新时间│────▶│ 检查是否 │ │
│ │ 排序请求 │ │ 有新数据 │ │
│ └──────────┘ └────┬─────┘ │
│ │ │
│ ┌───────────┴───────────┐ │
│ │ │ │
│ ▼ ▼ │
│ ┌────────┐ ┌────────┐ │
│ │ 有新数据│ │ 无新数据│ │
│ │ 继续抓取│ │ 结束任务│ │
│ └────────┘ └────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
9.5.2 增量更新管理器
package com.dreamworld.crawler;
import com.dreamworld.storage.ProductRepository;
import com.dreamworld.utils.LogUtils;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Properties;
/**
* 增量更新管理器
*/
public class IncrementalUpdateManager {
private static final String TAG = "IncrementalUpdate";
private static final DateTimeFormatter FORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss");
private final Path stateFile;
private final ProductApiClient apiClient;
private final ProductRepository repository;
private final ProductCrawlTask crawlTask;
// 状态
private LocalDateTime lastFullCrawlTime;
private LocalDateTime lastIncrementalTime;
private int consecutiveEmptyCount = 0;
public IncrementalUpdateManager(
String stateFilePath,
ProductApiClient apiClient,
ProductRepository repository) {
this.stateFile = Path.of(stateFilePath);
this.apiClient = apiClient;
this.repository = repository;
this.crawlTask = new ProductCrawlTask(apiClient, repository);
loadState();
}
/**
* 执行智能更新
* 根据状态自动选择全量或增量
*/
public ProductCrawlTask.CrawlResult smartUpdate() {
LogUtils.separator("开始智能更新");
// 判断是否需要全量抓取
if (shouldFullCrawl()) {
LogUtils.i(TAG, "执行全量抓取");
ProductCrawlTask.CrawlResult result = crawlTask.crawlAll();
if (result.isSuccess()) {
lastFullCrawlTime = LocalDateTime.now();
lastIncrementalTime = lastFullCrawlTime;
consecutiveEmptyCount = 0;
saveState();
}
return result;
}
// 执行增量抓取
LogUtils.i(TAG, "执行增量抓取,起始时间: " + lastIncrementalTime);
ProductCrawlTask.CrawlResult result = crawlTask.crawlIncremental(lastIncrementalTime);
if (result.isSuccess()) {
if (result.getSuccessCount() == 0) {
consecutiveEmptyCount++;
LogUtils.i(TAG, "无新数据,连续空结果次数: " + consecutiveEmptyCount);
} else {
consecutiveEmptyCount = 0;
lastIncrementalTime = LocalDateTime.now();
}
saveState();
}
return result;
}
/**
* 判断是否需要全量抓取
*/
private boolean shouldFullCrawl() {
// 从未执行过全量抓取
if (lastFullCrawlTime == null) {
LogUtils.i(TAG, "首次运行,需要全量抓取");
return true;
}
// 距离上次全量抓取超过7天
if (lastFullCrawlTime.plusDays(7).isBefore(LocalDateTime.now())) {
LogUtils.i(TAG, "距离上次全量抓取超过7天,需要重新全量");
return true;
}
// 连续多次增量无数据,可能有遗漏
if (consecutiveEmptyCount >= 10) {
LogUtils.i(TAG, "连续10次增量无数据,执行全量校验");
return true;
}
return false;
}
/**
* 加载状态
*/
private void loadState() {
if (!Files.exists(stateFile)) {
return;
}
try (InputStream is = Files.newInputStream(stateFile)) {
Properties props = new Properties();
props.load(is);
String fullTime = props.getProperty("lastFullCrawlTime");
if (fullTime != null && !fullTime.isEmpty()) {
lastFullCrawlTime = LocalDateTime.parse(fullTime, FORMATTER);
}
String incTime = props.getProperty("lastIncrementalTime");
if (incTime != null && !incTime.isEmpty()) {
lastIncrementalTime = LocalDateTime.parse(incTime, FORMATTER);
}
String emptyCount = props.getProperty("consecutiveEmptyCount");
if (emptyCount != null) {
consecutiveEmptyCount = Integer.parseInt(emptyCount);
}
LogUtils.i(TAG, "加载状态: 上次全量=" + lastFullCrawlTime +
", 上次增量=" + lastIncrementalTime);
} catch (Exception e) {
LogUtils.e(TAG, "加载状态失败", e);
}
}
/**
* 保存状态
*/
private void saveState() {
try {
Properties props = new Properties();
if (lastFullCrawlTime != null) {
props.setProperty("lastFullCrawlTime", lastFullCrawlTime.format(FORMATTER));
}
if (lastIncrementalTime != null) {
props.setProperty("lastIncrementalTime", lastIncrementalTime.format(FORMATTER));
}
props.setProperty("consecutiveEmptyCount", String.valueOf(consecutiveEmptyCount));
try (OutputStream os = Files.newOutputStream(stateFile)) {
props.store(os, "Crawler State");
}
} catch (Exception e) {
LogUtils.e(TAG, "保存状态失败", e);
}
}
/**
* 强制全量抓取
*/
public ProductCrawlTask.CrawlResult forceFullCrawl() {
lastFullCrawlTime = null;
return smartUpdate();
}
/**
* 获取统计信息
*/
public UpdateStats getStats() {
UpdateStats stats = new UpdateStats();
stats.lastFullCrawlTime = lastFullCrawlTime;
stats.lastIncrementalTime = lastIncrementalTime;
stats.consecutiveEmptyCount = consecutiveEmptyCount;
stats.totalProducts = repository.count();
return stats;
}
/**
* 更新统计
*/
public static class UpdateStats {
public LocalDateTime lastFullCrawlTime;
public LocalDateTime lastIncrementalTime;
public int consecutiveEmptyCount;
public long totalProducts;
@Override
public String toString() {
return String.format(
"UpdateStats{lastFull=%s, lastIncremental=%s, emptyCount=%d, total=%d}",
lastFullCrawlTime, lastIncrementalTime, consecutiveEmptyCount, totalProducts
);
}
}
}
9.5.3 变更检测策略
除了基于时间的增量更新,我们还可以实现基于内容的变更检测:
package com.dreamworld.crawler;
import com.dreamworld.model.Product;
import com.dreamworld.storage.ProductRepository;
import com.dreamworld.utils.LogUtils;
import java.math.BigDecimal;
import java.security.MessageDigest;
import java.util.*;
/**
* 商品变更检测器
*/
public class ProductChangeDetector {
private static final String TAG = "ChangeDetector";
private final ProductRepository repository;
// 变更类型
public enum ChangeType {
NEW, // 新商品
PRICE, // 价格变化
STOCK, // 库存变化
INFO, // 信息变化
REMOVED // 商品下架
}
public ProductChangeDetector(ProductRepository repository) {
this.repository = repository;
}
/**
* 检测商品变更
*/
public List<ProductChange> detectChanges(List<Product> newProducts) {
List<ProductChange> changes = new ArrayList<>();
Set<String> newProductIds = new HashSet<>();
for (Product newProduct : newProducts) {
newProductIds.add(newProduct.getId());
Optional<Product> existingOpt = repository.findById(newProduct.getId());
if (existingOpt.isEmpty()) {
// 新商品
changes.add(new ProductChange(newProduct, ChangeType.NEW, null, null));
continue;
}
Product existing = existingOpt.get();
// 检测价格变化
if (!Objects.equals(existing.getPrice(), newProduct.getPrice())) {
changes.add(new ProductChange(
newProduct,
ChangeType.PRICE,
existing.getPrice(),
newProduct.getPrice()
));
}
// 检测库存变化
if (existing.getStock() != newProduct.getStock()) {
// 只记录显著变化(变化超过10%或从有到无)
if (existing.getStock() == 0 || newProduct.getStock() == 0 ||
Math.abs(existing.getStock() - newProduct.getStock()) > existing.getStock() * 0.1) {
changes.add(new ProductChange(
newProduct,
ChangeType.STOCK,
existing.getStock(),
newProduct.getStock()
));
}
}
// 检测信息变化(名称、描述等)
if (!Objects.equals(existing.getName(), newProduct.getName()) ||
!Objects.equals(existing.getDescription(), newProduct.getDescription())) {
changes.add(new ProductChange(newProduct, ChangeType.INFO, null, null));
}
}
LogUtils.i(TAG, "检测到 " + changes.size() + " 个变更");
return changes;
}
/**
* 检测下架商品
*/
public List<ProductChange> detectRemovedProducts(List<String> currentIds) {
List<ProductChange> changes = new ArrayList<>();
Set<String> currentIdSet = new HashSet<>(currentIds);
// 获取所有已存储的商品ID
List<Product> allProducts = repository.findAll();
for (Product product : allProducts) {
if (!currentIdSet.contains(product.getId())) {
changes.add(new ProductChange(product, ChangeType.REMOVED, null, null));
}
}
if (!changes.isEmpty()) {
LogUtils.i(TAG, "检测到 " + changes.size() + " 个下架商品");
}
return changes;
}
/**
* 计算商品内容哈希
*/
public String calculateContentHash(Product product) {
try {
StringBuilder sb = new StringBuilder();
sb.append(product.getName());
sb.append(product.getDescription());
sb.append(product.getPrice());
sb.append(product.getStock());
sb.append(product.getTags());
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] digest = md.digest(sb.toString().getBytes());
StringBuilder hash = new StringBuilder();
for (byte b : digest) {
hash.append(String.format("%02x", b));
}
return hash.toString();
} catch (Exception e) {
return null;
}
}
/**
* 商品变更记录
*/
public static class ProductChange {
private final Product product;
private final ChangeType type;
private final Object oldValue;
private final Object newValue;
private final long timestamp;
public ProductChange(Product product, ChangeType type, Object oldValue, Object newValue) {
this.product = product;
this.type = type;
this.oldValue = oldValue;
this.newValue = newValue;
this.timestamp = System.currentTimeMillis();
}
public Product getProduct() { return product; }
public ChangeType getType() { return type; }
public Object getOldValue() { return oldValue; }
public Object getNewValue() { return newValue; }
public long getTimestamp() { return timestamp; }
@Override
public String toString() {
return String.format("ProductChange{id=%s, type=%s, old=%s, new=%s}",
product.getId(), type, oldValue, newValue);
}
}
}
9.5.4 变更通知
package com.dreamworld.crawler;
import com.dreamworld.model.Product;
import com.dreamworld.utils.LogUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
/**
* 变更通知服务
*/
public class ChangeNotificationService {
private static final String TAG = "ChangeNotification";
private final List<ChangeListener> listeners = new CopyOnWriteArrayList<>();
/**
* 注册监听器
*/
public void addListener(ChangeListener listener) {
listeners.add(listener);
}
/**
* 移除监听器
*/
public void removeListener(ChangeListener listener) {
listeners.remove(listener);
}
/**
* 通知变更
*/
public void notifyChanges(List<ProductChangeDetector.ProductChange> changes) {
if (changes.isEmpty()) {
return;
}
LogUtils.i(TAG, "通知 " + listeners.size() + " 个监听器," + changes.size() + " 个变更");
for (ChangeListener listener : listeners) {
try {
listener.onChanges(changes);
} catch (Exception e) {
LogUtils.e(TAG, "通知监听器失败", e);
}
}
}
/**
* 变更监听器接口
*/
public interface ChangeListener {
void onChanges(List<ProductChangeDetector.ProductChange> changes);
}
/**
* 价格变化监听器示例
*/
public static class PriceAlertListener implements ChangeListener {
@Override
public void onChanges(List<ProductChangeDetector.ProductChange> changes) {
for (ProductChangeDetector.ProductChange change : changes) {
if (change.getType() == ProductChangeDetector.ChangeType.PRICE) {
Product product = change.getProduct();
LogUtils.i("PriceAlert", String.format(
"价格变化: %s, %s -> %s",
product.getName(),
change.getOldValue(),
change.getNewValue()
));
// 这里可以发送邮件、推送通知等
}
}
}
}
/**
* 库存预警监听器示例
*/
public static class StockAlertListener implements ChangeListener {
private final int threshold;
public StockAlertListener(int threshold) {
this.threshold = threshold;
}
@Override
public void onChanges(List<ProductChangeDetector.ProductChange> changes) {
for (ProductChangeDetector.ProductChange change : changes) {
if (change.getType() == ProductChangeDetector.ChangeType.STOCK) {
Product product = change.getProduct();
int newStock = (int) change.getNewValue();
if (newStock <= threshold) {
LogUtils.w("StockAlert", String.format(
"库存预警: %s, 当前库存: %d",
product.getName(),
newStock
));
}
}
}
}
}
}
9.6 调度管理
9.6.1 定时调度器
package com.dreamworld.scheduler;
import com.dreamworld.crawler.IncrementalUpdateManager;
import com.dreamworld.crawler.ProductCrawlTask;
import com.dreamworld.utils.LogUtils;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* 抓取调度器
*/
public class CrawlScheduler {
private static final String TAG = "CrawlScheduler";
private final IncrementalUpdateManager updateManager;
private final ScheduledExecutorService scheduler;
private final ExecutorService taskExecutor;
private final AtomicBoolean running = new AtomicBoolean(false);
private ScheduledFuture<?> scheduledTask;
// 配置
private long intervalMinutes = 30; // 默认30分钟
private LocalTime quietStartTime = LocalTime.of(2, 0); // 静默开始时间
private LocalTime quietEndTime = LocalTime.of(6, 0); // 静默结束时间
private boolean enableQuietHours = true;
public CrawlScheduler(IncrementalUpdateManager updateManager) {
this.updateManager = updateManager;
this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> {
Thread t = new Thread(r, "CrawlScheduler");
t.setDaemon(true);
return t;
});
this.taskExecutor = Executors.newSingleThreadExecutor(r -> {
Thread t = new Thread(r, "CrawlTask");
t.setDaemon(true);
return t;
});
}
/**
* 启动调度
*/
public void start() {
if (running.compareAndSet(false, true)) {
LogUtils.separator("启动抓取调度器");
LogUtils.i(TAG, "调度间隔: " + intervalMinutes + " 分钟");
if (enableQuietHours) {
LogUtils.i(TAG, "静默时段: " + quietStartTime + " - " + quietEndTime);
}
// 立即执行一次
executeTask();
// 定时执行
scheduledTask = scheduler.scheduleAtFixedRate(
this::executeTask,
intervalMinutes,
intervalMinutes,
TimeUnit.MINUTES
);
}
}
/**
* 停止调度
*/
public void stop() {
if (running.compareAndSet(true, false)) {
LogUtils.i(TAG, "停止抓取调度器");
if (scheduledTask != null) {
scheduledTask.cancel(false);
}
}
}
/**
* 执行抓取任务
*/
private void executeTask() {
// 检查静默时段
if (enableQuietHours && isQuietHours()) {
LogUtils.d(TAG, "当前处于静默时段,跳过执行");
return;
}
taskExecutor.submit(() -> {
try {
LogUtils.separator("执行定时抓取任务");
ProductCrawlTask.CrawlResult result = updateManager.smartUpdate();
LogUtils.i(TAG, "任务完成: " +
(result.isSuccess() ? "成功" : "失败") +
", 抓取: " + result.getSuccessCount() + " 条");
} catch (Exception e) {
LogUtils.e(TAG, "执行任务异常", e);
}
});
}
/**
* 判断是否在静默时段
*/
private boolean isQuietHours() {
LocalTime now = LocalTime.now();
if (quietStartTime.isBefore(quietEndTime)) {
// 正常时段,如 02:00 - 06:00
return now.isAfter(quietStartTime) && now.isBefore(quietEndTime);
} else {
// 跨天时段,如 22:00 - 06:00
return now.isAfter(quietStartTime) || now.isBefore(quietEndTime);
}
}
/**
* 手动触发执行
*/
public CompletableFuture<ProductCrawlTask.CrawlResult> triggerNow() {
return CompletableFuture.supplyAsync(() -> {
LogUtils.i(TAG, "手动触发抓取任务");
return updateManager.smartUpdate();
}, taskExecutor);
}
/**
* 强制全量抓取
*/
public CompletableFuture<ProductCrawlTask.CrawlResult> triggerFullCrawl() {
return CompletableFuture.supplyAsync(() -> {
LogUtils.i(TAG, "手动触发全量抓取");
return updateManager.forceFullCrawl();
}, taskExecutor);
}
/**
* 获取调度状态
*/
public SchedulerStatus getStatus() {
SchedulerStatus status = new SchedulerStatus();
status.running = running.get();
status.intervalMinutes = intervalMinutes;
status.enableQuietHours = enableQuietHours;
status.quietStartTime = quietStartTime;
status.quietEndTime = quietEndTime;
status.inQuietHours = isQuietHours();
status.updateStats = updateManager.getStats();
return status;
}
// Setters
public void setIntervalMinutes(long intervalMinutes) {
this.intervalMinutes = intervalMinutes;
}
public void setQuietHours(LocalTime start, LocalTime end) {
this.quietStartTime = start;
this.quietEndTime = end;
}
public void setEnableQuietHours(boolean enable) {
this.enableQuietHours = enable;
}
/**
* 关闭调度器
*/
public void shutdown() {
stop();
scheduler.shutdown();
taskExecutor.shutdown();
try {
if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) {
scheduler.shutdownNow();
}
if (!taskExecutor.awaitTermination(30, TimeUnit.SECONDS)) {
taskExecutor.shutdownNow();
}
} catch (InterruptedException e) {
scheduler.shutdownNow();
taskExecutor.shutdownNow();
Thread.currentThread().interrupt();
}
}
/**
* 调度器状态
*/
public static class SchedulerStatus {
public boolean running;
public long intervalMinutes;
public boolean enableQuietHours;
public LocalTime quietStartTime;
public LocalTime quietEndTime;
public boolean inQuietHours;
public IncrementalUpdateManager.UpdateStats updateStats;
@Override
public String toString() {
return String.format(
"SchedulerStatus{running=%s, interval=%dmin, quiet=%s, inQuiet=%s, stats=%s}",
running, intervalMinutes, enableQuietHours, inQuietHours, updateStats
);
}
}
}
9.6.2 完整使用示例
package com.dreamworld;
import com.dreamworld.crawler.*;
import com.dreamworld.network.BusinessApiClient;
import com.dreamworld.scheduler.CrawlScheduler;
import com.dreamworld.security.SecurityChain;
import com.dreamworld.storage.*;
import com.dreamworld.utils.LogUtils;
import java.time.LocalTime;
/**
* 商品抓取系统主入口
*/
public class ProductCrawlerMain {
public static void main(String[] args) {
LogUtils.separator("商品抓取系统启动");
try {
// 1. 初始化安全链
SecurityChain securityChain = new SecurityChain();
securityChain.initialize();
// 2. 激活获取HAC_KEY
String hacKey = securityChain.activate();
if (hacKey == null) {
LogUtils.e("Main", "安全激活失败");
return;
}
// 3. 创建API客户端
BusinessApiClient apiClient = new BusinessApiClient(securityChain);
ProductApiClient productApi = new ProductApiClient(apiClient);
// 4. 创建存储仓库
// 开发环境使用JSON文件存储
ProductRepository repository = new JsonFileProductRepository("./data/products");
// 生产环境使用MySQL
// ProductRepository repository = new MySqlProductRepository(
// "jdbc:mysql://localhost:3306/dreamworld",
// "root",
// "password"
// );
// 5. 创建增量更新管理器
IncrementalUpdateManager updateManager = new IncrementalUpdateManager(
"./data/crawler_state.properties",
productApi,
repository
);
// 6. 创建调度器
CrawlScheduler scheduler = new CrawlScheduler(updateManager);
scheduler.setIntervalMinutes(30); // 30分钟执行一次
scheduler.setQuietHours(LocalTime.of(2, 0), LocalTime.of(6, 0)); // 凌晨2-6点静默
// 7. 注册变更监听
ChangeNotificationService notificationService = new ChangeNotificationService();
notificationService.addListener(new ChangeNotificationService.PriceAlertListener());
notificationService.addListener(new ChangeNotificationService.StockAlertListener(10));
// 8. 启动调度
scheduler.start();
// 9. 注册关闭钩子
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
LogUtils.i("Main", "收到关闭信号,正在停止...");
scheduler.shutdown();
if (repository instanceof MySqlProductRepository) {
((MySqlProductRepository) repository).close();
}
}));
// 10. 保持运行
LogUtils.i("Main", "系统已启动,按 Ctrl+C 停止");
Thread.currentThread().join();
} catch (Exception e) {
LogUtils.e("Main", "系统异常", e);
}
}
}
9.6.3 运行效果
════════════════════════════════════════════════════════════════════
商品抓取系统启动
════════════════════════════════════════════════════════════════════
[INFO] SecurityChain: 初始化安全链...
[INFO] SecurityChain: 设置生产环境
[INFO] SecurityChain: 获取设备密钥ID: f1e2d3c4b5a6978869574a3b2c1d0e0f
[INFO] SecurityChain: 安全激活成功
════════════════════════════════════════════════════════════════════
启动抓取调度器
════════════════════════════════════════════════════════════════════
[INFO] CrawlScheduler: 调度间隔: 30 分钟
[INFO] CrawlScheduler: 静默时段: 02:00 - 06:00
════════════════════════════════════════════════════════════════════
开始智能更新
════════════════════════════════════════════════════════════════════
[INFO] IncrementalUpdate: 首次运行,需要全量抓取
[INFO] IncrementalUpdate: 执行全量抓取
════════════════════════════════════════════════════════════════════
开始全量抓取商品数据
════════════════════════════════════════════════════════════════════
[INFO] ProductCrawlTask: 总商品数: 1234, 总页数: 25
[DEBUG] ProductCrawlTask: 抓取第 2/25 页
[DEBUG] ProductCrawlTask: 抓取第 3/25 页
...
[DEBUG] ProductCrawlTask: 抓取第 25/25 页
════════════════════════════════════════════════════════════════════
抓取完成
════════════════════════════════════════════════════════════════════
[INFO] ProductCrawlTask: 总数: 1234
[INFO] ProductCrawlTask: 成功: 1234
[INFO] ProductCrawlTask: 失败: 0
[INFO] ProductCrawlTask: 耗时: 156秒
[INFO] CrawlScheduler: 任务完成: 成功, 抓取: 1234 条
[INFO] Main: 系统已启动,按 Ctrl+C 停止
9.7 本章小结
本章我们完成了从API分析到数据抓取的完整实战,主要内容包括:
9.7.1 技术要点回顾
┌─────────────────────────────────────────────────────────────────┐
│ 本章技术要点 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. API分析 │
│ • 接口发现与文档整理 │
│ • 请求参数与响应结构分析 │
│ • 分页与排序机制理解 │
│ │
│ 2. 数据模型设计 │
│ • 商品实体与关联对象 │
│ • 数据库表结构设计 │
│ • 抓取元数据管理 │
│ │
│ 3. 抓取服务实现 │
│ • API客户端封装 │
│ • 重试与容错机制 │
│ • 请求限流控制 │
│ │
│ 4. 数据存储 │
│ • MySQL生产级存储 │
│ • JSON文件轻量存储 │
│ • 存储策略选择 │
│ │
│ 5. 增量更新 │
│ • 基于时间的增量策略 │
│ • 变更检测与通知 │
│ • 智能更新决策 │
│ │
│ 6. 调度管理 │
│ • 定时任务调度 │
│ • 静默时段控制 │
│ • 手动触发支持 │
│ │
└─────────────────────────────────────────────────────────────────┘
9.7.2 架构设计原则
在本章的实现中,我们遵循了以下设计原则:
1. 分层架构
┌─────────────────────────────────────────────────────────────────┐
│ 分层架构 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ 调度层 │ │
│ │ CrawlScheduler │ │
│ └──────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ 业务层 │ │
│ │ IncrementalUpdateManager / ProductCrawlTask │ │
│ └──────────────────────────────────────────────────────┘ │
│ │ │
│ ┌─────────────┴─────────────┐ │
│ ▼ ▼ │
│ ┌─────────────────┐ ┌─────────────────┐ │
│ │ API层 │ │ 存储层 │ │
│ │ ProductApiClient│ │ProductRepository │ │
│ └─────────────────┘ └─────────────────┘ │
│ │ │ │
│ ▼ ▼ │
│ ┌─────────────────┐ ┌─────────────────┐ │
│ │ 安全层 │ │ 数据库/文件 │ │
│ │ SecurityChain │ │ MySQL/JSON │ │
│ └─────────────────┘ └─────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
2. 接口抽象
通过 ProductRepository 接口抽象存储层,使得:
- 可以轻松切换存储实现(MySQL、JSON、Redis等)
- 便于单元测试(使用Mock实现)
- 支持多种存储策略组合
3. 可配置性
所有关键参数都支持配置:
- 抓取间隔、页大小、重试次数
- 静默时段、请求延迟
- 存储路径、数据库连接
4. 可观测性
完善的日志和状态管理:
- 详细的执行日志
- 抓取统计信息
- 调度器状态查询
9.7.3 生产环境建议
将本章代码部署到生产环境时,建议:
1. 安全性
- 使用环境变量管理敏感配置
- 定期轮换HAC_KEY
- 限制数据库访问权限
2. 可靠性
- 部署多实例,使用分布式锁避免重复抓取
- 配置告警,及时发现抓取异常
- 定期备份抓取数据
3. 性能
- 根据目标服务器承受能力调整请求频率
- 使用连接池管理数据库连接
- 考虑使用消息队列解耦抓取和存储
4. 合规性
- 遵守目标网站的robots.txt
- 控制抓取频率,避免对目标服务器造成压力
- 仅抓取公开数据,不尝试绕过访问控制
9.7.4 下一章预告
在下一章《性能优化与监控》中,我们将深入探讨:
- 并发抓取优化
- 内存与CPU优化
- 监控指标设计
- 告警策略配置
- 性能测试方法
本章附录
A. 依赖配置
<!-- pom.xml 新增依赖 -->
<dependencies>
<!-- 数据库连接池 -->
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
<version>5.0.1</version>
</dependency>
<!-- MySQL驱动 -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.33</version>
</dependency>
<!-- JSON处理 -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.10.1</version>
</dependency>
</dependencies>
B. 配置文件示例
# crawler.properties
# API配置
api.base.url=https://api.dreamworld.com
api.timeout.ms=30000
# 抓取配置
crawler.page.size=50
crawler.max.retries=3
crawler.retry.delay.ms=1000
crawler.request.delay.ms=500
# 调度配置
scheduler.interval.minutes=30
scheduler.quiet.start=02:00
scheduler.quiet.end=06:00
# 存储配置
storage.type=mysql
storage.mysql.url=jdbc:mysql://localhost:3306/dreamworld
storage.mysql.username=crawler
storage.mysql.password=${DB_PASSWORD}
storage.mysql.pool.size=10
# 或使用JSON文件存储
# storage.type=json
# storage.json.path=./data/products
C. 类图
┌─────────────────────────────────────────────────────────────────────────────┐
│ 类关系图 │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────┐ │
│ │ CrawlScheduler │ │
│ └────────┬────────┘ │
│ │ uses │
│ ▼ │
│ ┌─────────────────────────┐ │
│ │IncrementalUpdateManager │ │
│ └────────┬────────────────┘ │
│ │ uses │
│ ▼ │
│ ┌─────────────────┐ ┌─────────────────────┐ │
│ │ProductCrawlTask │────────▶│ ProductRepository │◀─────┐ │
│ └────────┬────────┘ │ <<interface>> │ │ │
│ │ uses └─────────────────────┘ │ │
│ ▼ △ │ │
│ ┌─────────────────┐ │ │ │
│ │ProductApiClient │ ┌────────┴────────┐ │ │
│ └────────┬────────┘ │ │ │ │
│ │ uses ┌──────┴──────┐ ┌───────┴───────┐ │ │
│ ▼ │MySqlProduct │ │JsonFileProduct│ │ │
│ ┌─────────────────┐ │ Repository │ │ Repository │ │ │
│ │BusinessApiClient│ └─────────────┘ └───────────────┘ │ │
│ └────────┬────────┘ │ │
│ │ uses │ │
│ ▼ │ │
│ ┌─────────────────┐ ┌─────────────────────┐ │ │
│ │ SecurityChain │ │ProductChangeDetector│────────────┘ │
│ └─────────────────┘ └─────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
本章完