第十一章:错误处理与容灾
本章字数:约28000字 阅读时间:约95分钟 难度等级:★★★★☆
声明:本文中的公司名称、包名、API地址、密钥等均已脱敏处理。文中的"梦想世界"、"dreamworld"等均为虚构名称,与任何真实公司无关。
引言
在生产环境中,错误是不可避免的。网络抖动、服务器故障、资源耗尽……各种问题随时可能发生。一个健壮的系统必须能够:
- 优雅地处理错误 - 不因单点故障而崩溃
- 快速恢复 - 问题解决后自动恢复正常
- 保证数据一致性 - 即使在异常情况下也不丢失数据
本章将深入探讨错误处理和容灾设计。
11.1 异常分类与处理策略
11.1.1 异常分类体系
┌─────────────────────────────────────────────────────────────────┐
│ 异常分类体系 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ 可恢复异常 │ │
│ │ ───────────────────────────────────────────────────── │ │
│ │ • 网络超时 → 重试 │ │
│ │ • 连接被拒绝 → 等待后重试 │ │
│ │ • 服务暂时不可用 → 退避重试 │ │
│ │ • 限流被拒绝 → 降低速率重试 │ │
│ │ • 临时性服务器错误 → 重试 │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ 不可恢复异常 │ │
│ │ ───────────────────────────────────────────────────── │ │
│ │ • 认证失败 → 重新认证 │ │
│ │ • 参数错误 → 修正参数 │ │
│ │ • 资源不存在 → 跳过或标记 │ │
│ │ • 权限不足 → 人工介入 │ │
│ │ • 数据格式错误 → 记录并跳过 │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ 系统级异常 │ │
│ │ ───────────────────────────────────────────────────── │ │
│ │ • OOM → 紧急告警,重启 │ │
│ │ • 磁盘满 → 清理空间 │ │
│ │ • 数据库连接耗尽 → 扩容连接池 │ │
│ │ • 线程池满 → 拒绝新任务 │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
11.1.2 自定义异常体系
package com.dreamworld.exception;
/**
* 爬虫异常基类
*/
public abstract class CrawlerException extends RuntimeException {
private final ErrorCode errorCode;
private final boolean retryable;
public CrawlerException(ErrorCode errorCode, String message, boolean retryable) {
super(message);
this.errorCode = errorCode;
this.retryable = retryable;
}
public CrawlerException(ErrorCode errorCode, String message, Throwable cause, boolean retryable) {
super(message, cause);
this.errorCode = errorCode;
this.retryable = retryable;
}
public ErrorCode getErrorCode() { return errorCode; }
public boolean isRetryable() { return retryable; }
}
/**
* 错误码枚举
*/
public enum ErrorCode {
// 网络错误 (1xxx)
NETWORK_TIMEOUT(1001, "网络超时"),
NETWORK_CONNECTION_REFUSED(1002, "连接被拒绝"),
NETWORK_UNKNOWN(1099, "未知网络错误"),
// API错误 (2xxx)
API_RATE_LIMITED(2001, "请求被限流"),
API_UNAUTHORIZED(2002, "认证失败"),
API_FORBIDDEN(2003, "权限不足"),
API_NOT_FOUND(2004, "资源不存在"),
API_SERVER_ERROR(2005, "服务器错误"),
API_RESPONSE_INVALID(2006, "响应格式错误"),
// 安全错误 (3xxx)
SECURITY_KEY_EXPIRED(3001, "密钥过期"),
SECURITY_SIGN_FAILED(3002, "签名失败"),
SECURITY_ACTIVATION_FAILED(3003, "激活失败"),
// 存储错误 (4xxx)
STORAGE_CONNECTION_FAILED(4001, "存储连接失败"),
STORAGE_WRITE_FAILED(4002, "写入失败"),
STORAGE_READ_FAILED(4003, "读取失败"),
// 系统错误 (5xxx)
SYSTEM_OOM(5001, "内存不足"),
SYSTEM_RESOURCE_EXHAUSTED(5002, "资源耗尽"),
SYSTEM_UNKNOWN(5099, "未知系统错误");
private final int code;
private final String message;
ErrorCode(int code, String message) {
this.code = code;
this.message = message;
}
public int getCode() { return code; }
public String getMessage() { return message; }
}
/**
* 网络异常
*/
public class NetworkException extends CrawlerException {
public NetworkException(ErrorCode errorCode, String message) {
super(errorCode, message, true); // 网络异常通常可重试
}
public NetworkException(ErrorCode errorCode, String message, Throwable cause) {
super(errorCode, message, cause, true);
}
}
/**
* API异常
*/
public class ApiException extends CrawlerException {
private final int httpStatus;
private final String responseBody;
public ApiException(ErrorCode errorCode, int httpStatus, String responseBody) {
super(errorCode, buildMessage(errorCode, httpStatus, responseBody),
isRetryable(errorCode));
this.httpStatus = httpStatus;
this.responseBody = responseBody;
}
private static String buildMessage(ErrorCode errorCode, int httpStatus, String body) {
return String.format("%s (HTTP %d): %s", errorCode.getMessage(), httpStatus, body);
}
private static boolean isRetryable(ErrorCode errorCode) {
return errorCode == ErrorCode.API_RATE_LIMITED ||
errorCode == ErrorCode.API_SERVER_ERROR;
}
public int getHttpStatus() { return httpStatus; }
public String getResponseBody() { return responseBody; }
}
/**
* 安全异常
*/
public class SecurityException extends CrawlerException {
public SecurityException(ErrorCode errorCode, String message) {
super(errorCode, message, false); // 安全异常通常不可重试
}
}
/**
* 存储异常
*/
public class StorageException extends CrawlerException {
public StorageException(ErrorCode errorCode, String message, Throwable cause) {
super(errorCode, message, cause, isRetryable(errorCode));
}
private static boolean isRetryable(ErrorCode errorCode) {
return errorCode == ErrorCode.STORAGE_CONNECTION_FAILED;
}
}
11.1.3 异常处理器
package com.dreamworld.exception;
import com.dreamworld.alert.AlertService;
import com.dreamworld.metrics.MetricsCollector;
import com.dreamworld.utils.LogUtils;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Consumer;
/**
* 全局异常处理器
*/
public class GlobalExceptionHandler {
private static final String TAG = "ExceptionHandler";
private final AlertService alertService;
private final MetricsCollector metrics;
private final Map<Class<? extends Exception>, Consumer<Exception>> handlers;
public GlobalExceptionHandler(AlertService alertService, MetricsCollector metrics) {
this.alertService = alertService;
this.metrics = metrics;
this.handlers = new HashMap<>();
registerDefaultHandlers();
}
/**
* 注册默认处理器
*/
private void registerDefaultHandlers() {
// 网络异常
registerHandler(NetworkException.class, e -> {
NetworkException ne = (NetworkException) e;
metrics.incrementCounter("errors.network");
LogUtils.w(TAG, "网络异常: " + ne.getMessage());
});
// API异常
registerHandler(ApiException.class, e -> {
ApiException ae = (ApiException) e;
metrics.incrementCounter("errors.api");
if (ae.getErrorCode() == ErrorCode.API_RATE_LIMITED) {
alertService.warning("限流告警", "API请求被限流");
} else if (ae.getErrorCode() == ErrorCode.API_UNAUTHORIZED) {
alertService.error("认证失败", "API认证失败,需要重新激活");
}
});
// 安全异常
registerHandler(SecurityException.class, e -> {
SecurityException se = (SecurityException) e;
metrics.incrementCounter("errors.security");
alertService.error("安全异常", se.getMessage());
});
// 存储异常
registerHandler(StorageException.class, e -> {
StorageException se = (StorageException) e;
metrics.incrementCounter("errors.storage");
alertService.error("存储异常", se.getMessage());
});
// OOM
registerHandler(OutOfMemoryError.class, e -> {
metrics.incrementCounter("errors.oom");
alertService.critical("内存溢出", "系统发生OOM,需要立即处理!");
});
}
/**
* 注册异常处理器
*/
public <T extends Exception> void registerHandler(Class<T> exceptionClass, Consumer<Exception> handler) {
handlers.put(exceptionClass, handler);
}
/**
* 处理异常
*/
public void handle(Exception e) {
// 查找匹配的处理器
Consumer<Exception> handler = findHandler(e.getClass());
if (handler != null) {
handler.accept(e);
} else {
// 默认处理
metrics.incrementCounter("errors.unknown");
LogUtils.e(TAG, "未处理的异常", e);
}
}
/**
* 查找处理器(支持继承)
*/
private Consumer<Exception> findHandler(Class<?> exceptionClass) {
Consumer<Exception> handler = handlers.get(exceptionClass);
if (handler == null && exceptionClass.getSuperclass() != null) {
return findHandler(exceptionClass.getSuperclass());
}
return handler;
}
/**
* 判断异常是否可重试
*/
public boolean isRetryable(Exception e) {
if (e instanceof CrawlerException) {
return ((CrawlerException) e).isRetryable();
}
// 某些标准异常也可重试
if (e instanceof java.net.SocketTimeoutException ||
e instanceof java.net.ConnectException) {
return true;
}
return false;
}
}
11.2 重试机制设计
11.2.1 重试策略
┌─────────────────────────────────────────────────────────────────┐
│ 重试策略对比 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 策略 特点 适用场景 │
│ ───────────────────────────────────────────────────────────── │
│ 固定间隔 简单易实现 临时性故障 │
│ 间隔固定 负载稳定的服务 │
│ │
│ 指数退避 间隔递增 限流场景 │
│ 避免雪崩 服务恢复中 │
│ │
│ 指数退避+抖动 加入随机性 高并发场景 │
│ 避免同时重试 分布式系统 │
│ │
│ 自适应 根据成功率调整 长期运行服务 │
│ 动态优化 负载变化大 │
│ │
└─────────────────────────────────────────────────────────────────┘
11.2.2 重试器实现
package com.dreamworld.retry;
import com.dreamworld.exception.CrawlerException;
import com.dreamworld.exception.GlobalExceptionHandler;
import com.dreamworld.utils.LogUtils;
import java.util.Random;
import java.util.concurrent.Callable;
import java.util.function.Predicate;
/**
* 通用重试器
*/
public class Retryer<T> {
private static final String TAG = "Retryer";
private final int maxAttempts;
private final RetryStrategy strategy;
private final Predicate<Exception> retryPredicate;
private final GlobalExceptionHandler exceptionHandler;
private Retryer(Builder<T> builder) {
this.maxAttempts = builder.maxAttempts;
this.strategy = builder.strategy;
this.retryPredicate = builder.retryPredicate;
this.exceptionHandler = builder.exceptionHandler;
}
/**
* 执行带重试的操作
*/
public T execute(Callable<T> callable) throws Exception {
Exception lastException = null;
for (int attempt = 1; attempt <= maxAttempts; attempt++) {
try {
return callable.call();
} catch (Exception e) {
lastException = e;
// 处理异常
if (exceptionHandler != null) {
exceptionHandler.handle(e);
}
// 判断是否应该重试
if (!shouldRetry(e, attempt)) {
throw e;
}
// 计算等待时间
long waitTime = strategy.getWaitTime(attempt);
LogUtils.w(TAG, String.format(
"第 %d/%d 次尝试失败,%dms 后重试: %s",
attempt, maxAttempts, waitTime, e.getMessage()
));
// 等待
Thread.sleep(waitTime);
}
}
throw lastException;
}
/**
* 判断是否应该重试
*/
private boolean shouldRetry(Exception e, int attempt) {
// 已达最大尝试次数
if (attempt >= maxAttempts) {
return false;
}
// 使用自定义判断
if (retryPredicate != null) {
return retryPredicate.test(e);
}
// 使用异常处理器判断
if (exceptionHandler != null) {
return exceptionHandler.isRetryable(e);
}
// 默认:CrawlerException根据其retryable属性
if (e instanceof CrawlerException) {
return ((CrawlerException) e).isRetryable();
}
return false;
}
/**
* 构建器
*/
public static <T> Builder<T> builder() {
return new Builder<>();
}
public static class Builder<T> {
private int maxAttempts = 3;
private RetryStrategy strategy = new ExponentialBackoffStrategy(1000, 30000, 2.0);
private Predicate<Exception> retryPredicate;
private GlobalExceptionHandler exceptionHandler;
public Builder<T> maxAttempts(int maxAttempts) {
this.maxAttempts = maxAttempts;
return this;
}
public Builder<T> strategy(RetryStrategy strategy) {
this.strategy = strategy;
return this;
}
public Builder<T> retryIf(Predicate<Exception> predicate) {
this.retryPredicate = predicate;
return this;
}
public Builder<T> exceptionHandler(GlobalExceptionHandler handler) {
this.exceptionHandler = handler;
return this;
}
public Retryer<T> build() {
return new Retryer<>(this);
}
}
}
11.2.3 重试策略实现
package com.dreamworld.retry;
import java.util.Random;
/**
* 重试策略接口
*/
public interface RetryStrategy {
/**
* 获取第n次重试的等待时间(毫秒)
*/
long getWaitTime(int attempt);
}
/**
* 固定间隔策略
*/
public class FixedIntervalStrategy implements RetryStrategy {
private final long intervalMs;
public FixedIntervalStrategy(long intervalMs) {
this.intervalMs = intervalMs;
}
@Override
public long getWaitTime(int attempt) {
return intervalMs;
}
}
/**
* 指数退避策略
*/
public class ExponentialBackoffStrategy implements RetryStrategy {
private final long initialIntervalMs;
private final long maxIntervalMs;
private final double multiplier;
public ExponentialBackoffStrategy(long initialIntervalMs, long maxIntervalMs, double multiplier) {
this.initialIntervalMs = initialIntervalMs;
this.maxIntervalMs = maxIntervalMs;
this.multiplier = multiplier;
}
@Override
public long getWaitTime(int attempt) {
long interval = (long) (initialIntervalMs * Math.pow(multiplier, attempt - 1));
return Math.min(interval, maxIntervalMs);
}
}
/**
* 指数退避+抖动策略
*/
public class ExponentialBackoffWithJitterStrategy implements RetryStrategy {
private final long initialIntervalMs;
private final long maxIntervalMs;
private final double multiplier;
private final double jitterFactor;
private final Random random = new Random();
public ExponentialBackoffWithJitterStrategy(
long initialIntervalMs, long maxIntervalMs, double multiplier, double jitterFactor) {
this.initialIntervalMs = initialIntervalMs;
this.maxIntervalMs = maxIntervalMs;
this.multiplier = multiplier;
this.jitterFactor = jitterFactor;
}
@Override
public long getWaitTime(int attempt) {
long baseInterval = (long) (initialIntervalMs * Math.pow(multiplier, attempt - 1));
baseInterval = Math.min(baseInterval, maxIntervalMs);
// 添加抖动:interval * (1 - jitter + random * 2 * jitter)
double jitter = (random.nextDouble() * 2 - 1) * jitterFactor;
long interval = (long) (baseInterval * (1 + jitter));
return Math.max(0, interval);
}
}
/**
* 自适应策略
*/
public class AdaptiveRetryStrategy implements RetryStrategy {
private final long minIntervalMs;
private final long maxIntervalMs;
private volatile long currentIntervalMs;
private volatile int consecutiveSuccesses = 0;
private volatile int consecutiveFailures = 0;
public AdaptiveRetryStrategy(long minIntervalMs, long maxIntervalMs) {
this.minIntervalMs = minIntervalMs;
this.maxIntervalMs = maxIntervalMs;
this.currentIntervalMs = minIntervalMs;
}
@Override
public long getWaitTime(int attempt) {
return currentIntervalMs;
}
/**
* 记录成功
*/
public void recordSuccess() {
consecutiveSuccesses++;
consecutiveFailures = 0;
// 连续成功,减少间隔
if (consecutiveSuccesses >= 5) {
currentIntervalMs = Math.max(minIntervalMs, currentIntervalMs / 2);
consecutiveSuccesses = 0;
}
}
/**
* 记录失败
*/
public void recordFailure() {
consecutiveFailures++;
consecutiveSuccesses = 0;
// 连续失败,增加间隔
if (consecutiveFailures >= 3) {
currentIntervalMs = Math.min(maxIntervalMs, currentIntervalMs * 2);
consecutiveFailures = 0;
}
}
public long getCurrentInterval() {
return currentIntervalMs;
}
}
11.2.4 使用示例
// 创建重试器
Retryer<ProductApiClient.ProductListResult> retryer = Retryer.<ProductApiClient.ProductListResult>builder()
.maxAttempts(5)
.strategy(new ExponentialBackoffWithJitterStrategy(1000, 30000, 2.0, 0.2))
.retryIf(e -> {
if (e instanceof ApiException) {
ApiException ae = (ApiException) e;
return ae.getHttpStatus() >= 500 || ae.getHttpStatus() == 429;
}
return e instanceof NetworkException;
})
.build();
// 执行带重试的请求
try {
ProductApiClient.ProductListResult result = retryer.execute(() ->
apiClient.getProductList(page, size, null, null)
);
// 处理结果
} catch (Exception e) {
// 所有重试都失败
LogUtils.e(TAG, "请求最终失败", e);
}
11.3 熔断与降级
11.3.1 熔断器原理
┌─────────────────────────────────────────────────────────────────┐
│ 熔断器状态机 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ CLOSED │ │
│ │ (正常) │ │
│ └──────┬──────┘ │
│ │ │
│ 失败率超过阈值 │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ OPEN │ │
│ │ (熔断) │◀─────────┐ │
│ └──────┬──────┘ │ │
│ │ │ │
│ 等待超时时间 │ │
│ │ 探测失败 │
│ ▼ │ │
│ ┌─────────────┐ │ │
│ │ HALF_OPEN │──────────┘ │
│ │ (半开) │ │
│ └──────┬──────┘ │
│ │ │
│ 探测成功 │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ CLOSED │ │
│ │ (恢复) │ │
│ └─────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
11.3.2 熔断器实现
package com.dreamworld.circuitbreaker;
import com.dreamworld.utils.LogUtils;
import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
/**
* 熔断器
*/
public class CircuitBreaker {
private static final String TAG = "CircuitBreaker";
private final String name;
private final int failureThreshold; // 失败阈值
private final int successThreshold; // 半开状态成功阈值
private final Duration openDuration; // 熔断持续时间
private final double failureRateThreshold; // 失败率阈值
private final AtomicReference<State> state = new AtomicReference<>(State.CLOSED);
private final AtomicInteger failureCount = new AtomicInteger(0);
private final AtomicInteger successCount = new AtomicInteger(0);
private final AtomicInteger totalCount = new AtomicInteger(0);
private volatile Instant openTime;
public CircuitBreaker(String name, int failureThreshold, int successThreshold,
Duration openDuration, double failureRateThreshold) {
this.name = name;
this.failureThreshold = failureThreshold;
this.successThreshold = successThreshold;
this.openDuration = openDuration;
this.failureRateThreshold = failureRateThreshold;
}
/**
* 执行受保护的操作
*/
public <T> T execute(Callable<T> callable, Callable<T> fallback) throws Exception {
// 检查是否允许执行
if (!allowRequest()) {
LogUtils.w(TAG, String.format("[%s] 熔断器打开,执行降级逻辑", name));
return fallback.call();
}
try {
T result = callable.call();
recordSuccess();
return result;
} catch (Exception e) {
recordFailure();
// 如果熔断器打开,执行降级
if (state.get() == State.OPEN) {
return fallback.call();
}
throw e;
}
}
/**
* 检查是否允许请求
*/
public boolean allowRequest() {
State currentState = state.get();
switch (currentState) {
case CLOSED:
return true;
case OPEN:
// 检查是否可以转换到半开状态
if (shouldAttemptReset()) {
if (state.compareAndSet(State.OPEN, State.HALF_OPEN)) {
LogUtils.i(TAG, String.format("[%s] 熔断器进入半开状态", name));
resetCounters();
}
return true;
}
return false;
case HALF_OPEN:
return true;
default:
return false;
}
}
/**
* 记录成功
*/
private void recordSuccess() {
successCount.incrementAndGet();
totalCount.incrementAndGet();
State currentState = state.get();
if (currentState == State.HALF_OPEN) {
// 半开状态下,连续成功达到阈值则关闭熔断器
if (successCount.get() >= successThreshold) {
if (state.compareAndSet(State.HALF_OPEN, State.CLOSED)) {
LogUtils.i(TAG, String.format("[%s] 熔断器关闭,恢复正常", name));
resetCounters();
}
}
}
}
/**
* 记录失败
*/
private void recordFailure() {
failureCount.incrementAndGet();
totalCount.incrementAndGet();
State currentState = state.get();
if (currentState == State.HALF_OPEN) {
// 半开状态下失败,立即打开熔断器
if (state.compareAndSet(State.HALF_OPEN, State.OPEN)) {
openTime = Instant.now();
LogUtils.w(TAG, String.format("[%s] 半开状态探测失败,熔断器重新打开", name));
}
} else if (currentState == State.CLOSED) {
// 检查是否应该打开熔断器
if (shouldTrip()) {
if (state.compareAndSet(State.CLOSED, State.OPEN)) {
openTime = Instant.now();
LogUtils.w(TAG, String.format(
"[%s] 熔断器打开,失败次数: %d, 失败率: %.1f%%",
name, failureCount.get(), getFailureRate() * 100
));
}
}
}
}
/**
* 判断是否应该触发熔断
*/
private boolean shouldTrip() {
int failures = failureCount.get();
int total = totalCount.get();
// 失败次数超过阈值
if (failures >= failureThreshold) {
return true;
}
// 失败率超过阈值(需要足够的样本)
if (total >= 10 && getFailureRate() >= failureRateThreshold) {
return true;
}
return false;
}
/**
* 判断是否应该尝试恢复
*/
private boolean shouldAttemptReset() {
return openTime != null &&
Duration.between(openTime, Instant.now()).compareTo(openDuration) >= 0;
}
/**
* 获取失败率
*/
private double getFailureRate() {
int total = totalCount.get();
if (total == 0) return 0;
return (double) failureCount.get() / total;
}
/**
* 重置计数器
*/
private void resetCounters() {
failureCount.set(0);
successCount.set(0);
totalCount.set(0);
}
/**
* 获取当前状态
*/
public State getState() {
return state.get();
}
/**
* 获取统计信息
*/
public CircuitBreakerStats getStats() {
return new CircuitBreakerStats(
name,
state.get(),
failureCount.get(),
successCount.get(),
totalCount.get(),
getFailureRate()
);
}
/**
* 熔断器状态
*/
public enum State {
CLOSED, // 关闭(正常)
OPEN, // 打开(熔断)
HALF_OPEN // 半开(探测)
}
/**
* 熔断器统计
*/
public static class CircuitBreakerStats {
public final String name;
public final State state;
public final int failures;
public final int successes;
public final int total;
public final double failureRate;
public CircuitBreakerStats(String name, State state, int failures,
int successes, int total, double failureRate) {
this.name = name;
this.state = state;
this.failures = failures;
this.successes = successes;
this.total = total;
this.failureRate = failureRate;
}
@Override
public String toString() {
return String.format(
"CircuitBreaker[%s]{state=%s, failures=%d, successes=%d, rate=%.1f%%}",
name, state, failures, successes, failureRate * 100
);
}
}
/**
* 构建器
*/
public static Builder builder(String name) {
return new Builder(name);
}
public static class Builder {
private final String name;
private int failureThreshold = 5;
private int successThreshold = 3;
private Duration openDuration = Duration.ofSeconds(30);
private double failureRateThreshold = 0.5;
public Builder(String name) {
this.name = name;
}
public Builder failureThreshold(int threshold) {
this.failureThreshold = threshold;
return this;
}
public Builder successThreshold(int threshold) {
this.successThreshold = threshold;
return this;
}
public Builder openDuration(Duration duration) {
this.openDuration = duration;
return this;
}
public Builder failureRateThreshold(double threshold) {
this.failureRateThreshold = threshold;
return this;
}
public CircuitBreaker build() {
return new CircuitBreaker(name, failureThreshold, successThreshold,
openDuration, failureRateThreshold);
}
}
}
11.3.3 降级策略
package com.dreamworld.circuitbreaker;
import com.dreamworld.model.Product;
import com.dreamworld.storage.ProductRepository;
import com.dreamworld.utils.LogUtils;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
/**
* 降级服务
*/
public class FallbackService {
private static final String TAG = "Fallback";
private final ProductRepository cacheRepository; // 缓存/本地存储
public FallbackService(ProductRepository cacheRepository) {
this.cacheRepository = cacheRepository;
}
/**
* 商品列表降级
*/
public List<Product> getProductListFallback(int page, int size) {
LogUtils.w(TAG, "执行商品列表降级,返回缓存数据");
try {
// 从缓存获取
List<Product> cached = cacheRepository.findByPage(page, size);
if (!cached.isEmpty()) {
return cached;
}
} catch (Exception e) {
LogUtils.e(TAG, "缓存读取失败", e);
}
// 返回空列表
return Collections.emptyList();
}
/**
* 商品详情降级
*/
public Product getProductDetailFallback(String productId) {
LogUtils.w(TAG, "执行商品详情降级,返回缓存数据: " + productId);
try {
Optional<Product> cached = cacheRepository.findById(productId);
if (cached.isPresent()) {
return cached.get();
}
} catch (Exception e) {
LogUtils.e(TAG, "缓存读取失败", e);
}
// 返回占位商品
return createPlaceholderProduct(productId);
}
/**
* 创建占位商品
*/
private Product createPlaceholderProduct(String productId) {
Product placeholder = new Product();
placeholder.setId(productId);
placeholder.setName("商品信息暂时不可用");
placeholder.setDescription("请稍后重试");
placeholder.setPrice(java.math.BigDecimal.ZERO);
return placeholder;
}
}
11.4 数据一致性保障
11.4.1 幂等性设计
package com.dreamworld.idempotent;
import com.dreamworld.utils.LogUtils;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
/**
* 幂等性管理器
*/
public class IdempotentManager {
private static final String TAG = "Idempotent";
// 已处理的请求ID缓存
private final Map<String, ProcessedRequest> processedRequests = new ConcurrentHashMap<>();
private final long expirationMs;
public IdempotentManager(long expirationMinutes) {
this.expirationMs = TimeUnit.MINUTES.toMillis(expirationMinutes);
// 启动清理线程
startCleanupThread();
}
/**
* 检查并标记请求
* @return true 如果是新请求,false 如果是重复请求
*/
public boolean checkAndMark(String requestId) {
ProcessedRequest existing = processedRequests.get(requestId);
if (existing != null && !existing.isExpired()) {
LogUtils.d(TAG, "检测到重复请求: " + requestId);
return false;
}
processedRequests.put(requestId, new ProcessedRequest(requestId));
return true;
}
/**
* 标记请求完成
*/
public void markCompleted(String requestId, Object result) {
ProcessedRequest request = processedRequests.get(requestId);
if (request != null) {
request.markCompleted(result);
}
}
/**
* 标记请求失败
*/
public void markFailed(String requestId, Exception error) {
ProcessedRequests request = processedRequests.get(requestId);
if (request != null) {
request.markFailed(error);
}
}
/**
* 获取已处理请求的结果
*/
public Object getResult(String requestId) {
ProcessedRequest request = processedRequests.get(requestId);
return request != null ? request.result : null;
}
/**
* 启动清理线程
*/
private void startCleanupThread() {
Thread cleanupThread = new Thread(() -> {
while (!Thread.currentThread().isInterrupted()) {
try {
Thread.sleep(60000); // 每分钟清理一次
cleanup();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}, "IdempotentCleanup");
cleanupThread.setDaemon(true);
cleanupThread.start();
}
/**
* 清理过期请求
*/
private void cleanup() {
int removed = 0;
for (Map.Entry<String, ProcessedRequest> entry : processedRequests.entrySet()) {
if (entry.getValue().isExpired()) {
processedRequests.remove(entry.getKey());
removed++;
}
}
if (removed > 0) {
LogUtils.d(TAG, "清理 " + removed + " 个过期请求记录");
}
}
/**
* 已处理请求
*/
private class ProcessedRequest {
final String requestId;
final long timestamp;
volatile boolean completed;
volatile Object result;
volatile Exception error;
ProcessedRequest(String requestId) {
this.requestId = requestId;
this.timestamp = System.currentTimeMillis();
}
void markCompleted(Object result) {
this.completed = true;
this.result = result;
}
void markFailed(Exception error) {
this.completed = true;
this.error = error;
}
boolean isExpired() {
return System.currentTimeMillis() - timestamp > expirationMs;
}
}
}
11.4.2 事务性保存
package com.dreamworld.storage;
import com.dreamworld.model.Product;
import com.dreamworld.utils.LogUtils;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.List;
/**
* 事务性商品保存
*/
public class TransactionalProductSaver {
private static final String TAG = "TransactionalSaver";
private final javax.sql.DataSource dataSource;
public TransactionalProductSaver(javax.sql.DataSource dataSource) {
this.dataSource = dataSource;
}
/**
* 事务性批量保存
*/
public SaveResult saveWithTransaction(List<Product> products) {
SaveResult result = new SaveResult();
result.total = products.size();
Connection conn = null;
try {
conn = dataSource.getConnection();
conn.setAutoCommit(false);
for (Product product : products) {
try {
saveProduct(conn, product);
result.success++;
} catch (SQLException e) {
LogUtils.e(TAG, "保存商品失败: " + product.getId(), e);
result.failed++;
result.failedIds.add(product.getId());
}
}
// 提交事务
conn.commit();
LogUtils.i(TAG, String.format("事务提交成功,保存 %d/%d 个商品",
result.success, result.total));
} catch (SQLException e) {
// 回滚事务
if (conn != null) {
try {
conn.rollback();
LogUtils.w(TAG, "事务回滚");
} catch (SQLException re) {
LogUtils.e(TAG, "回滚失败", re);
}
}
result.error = e;
} finally {
if (conn != null) {
try {
conn.setAutoCommit(true);
conn.close();
} catch (SQLException e) {
LogUtils.e(TAG, "关闭连接失败", e);
}
}
}
return result;
}
/**
* 带检查点的批量保存
*/
public SaveResult saveWithCheckpoint(List<Product> products, int checkpointSize) {
SaveResult result = new SaveResult();
result.total = products.size();
Connection conn = null;
try {
conn = dataSource.getConnection();
conn.setAutoCommit(false);
int batchCount = 0;
for (Product product : products) {
try {
saveProduct(conn, product);
result.success++;
batchCount++;
// 达到检查点,提交
if (batchCount >= checkpointSize) {
conn.commit();
LogUtils.d(TAG, "检查点提交: " + result.success + " 条");
batchCount = 0;
}
} catch (SQLException e) {
LogUtils.e(TAG, "保存商品失败: " + product.getId(), e);
result.failed++;
result.failedIds.add(product.getId());
}
}
// 提交剩余数据
if (batchCount > 0) {
conn.commit();
}
} catch (SQLException e) {
result.error = e;
} finally {
if (conn != null) {
try {
conn.close();
} catch (SQLException e) {
LogUtils.e(TAG, "关闭连接失败", e);
}
}
}
return result;
}
private void saveProduct(Connection conn, Product product) throws SQLException {
String sql = "INSERT INTO products (id, name, price, crawl_time) VALUES (?, ?, ?, ?) " +
"ON DUPLICATE KEY UPDATE name=VALUES(name), price=VALUES(price), crawl_time=VALUES(crawl_time)";
try (PreparedStatement pstmt = conn.prepareStatement(sql)) {
pstmt.setString(1, product.getId());
pstmt.setString(2, product.getName());
pstmt.setBigDecimal(3, product.getPrice());
pstmt.setTimestamp(4, java.sql.Timestamp.valueOf(product.getCrawlTime()));
pstmt.executeUpdate();
}
}
/**
* 保存结果
*/
public static class SaveResult {
public int total;
public int success;
public int failed;
public List<String> failedIds = new java.util.ArrayList<>();
public Exception error;
public boolean isSuccess() {
return error == null && failed == 0;
}
}
}
11.5 灾难恢复
11.5.1 断点续传
package com.dreamworld.recovery;
import com.dreamworld.utils.LogUtils;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Properties;
/**
* 断点续传管理器
*/
public class CheckpointManager {
private static final String TAG = "Checkpoint";
private final Path checkpointFile;
private Properties checkpoint;
public CheckpointManager(String checkpointPath) {
this.checkpointFile = Path.of(checkpointPath);
this.checkpoint = new Properties();
load();
}
/**
* 加载检查点
*/
private void load() {
if (Files.exists(checkpointFile)) {
try (InputStream is = Files.newInputStream(checkpointFile)) {
checkpoint.load(is);
LogUtils.i(TAG, "加载检查点: " + checkpoint);
} catch (IOException e) {
LogUtils.e(TAG, "加载检查点失败", e);
}
}
}
/**
* 保存检查点
*/
public void save() {
try (OutputStream os = Files.newOutputStream(checkpointFile)) {
checkpoint.store(os, "Crawler Checkpoint");
} catch (IOException e) {
LogUtils.e(TAG, "保存检查点失败", e);
}
}
/**
* 设置检查点值
*/
public void set(String key, String value) {
checkpoint.setProperty(key, value);
}
public void set(String key, int value) {
checkpoint.setProperty(key, String.valueOf(value));
}
public void set(String key, long value) {
checkpoint.setProperty(key, String.valueOf(value));
}
/**
* 获取检查点值
*/
public String get(String key, String defaultValue) {
return checkpoint.getProperty(key, defaultValue);
}
public int getInt(String key, int defaultValue) {
String value = checkpoint.getProperty(key);
return value != null ? Integer.parseInt(value) : defaultValue;
}
public long getLong(String key, long defaultValue) {
String value = checkpoint.getProperty(key);
return value != null ? Long.parseLong(value) : defaultValue;
}
/**
* 清除检查点
*/
public void clear() {
checkpoint.clear();
try {
Files.deleteIfExists(checkpointFile);
} catch (IOException e) {
LogUtils.e(TAG, "删除检查点文件失败", e);
}
}
/**
* 检查是否有未完成的任务
*/
public boolean hasUnfinishedTask() {
return checkpoint.containsKey("taskId") &&
!"completed".equals(checkpoint.getProperty("status"));
}
}
/**
* 可恢复的抓取任务
*/
public class RecoverableCrawlTask {
private static final String TAG = "RecoverableCrawl";
private final CheckpointManager checkpoint;
private final ProductApiClient apiClient;
private final ProductRepository repository;
public RecoverableCrawlTask(CheckpointManager checkpoint,
ProductApiClient apiClient,
ProductRepository repository) {
this.checkpoint = checkpoint;
this.apiClient = apiClient;
this.repository = repository;
}
/**
* 执行可恢复的抓取
*/
public void crawl() {
String taskId = java.util.UUID.randomUUID().toString();
// 检查是否有未完成的任务
if (checkpoint.hasUnfinishedTask()) {
taskId = checkpoint.get("taskId", taskId);
LogUtils.i(TAG, "恢复未完成的任务: " + taskId);
} else {
checkpoint.set("taskId", taskId);
checkpoint.set("status", "running");
checkpoint.set("startPage", 1);
checkpoint.save();
}
int startPage = checkpoint.getInt("startPage", 1);
int totalPages = checkpoint.getInt("totalPages", 0);
try {
// 如果不知道总页数,先获取
if (totalPages == 0) {
ProductApiClient.ProductListResult firstPage =
apiClient.getProductList(1, 50, null, null);
if (firstPage != null) {
totalPages = firstPage.getPages();
checkpoint.set("totalPages", totalPages);
// 保存第一页数据
repository.saveProducts(firstPage.getList());
startPage = 2;
checkpoint.set("startPage", startPage);
checkpoint.save();
}
}
// 从断点继续抓取
for (int page = startPage; page <= totalPages; page++) {
LogUtils.d(TAG, String.format("抓取第 %d/%d 页", page, totalPages));
ProductApiClient.ProductListResult result =
apiClient.getProductList(page, 50, null, null);
if (result != null && result.getList() != null) {
repository.saveProducts(result.getList());
}
// 更新检查点
checkpoint.set("startPage", page + 1);
checkpoint.save();
}
// 标记完成
checkpoint.set("status", "completed");
checkpoint.save();
LogUtils.i(TAG, "任务完成: " + taskId);
} catch (Exception e) {
LogUtils.e(TAG, "任务异常,已保存检查点", e);
checkpoint.set("status", "failed");
checkpoint.set("error", e.getMessage());
checkpoint.save();
throw e;
}
}
}
11.5.2 数据备份与恢复
package com.dreamworld.backup;
import com.dreamworld.model.Product;
import com.dreamworld.storage.ProductRepository;
import com.dreamworld.utils.LogUtils;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
/**
* 数据备份服务
*/
public class BackupService {
private static final String TAG = "Backup";
private static final DateTimeFormatter FORMATTER =
DateTimeFormatter.ofPattern("yyyyMMdd_HHmmss");
private final ProductRepository repository;
private final Path backupDir;
private final Gson gson;
public BackupService(ProductRepository repository, String backupPath) {
this.repository = repository;
this.backupDir = Path.of(backupPath);
this.gson = new GsonBuilder()
.setPrettyPrinting()
.setDateFormat("yyyy-MM-dd'T'HH:mm:ss")
.create();
// 确保备份目录存在
try {
Files.createDirectories(backupDir);
} catch (IOException e) {
LogUtils.e(TAG, "创建备份目录失败", e);
}
}
/**
* 执行全量备份
*/
public BackupResult backup() {
String timestamp = LocalDateTime.now().format(FORMATTER);
String filename = "backup_" + timestamp + ".json.gz";
Path backupFile = backupDir.resolve(filename);
BackupResult result = new BackupResult();
result.filename = filename;
result.startTime = LocalDateTime.now();
try {
List<Product> allProducts = repository.findAll();
result.productCount = allProducts.size();
// 压缩写入
try (OutputStream os = Files.newOutputStream(backupFile);
GZIPOutputStream gzos = new GZIPOutputStream(os);
Writer writer = new OutputStreamWriter(gzos)) {
gson.toJson(allProducts, writer);
}
result.fileSize = Files.size(backupFile);
result.success = true;
result.endTime = LocalDateTime.now();
LogUtils.i(TAG, String.format("备份完成: %s, %d 条数据, %.2f MB",
filename, result.productCount, result.fileSize / 1024.0 / 1024.0));
} catch (Exception e) {
result.success = false;
result.error = e.getMessage();
LogUtils.e(TAG, "备份失败", e);
}
return result;
}
/**
* 从备份恢复
*/
public RestoreResult restore(String filename) {
Path backupFile = backupDir.resolve(filename);
RestoreResult result = new RestoreResult();
result.filename = filename;
result.startTime = LocalDateTime.now();
if (!Files.exists(backupFile)) {
result.success = false;
result.error = "备份文件不存在: " + filename;
return result;
}
try {
// 读取并解压
List<Product> products;
try (InputStream is = Files.newInputStream(backupFile);
GZIPInputStream gzis = new GZIPInputStream(is);
Reader reader = new InputStreamReader(gzis)) {
java.lang.reflect.Type listType =
new com.google.gson.reflect.TypeToken<List<Product>>(){}.getType();
products = gson.fromJson(reader, listType);
}
result.productCount = products.size();
// 恢复数据
repository.saveProducts(products);
result.success = true;
result.endTime = LocalDateTime.now();
LogUtils.i(TAG, String.format("恢复完成: %s, %d 条数据",
filename, result.productCount));
} catch (Exception e) {
result.success = false;
result.error = e.getMessage();
LogUtils.e(TAG, "恢复失败", e);
}
return result;
}
/**
* 列出所有备份
*/
public List<BackupInfo> listBackups() {
List<BackupInfo> backups = new java.util.ArrayList<>();
try {
Files.list(backupDir)
.filter(p -> p.toString().endsWith(".json.gz"))
.forEach(p -> {
BackupInfo info = new BackupInfo();
info.filename = p.getFileName().toString();
try {
info.fileSize = Files.size(p);
info.createTime = Files.getLastModifiedTime(p).toInstant()
.atZone(java.time.ZoneId.systemDefault()).toLocalDateTime();
} catch (IOException e) {
// ignore
}
backups.add(info);
});
} catch (IOException e) {
LogUtils.e(TAG, "列出备份失败", e);
}
// 按时间倒序
backups.sort((a, b) -> b.createTime.compareTo(a.createTime));
return backups;
}
/**
* 清理旧备份
*/
public int cleanOldBackups(int keepCount) {
List<BackupInfo> backups = listBackups();
int deleted = 0;
if (backups.size() > keepCount) {
for (int i = keepCount; i < backups.size(); i++) {
try {
Files.delete(backupDir.resolve(backups.get(i).filename));
deleted++;
} catch (IOException e) {
LogUtils.e(TAG, "删除备份失败: " + backups.get(i).filename, e);
}
}
}
if (deleted > 0) {
LogUtils.i(TAG, "清理 " + deleted + " 个旧备份");
}
return deleted;
}
/**
* 备份结果
*/
public static class BackupResult {
public String filename;
public LocalDateTime startTime;
public LocalDateTime endTime;
public int productCount;
public long fileSize;
public boolean success;
public String error;
}
/**
* 恢复结果
*/
public static class RestoreResult {
public String filename;
public LocalDateTime startTime;
public LocalDateTime endTime;
public int productCount;
public boolean success;
public String error;
}
/**
* 备份信息
*/
public static class BackupInfo {
public String filename;
public long fileSize;
public LocalDateTime createTime;
}
}
11.6 本章小结
本章我们深入探讨了错误处理和容灾设计,主要内容包括:
11.6.1 技术要点回顾
┌─────────────────────────────────────────────────────────────────┐
│ 本章技术要点 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. 异常分类与处理 │
│ • 可恢复异常 vs 不可恢复异常 │
│ • 自定义异常体系 │
│ • 全局异常处理器 │
│ │
│ 2. 重试机制 │
│ • 固定间隔策略 │
│ • 指数退避策略 │
│ • 指数退避+抖动策略 │
│ • 自适应策略 │
│ │
│ 3. 熔断与降级 │
│ • 熔断器状态机 │
│ • 熔断触发条件 │
│ • 降级策略实现 │
│ │
│ 4. 数据一致性 │
│ • 幂等性设计 │
│ • 事务性保存 │
│ • 检查点机制 │
│ │
│ 5. 灾难恢复 │
│ • 断点续传 │
│ • 数据备份与恢复 │
│ • 备份清理策略 │
│ │
└─────────────────────────────────────────────────────────────────┘
11.6.2 容灾设计原则
1. 快速失败(Fail Fast)
- 尽早发现问题
- 避免错误扩散
- 减少资源浪费
2. 优雅降级(Graceful Degradation)
- 核心功能优先
- 提供备选方案
- 保持用户体验
3. 故障隔离(Fault Isolation)
- 熔断器隔离故障
- 限制影响范围
- 防止级联失败
4. 自动恢复(Auto Recovery)
- 断点续传
- 自动重试
- 状态恢复
11.6.3 最佳实践检查清单
□ 异常处理
□ 定义清晰的异常分类
□ 实现全局异常处理
□ 记录详细的错误日志
□ 重试机制
□ 选择合适的重试策略
□ 设置合理的重试次数
□ 避免重试风暴
□ 熔断降级
□ 为关键服务配置熔断器
□ 实现降级逻辑
□ 监控熔断器状态
□ 数据保护
□ 实现幂等性
□ 使用事务保证一致性
□ 定期备份数据
□ 恢复能力
□ 实现断点续传
□ 测试恢复流程
□ 文档化恢复步骤
11.6.4 下一章预告
在下一章《Android安全机制全景》中,我们将从防御者的视角,全面分析Android平台的安全机制:
- Android安全架构
- 应用签名与验证
- 权限系统
- 沙箱机制
- 安全增强技术
本章完