熔断限流从入门到实战:打造高可用微服务架构
一、为什么需要熔断限流?
在分布式系统中,服务间的依赖关系日益复杂。当一个服务出现故障或响应过慢时,如果不及时隔离,会引发"雪崩效应",导致整个系统瘫痪。
真实案例:某电商系统在大促期间,推荐服务因为数据库慢查询导致响应超时,调用方持续重试,最终使整个订单系统崩溃,造成数百万元损失。
熔断限流正是解决这类问题的核心手段:
- 熔断(Circuit Breaker):当检测到下游服务故障率达到阈值时,自动切断调用,快速失败,保护系统稳定性
- 限流(Rate Limiting):控制请求速率,防止系统过载
二、自定义实现熔断器
让我们从零开始,手动实现一个简单的熔断器,深入理解其核心原理。
2.1 熔断器状态机
熔断器有三种状态:
- 关闭(CLOSED):正常状态,请求正常通过
- 开启(OPEN):熔断状态,直接返回失败,不调用下游服务
- 半开(HALF_OPEN):尝试恢复状态,允许少量请求通过探测服务是否恢复
2.2 核心代码实现
package com.example.circuitbreaker;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.atomic.AtomicLong;
/**
* 自定义熔断器实现
*
* @author 架构师
* @version 1.0.0
*/
public class CircuitBreaker {
// 熔断器状态枚举
public enum State {
CLOSED, // 关闭状态 - 正常
OPEN, // 开启状态 - 熔断
HALF_OPEN // 半开状态 - 尝试恢复
}
// 配置参数
private final int failureThreshold; // 失败阈值
private final long timeoutMillis; // 超时时间(毫秒)
private final long halfOpenWaitMillis; // 半开状态等待时间
// 状态变量
private final AtomicReference<State> state = new AtomicReference<>(State.CLOSED);
private final AtomicInteger failureCount = new AtomicInteger(0);
private final AtomicLong lastFailureTime = new AtomicLong(0);
private final AtomicLong nextAttemptTime = new AtomicLong(0);
public CircuitBreaker(int failureThreshold, long timeoutMillis, long halfOpenWaitMillis) {
this.failureThreshold = failureThreshold;
this.timeoutMillis = timeoutMillis;
this.halfOpenWaitMillis = halfOpenWaitMillis;
}
/**
* 执行请求,带熔断保护
*/
public <T> T execute(Supplier<T> supplier, Supplier<T> fallback) {
// 检查是否可以执行请求
if (!allowRequest()) {
return fallback.get();
}
long startTime = System.currentTimeMillis();
try {
T result = supplier.get();
// 记录成功
onSuccess();
return result;
} catch (Exception e) {
// 记录失败
onFailure(startTime);
return fallback.get();
}
}
/**
* 判断是否允许请求通过
*/
private boolean allowRequest() {
State currentState = state.get();
switch (currentState) {
case CLOSED:
return true;
case OPEN:
// 检查是否可以尝试半开
if (System.currentTimeMillis() >= nextAttemptTime.get()) {
if (state.compareAndSet(State.OPEN, State.HALF_OPEN)) {
System.out.println("熔断器进入半开状态,尝试恢复...");
return true;
}
}
return false;
case HALF_OPEN:
return true;
default:
return false;
}
}
/**
* 处理成功调用
*/
private void onSuccess() {
failureCount.set(0);
if (state.get() == State.HALF_OPEN) {
if (state.compareAndSet(State.HALF_OPEN, State.CLOSED)) {
System.out.println("服务已恢复,熔断器关闭");
}
}
}
/**
* 处理失败调用
*/
private void onFailure(long startTime) {
int currentFailures = failureCount.incrementAndGet();
lastFailureTime.set(System.currentTimeMillis());
// 超时也算失败
long duration = System.currentTimeMillis() - startTime;
if (duration > timeoutMillis) {
System.out.println("请求超时: " + duration + "ms");
}
// 检查是否需要熔断
if (currentFailures >= failureThreshold) {
if (state.compareAndSet(State.CLOSED, State.OPEN) ||
state.compareAndSet(State.HALF_OPEN, State.OPEN)) {
long waitTime = halfOpenWaitMillis;
nextAttemptTime.set(System.currentTimeMillis() + waitTime);
System.out.println("失败次数达到阈值 " + failureThreshold + ",熔断器开启,等待 " +
waitTime + "ms 后尝试恢复");
}
}
}
public State getState() {
return state.get();
}
public int getFailureCount() {
return failureCount.get();
}
@FunctionalInterface
public interface Supplier<T> {
T get() throws Exception;
}
}
2.3 使用示例
package com.example.circuitbreaker;
/**
* 熔断器使用示例
*/
public class CircuitBreakerExample {
public static void main(String[] args) {
// 创建熔断器:3次失败后熔断,超时2秒,半开等待5秒
CircuitBreaker breaker = new CircuitBreaker(3, 2000, 5000);
// 模拟远程服务调用
for (int i = 0; i < 20; i++) {
String result = breaker.execute(
() -> {
// 模拟远程调用
if (Math.random() < 0.4) {
throw new RuntimeException("服务异常");
}
return "调用成功";
},
() -> {
return "降级处理";
}
);
System.out.println("第" + (i + 1) + "次调用: " + result +
", 熔断器状态: " + breaker.getState() +
", 失败次数: " + breaker.getFailureCount());
try {
Thread.sleep(500);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
}
三、自定义限流器
3.1 滑动窗口算法
滑动窗口算法可以更精确地控制请求速率。
package com.example.ratelimiter;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 滑动窗口限流器
*
* @author 架构师
*/
public class SlidingWindowRateLimiter {
private final int maxRequests; // 窗口内最大请求数
private final long windowSizeMillis; // 窗口大小(毫秒)
private final ConcurrentLinkedQueue<Long> requestTimestamps = new ConcurrentLinkedQueue<>();
private final AtomicInteger currentCount = new AtomicInteger(0);
public SlidingWindowRateLimiter(int maxRequests, long windowSizeMillis) {
this.maxRequests = maxRequests;
this.windowSizeMillis = windowSizeMillis;
}
/**
* 尝试获取请求许可
*/
public synchronized boolean tryAcquire() {
long currentTime = System.currentTimeMillis();
long windowStart = currentTime - windowSizeMillis;
// 清理窗口外的旧记录
while (!requestTimestamps.isEmpty() && requestTimestamps.peek() < windowStart) {
requestTimestamps.poll();
currentCount.decrementAndGet();
}
// 检查是否允许新请求
if (currentCount.get() < maxRequests) {
requestTimestamps.offer(currentTime);
currentCount.incrementAndGet();
return true;
}
return false;
}
public int getCurrentCount() {
return currentCount.get();
}
}
3.2 令牌桶算法
令牌桶算法可以应对突发流量。
package com.example.ratelimiter;
import java.util.concurrent.atomic.AtomicLong;
/**
* 令牌桶限流器
*
* @author 架构师
*/
public class TokenBucketRateLimiter {
private final long capacity; // 桶容量
private final long refillRate; // 令牌填充速率(每秒)
private final AtomicLong tokens; // 当前令牌数
private final AtomicLong lastRefillTime; // 上次填充时间
public TokenBucketRateLimiter(long capacity, long refillRate) {
this.capacity = capacity;
this.refillRate = refillRate;
this.tokens = new AtomicLong(capacity);
this.lastRefillTime = new AtomicLong(System.currentTimeMillis());
}
/**
* 尝试消费令牌
*/
public synchronized boolean tryAcquire(int permits) {
refillTokens();
long currentTokens = tokens.get();
if (currentTokens >= permits) {
tokens.addAndGet(-permits);
return true;
}
return false;
}
/**
* 填充令牌
*/
private void refillTokens() {
long now = System.currentTimeMillis();
long lastRefill = lastRefillTime.get();
long elapsedMillis = now - lastRefill;
if (elapsedMillis > 0) {
long tokensToAdd = (elapsedMillis * refillRate) / 1000;
if (tokensToAdd > 0) {
long newTokens = Math.min(capacity, tokens.get() + tokensToAdd);
tokens.set(newTokens);
lastRefillTime.set(now);
}
}
}
public long getAvailableTokens() {
refillTokens();
return tokens.get();
}
}
四、生产级解决方案:Resilience4j
自定义实现适合学习原理,但在生产环境中,我们推荐使用成熟的开源框架。Resilience4j是Java领域最流行的容错库。
4.1 Resilience4j核心模块
Resilience4j提供多个模块,各司其职:
- Circuit Breaker:熔断器
- Rate Limiter:限流器
- Retry:重试机制
- Timeout:超时控制
- Bulkhead:舱壁隔离(限制并发数)
- Cache:结果缓存
4.2 快速集成
Maven依赖:
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-spring-boot2</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-circuitbreaker</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-ratelimiter</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-retry</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>io.github.resilience4j</groupId>
<artifactId>resilience4j-bulkhead</artifactId>
<version>2.1.0</version>
</dependency>
配置文件:
resilience4j:
circuitbreaker:
configs:
default:
failure-rate-threshold: 50 # 失败率阈值50%
wait-duration-in-open-state: 10s # 熔断开启等待时间
sliding-window-size: 10 # 滑动窗口大小
minimum-number-of-calls: 5 # 最小调用次数
permitted-number-of-calls-in-half-open-state: 3 # 半开状态允许调用次数
instances:
remoteService:
base-config: default
ratelimiter:
configs:
default:
limit-refresh-period: 1s # 限流周期
limit-for-period: 100 # 每周期允许请求数
register-health-indicator: true
instances:
remoteService:
base-config: default
retry:
configs:
default:
max-attempts: 3 # 最大重试次数
wait-duration: 500ms # 重试等待时间
retry-exceptions:
- java.lang.RuntimeException
bulkhead:
configs:
default:
max-concurrent-calls: 50 # 最大并发调用数
max-wait-duration: 0s # 最大等待时间
4.3 代码实现
package com.example.resilience;
import io.github.resilience4j.circuitbreaker.annotation.CircuitBreaker;
import io.github.resilience4j.ratelimiter.annotation.RateLimiter;
import io.github.resilience4j.retry.annotation.Retry;
import io.github.resilience4j.bulkhead.annotation.Bulkhead;
import org.springframework.stereotype.Service;
import java.util.Random;
/**
* 远程服务代理类 - 使用Resilience4j增强
*
* @author 架构师
*/
@Service
public class RemoteServiceProxy {
private final Random random = new Random();
/**
* 调用远程服务 - 组合使用多种容错机制
*/
@CircuitBreaker(name = "remoteService", fallbackMethod = "fallback")
@RateLimiter(name = "remoteService")
@Retry(name = "remoteService")
@Bulkhead(name = "remoteService", type = Bulkhead.Type.THREADPOOL)
public String callRemoteService(String request) {
// 模拟远程服务调用
simulateRemoteCall();
return "服务响应: " + request;
}
/**
* 降级方法
*/
private String fallback(String request, Exception exception) {
return "[降级] 服务暂时不可用,请稍后再试";
}
/**
* 模拟远程调用(可能失败)
*/
private void simulateRemoteCall() {
// 30%概率失败
if (random.nextInt(10) < 3) {
throw new RuntimeException("服务调用失败");
}
// 模拟网络延迟
try {
Thread.sleep(50 + random.nextInt(100));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
4.4 控制器层
package com.example.controller;
import com.example.resilience.RemoteServiceProxy;
import io.github.resilience4j.circuitbreaker.CircuitBreakerRegistry;
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
/**
* API控制器
*
* @author 架构师
*/
@RestController
@RequestMapping("/api")
public class ApiController {
@Autowired
private RemoteServiceProxy remoteService;
@Autowired
private CircuitBreakerRegistry circuitBreakerRegistry;
@GetMapping("/call/{request}")
public String callService(@PathVariable String request) {
return remoteService.callRemoteService(request);
}
@GetMapping("/circuit-breaker/state")
public Object getCircuitBreakerState() {
CircuitBreaker circuitBreaker = circuitBreakerRegistry
.circuitBreaker("remoteService");
return java.util.Collections.singletonMap("state",
circuitBreaker.getState().name());
}
}
五、生产环境实战
5.1 系统架构
在高并发系统中,我们采用多层防护策略:
- 网关层限流:基于IP、API路径的全局限流
- 应用层熔断:针对每个下游服务的熔断保护
- 数据库层连接池:限制数据库并发连接数
5.2 典型应用场景
场景1:第三方API调用
@Service
public class PaymentService {
@CircuitBreaker(
name = "paymentGateway",
fallbackMethod = "localPaymentQueue"
)
@RateLimiter(name = "paymentGateway")
public PaymentResult processPayment(PaymentRequest request) {
// 调用第三方支付网关
return paymentGatewayClient.charge(request);
}
private PaymentResult localPaymentQueue(PaymentRequest request, Exception ex) {
// 降级:写入本地队列,稍后重试
paymentQueueService.enqueue(request);
return PaymentResult.pending();
}
}
场景2:数据库查询保护
@Service
public class UserService {
@Bulkhead(name = "databaseQuery", maxConcurrentCalls = 10)
public List<User> getUsersByIds(List<Long> ids) {
// 限制数据库并发查询数
return userRepository.findAllById(ids);
}
}
场景3:缓存穿透保护
@Service
public class ProductService {
@CircuitBreaker(name = "productSearch", fallbackMethod = "searchFromCache")
@Cacheable(value = "products", key = "#keyword")
public List<Product> searchProducts(String keyword) {
return productRepository.search(keyword);
}
private List<Product> searchFromCache(String keyword, Exception ex) {
// 从缓存或ES中搜索
return productCacheService.search(keyword);
}
}
5.3 分布式限流
在微服务架构中,需要实现分布式限流:
@Service
public class DistributedRateLimiter {
@Autowired
private RedisTemplate<String, String> redisTemplate;
/**
* 基于Redis的分布式限流
*/
public boolean tryAcquire(String key, int limit, int windowSeconds) {
String redisKey = "rate:limit:" + key;
// Lua脚本保证原子性
String luaScript =
"local current = redis.call('get', KEYS[1]) " +
"if current == false then " +
" redis.call('set', KEYS[1], 1) " +
" redis.call('expire', KEYS[1], ARGV[1]) " +
" return 1 " +
"elseif tonumber(current) < tonumber(ARGV[2]) then " +
" return redis.call('incr', KEYS[1]) " +
"else " +
" return 0 " +
"end";
DefaultRedisScript<Long> script = new DefaultRedisScript<>(luaScript, Long.class);
Long result = redisTemplate.execute(script,
Collections.singletonList(redisKey),
String.valueOf(windowSeconds),
String.valueOf(limit));
return result != null && result == 1L;
}
}
六、最佳实践
6.1 配置原则
- 失败率阈值:建议设置为50%,避免过于敏感
- 熔断等待时间:根据服务恢复时间设置,一般10-30秒
- 半开状态请求数:建议3-5个,避免过多请求冲击
6.2 常见陷阱
错误做法1:所有服务使用统一配置
// 不推荐:一刀切
@CircuitBreaker(name = "default") // 所有服务用同一配置
✅ 正确做法:针对不同服务特点配置
// 推荐:个性化配置
@CircuitBreaker(name = "criticalService") // 核心服务
@CircuitBreaker(name = "normalService") // 普通服务
错误做法2:降级逻辑抛异常
// 不推荐:降级中抛异常
private String fallback(String req, Exception ex) {
throw new RuntimeException("降级失败");
}
✅ 正确做法:降级返回安全值
// 推荐:返回默认值或缓存值
private String fallback(String req, Exception ex) {
return getCachedValue(req);
}
七、总结
熔断限流是保障微服务高可用的核心手段。