跳到主要内容

服务熔断与降级详解

在微服务系统中,服务间的依赖关系复杂,单个服务的故障可能会像雪崩一样迅速传播到整个系统。熔断器模式是解决这类问题的有效手段,它通过快速失败和降级策略来保护系统免受级联故障的影响。

核心价值

熔断器 = 快速失败 + 故障隔离 + 自动恢复 + 降级保护 + 系统稳定

1. 熔断器基础概念

1.1 什么是熔断器?

熔断器模式来源于电路中的保险丝概念,当电路过载时,保险丝会熔断以保护整个电路系统。在软件系统中,熔断器同样起到保护作用,当依赖的服务出现故障时,熔断器会快速失败,避免故障传播。

熔断器的核心思想

熔断器核心思想示例
java
1public class CircuitBreakerConcept {
2 /*
3 * 熔断器的核心思想
4 * 1. 监控:持续监控依赖服务的调用结果
5 * 2. 判断:根据失败率、响应时间等指标判断服务状态
6 * 3. 保护:当服务异常时,快速失败,避免资源浪费
7 * 4. 恢复:当服务恢复时,逐步恢复正常调用
8 * 5. 降级:提供降级策略,保证系统可用性
9 */
10
11 // 熔断器状态枚举
12 public enum CircuitBreakerState {
13 CLOSED, // 关闭状态:正常处理请求
14 OPEN, // 开启状态:快速失败
15 HALF_OPEN // 半开状态:允许部分请求通过
16 }
17
18 // 熔断器基本结构
19 public class CircuitBreaker {
20 private CircuitBreakerState state = CircuitBreakerState.CLOSED;
21 private int failureCount = 0;
22 private int successCount = 0;
23 private long lastFailureTime = 0;
24
25 // 配置参数
26 private final int failureThreshold; // 失败阈值
27 private final int successThreshold; // 成功阈值
28 private final long timeout; // 超时时间
29 private final long resetTimeout; // 重置超时时间
30
31 public CircuitBreaker(int failureThreshold, int successThreshold,
32 long timeout, long resetTimeout) {
33 this.failureThreshold = failureThreshold;
34 this.successThreshold = successThreshold;
35 this.timeout = timeout;
36 this.resetTimeout = resetTimeout;
37 }
38
39 // 执行方法
40 public <T> T execute(Supplier<T> supplier) {
41 if (isOpen()) {
42 throw new CircuitBreakerOpenException("Circuit breaker is open");
43 }
44
45 try {
46 T result = supplier.get();
47 onSuccess();
48 return result;
49 } catch (Exception e) {
50 onFailure();
51 throw e;
52 }
53 }
54
55 // 判断是否开启
56 private boolean isOpen() {
57 if (state == CircuitBreakerState.OPEN) {
58 if (System.currentTimeMillis() - lastFailureTime > resetTimeout) {
59 state = CircuitBreakerState.HALF_OPEN;
60 successCount = 0;
61 return false;
62 }
63 return true;
64 }
65 return false;
66 }
67
68 // 成功处理
69 private void onSuccess() {
70 if (state == CircuitBreakerState.HALF_OPEN) {
71 if (++successCount >= successThreshold) {
72 state = CircuitBreakerState.CLOSED;
73 failureCount = 0;
74 }
75 }
76 }
77
78 // 失败处理
79 private void onFailure() {
80 lastFailureTime = System.currentTimeMillis();
81
82 if (state == CircuitBreakerState.CLOSED) {
83 if (++failureCount >= failureThreshold) {
84 state = CircuitBreakerState.OPEN;
85 }
86 } else if (state == CircuitBreakerState.HALF_OPEN) {
87 state = CircuitBreakerState.OPEN;
88 successCount = 0;
89 }
90 }
91 }
92
93 // 熔断器异常
94 public class CircuitBreakerOpenException extends RuntimeException {
95 public CircuitBreakerOpenException(String message) {
96 super(message);
97 }
98 }
99}

1.2 熔断器的三种状态

状态转换图

1熔断器状态转换:
2┌─────────────┐ 失败次数达到阈值 ┌─────────────┐
3│ CLOSED │ ─────────────────────▶ │ OPEN │
4│ (关闭状态) │ │ (开启状态) │
5│ │ ◀───────────────────── │ │
6│ 正常处理请求 │ 超时时间到达 │ 快速失败 │
7└─────────────┘ └─────────────┘
8 │ │
9 │ 成功次数达到阈值 │
10 ▼ │
11┌─────────────┐ │
12│ HALF_OPEN │ ◀─────────────────────────────┘
13│ (半开状态) │
14│ │
15│ 允许部分请求 │
16└─────────────┘

状态详细说明

熔断器状态说明
java
1public class CircuitBreakerStates {
2 /*
3 * 熔断器三种状态详解
4 *
5 * 1. CLOSED(关闭状态)
6 * - 正常状态,所有请求都会正常处理
7 * - 持续监控失败率,当失败次数达到阈值时转换为OPEN状态
8 * - 失败计数器会记录每次失败
9 *
10 * 2. OPEN(开启状态)
11 * - 保护状态,所有请求都会快速失败
12 * - 避免对故障服务的无效调用,节省资源
13 * - 经过一定时间后自动转换为HALF_OPEN状态
14 *
15 * 3. HALF_OPEN(半开状态)
16 * - 试探状态,允许少量请求通过
17 * - 如果这些请求成功,则转换为CLOSED状态
18 * - 如果这些请求失败,则转换回OPEN状态
19 */
20
21 // 关闭状态处理
22 public class ClosedStateHandler {
23 public <T> T handle(Supplier<T> supplier) {
24 try {
25 T result = supplier.get();
26 // 成功,重置失败计数
27 resetFailureCount();
28 return result;
29 } catch (Exception e) {
30 // 失败,增加失败计数
31 incrementFailureCount();
32
33 // 检查是否达到失败阈值
34 if (getFailureCount() >= getFailureThreshold()) {
35 transitionToOpenState();
36 }
37
38 throw e;
39 }
40 }
41 }
42
43 // 开启状态处理
44 public class OpenStateHandler {
45 public <T> T handle(Supplier<T> supplier) {
46 // 直接抛出异常,不调用实际服务
47 throw new CircuitBreakerOpenException("Circuit breaker is open");
48 }
49
50 // 检查是否可以转换为半开状态
51 public boolean canTransitionToHalfOpen() {
52 return System.currentTimeMillis() - getLastFailureTime() > getResetTimeout();
53 }
54 }
55
56 // 半开状态处理
57 public class HalfOpenStateHandler {
58 public <T> T handle(Supplier<T> supplier) {
59 try {
60 T result = supplier.get();
61 // 成功,增加成功计数
62 incrementSuccessCount();
63
64 // 检查是否达到成功阈值
65 if (getSuccessCount() >= getSuccessThreshold()) {
66 transitionToClosedState();
67 }
68
69 return result;
70 } catch (Exception e) {
71 // 失败,立即转换回开启状态
72 transitionToOpenState();
73 throw e;
74 }
75 }
76 }
77}

1.3 熔断器的优势与挑战

核心优势

熔断器优势示例
java
1public class CircuitBreakerAdvantages {
2 /*
3 * 熔断器的核心优势
4 * 1. 快速失败:避免长时间等待故障服务响应
5 * 2. 故障隔离:防止故障在服务间传播
6 * 3. 自动恢复:服务恢复后自动恢复正常调用
7 * 4. 资源保护:避免资源浪费在无效调用上
8 * 5. 系统稳定:提高整体系统的可用性和稳定性
9 */
10
11 // 快速失败示例
12 public class FastFailureExample {
13 private final CircuitBreaker circuitBreaker;
14
15 public FastFailureExample() {
16 this.circuitBreaker = new CircuitBreaker(5, 3, 5000, 30000);
17 }
18
19 public String callService() {
20 return circuitBreaker.execute(() -> {
21 // 模拟调用可能失败的服务
22 if (Math.random() < 0.3) {
23 throw new RuntimeException("Service unavailable");
24 }
25 return "Service response";
26 });
27 }
28
29 // 当服务故障时,熔断器会快速失败,避免长时间等待
30 public void demonstrateFastFailure() {
31 try {
32 String result = callService();
33 System.out.println("Success: " + result);
34 } catch (CircuitBreakerOpenException e) {
35 System.out.println("Fast failure: " + e.getMessage());
36 // 可以立即返回降级结果,而不是等待超时
37 }
38 }
39 }
40
41 // 故障隔离示例
42 public class FaultIsolationExample {
43 private final Map<String, CircuitBreaker> circuitBreakers = new HashMap<>();
44
45 public void callMultipleServices() {
46 // 每个服务都有独立的熔断器
47 String userResult = callServiceWithCircuitBreaker("user-service", this::callUserService);
48 String orderResult = callServiceWithCircuitBreaker("order-service", this::callOrderService);
49 String paymentResult = callServiceWithCircuitBreaker("payment-service", this::callPaymentService);
50
51 // 即使某个服务故障,其他服务仍能正常工作
52 System.out.println("User: " + userResult);
53 System.out.println("Order: " + orderResult);
54 System.out.println("Payment: " + paymentResult);
55 }
56
57 private <T> T callServiceWithCircuitBreaker(String serviceName, Supplier<T> serviceCall) {
58 CircuitBreaker circuitBreaker = circuitBreakers.computeIfAbsent(serviceName,
59 k -> new CircuitBreaker(5, 3, 5000, 30000));
60
61 try {
62 return circuitBreaker.execute(serviceCall);
63 } catch (CircuitBreakerOpenException e) {
64 return getFallbackResult(serviceName);
65 }
66 }
67
68 private String getFallbackResult(String serviceName) {
69 // 返回降级结果
70 return "Fallback result for " + serviceName;
71 }
72 }
73}

主要挑战

熔断器挑战示例
java
1public class CircuitBreakerChallenges {
2 /*
3 * 熔断器面临的主要挑战
4 * 1. 配置复杂:需要合理设置各种阈值参数
5 * 2. 状态管理:需要正确管理熔断器状态转换
6 * 3. 监控困难:需要监控熔断器的状态和性能
7 * 4. 降级策略:需要设计合适的降级策略
8 * 5. 测试困难:难以模拟各种故障场景
9 */
10
11 // 配置复杂性
12 public class ConfigurationComplexity {
13 /*
14 * 熔断器配置参数众多,需要根据实际情况调整
15 * - failureThreshold: 失败阈值,设置过低容易误触发,设置过高响应慢
16 * - successThreshold: 成功阈值,影响恢复速度
17 * - timeout: 超时时间,影响用户体验
18 * - resetTimeout: 重置超时时间,影响故障恢复
19 */
20
21 public CircuitBreaker createOptimizedCircuitBreaker(String serviceName) {
22 // 根据服务特点设置不同的配置
23 switch (serviceName) {
24 case "user-service":
25 // 用户服务,对可用性要求高,设置较低的失败阈值
26 return new CircuitBreaker(3, 2, 3000, 20000);
27 case "order-service":
28 // 订单服务,对一致性要求高,设置较高的失败阈值
29 return new CircuitBreaker(5, 3, 5000, 30000);
30 case "payment-service":
31 // 支付服务,对安全性要求高,设置严格的配置
32 return new CircuitBreaker(2, 1, 2000, 15000);
33 default:
34 return new CircuitBreaker(5, 3, 5000, 30000);
35 }
36 }
37 }
38
39 // 状态管理复杂性
40 public class StateManagementComplexity {
41 /*
42 * 熔断器状态管理需要考虑的问题
43 * 1. 并发安全:多线程环境下的状态转换
44 * 2. 状态持久化:重启后状态恢复
45 * 3. 分布式环境:多实例间的状态同步
46 * 4. 监控告警:状态变化的及时通知
47 */
48
49 // 线程安全的熔断器
50 public class ThreadSafeCircuitBreaker {
51 private volatile CircuitBreakerState state = CircuitBreakerState.CLOSED;
52 private final AtomicInteger failureCount = new AtomicInteger(0);
53 private final AtomicInteger successCount = new AtomicInteger(0);
54 private volatile long lastFailureTime = 0;
55
56 public synchronized void transitionToOpenState() {
57 state = CircuitBreakerState.OPEN;
58 lastFailureTime = System.currentTimeMillis();
59 // 通知监控系统
60 notifyStateChange("OPEN");
61 }
62
63 public synchronized void transitionToClosedState() {
64 state = CircuitBreakerState.CLOSED;
65 failureCount.set(0);
66 // 通知监控系统
67 notifyStateChange("CLOSED");
68 }
69
70 private void notifyStateChange(String newState) {
71 // 发送状态变化通知
72 System.out.println("Circuit breaker state changed to: " + newState);
73 }
74 }
75 }
76}
熔断器设计原则
  1. 快速失败:当依赖服务故障时,应该快速失败而不是等待超时
  2. 自动恢复:当依赖服务恢复时,应该自动恢复正常调用
  3. 降级保护:提供合适的降级策略,保证系统可用性
  4. 监控告警:建立完善的监控体系,及时发现和处理问题
  5. 配置灵活:支持动态配置,适应不同的业务场景

2. Resilience4j 框架详解

2.1 Resilience4j 核心组件

Resilience4j是一个轻量级的容错库,专为Java 8和函数式编程设计。它提供了多种容错模式,包括熔断器、限流器、重试、超时等。

核心组件架构

Resilience4j核心组件
java
1public class Resilience4jComponents {
2 /*
3 * Resilience4j核心组件
4 * 1. CircuitBreaker:熔断器,防止级联故障
5 * 2. RateLimiter:限流器,控制请求速率
6 * 3. Bulkhead:舱壁隔离,限制并发调用
7 * 4. Retry:重试机制,处理临时故障
8 * 5. TimeLimiter:超时控制,避免长时间等待
9 * 6. Cache:缓存机制,提高响应速度
10 */
11
12 // 熔断器配置
13 public class CircuitBreakerConfig {
14 private final int slidingWindowSize; // 滑动窗口大小
15 private final double failureRateThreshold; // 失败率阈值
16 private final Duration waitDurationInOpenState; // 开启状态等待时间
17 private final int permittedNumberOfCallsInHalfOpenState; // 半开状态允许调用数
18 private final boolean automaticTransitionFromOpenToHalfOpenEnabled; // 自动转换
19
20 public CircuitBreakerConfig(int slidingWindowSize, double failureRateThreshold,
21 Duration waitDurationInOpenState,
22 int permittedNumberOfCallsInHalfOpenState) {
23 this.slidingWindowSize = slidingWindowSize;
24 this.failureRateThreshold = failureRateThreshold;
25 this.waitDurationInOpenState = waitDurationInOpenState;
26 this.permittedNumberOfCallsInHalfOpenState = permittedNumberOfCallsInHalfOpenState;
27 this.automaticTransitionFromOpenToHalfOpenEnabled = true;
28 }
29 }
30
31 // 限流器配置
32 public class RateLimiterConfig {
33 private final int limitForPeriod; // 周期内限制
34 private final Duration limitRefreshPeriod; // 限制刷新周期
35 private final Duration timeoutDuration; // 超时时间
36
37 public RateLimiterConfig(int limitForPeriod, Duration limitRefreshPeriod,
38 Duration timeoutDuration) {
39 this.limitForPeriod = limitForPeriod;
40 this.limitRefreshPeriod = limitRefreshPeriod;
41 this.timeoutDuration = timeoutDuration;
42 }
43 }
44
45 // 舱壁隔离配置
46 public class BulkheadConfig {
47 private final int maxConcurrentCalls; // 最大并发调用数
48 private final Duration maxWaitDuration; // 最大等待时间
49
50 public BulkheadConfig(int maxConcurrentCalls, Duration maxWaitDuration) {
51 this.maxConcurrentCalls = maxConcurrentCalls;
52 this.maxWaitDuration = maxWaitDuration;
53 }
54 }
55
56 // 重试配置
57 public class RetryConfig {
58 private final int maxAttempts; // 最大重试次数
59 private final Duration waitDuration; // 等待时间
60 private final List<Class<? extends Throwable>> retryExceptions; // 重试异常类型
61
62 public RetryConfig(int maxAttempts, Duration waitDuration,
63 List<Class<? extends Throwable>> retryExceptions) {
64 this.maxAttempts = maxAttempts;
65 this.waitDuration = waitDuration;
66 this.retryExceptions = retryExceptions;
67 }
68 }
69}

2.2 Resilience4j 使用示例

基础使用示例

Resilience4j基础使用
java
1@Service
2public class ProductService {
3
4 private final CircuitBreaker circuitBreaker;
5 private final RateLimiter rateLimiter;
6 private final Bulkhead bulkhead;
7 private final Retry retry;
8 private final TimeLimiter timeLimiter;
9
10 public ProductService() {
11 // 创建熔断器
12 CircuitBreakerConfig circuitBreakerConfig = CircuitBreakerConfig.custom()
13 .slidingWindowSize(10)
14 .failureRateThreshold(50.0f)
15 .waitDurationInOpenState(Duration.ofSeconds(30))
16 .permittedNumberOfCallsInHalfOpenState(3)
17 .build();
18
19 this.circuitBreaker = CircuitBreaker.of("product-service", circuitBreakerConfig);
20
21 // 创建限流器
22 RateLimiterConfig rateLimiterConfig = RateLimiterConfig.custom()
23 .limitForPeriod(10)
24 .limitRefreshPeriod(Duration.ofSeconds(1))
25 .timeoutDuration(Duration.ofSeconds(5))
26 .build();
27
28 this.rateLimiter = RateLimiter.of("product-service", rateLimiterConfig);
29
30 // 创建舱壁隔离
31 BulkheadConfig bulkheadConfig = BulkheadConfig.custom()
32 .maxConcurrentCalls(20)
33 .maxWaitDuration(Duration.ofSeconds(5))
34 .build();
35
36 this.bulkhead = Bulkhead.of("product-service", bulkheadConfig);
37
38 // 创建重试
39 RetryConfig retryConfig = RetryConfig.custom()
40 .maxAttempts(3)
41 .waitDuration(Duration.ofSeconds(1))
42 .retryExceptions(Arrays.asList(RuntimeException.class))
43 .build();
44
45 this.retry = Retry.of("product-service", retryConfig);
46
47 // 创建超时控制
48 this.timeLimiter = TimeLimiter.of("product-service",
49 TimeLimiterConfig.custom().timeoutDuration(Duration.ofSeconds(5)).build());
50 }
51
52 // 使用熔断器
53 public Product getProduct(String id) {
54 return circuitBreaker.executeSupplier(() -> {
55 // 调用下游服务
56 return callProductService(id);
57 });
58 }
59
60 // 使用限流器
61 public List<Product> searchProducts(String keyword) {
62 return rateLimiter.executeSupplier(() -> {
63 // 搜索商品
64 return searchProductService(keyword);
65 });
66 }
67
68 // 使用舱壁隔离
69 public void updateProduct(String id, Product product) {
70 bulkhead.executeRunnable(() -> {
71 // 更新商品
72 updateProductService(id, product);
73 });
74 }
75
76 // 使用重试
77 public void createProduct(Product product) {
78 retry.executeRunnable(() -> {
79 // 创建商品
80 createProductService(product);
81 });
82 }
83
84 // 使用超时控制
85 public CompletableFuture<Product> getProductAsync(String id) {
86 return timeLimiter.executeCompletionStage(
87 () -> CompletableFuture.supplyAsync(() -> callProductService(id))
88 );
89 }
90
91 // 组合使用多个组件
92 public Product getProductWithAllProtection(String id) {
93 return circuitBreaker.executeSupplier(() ->
94 rateLimiter.executeSupplier(() ->
95 bulkhead.executeSupplier(() ->
96 retry.executeSupplier(() -> callProductService(id))
97 )
98 )
99 );
100 }
101
102 // 模拟服务调用
103 private Product callProductService(String id) {
104 // 模拟网络调用
105 if (Math.random() < 0.3) {
106 throw new RuntimeException("Service unavailable");
107 }
108 return new Product(id, "Product " + id, 100.0);
109 }
110
111 private List<Product> searchProductService(String keyword) {
112 // 模拟搜索服务
113 return Arrays.asList(
114 new Product("1", "Product 1", 100.0),
115 new Product("2", "Product 2", 200.0)
116 );
117 }
118
119 private void updateProductService(String id, Product product) {
120 // 模拟更新服务
121 System.out.println("Updating product: " + id);
122 }
123
124 private void createProductService(Product product) {
125 // 模拟创建服务
126 System.out.println("Creating product: " + product.getName());
127 }
128}

注解方式使用

Resilience4j注解使用
java
1@Service
2public class AnnotatedProductService {
3
4 @CircuitBreaker(name = "product-service", fallbackMethod = "getProductFallback")
5 @RateLimiter(name = "product-service")
6 @Bulkhead(name = "product-service")
7 @Retry(name = "product-service", fallbackMethod = "getProductFallback")
8 @TimeLimiter(name = "product-service")
9 public CompletableFuture<Product> getProduct(String id) {
10 return CompletableFuture.supplyAsync(() -> {
11 // 模拟调用下游服务
12 if (Math.random() < 0.3) {
13 throw new RuntimeException("Service unavailable");
14 }
15 return new Product(id, "Product " + id, 100.0);
16 });
17 }
18
19 // 熔断器降级方法
20 public CompletableFuture<Product> getProductFallback(String id, Throwable ex) {
21 return CompletableFuture.completedFuture(
22 new Product(id, "Fallback Product", 0.0)
23 );
24 }
25
26 // 重试降级方法
27 public Product getProductFallback(String id, RuntimeException ex) {
28 return new Product(id, "Retry Fallback Product", 0.0);
29 }
30
31 // 组合使用多个注解
32 @CircuitBreaker(name = "order-service", fallbackMethod = "createOrderFallback")
33 @RateLimiter(name = "order-service")
34 @Bulkhead(name = "order-service")
35 public Order createOrder(OrderRequest request) {
36 // 创建订单逻辑
37 if (Math.random() < 0.2) {
38 throw new RuntimeException("Order service unavailable");
39 }
40 return new Order(UUID.randomUUID().toString(), request.getProductId(),
41 request.getQuantity(), 100.0);
42 }
43
44 public Order createOrderFallback(OrderRequest request, Throwable ex) {
45 // 降级逻辑:返回默认订单
46 return new Order("fallback-order", request.getProductId(),
47 request.getQuantity(), 0.0);
48 }
49}

2.3 Resilience4j 配置详解

配置文件

application.yml
yaml
1resilience4j:
2 circuitbreaker:
3 instances:
4 product-service:
5 sliding-window-size: 20
6 failure-rate-threshold: 50
7 wait-duration-in-open-state: 30s
8 permitted-number-of-calls-in-half-open-state: 3
9 automatic-transition-from-open-to-half-open-enabled: true
10 record-exceptions:
11 - java.lang.RuntimeException
12 - java.io.IOException
13 ignore-exceptions:
14 - java.lang.IllegalArgumentException
15 sliding-window-type: COUNT_BASED
16 minimum-number-of-calls: 10
17 slow-call-rate-threshold: 100
18 slow-call-duration-threshold: 2s
19
20 order-service:
21 sliding-window-size: 10
22 failure-rate-threshold: 30
23 wait-duration-in-open-state: 60s
24 permitted-number-of-calls-in-half-open-state: 2
25 record-exceptions:
26 - java.lang.RuntimeException
27 sliding-window-type: TIME_BASED
28 minimum-number-of-calls: 5
29 slow-call-rate-threshold: 50
30 slow-call-duration-threshold: 1s
31
32 payment-service:
33 sliding-window-size: 5
34 failure-rate-threshold: 20
35 wait-duration-in-open-state: 120s
36 permitted-number-of-calls-in-half-open-state: 1
37 record-exceptions:
38 - java.lang.RuntimeException
39 sliding-window-type: COUNT_BASED
40 minimum-number-of-calls: 3
41 slow-call-rate-threshold: 30
42 slow-call-duration-threshold: 500ms
43
44 ratelimiter:
45 instances:
46 product-service:
47 limit-for-period: 10
48 limit-refresh-period: 1s
49 timeout-duration: 5s
50 register-health-indicator: true
51
52 order-service:
53 limit-for-period: 5
54 limit-refresh-period: 1s
55 timeout-duration: 10s
56 register-health-indicator: true
57
58 payment-service:
59 limit-for-period: 2
60 limit-refresh-period: 1s
61 timeout-duration: 15s
62 register-health-indicator: true
63
64 bulkhead:
65 instances:
66 product-service:
67 max-concurrent-calls: 20
68 max-wait-duration: 5s
69 register-health-indicator: true
70
71 order-service:
72 max-concurrent-calls: 10
73 max-wait-duration: 10s
74 register-health-indicator: true
75
76 payment-service:
77 max-concurrent-calls: 5
78 max-wait-duration: 15s
79 register-health-indicator: true
80
81 retry:
82 instances:
83 product-service:
84 max-attempts: 3
85 wait-duration: 1s
86 enable-exponential-backoff: true
87 exponential-backoff-multiplier: 2
88 retry-exceptions:
89 - java.lang.RuntimeException
90 - java.io.IOException
91 ignore-exceptions:
92 - java.lang.IllegalArgumentException
93
94 order-service:
95 max-attempts: 2
96 wait-duration: 2s
97 enable-exponential-backoff: true
98 exponential-backoff-multiplier: 1.5
99 retry-exceptions:
100 - java.lang.RuntimeException
101
102 payment-service:
103 max-attempts: 1
104 wait-duration: 5s
105 enable-exponential-backoff: false
106 retry-exceptions:
107 - java.lang.RuntimeException
108
109 timelimiter:
110 instances:
111 product-service:
112 timeout-duration: 5s
113 cancel-running-future: true
114
115 order-service:
116 timeout-duration: 10s
117 cancel-running-future: true
118
119 payment-service:
120 timeout-duration: 15s
121 cancel-running-future: true
122
123# 监控配置
124management:
125 endpoints:
126 web:
127 exposure:
128 include: health,metrics,circuitbreakers,ratelimiters,bulkheads,retries
129 endpoint:
130 health:
131 show-details: always
132 metrics:
133 export:
134 prometheus:
135 enabled: true

配置类

Resilience4j配置类
java
1@Configuration
2public class Resilience4jConfiguration {
3
4 // 熔断器配置
5 @Bean
6 public Customizer<Resilience4JCircuitBreakerFactory> defaultCustomizer() {
7 return factory -> factory.configureDefault(id -> new CircuitBreakerConfig.Builder()
8 .slidingWindowSize(10)
9 .failureRateThreshold(50)
10 .waitDurationInOpenState(Duration.ofSeconds(10))
11 .permittedNumberOfCallsInHalfOpenState(5)
12 .recordExceptions(RuntimeException.class, IOException.class)
13 .ignoreExceptions(IllegalArgumentException.class)
14 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.COUNT_BASED)
15 .minimumNumberOfCalls(5)
16 .slowCallRateThreshold(100)
17 .slowCallDurationThreshold(Duration.ofSeconds(2))
18 .build());
19 }
20
21 // 限流器配置
22 @Bean
23 public Customizer<Resilience4JRateLimiterFactory> rateLimiterCustomizer() {
24 return factory -> factory.configureDefault(id -> RateLimiterConfig.custom()
25 .limitForPeriod(10)
26 .limitRefreshPeriod(Duration.ofSeconds(1))
27 .timeoutDuration(Duration.ofSeconds(5))
28 .build());
29 }
30
31 // 舱壁隔离配置
32 @Bean
33 public Customizer<Resilience4JBulkheadFactory> bulkheadCustomizer() {
34 return factory -> factory.configureDefault(id -> BulkheadConfig.custom()
35 .maxConcurrentCalls(20)
36 .maxWaitDuration(Duration.ofSeconds(5))
37 .build());
38 }
39
40 // 重试配置
41 @Bean
42 public Customizer<Resilience4JRetryFactory> retryCustomizer() {
43 return factory -> factory.configureDefault(id -> RetryConfig.custom()
44 .maxAttempts(3)
45 .waitDuration(Duration.ofSeconds(1))
46 .enableExponentialBackoff()
47 .exponentialBackoffMultiplier(2)
48 .retryExceptions(RuntimeException.class, IOException.class)
49 .ignoreExceptions(IllegalArgumentException.class)
50 .build());
51 }
52
53 // 超时控制配置
54 @Bean
55 public Customizer<Resilience4JTimeLimiterFactory> timeLimiterCustomizer() {
56 return factory -> factory.configureDefault(id -> TimeLimiterConfig.custom()
57 .timeoutDuration(Duration.ofSeconds(5))
58 .cancelRunningFuture(true)
59 .build());
60 }
61
62 // 事件监听器
63 @Bean
64 public CircuitBreakerRegistryCustomizer circuitBreakerRegistryCustomizer() {
65 return registry -> {
66 registry.getEventPublisher()
67 .onEntryAdded(entryAddedEvent -> {
68 CircuitBreaker addedCircuitBreaker = entryAddedEvent.getAddedEntry();
69 addedCircuitBreaker.getEventPublisher()
70 .onStateTransition(event -> {
71 log.info("Circuit breaker {} state changed from {} to {}",
72 addedCircuitBreaker.getName(),
73 event.getStateTransition().getFromState(),
74 event.getStateTransition().getToState());
75 })
76 .onFailureRateExceeded(event -> {
77 log.warn("Circuit breaker {} failure rate exceeded: {}%",
78 addedCircuitBreaker.getName(),
79 event.getFailureRate());
80 })
81 .onSlowCallRateExceeded(event -> {
82 log.warn("Circuit breaker {} slow call rate exceeded: {}%",
83 addedCircuitBreaker.getName(),
84 event.getSlowCallRate());
85 });
86 });
87 };
88 }
89}
Resilience4j最佳实践
  1. 合理配置:根据服务特点设置合适的阈值和超时时间
  2. 监控告警:建立完善的监控体系,及时发现和处理问题
  3. 降级策略:为每个服务设计合适的降级策略
  4. 测试验证:通过压力测试验证配置的有效性
  5. 文档维护:维护完整的配置文档,便于团队协作

3. 监控观测与最佳实践

3.1 监控观测体系

监控指标收集

监控指标收集示例
java
1public class MonitoringAndObservability {
2
3 // 熔断器监控指标
4 @Component
5 public class CircuitBreakerMetrics {
6
7 private final MeterRegistry meterRegistry;
8 private final Counter totalCalls;
9 private final Counter successfulCalls;
10 private final Counter failedCalls;
11 private final Counter notPermittedCalls;
12 private final Timer callDuration;
13 private final Gauge failureRate;
14 private final Gauge slowCallRate;
15
16 public CircuitBreakerMetrics(MeterRegistry meterRegistry) {
17 this.meterRegistry = meterRegistry;
18 this.totalCalls = Counter.builder("circuitbreaker.calls.total")
19 .description("Total number of calls")
20 .register(meterRegistry);
21 this.successfulCalls = Counter.builder("circuitbreaker.calls.successful")
22 .description("Number of successful calls")
23 .register(meterRegistry);
24 this.failedCalls = Counter.builder("circuitbreaker.calls.failed")
25 .description("Number of failed calls")
26 .register(meterRegistry);
27 this.notPermittedCalls = Counter.builder("circuitbreaker.calls.not_permitted")
28 .description("Number of calls not permitted")
29 .register(meterRegistry);
30 this.callDuration = Timer.builder("circuitbreaker.calls.duration")
31 .description("Call duration")
32 .register(meterRegistry);
33 this.failureRate = Gauge.builder("circuitbreaker.failure_rate")
34 .description("Failure rate percentage")
35 .register(meterRegistry, this, CircuitBreakerMetrics::getFailureRate);
36 this.slowCallRate = Gauge.builder("circuitbreaker.slow_call_rate")
37 .description("Slow call rate percentage")
38 .register(meterRegistry, this, CircuitBreakerMetrics::getSlowCallRate);
39 }
40
41 public void recordCall(String serviceName, boolean success, long duration, boolean permitted) {
42 totalCalls.increment(Tags.of("service", serviceName));
43
44 if (permitted) {
45 if (success) {
46 successfulCalls.increment(Tags.of("service", serviceName));
47 } else {
48 failedCalls.increment(Tags.of("service", serviceName));
49 }
50 callDuration.record(duration, TimeUnit.MILLISECONDS, Tags.of("service", serviceName));
51 } else {
52 notPermittedCalls.increment(Tags.of("service", serviceName));
53 }
54 }
55
56 private double getFailureRate() {
57 // 计算失败率
58 return 0.0; // 实际实现中需要计算
59 }
60
61 private double getSlowCallRate() {
62 // 计算慢调用率
63 return 0.0; // 实际实现中需要计算
64 }
65 }
66
67 // 限流器监控指标
68 @Component
69 public class RateLimiterMetrics {
70
71 private final MeterRegistry meterRegistry;
72 private final Counter permittedCalls;
73 private final Counter refusedCalls;
74 private final Timer waitDuration;
75
76 public RateLimiterMetrics(MeterRegistry meterRegistry) {
77 this.meterRegistry = meterRegistry;
78 this.permittedCalls = Counter.builder("ratelimiter.calls.permitted")
79 .description("Number of permitted calls")
80 .register(meterRegistry);
81 this.refusedCalls = Counter.builder("ratelimiter.calls.refused")
82 .description("Number of refused calls")
83 .register(meterRegistry);
84 this.waitDuration = Timer.builder("ratelimiter.wait.duration")
85 .description("Wait duration for permission")
86 .register(meterRegistry);
87 }
88
89 public void recordCall(String serviceName, boolean permitted, long waitTime) {
90 if (permitted) {
91 permittedCalls.increment(Tags.of("service", serviceName));
92 } else {
93 refusedCalls.increment(Tags.of("service", serviceName));
94 }
95 waitDuration.record(waitTime, TimeUnit.MILLISECONDS, Tags.of("service", serviceName));
96 }
97 }
98
99 // 舱壁隔离监控指标
100 @Component
101 public class BulkheadMetrics {
102
103 private final MeterRegistry meterRegistry;
104 private final Counter permittedCalls;
105 private final Counter rejectedCalls;
106 private final Timer callDuration;
107 private final Gauge availableConcurrentCalls;
108
109 public BulkheadMetrics(MeterRegistry meterRegistry) {
110 this.meterRegistry = meterRegistry;
111 this.permittedCalls = Counter.builder("bulkhead.calls.permitted")
112 .description("Number of permitted calls")
113 .register(meterRegistry);
114 this.rejectedCalls = Counter.builder("bulkhead.calls.rejected")
115 .description("Number of rejected calls")
116 .register(meterRegistry);
117 this.callDuration = Timer.builder("bulkhead.calls.duration")
118 .description("Call duration")
119 .register(meterRegistry);
120 this.availableConcurrentCalls = Gauge.builder("bulkhead.available_concurrent_calls")
121 .description("Available concurrent calls")
122 .register(meterRegistry, this, BulkheadMetrics::getAvailableConcurrentCalls);
123 }
124
125 public void recordCall(String serviceName, boolean permitted, long duration) {
126 if (permitted) {
127 permittedCalls.increment(Tags.of("service", serviceName));
128 callDuration.record(duration, TimeUnit.MILLISECONDS, Tags.of("service", serviceName));
129 } else {
130 rejectedCalls.increment(Tags.of("service", serviceName));
131 }
132 }
133
134 private double getAvailableConcurrentCalls() {
135 // 获取可用并发调用数
136 return 0.0; // 实际实现中需要计算
137 }
138 }
139
140 // 重试监控指标
141 @Component
142 public class RetryMetrics {
143
144 private final MeterRegistry meterRegistry;
145 private final Counter totalCalls;
146 private final Counter successfulCalls;
147 private final Counter failedCalls;
148 private final Counter retryAttempts;
149 private final Timer callDuration;
150
151 public RetryMetrics(MeterRegistry meterRegistry) {
152 this.meterRegistry = meterRegistry;
153 this.totalCalls = Counter.builder("retry.calls.total")
154 .description("Total number of calls")
155 .register(meterRegistry);
156 this.successfulCalls = Counter.builder("retry.calls.successful")
157 .description("Number of successful calls")
158 .register(meterRegistry);
159 this.failedCalls = Counter.builder("retry.calls.failed")
160 .description("Number of failed calls")
161 .register(meterRegistry);
162 this.retryAttempts = Counter.builder("retry.attempts")
163 .description("Number of retry attempts")
164 .register(meterRegistry);
165 this.callDuration = Timer.builder("retry.calls.duration")
166 .description("Call duration")
167 .register(meterRegistry);
168 }
169
170 public void recordCall(String serviceName, boolean success, int attempts, long duration) {
171 totalCalls.increment(Tags.of("service", serviceName));
172
173 if (success) {
174 successfulCalls.increment(Tags.of("service", serviceName));
175 } else {
176 failedCalls.increment(Tags.of("service", serviceName));
177 }
178
179 if (attempts > 1) {
180 retryAttempts.increment(Tags.of("service", serviceName), attempts - 1);
181 }
182
183 callDuration.record(duration, TimeUnit.MILLISECONDS, Tags.of("service", serviceName));
184 }
185 }
186}

Prometheus监控配置

Prometheus监控配置
yaml
1# prometheus.yml
2global:
3 scrape_interval: 15s
4 evaluation_interval: 15s
5
6scrape_configs:
7 - job_name: 'resilience4j-metrics'
8 static_configs:
9 - targets: ['localhost:8080']
10 metrics_path: '/actuator/prometheus'
11 scrape_interval: 5s
12
13# 告警规则
14rule_files:
15 - 'resilience4j-alerts.yml'
resilience4j-alerts.yml
yaml
1groups:
2 - name: resilience4j
3 rules:
4 - alert: CircuitBreakerOpen
5 expr: circuitbreaker_state == 1
6 for: 1m
7 labels:
8 severity: critical
9 annotations:
10 summary: "Circuit breaker is open"
11 description: "Circuit breaker for service {{ $labels.service }} is open"
12
13 - alert: HighFailureRate
14 expr: circuitbreaker_failure_rate > 50
15 for: 2m
16 labels:
17 severity: warning
18 annotations:
19 summary: "High failure rate detected"
20 description: "Failure rate for service {{ $labels.service }} is {{ $value }}%"
21
22 - alert: HighSlowCallRate
23 expr: circuitbreaker_slow_call_rate > 30
24 for: 2m
25 labels:
26 severity: warning
27 annotations:
28 summary: "High slow call rate detected"
29 description: "Slow call rate for service {{ $labels.service }} is {{ $value }}%"
30
31 - alert: RateLimiterRefused
32 expr: rate(ratelimiter_calls_refused_total[5m]) > 10
33 for: 1m
34 labels:
35 severity: warning
36 annotations:
37 summary: "Rate limiter refusing calls"
38 description: "Rate limiter for service {{ $labels.service }} is refusing calls"
39
40 - alert: BulkheadRejected
41 expr: rate(bulkhead_calls_rejected_total[5m]) > 5
42 for: 1m
43 labels:
44 severity: warning
45 annotations:
46 summary: "Bulkhead rejecting calls"
47 description: "Bulkhead for service {{ $labels.service }} is rejecting calls"
48
49 - alert: HighRetryAttempts
50 expr: rate(retry_attempts_total[5m]) > 20
51 for: 2m
52 labels:
53 severity: warning
54 annotations:
55 summary: "High retry attempts detected"
56 description: "Retry attempts for service {{ $labels.service }} is high"

3.2 最佳实践与设计模式

降级策略设计

降级策略设计示例
java
1public class FallbackStrategies {
2
3 // 静态降级:返回预定义的默认值
4 public class StaticFallback {
5
6 @CircuitBreaker(name = "user-service", fallbackMethod = "getUserFallback")
7 public User getUser(String id) {
8 return userService.getUser(id);
9 }
10
11 public User getUserFallback(String id, Throwable ex) {
12 // 返回静态默认用户
13 return new User(id, "Default User", "default@example.com");
14 }
15 }
16
17 // 缓存降级:从缓存中获取数据
18 public class CacheFallback {
19
20 private final Cache<String, User> userCache;
21
22 public CacheFallback() {
23 this.userCache = Caffeine.newBuilder()
24 .maximumSize(1000)
25 .expireAfterWrite(Duration.ofMinutes(30))
26 .build();
27 }
28
29 @CircuitBreaker(name = "user-service", fallbackMethod = "getUserFromCache")
30 public User getUser(String id) {
31 User user = userService.getUser(id);
32 // 更新缓存
33 userCache.put(id, user);
34 return user;
35 }
36
37 public User getUserFromCache(String id, Throwable ex) {
38 // 从缓存中获取用户
39 User cachedUser = userCache.getIfPresent(id);
40 if (cachedUser != null) {
41 return cachedUser;
42 }
43 // 缓存中没有,返回默认用户
44 return new User(id, "Cached User", "cached@example.com");
45 }
46 }
47
48 // 异步降级:异步获取数据
49 public class AsyncFallback {
50
51 @CircuitBreaker(name = "user-service", fallbackMethod = "getUserAsyncFallback")
52 @TimeLimiter(name = "user-service")
53 public CompletableFuture<User> getUserAsync(String id) {
54 return CompletableFuture.supplyAsync(() -> userService.getUser(id));
55 }
56
57 public CompletableFuture<User> getUserAsyncFallback(String id, Throwable ex) {
58 return CompletableFuture.supplyAsync(() -> {
59 // 异步获取备用数据
60 return getBackupUserData(id);
61 });
62 }
63
64 private User getBackupUserData(String id) {
65 // 从备用数据源获取用户信息
66 return new User(id, "Backup User", "backup@example.com");
67 }
68 }
69
70 // 组合降级:多种降级策略组合
71 public class CompositeFallback {
72
73 private final Cache<String, User> userCache;
74 private final UserService backupUserService;
75
76 public CompositeFallback() {
77 this.userCache = Caffeine.newBuilder()
78 .maximumSize(1000)
79 .expireAfterWrite(Duration.ofMinutes(30))
80 .build();
81 this.backupUserService = new BackupUserService();
82 }
83
84 @CircuitBreaker(name = "user-service", fallbackMethod = "getUserCompositeFallback")
85 public User getUser(String id) {
86 User user = userService.getUser(id);
87 userCache.put(id, user);
88 return user;
89 }
90
91 public User getUserCompositeFallback(String id, Throwable ex) {
92 // 策略1:从缓存获取
93 User cachedUser = userCache.getIfPresent(id);
94 if (cachedUser != null) {
95 return cachedUser;
96 }
97
98 // 策略2:从备用服务获取
99 try {
100 User backupUser = backupUserService.getUser(id);
101 if (backupUser != null) {
102 return backupUser;
103 }
104 } catch (Exception e) {
105 log.warn("Backup service also failed for user: {}", id, e);
106 }
107
108 // 策略3:返回静态默认值
109 return new User(id, "Default User", "default@example.com");
110 }
111 }
112
113 // 智能降级:根据异常类型选择不同的降级策略
114 public class IntelligentFallback {
115
116 @CircuitBreaker(name = "user-service", fallbackMethod = "getUserIntelligentFallback")
117 public User getUser(String id) {
118 return userService.getUser(id);
119 }
120
121 public User getUserIntelligentFallback(String id, Throwable ex) {
122 if (ex instanceof TimeoutException) {
123 // 超时异常,从缓存获取
124 return getUserFromCache(id);
125 } else if (ex instanceof ServiceUnavailableException) {
126 // 服务不可用,从备用服务获取
127 return getUserFromBackup(id);
128 } else if (ex instanceof ValidationException) {
129 // 验证异常,返回默认值
130 return getDefaultUser(id);
131 } else {
132 // 其他异常,返回静态默认值
133 return new User(id, "Fallback User", "fallback@example.com");
134 }
135 }
136
137 private User getUserFromCache(String id) {
138 // 从缓存获取用户
139 return new User(id, "Cached User", "cached@example.com");
140 }
141
142 private User getUserFromBackup(String id) {
143 // 从备用服务获取用户
144 return new User(id, "Backup User", "backup@example.com");
145 }
146
147 private User getDefaultUser(String id) {
148 // 返回默认用户
149 return new User(id, "Default User", "default@example.com");
150 }
151 }
152}

配置最佳实践

配置最佳实践示例
java
1public class ConfigurationBestPractices {
2
3 // 服务级别配置
4 public class ServiceLevelConfiguration {
5
6 // 用户服务配置:对可用性要求高
7 public CircuitBreakerConfig getUserServiceConfig() {
8 return CircuitBreakerConfig.custom()
9 .slidingWindowSize(20) // 较大的窗口,更稳定
10 .failureRateThreshold(30.0f) // 较低的失败率阈值
11 .waitDurationInOpenState(Duration.ofSeconds(30)) // 较短的恢复时间
12 .permittedNumberOfCallsInHalfOpenState(5) // 较多的试探调用
13 .recordExceptions(RuntimeException.class, IOException.class)
14 .ignoreExceptions(IllegalArgumentException.class)
15 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.COUNT_BASED)
16 .minimumNumberOfCalls(10) // 最小调用数
17 .slowCallRateThreshold(50) // 慢调用率阈值
18 .slowCallDurationThreshold(Duration.ofSeconds(1)) // 慢调用时间阈值
19 .build();
20 }
21
22 // 订单服务配置:对一致性要求高
23 public CircuitBreakerConfig getOrderServiceConfig() {
24 return CircuitBreakerConfig.custom()
25 .slidingWindowSize(10) // 较小的窗口,更敏感
26 .failureRateThreshold(20.0f) // 更低的失败率阈值
27 .waitDurationInOpenState(Duration.ofSeconds(60)) // 较长的恢复时间
28 .permittedNumberOfCallsInHalfOpenState(2) // 较少的试探调用
29 .recordExceptions(RuntimeException.class)
30 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.TIME_BASED)
31 .minimumNumberOfCalls(5) // 最小调用数
32 .slowCallRateThreshold(30) // 慢调用率阈值
33 .slowCallDurationThreshold(Duration.ofSeconds(2)) // 慢调用时间阈值
34 .build();
35 }
36
37 // 支付服务配置:对安全性要求高
38 public CircuitBreakerConfig getPaymentServiceConfig() {
39 return CircuitBreakerConfig.custom()
40 .slidingWindowSize(5) // 最小的窗口,最敏感
41 .failureRateThreshold(10.0f) // 最低的失败率阈值
42 .waitDurationInOpenState(Duration.ofSeconds(120)) // 最长的恢复时间
43 .permittedNumberOfCallsInHalfOpenState(1) // 最少的试探调用
44 .recordExceptions(RuntimeException.class)
45 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.COUNT_BASED)
46 .minimumNumberOfCalls(3) // 最小调用数
47 .slowCallRateThreshold(20) // 慢调用率阈值
48 .slowCallDurationThreshold(Duration.ofSeconds(500, ChronoUnit.MILLIS)) // 慢调用时间阈值
49 .build();
50 }
51 }
52
53 // 环境级别配置
54 public class EnvironmentLevelConfiguration {
55
56 // 开发环境配置:宽松的配置
57 public CircuitBreakerConfig getDevelopmentConfig() {
58 return CircuitBreakerConfig.custom()
59 .slidingWindowSize(5)
60 .failureRateThreshold(80.0f) // 较高的失败率阈值
61 .waitDurationInOpenState(Duration.ofSeconds(10)) // 较短的恢复时间
62 .permittedNumberOfCallsInHalfOpenState(3)
63 .recordExceptions(RuntimeException.class)
64 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.COUNT_BASED)
65 .minimumNumberOfCalls(3)
66 .build();
67 }
68
69 // 测试环境配置:中等严格度
70 public CircuitBreakerConfig getTestConfig() {
71 return CircuitBreakerConfig.custom()
72 .slidingWindowSize(10)
73 .failureRateThreshold(50.0f) // 中等的失败率阈值
74 .waitDurationInOpenState(Duration.ofSeconds(30)) // 中等的恢复时间
75 .permittedNumberOfCallsInHalfOpenState(3)
76 .recordExceptions(RuntimeException.class)
77 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.COUNT_BASED)
78 .minimumNumberOfCalls(5)
79 .build();
80 }
81
82 // 生产环境配置:严格的配置
83 public CircuitBreakerConfig getProductionConfig() {
84 return CircuitBreakerConfig.custom()
85 .slidingWindowSize(20)
86 .failureRateThreshold(30.0f) // 较低的失败率阈值
87 .waitDurationInOpenState(Duration.ofSeconds(60)) // 较长的恢复时间
88 .permittedNumberOfCallsInHalfOpenState(2)
89 .recordExceptions(RuntimeException.class, IOException.class)
90 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.TIME_BASED)
91 .minimumNumberOfCalls(10)
92 .slowCallRateThreshold(50)
93 .slowCallDurationThreshold(Duration.ofSeconds(1))
94 .build();
95 }
96 }
97
98 // 动态配置
99 public class DynamicConfiguration {
100
101 private final CircuitBreakerRegistry circuitBreakerRegistry;
102
103 public DynamicConfiguration(CircuitBreakerRegistry circuitBreakerRegistry) {
104 this.circuitBreakerRegistry = circuitBreakerRegistry;
105 }
106
107 // 动态更新熔断器配置
108 public void updateCircuitBreakerConfig(String name, CircuitBreakerConfig config) {
109 CircuitBreaker circuitBreaker = circuitBreakerRegistry.circuitBreaker(name);
110 circuitBreaker.updateConfiguration(config);
111 }
112
113 // 根据负载动态调整配置
114 public void adjustConfigByLoad(String serviceName, double currentLoad) {
115 CircuitBreaker circuitBreaker = circuitBreakerRegistry.circuitBreaker(serviceName);
116 CircuitBreakerConfig currentConfig = circuitBreaker.getCircuitBreakerConfig();
117
118 CircuitBreakerConfig newConfig;
119 if (currentLoad > 0.8) {
120 // 高负载,使用更严格的配置
121 newConfig = CircuitBreakerConfig.custom()
122 .slidingWindowSize(currentConfig.getSlidingWindowSize())
123 .failureRateThreshold(currentConfig.getFailureRateThreshold() * 0.8) // 降低失败率阈值
124 .waitDurationInOpenState(currentConfig.getWaitDurationInOpenState().multipliedBy(2)) // 增加恢复时间
125 .permittedNumberOfCallsInHalfOpenState(Math.max(1, currentConfig.getPermittedNumberOfCallsInHalfOpenState() - 1)) // 减少试探调用
126 .build();
127 } else if (currentLoad < 0.3) {
128 // 低负载,使用更宽松的配置
129 newConfig = CircuitBreakerConfig.custom()
130 .slidingWindowSize(currentConfig.getSlidingWindowSize())
131 .failureRateThreshold(currentConfig.getFailureRateThreshold() * 1.2) // 提高失败率阈值
132 .waitDurationInOpenState(currentConfig.getWaitDurationInOpenState().dividedBy(2)) // 减少恢复时间
133 .permittedNumberOfCallsInHalfOpenState(currentConfig.getPermittedNumberOfCallsInHalfOpenState() + 1) // 增加试探调用
134 .build();
135 } else {
136 // 正常负载,保持当前配置
137 return;
138 }
139
140 updateCircuitBreakerConfig(serviceName, newConfig);
141 }
142 }
143}

3.3 常见问题与解决方案

问题诊断与解决

问题诊断与解决方案
java
1public class TroubleshootingGuide {
2
3 // 问题1:熔断器频繁开启
4 public class FrequentCircuitBreakerOpen {
5
6 /*
7 * 问题描述:熔断器频繁在开启和关闭状态之间切换
8 * 可能原因:
9 * 1. 失败率阈值设置过低
10 * 2. 滑动窗口大小设置过小
11 * 3. 最小调用数设置过小
12 * 4. 下游服务确实存在问题
13 */
14
15 public CircuitBreakerConfig fixFrequentOpen() {
16 return CircuitBreakerConfig.custom()
17 .slidingWindowSize(50) // 增加滑动窗口大小
18 .failureRateThreshold(70.0f) // 提高失败率阈值
19 .waitDurationInOpenState(Duration.ofSeconds(120)) // 增加恢复时间
20 .permittedNumberOfCallsInHalfOpenState(5) // 增加试探调用数
21 .minimumNumberOfCalls(20) // 增加最小调用数
22 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.TIME_BASED) // 使用时间窗口
23 .build();
24 }
25 }
26
27 // 问题2:熔断器恢复过慢
28 public class SlowCircuitBreakerRecovery {
29
30 /*
31 * 问题描述:熔断器开启后恢复时间过长
32 * 可能原因:
33 * 1. 恢复时间设置过长
34 * 2. 试探调用数设置过少
35 * 3. 下游服务恢复较慢
36 */
37
38 public CircuitBreakerConfig fixSlowRecovery() {
39 return CircuitBreakerConfig.custom()
40 .slidingWindowSize(20)
41 .failureRateThreshold(50.0f)
42 .waitDurationInOpenState(Duration.ofSeconds(30)) // 减少恢复时间
43 .permittedNumberOfCallsInHalfOpenState(10) // 增加试探调用数
44 .automaticTransitionFromOpenToHalfOpenEnabled(true) // 启用自动转换
45 .build();
46 }
47 }
48
49 // 问题3:误触发熔断器
50 public class FalsePositiveCircuitBreaker {
51
52 /*
53 * 问题描述:熔断器在服务正常时被误触发
54 * 可能原因:
55 * 1. 异常类型配置不当
56 * 2. 慢调用阈值设置过低
57 * 3. 网络抖动导致临时故障
58 */
59
60 public CircuitBreakerConfig fixFalsePositive() {
61 return CircuitBreakerConfig.custom()
62 .slidingWindowSize(30)
63 .failureRateThreshold(50.0f)
64 .waitDurationInOpenState(Duration.ofSeconds(60))
65 .permittedNumberOfCallsInHalfOpenState(5)
66 .recordExceptions(RuntimeException.class) // 只记录运行时异常
67 .ignoreExceptions(IllegalArgumentException.class, // 忽略参数异常
68 ValidationException.class) // 忽略验证异常
69 .slowCallRateThreshold(100) // 提高慢调用率阈值
70 .slowCallDurationThreshold(Duration.ofSeconds(3)) // 提高慢调用时间阈值
71 .build();
72 }
73 }
74
75 // 问题4:降级策略不当
76 public class InappropriateFallbackStrategy {
77
78 /*
79 * 问题描述:降级策略导致用户体验差或业务逻辑错误
80 * 可能原因:
81 * 1. 降级数据不准确
82 * 2. 降级逻辑过于简单
83 * 3. 没有考虑业务场景
84 */
85
86 public class ImprovedFallbackStrategy {
87
88 @CircuitBreaker(name = "user-service", fallbackMethod = "getUserWithContext")
89 public User getUser(String id, String context) {
90 return userService.getUser(id);
91 }
92
93 public User getUserWithContext(String id, String context, Throwable ex) {
94 // 根据上下文选择不同的降级策略
95 if ("critical".equals(context)) {
96 // 关键业务,尝试从备用服务获取
97 return getUserFromBackupService(id);
98 } else if ("normal".equals(context)) {
99 // 普通业务,从缓存获取
100 return getUserFromCache(id);
101 } else {
102 // 非关键业务,返回默认值
103 return getDefaultUser(id);
104 }
105 }
106
107 private User getUserFromBackupService(String id) {
108 try {
109 return backupUserService.getUser(id);
110 } catch (Exception e) {
111 log.warn("Backup service also failed for user: {}", id, e);
112 return getDefaultUser(id);
113 }
114 }
115
116 private User getUserFromCache(String id) {
117 User cachedUser = userCache.getIfPresent(id);
118 return cachedUser != null ? cachedUser : getDefaultUser(id);
119 }
120
121 private User getDefaultUser(String id) {
122 return new User(id, "Default User", "default@example.com");
123 }
124 }
125 }
126}
熔断器使用注意事项
  1. 合理配置:根据服务特点和业务需求设置合适的参数
  2. 监控告警:建立完善的监控体系,及时发现和处理问题
  3. 降级策略:设计合适的降级策略,保证系统可用性
  4. 测试验证:通过压力测试验证配置的有效性
  5. 文档维护:维护完整的配置文档,便于团队协作
  6. 定期评估:定期评估和调整配置参数

4. 面试题精选

4.1 基础概念题

Q1: 什么是熔断器模式?它的核心思想是什么?

: 熔断器模式是一种容错设计模式,用于防止级联故障。其核心思想包括:

  1. 监控状态:持续监控依赖服务的调用结果
  2. 快速失败:当服务异常时,快速失败而不是等待超时
  3. 自动恢复:当服务恢复时,自动恢复正常调用
  4. 降级保护:提供降级策略,保证系统可用性

核心组件

  • CLOSED状态:正常状态,所有请求都会正常处理
  • OPEN状态:保护状态,所有请求都会快速失败
  • HALF_OPEN状态:试探状态,允许少量请求通过

Q2: 熔断器的三种状态是如何转换的?

: 熔断器状态转换规则如下:

1CLOSED → OPEN: 失败次数达到阈值
2OPEN → HALF_OPEN: 经过恢复时间后自动转换
3HALF_OPEN → CLOSED: 成功次数达到阈值
4HALF_OPEN → OPEN: 任何一次失败

详细转换条件

  • CLOSED → OPEN:在滑动窗口内,失败率超过阈值
  • OPEN → HALF_OPEN:经过waitDurationInOpenState时间后
  • HALF_OPEN → CLOSED:在permittedNumberOfCallsInHalfOpenState次调用中,成功次数达到阈值
  • HALF_OPEN → OPEN:在试探期间任何一次调用失败

Q3: Resilience4j与Hystrix的区别是什么?

: 主要区别如下:

Resilience4j

  • 基于Java 8和函数式编程设计
  • 轻量级,无外部依赖
  • 支持多种容错模式(熔断器、限流器、舱壁隔离、重试、超时)
  • 提供丰富的监控指标
  • 活跃维护,持续更新

Hystrix

  • 基于RxJava设计
  • 较重,有外部依赖
  • 主要关注熔断器模式
  • 监控功能相对简单
  • 已进入维护模式,不再积极开发

4.2 实践题

Q4: 如何设计一个合适的降级策略?

: 降级策略设计需要考虑以下几个方面:

  1. 静态降级:返回预定义的默认值
java
1public User getUserFallback(String id, Throwable ex) {
2 return new User(id, "Default User", "default@example.com");
3}
  1. 缓存降级:从缓存中获取数据
java
1public User getUserFromCache(String id, Throwable ex) {
2 User cachedUser = userCache.getIfPresent(id);
3 return cachedUser != null ? cachedUser : getDefaultUser(id);
4}
  1. 备用服务降级:从备用服务获取数据
java
1public User getUserFromBackup(String id, Throwable ex) {
2 try {
3 return backupUserService.getUser(id);
4 } catch (Exception e) {
5 return getDefaultUser(id);
6 }
7}
  1. 智能降级:根据异常类型选择不同策略
java
1public User getUserIntelligentFallback(String id, Throwable ex) {
2 if (ex instanceof TimeoutException) {
3 return getUserFromCache(id);
4 } else if (ex instanceof ServiceUnavailableException) {
5 return getUserFromBackup(id);
6 } else {
7 return getDefaultUser(id);
8 }
9}

Q5: 如何配置熔断器参数?

: 熔断器参数配置需要考虑服务特点:

java
1// 用户服务:对可用性要求高
2CircuitBreakerConfig userServiceConfig = CircuitBreakerConfig.custom()
3 .slidingWindowSize(20) // 较大的窗口,更稳定
4 .failureRateThreshold(30.0f) // 较低的失败率阈值
5 .waitDurationInOpenState(Duration.ofSeconds(30)) // 较短的恢复时间
6 .permittedNumberOfCallsInHalfOpenState(5) // 较多的试探调用
7 .build();
8
9// 支付服务:对安全性要求高
10CircuitBreakerConfig paymentServiceConfig = CircuitBreakerConfig.custom()
11 .slidingWindowSize(5) // 较小的窗口,更敏感
12 .failureRateThreshold(10.0f) // 较低的失败率阈值
13 .waitDurationInOpenState(Duration.ofSeconds(120)) // 较长的恢复时间
14 .permittedNumberOfCallsInHalfOpenState(1) // 较少的试探调用
15 .build();

4.3 性能优化题

Q6: 如何优化熔断器的性能?

: 熔断器性能优化可以从以下几个方面考虑:

  1. 合理配置参数
java
1CircuitBreakerConfig optimizedConfig = CircuitBreakerConfig.custom()
2 .slidingWindowSize(50) // 增加窗口大小,减少状态切换
3 .failureRateThreshold(50.0f) // 设置合理的失败率阈值
4 .waitDurationInOpenState(Duration.ofSeconds(60)) // 设置合理的恢复时间
5 .permittedNumberOfCallsInHalfOpenState(3) // 设置合理的试探调用数
6 .slidingWindowType(CircuitBreakerConfig.SlidingWindowType.TIME_BASED) // 使用时间窗口
7 .build();
  1. 使用缓存降级
java
1@CircuitBreaker(name = "user-service", fallbackMethod = "getUserFromCache")
2public User getUser(String id) {
3 User user = userService.getUser(id);
4 userCache.put(id, user); // 更新缓存
5 return user;
6}
7
8public User getUserFromCache(String id, Throwable ex) {
9 return userCache.getIfPresent(id);
10}
  1. 异步处理
java
1@CircuitBreaker(name = "user-service", fallbackMethod = "getUserAsyncFallback")
2@TimeLimiter(name = "user-service")
3public CompletableFuture<User> getUserAsync(String id) {
4 return CompletableFuture.supplyAsync(() -> userService.getUser(id));
5}

Q7: 如何处理熔断器的误触发问题?

: 误触发问题可以通过以下方式解决:

  1. 调整配置参数
java
1CircuitBreakerConfig antiFalsePositiveConfig = CircuitBreakerConfig.custom()
2 .slidingWindowSize(30) // 增加窗口大小
3 .failureRateThreshold(70.0f) // 提高失败率阈值
4 .minimumNumberOfCalls(20) // 增加最小调用数
5 .recordExceptions(RuntimeException.class) // 只记录特定异常
6 .ignoreExceptions(IllegalArgumentException.class) // 忽略参数异常
7 .slowCallRateThreshold(100) // 提高慢调用率阈值
8 .slowCallDurationThreshold(Duration.ofSeconds(3)) // 提高慢调用时间阈值
9 .build();
  1. 使用重试机制
java
1@Retry(name = "user-service", fallbackMethod = "getUserFallback")
2@CircuitBreaker(name = "user-service", fallbackMethod = "getUserFallback")
3public User getUser(String id) {
4 return userService.getUser(id);
5}
  1. 监控和告警
java
1@Bean
2public CircuitBreakerRegistryCustomizer circuitBreakerRegistryCustomizer() {
3 return registry -> {
4 registry.getEventPublisher()
5 .onEntryAdded(entryAddedEvent -> {
6 CircuitBreaker circuitBreaker = entryAddedEvent.getAddedEntry();
7 circuitBreaker.getEventPublisher()
8 .onStateTransition(event -> {
9 log.warn("Circuit breaker {} state changed from {} to {}",
10 circuitBreaker.getName(),
11 event.getStateTransition().getFromState(),
12 event.getStateTransition().getToState());
13 });
14 });
15 };
16}

4.4 架构设计题

Q8: 如何设计一个高可用的熔断器系统?

: 高可用熔断器系统设计包括以下几个方面:

  1. 多级熔断器
java
1public class MultiLevelCircuitBreaker {
2
3 @CircuitBreaker(name = "user-service", fallbackMethod = "getUserFromCache")
4 public User getUser(String id) {
5 return userService.getUser(id);
6 }
7
8 @CircuitBreaker(name = "cache-service", fallbackMethod = "getUserFromBackup")
9 public User getUserFromCache(String id, Throwable ex) {
10 return cacheService.getUser(id);
11 }
12
13 @CircuitBreaker(name = "backup-service", fallbackMethod = "getDefaultUser")
14 public User getUserFromBackup(String id, Throwable ex) {
15 return backupService.getUser(id);
16 }
17
18 public User getDefaultUser(String id, Throwable ex) {
19 return new User(id, "Default User", "default@example.com");
20 }
21}
  1. 分布式熔断器
java
1@Component
2public class DistributedCircuitBreaker {
3
4 private final RedisTemplate<String, String> redisTemplate;
5
6 public boolean isCircuitBreakerOpen(String serviceName) {
7 String key = "circuitbreaker:" + serviceName + ":state";
8 String state = redisTemplate.opsForValue().get(key);
9 return "OPEN".equals(state);
10 }
11
12 public void setCircuitBreakerState(String serviceName, String state) {
13 String key = "circuitbreaker:" + serviceName + ":state";
14 redisTemplate.opsForValue().set(key, state, Duration.ofMinutes(10));
15 }
16}
  1. 动态配置
java
1@Component
2public class DynamicCircuitBreakerConfig {
3
4 private final CircuitBreakerRegistry circuitBreakerRegistry;
5
6 public void updateConfig(String serviceName, CircuitBreakerConfig config) {
7 CircuitBreaker circuitBreaker = circuitBreakerRegistry.circuitBreaker(serviceName);
8 circuitBreaker.updateConfiguration(config);
9 }
10
11 public void adjustConfigByLoad(String serviceName, double currentLoad) {
12 // 根据负载动态调整配置
13 if (currentLoad > 0.8) {
14 // 高负载,使用更严格的配置
15 updateConfig(serviceName, getStrictConfig());
16 } else if (currentLoad < 0.3) {
17 // 低负载,使用更宽松的配置
18 updateConfig(serviceName, getLooseConfig());
19 }
20 }
21}

Q9: 熔断器与限流器的区别是什么?

: 主要区别如下:

熔断器(Circuit Breaker)

  • 目的:防止级联故障,保护系统免受故障服务影响
  • 触发条件:基于失败率、响应时间等指标
  • 状态:有CLOSED、OPEN、HALF_OPEN三种状态
  • 恢复机制:自动恢复,支持试探性调用
  • 适用场景:保护系统免受故障服务影响

限流器(Rate Limiter)

  • 目的:控制请求速率,防止系统过载
  • 触发条件:基于请求数量、时间窗口
  • 状态:只有允许/拒绝两种状态
  • 恢复机制:基于时间窗口自动恢复
  • 适用场景:防止系统过载,保护下游服务

组合使用

java
1@CircuitBreaker(name = "user-service", fallbackMethod = "getUserFallback")
2@RateLimiter(name = "user-service")
3@Bulkhead(name = "user-service")
4public User getUser(String id) {
5 return userService.getUser(id);
6}

4.5 监控运维题

Q10: 如何监控熔断器的健康状况?

: 熔断器监控可以从以下几个方面进行:

  1. 关键指标监控
java
1@Component
2public class CircuitBreakerHealthCheck {
3
4 private final CircuitBreakerRegistry circuitBreakerRegistry;
5 private final MeterRegistry meterRegistry;
6
7 public Health checkCircuitBreakerHealth() {
8 Map<String, Object> details = new HashMap<>();
9
10 for (CircuitBreaker circuitBreaker : circuitBreakerRegistry.getAllCircuitBreakers()) {
11 CircuitBreaker.Metrics metrics = circuitBreaker.getMetrics();
12 details.put(circuitBreaker.getName() + ".state", circuitBreaker.getState());
13 details.put(circuitBreaker.getName() + ".failureRate", metrics.getFailureRate());
14 details.put(circuitBreaker.getName() + ".slowCallRate", metrics.getSlowCallRate());
15 }
16
17 boolean isHealthy = details.values().stream()
18 .noneMatch(value -> "OPEN".equals(value) ||
19 (value instanceof Number && ((Number) value).doubleValue() > 50));
20
21 return isHealthy ? Health.up().withDetails(details).build()
22 : Health.down().withDetails(details).build();
23 }
24}
  1. 告警规则配置
yaml
1# prometheus告警规则
2groups:
3 - name: circuitbreaker
4 rules:
5 - alert: CircuitBreakerOpen
6 expr: circuitbreaker_state == 1
7 for: 1m
8 labels:
9 severity: critical
10 annotations:
11 summary: "Circuit breaker is open"
12 description: "Circuit breaker for service {{ $labels.service }} is open"
13
14 - alert: HighFailureRate
15 expr: circuitbreaker_failure_rate > 50
16 for: 2m
17 labels:
18 severity: warning
19 annotations:
20 summary: "High failure rate detected"
21 description: "Failure rate for service {{ $labels.service }} is {{ $value }}%"
  1. 日志监控
java
1@Component
2public class CircuitBreakerLoggingFilter {
3
4 private static final Logger logger = LoggerFactory.getLogger(CircuitBreakerLoggingFilter.class);
5
6 @EventListener
7 public void onCircuitBreakerEvent(CircuitBreakerOnStateTransitionEvent event) {
8 logger.warn("Circuit breaker {} state changed from {} to {}",
9 event.getCircuitBreakerName(),
10 event.getStateTransition().getFromState(),
11 event.getStateTransition().getToState());
12 }
13
14 @EventListener
15 public void onFailureRateExceeded(CircuitBreakerOnFailureRateExceededEvent event) {
16 logger.error("Circuit breaker {} failure rate exceeded: {}%",
17 event.getCircuitBreakerName(),
18 event.getFailureRate());
19 }
20}
熔断器学习要点
  1. 理解原理:掌握熔断器的核心思想和状态转换机制
  2. 掌握配置:熟悉各种配置参数的含义和影响
  3. 实践应用:通过实际项目练习熔断器的使用
  4. 监控运维:学会熔断器的监控和运维管理
  5. 性能优化:了解熔断器性能优化的方法
  6. 最佳实践:掌握熔断器的最佳实践和设计模式

通过本章的学习,你应该已经深入理解了熔断器模式的核心概念、实现方案和最佳实践。熔断器是微服务架构中的重要容错机制,合理使用熔断器可以显著提高系统的可用性和稳定性。在实际项目中,要根据业务需求选择合适的熔断器方案,并注重监控运维和性能优化。

评论