Kotlin 重试(Retry)机制详解
重试机制是处理临时性失败(如网络波动、服务暂时不可用)的重要策略。Kotlin 提供了多种实现重试的方式。
一、基础重试实现
1. 基本的 try-catch 重试
suspend fun <T> simpleRetry(
maxRetries: Int = 3,
block: suspend () -> T
): T {
var lastException: Throwable? = null
for (attempt in 1..maxRetries) {
try {
return block()
} catch (e: Exception) {
lastException = e
println("尝试 $attempt 失败: ${e.message}")
if (attempt < maxRetries) {
delay(1000) // 简单的固定延迟
}
}
}
throw lastException ?: RuntimeException("重试失败")
}
// 使用示例
suspend fun fetchData(): String {
return simpleRetry(maxRetries = 3) {
// 模拟可能失败的操作
if (Random.nextBoolean()) {
throw IOException("网络错误")
}
"数据内容"
}
}
2. 带指数退避的重试
suspend fun <T> retryWithBackoff(
maxRetries: Int = 3,
initialDelayMillis: Long = 100,
maxDelayMillis: Long = 10000,
factor: Double = 2.0,
block: suspend () -> T
): T {
var currentDelay = initialDelayMillis
var lastException: Throwable? = null
for (attempt in 1..maxRetries) {
try {
return block()
} catch (e: Exception) {
lastException = e
println("第 $attempt 次尝试失败: ${e.message}")
if (attempt < maxRetries) {
// 指数退避
delay(currentDelay)
currentDelay = (currentDelay * factor).toLong()
.coerceAtMost(maxDelayMillis)
}
}
}
throw lastException ?: RuntimeException("重试失败")
}
二、Flow 中的重试
1. Flow 的 retry 操作符
import kotlinx.coroutines.flow.*
fun fetchDataFlow(): Flow<String> = flow {
// 模拟不稳定的数据源
for (i in 1..5) {
delay(500)
if (Random.nextDouble() < 0.3) { // 30% 失败率
throw IOException("第 $i 次获取失败")
}
emit("数据块 $i")
}
}
fun main() = runBlocking {
// 基本 retry
fetchDataFlow()
.retry(3) { cause ->
println("重试原因: ${cause.message}")
cause is IOException // 只在 IOException 时重试
}
.catch { e -> emit("最终失败: ${e.message}") }
.collect { println(it) }
println("\n=== 带延迟的重试 ===")
// retryWhen - 更灵活的控制
fetchDataFlow()
.retryWhen { cause, attempt ->
if (cause is IOException && attempt < 3) {
delay(1000 * attempt) // 线性退避
println("第 $attempt 次重试")
true
} else {
false
}
}
.catch { e -> emit("失败: ${e.message}") }
.collect { println(it) }
}
2. 带退避策略的 Flow 重试
import kotlinx.coroutines.flow.*
/**
* 为 Flow 添加指数退避重试
*/
fun <T> Flow<T>.retryExponentialBackoff(
maxRetries: Int,
initialDelayMillis: Long = 1000,
maxDelayMillis: Long = 60000,
factor: Double = 2.0,
shouldRetry: (Throwable) -> Boolean = { true }
): Flow<T> = retryWhen { cause, attempt ->
if (attempt <= maxRetries && shouldRetry(cause)) {
// 计算延迟时间(指数退避)
val delayMillis = (initialDelayMillis * Math.pow(factor, (attempt - 1).toDouble()))
.toLong()
.coerceAtMost(maxDelayMillis)
println("第 $attempt 次重试,等待 ${delayMillis}ms")
delay(delayMillis)
true
} else {
false
}
}
// 使用示例
fun unstableFlow(): Flow<Int> = flow {
for (i in 1..10) {
if (i % 3 == 0) { // 每3次失败一次
throw RuntimeException("第 $i 次失败")
}
emit(i)
delay(100)
}
}
fun main() = runBlocking {
unstableFlow()
.retryExponentialBackoff(
maxRetries = 3,
initialDelayMillis = 500,
maxDelayMillis = 5000
)
.catch { e -> println("最终失败: ${e.message}") }
.collect { println("收到: $it") }
}
三、实际应用场景
1. 网络请求重试
import retrofit2.HttpException
import java.io.IOException
import java.net.SocketTimeoutException
class NetworkRepository(
private val apiService: ApiService
) {
suspend fun <T> executeWithRetry(
maxRetries: Int = 3,
retryOn: (Throwable) -> Boolean = { it.shouldRetry() },
block: suspend () -> T
): T {
var lastException: Throwable? = null
for (attempt in 1..maxRetries) {
try {
return block()
} catch (e: Exception) {
lastException = e
if (attempt < maxRetries && retryOn(e)) {
// 计算退避延迟
val delayMillis = calculateBackoffDelay(attempt)
println("网络请求失败,${delayMillis}ms后重试 (尝试 $attempt)")
delay(delayMillis)
} else {
break
}
}
}
throw lastException ?: RuntimeException("请求失败")
}
private fun calculateBackoffDelay(attempt: Int): Long {
// 指数退避 + 随机抖动(避免惊群效应)
val baseDelay = 1000L * (1 shl (attempt - 1)) // 1s, 2s, 4s, ...
val jitter = (Math.random() * 1000).toLong() // 0-1s 随机抖动
return (baseDelay + jitter).coerceAtMost(30000L) // 最大30s
}
// 获取用户数据(带重试)
suspend fun getUserData(userId: String): UserData {
return executeWithRetry(
maxRetries = 3,
retryOn = { it.shouldRetry() }
) {
apiService.getUser(userId)
}
}
// 上传文件(带重试)
suspend fun uploadFile(file: File, maxRetries: Int = 5): UploadResult {
return executeWithRetry(maxRetries = maxRetries) {
apiService.uploadFile(file)
}
}
}
// 异常扩展:判断是否应该重试
fun Throwable.shouldRetry(): Boolean {
return when (this) {
is IOException -> true // 网络IO错误
is SocketTimeoutException -> true // 超时
is HttpException -> this.code() in setOf(408, 429, 500, 502, 503, 504) // 特定HTTP状态码
else -> false
}
}
2. 数据库操作重试
import java.sql.SQLException
import java.sql.SQLTransientException
class DatabaseRepository(
private val dataSource: DataSource
) {
suspend fun <T> executeTransactionWithRetry(
maxRetries: Int = 3,
block: suspend () -> T
): T {
var lastException: Throwable? = null
for (attempt in 1..maxRetries) {
try {
return dataSource.connection.use { connection ->
connection.autoCommit = false
try {
val result = block()
connection.commit()
result
} catch (e: Exception) {
connection.rollback()
throw e
}
}
} catch (e: Exception) {
lastException = e
if (attempt < maxRetries && e.shouldRetryDatabase()) {
// 数据库特有的退避策略
val delayMillis = calculateDatabaseRetryDelay(attempt, e)
println("数据库操作失败,${delayMillis}ms后重试")
delay(delayMillis)
// 刷新连接池(如果连接有问题)
if (e.isConnectionError()) {
dataSource.refreshConnection()
}
} else {
break
}
}
}
throw lastException ?: RuntimeException("数据库操作失败")
}
private fun calculateDatabaseRetryDelay(attempt: Int, exception: Exception): Long {
return when {
exception.isDeadlock() -> 100L // 死锁快速重试
exception.isTimeout() -> 1000L * attempt // 超时线性退避
else -> 500L * (1 shl (attempt - 1)) // 其他错误指数退避
}.coerceAtMost(10000L)
}
// 批量插入(带重试)
suspend fun batchInsert(entities: List<Entity>) {
executeTransactionWithRetry {
entities.forEach { entity ->
// 插入逻辑
}
}
}
}
// 数据库异常判断扩展
fun Exception.shouldRetryDatabase(): Boolean {
return when (this) {
is SQLTransientException -> true // 临时性SQL错误
is SQLException -> {
when (this.sqlState) {
"40001" -> true // 死锁
"08003", "08006", "08007" -> true // 连接错误
"57014" -> true // 查询超时
else -> false
}
}
else -> false
}
}
fun Exception.isConnectionError(): Boolean {
return this is SQLException && this.sqlState in setOf("08003", "08006", "08007")
}
fun Exception.isDeadlock(): Boolean {
return this is SQLException && this.sqlState == "40001"
}
fun Exception.isTimeout(): Boolean {
return this is SQLException && this.sqlState == "57014"
}
3. 文件操作重试
import java.io.IOException
import java.nio.file.AccessDeniedException
import java.nio.file.FileSystemException
class FileOperationManager {
suspend fun <T> retryFileOperation(
maxRetries: Int = 3,
retryDelayMillis: Long = 1000,
block: suspend () -> T
): T {
var lastException: Throwable? = null
for (attempt in 1..maxRetries) {
try {
return block()
} catch (e: Exception) {
lastException = e
if (attempt < maxRetries && e.shouldRetryFileOperation()) {
println("文件操作失败 (尝试 $attempt): ${e.message}")
delay(retryDelayMillis * attempt) // 线性退避
// 尝试释放文件锁
if (e.isFileLocked()) {
releaseFileLocks()
}
} else {
break
}
}
}
throw lastException ?: RuntimeException("文件操作失败")
}
// 读取文件(带重试)
suspend fun readFileWithRetry(file: File): String {
return retryFileOperation(maxRetries = 5) {
file.readText()
}
}
// 写入文件(带重试)
suspend fun writeFileWithRetry(file: File, content: String) {
retryFileOperation {
file.writeText(content)
}
}
// 复制大文件(带进度和重试)
suspend fun copyLargeFileWithRetry(
source: File,
target: File,
chunkSize: Int = 8192
) {
var attempt = 0
val maxRetries = 3
while (attempt < maxRetries) {
try {
source.inputStream().use { input ->
target.outputStream().use { output ->
val buffer = ByteArray(chunkSize)
var bytesRead: Int
var totalBytes = 0L
while (input.read(buffer).also { bytesRead = it } != -1) {
output.write(buffer, 0, bytesRead)
totalBytes += bytesRead
// 定期刷新(避免数据丢失)
if (totalBytes % (chunkSize * 100) == 0L) {
output.flush()
}
}
}
}
return // 成功退出
} catch (e: Exception) {
attempt++
if (attempt >= maxRetries || !e.shouldRetryFileOperation()) {
throw e
}
println("文件复制失败,第 $attempt 次重试")
delay(1000 * attempt)
// 清理可能损坏的目标文件
if (target.exists()) {
target.delete()
}
}
}
}
private fun Exception.shouldRetryFileOperation(): Boolean {
return when (this) {
is AccessDeniedException -> false // 权限问题,重试无效
is FileSystemException -> this.reason != "Permission denied"
is IOException -> true // 其他IO错误
else -> false
}
}
private fun Exception.isFileLocked(): Boolean {
return this is IOException &&
this.message?.contains("locked", ignoreCase = true) == true
}
private fun releaseFileLocks() {
// 尝试释放文件锁(平台相关)
// Windows: 使用 handle.exe 或 PowerShell
// Linux/Mac: 使用 lsof + fuser
}
}
四、高级重试策略
1. 基于响应内容的重试
class SmartRetryStrategy {
data class RetryContext(
val attempt: Int,
val lastException: Throwable?,
val lastResponse: Any?
)
suspend fun <T> retryWithResponseInspection(
maxRetries: Int = 3,
shouldRetry: suspend (RetryContext) -> Boolean,
block: suspend () -> T
): T {
var lastException: Throwable? = null
var lastResponse: Any? = null
for (attempt in 1..maxRetries) {
try {
val result = block()
// 检查结果是否有效(即使没有抛出异常)
if (shouldRetry(RetryContext(attempt, null, result))) {
// 虽然成功但结果不符合要求,重试
lastResponse = result
if (attempt < maxRetries) {
delay(calculateDelay(attempt))
continue
}
}
return result
} catch (e: Exception) {
lastException = e
lastResponse = null
if (attempt < maxRetries &&
shouldRetry(RetryContext(attempt, e, null))) {
delay(calculateDelay(attempt))
} else {
break
}
}
}
throw lastException ?: RuntimeException("操作失败")
}
// API调用示例
suspend fun callApiWithValidation(): ApiResponse {
return retryWithResponseInspection(
maxRetries = 3,
shouldRetry = { context ->
when {
// 异常情况
context.lastException != null -> {
context.lastException.shouldRetry()
}
// 响应内容检查
context.lastResponse != null -> {
val response = context.lastResponse as ApiResponse
response.shouldRetry()
}
else -> false
}
}
) {
apiService.getData()
}
}
private fun calculateDelay(attempt: Int): Long {
return (1000L * attempt).coerceAtMost(10000L)
}
}
data class ApiResponse(
val success: Boolean,
val data: Any?,
val errorCode: String?,
val message: String?
) {
fun shouldRetry(): Boolean {
return when (errorCode) {
"RATE_LIMITED" -> true // 限流
"TEMPORARY_ERROR" -> true // 临时错误
"MAINTENANCE" -> true // 维护中
else -> false
}
}
}
2. 并发重试策略
import kotlinx.coroutines.*
import java.util.concurrent.ConcurrentHashMap
class ConcurrentRetryManager {
private val activeRetries = ConcurrentHashMap<String, Job>()
/**
* 并发安全的去重重试
* 相同 key 的操作只进行一次重试
*/
suspend fun <T> retryDeduplicated(
key: String,
maxRetries: Int = 3,
block: suspend () -> T
): T {
// 检查是否已有相同key的重试在进行中
val existingJob = activeRetries[key]
if (existingJob != null && existingJob.isActive) {
// 等待已有的重试完成
return try {
existingJob.join()
block() // 重试完成后再次执行
} catch (e: Exception) {
throw e
}
}
// 启动新的重试
return coroutineScope {
val job = launch(start = CoroutineStart.LAZY) {
retryWithBackoff(maxRetries, block)
}
activeRetries[key] = job
try {
job.start()
job.join()
block()
} finally {
activeRetries.remove(key)
}
}
}
/**
* 批量操作的智能重试
* 部分失败时只重试失败的部分
*/
suspend fun <T, R> batchRetry(
items: List<T>,
concurrency: Int = 5,
maxRetriesPerItem: Int = 3,
transform: suspend (T) -> R
): List<R> = coroutineScope {
val results = mutableMapOf<Int, Result<R>>()
val failedIndices = mutableListOf<Int>()
// 第一轮:并发处理所有项
items.mapIndexed { index, item ->
async {
try {
Result.success(transform(item))
} catch (e: Exception) {
Result.failure<R>(e)
}.also { result ->
synchronized(results) {
results[index] = result
if (result.isFailure) {
failedIndices.add(index)
}
}
}
}
}.awaitAll()
// 重试失败项
for (retryAttempt in 1..maxRetriesPerItem) {
if (failedIndices.isEmpty()) break
println("第 $retryAttempt 轮重试,失败项: ${failedIndices.size}")
val currentFailed = failedIndices.toList()
failedIndices.clear()
val retryJobs = currentFailed.map { index ->
async {
try {
retryWithBackoff(
maxRetries = 1, // 每次只重试一次,由外层循环控制
block = { transform(items[index]) }
).also { success ->
synchronized(results) {
results[index] = Result.success(success)
}
}
} catch (e: Exception) {
synchronized(results) {
results[index] = Result.failure(e)
failedIndices.add(index)
}
}
}
}
retryJobs.awaitAll()
if (failedIndices.isNotEmpty()) {
delay(1000 * retryAttempt) // 轮次间延迟
}
}
// 收集最终结果
items.indices.map { index ->
results[index]?.getOrThrow() ?: throw IllegalStateException("Missing result")
}
}
/**
* 优先队列重试:重要任务优先重试
*/
suspend fun <T> priorityRetry(
tasks: List<Pair<Int, suspend () -> T>>, // (优先级, 任务)
maxRetries: Int = 3
): List<T> {
val sortedTasks = tasks.sortedByDescending { it.first } // 优先级高的先执行
val results = mutableListOf<T>()
for ((priority, task) in sortedTasks) {
try {
val result = retryWithBackoff(maxRetries) {
println("执行优先级 $priority 的任务")
task()
}
results.add(result)
} catch (e: Exception) {
println("优先级 $priority 的任务失败: ${e.message}")
// 低优先级任务失败不影响高优先级任务
}
}
return results
}
}
3. 熔断器模式配合重试
import kotlinx.coroutines.sync.Mutex
import kotlinx.coroutines.sync.withLock
class CircuitBreaker(
private val failureThreshold: Int = 5,
private val resetTimeoutMillis: Long = 60000,
private val halfOpenMaxAttempts: Int = 3
) {
private var state: State = State.CLOSED
private var failureCount = 0
private var lastFailureTime = 0L
private var halfOpenAttempts = 0
private val mutex = Mutex()
sealed class State {
object CLOSED : State() // 正常状态
object OPEN : State() // 熔断状态
object HALF_OPEN : State() // 半开状态(试探)
}
suspend fun <T> execute(block: suspend () -> T): T {
mutex.withLock {
updateState()
}
return when (state) {
State.OPEN -> throw CircuitBreakerOpenException("熔断器开启")
State.HALF_OPEN -> executeHalfOpen(block)
State.CLOSED -> executeClosed(block)
}
}
private suspend fun <T> executeClosed(block: suspend () -> T): T {
try {
val result = block()
mutex.withLock {
// 成功时重置失败计数
failureCount = 0
}
return result
} catch (e: Exception) {
mutex.withLock {
failureCount++
lastFailureTime = System.currentTimeMillis()
if (failureCount >= failureThreshold) {
state = State.OPEN
}
}
throw e
}
}
private suspend fun <T> executeHalfOpen(block: suspend () -> T): T {
try {
val result = block()
mutex.withLock {
// 半开状态下成功,关闭熔断器
state = State.CLOSED
failureCount = 0
halfOpenAttempts = 0
}
return result
} catch (e: Exception) {
mutex.withLock {
halfOpenAttempts++
if (halfOpenAttempts >= halfOpenMaxAttempts) {
state = State.OPEN
lastFailureTime = System.currentTimeMillis()
}
}
throw e
}
}
private fun updateState() {
when (state) {
State.OPEN -> {
val timeSinceFailure = System.currentTimeMillis() - lastFailureTime
if (timeSinceFailure >= resetTimeoutMillis) {
state = State.HALF_OPEN
halfOpenAttempts = 0
}
}
else -> Unit
}
}
fun getStatus(): String {
return "State: $state, Failures: $failureCount, LastFailure: $lastFailureTime"
}
}
class CircuitBreakerOpenException(message: String) : Exception(message)
// 使用熔断器的重试策略
class ResilientService(
private val circuitBreaker: CircuitBreaker
) {
suspend fun <T> executeWithResilience(
maxRetries: Int = 3,
block: suspend () -> T
): T {
var lastException: Throwable? = null
for (attempt in 1..maxRetries) {
try {
// 通过熔断器执行
return circuitBreaker.execute(block)
} catch (e: CircuitBreakerOpenException) {
// 熔断器开启,直接失败
throw e
} catch (e: Exception) {
lastException = e
if (attempt < maxRetries && e.shouldRetry()) {
val delay = calculateDelay(attempt)
println("服务调用失败,${delay}ms后重试")
delay(delay)
} else {
break
}
}
}
throw lastException ?: RuntimeException("服务调用失败")
}
suspend fun callService(): String {
return executeWithResilience {
// 模拟服务调用
if (Random.nextDouble() < 0.4) {
throw IOException("服务暂时不可用")
}
"服务响应"
}
}
}
五、测试重试逻辑
1. 单元测试重试
import kotlinx.coroutines.test.runTest
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFailsWith
class RetryTest {
@Test
fun testRetrySuccess() = runTest {
var attempts = 0
val result = retryWithBackoff(maxRetries = 3) {
attempts++
if (attempts < 3) {
throw IOException("模拟失败")
}
"成功"
}
assertEquals("成功", result)
assertEquals(3, attempts)
}
@Test
fun testRetryFailure() = runTest {
var attempts = 0
assertFailsWith<IOException> {
retryWithBackoff(maxRetries = 3) {
attempts++
throw IOException("总是失败")
}
}
assertEquals(3, attempts)
}
@Test
fun testRetryWithCondition() = runTest {
var attempts = 0
val result = retryWithBackoff(
maxRetries = 5,
block = {
attempts++
when {
attempts == 1 -> throw IOException("可重试错误")
attempts == 2 -> throw IllegalArgumentException("不可重试错误")
else -> "成功"
}
}
) { e ->
e is IOException // 只重试IOException
}
// 应该失败,因为第二次是不可重试错误
assertFailsWith<IllegalArgumentException>()
}
@Test
fun testExponentialBackoff() = runTest {
val delays = mutableListOf<Long>()
val startTime = System.currentTimeMillis()
try {
retryWithBackoff(
maxRetries = 4,
initialDelayMillis = 100,
factor = 2.0
) {
delays.add(System.currentTimeMillis() - startTime)
throw IOException("失败")
}
} catch (e: IOException) {
// 预期失败
}
// 验证延迟模式:~100ms, ~300ms, ~700ms
assertEquals(3, delays.size - 1) // 第一次立即执行
for (i in 1 until delays.size) {
val actualDelay = delays[i] - delays[i - 1]
val expectedDelay = 100L * (1 shl (i - 1))
// 允许10%的误差
assert(actualDelay in (expectedDelay * 0.9).toLong()..(expectedDelay * 1.1).toLong()) {
"延迟 $actualDelay 不在预期范围 $expectedDelay"
}
}
}
}
2. 模拟不稳定服务进行测试
class UnstableServiceSimulator(
private val failurePattern: List<Boolean> // true=失败, false=成功
) {
private var callCount = 0
suspend fun call(): String {
if (callCount >= failurePattern.size) {
return "稳定服务"
}
val shouldFail = failurePattern[callCount]
callCount++
if (shouldFail) {
throw IOException("服务调用 $callCount 失败")
}
return "服务调用 $callCount 成功"
}
fun reset() {
callCount = 0
}
}
class RetryIntegrationTest {
@Test
fun testRetryPatterns() = runTest {
// 测试1: 失败-成功模式
val simulator1 = UnstableServiceSimulator(listOf(true, false))
val result1 = retryWithBackoff(maxRetries = 3) {
simulator1.call()
}
assertEquals("服务调用 2 成功", result1)
// 测试2: 连续失败模式
simulator1.reset()
val simulator2 = UnstableServiceSimulator(listOf(true, true, true, false))
val result2 = retryWithBackoff(maxRetries = 4) {
simulator2.call()
}
assertEquals("服务调用 4 成功", result2)
// 测试3: 超过重试次数
simulator1.reset()
val simulator3 = UnstableServiceSimulator(listOf(true, true, true, true, false))
assertFailsWith<IOException> {
retryWithBackoff(maxRetries = 3) {
simulator3.call()
}
}
}
@Test
fun testCircuitBreakerWithRetry() = runTest {
val circuitBreaker = CircuitBreaker(
failureThreshold = 2,
resetTimeoutMillis = 1000
)
val service = ResilientService(circuitBreaker)
// 模拟快速失败触发熔断
val failingService = object {
var callCount = 0
suspend fun call(): String {
callCount++
throw IOException("服务失败")
}
}
// 前两次重试会失败
assertFailsWith<IOException> {
service.executeWithResilience(maxRetries = 2) {
failingService.call()
}
}
// 等待熔断器重置
delay(1500)
// 现在应该可以重新尝试
failingService.callCount = 0
assertFailsWith<IOException> {
service.executeWithResilience(maxRetries = 1) {
failingService.call()
}
}
}
}
六、最佳实践
1. 重试策略选择指南
object RetryStrategyGuide {
/**
* 选择适当的重试策略
*/
fun <T> chooseStrategy(
operationType: OperationType,
context: RetryContext
): RetryStrategy {
return when (operationType) {
OperationType.NETWORK_REQUEST -> NetworkRetryStrategy()
OperationType.DATABASE_OPERATION -> DatabaseRetryStrategy()
OperationType.FILE_OPERATION -> FileRetryStrategy()
OperationType.EXTERNAL_API_CALL -> ExternalApiRetryStrategy()
}
}
/**
* 网络请求重试策略
*/
class NetworkRetryStrategy : RetryStrategy {
override val maxRetries = 3
override val initialDelay = 1000L
override val maxDelay = 10000L
override val factor = 2.0
override fun shouldRetry(exception: Throwable): Boolean {
return when (exception) {
is IOException -> true
is SocketTimeoutException -> true
is HttpException -> exception.code() in setOf(408, 429, 500, 502, 503, 504)
else -> false
}
}
}
/**
* 数据库操作重试策略
*/
class DatabaseRetryStrategy : RetryStrategy {
override val maxRetries = 5
override val initialDelay = 100L
override val maxDelay = 2000L
override val factor = 1.5
override fun shouldRetry(exception: Throwable): Boolean {
return exception is SQLTransientException ||
(exception is SQLException && exception.isTransient())
}
}
/**
* 重试监控和指标
*/
class RetryMonitor {
private val metrics = mutableMapOf<String, RetryMetrics>()
fun recordAttempt(operation: String, attempt: Int, success: Boolean) {
val metric = metrics.getOrPut(operation) { RetryMetrics() }
metric.totalAttempts++
if (attempt > 1) metric.retryAttempts++
if (success) metric.successfulOperations++ else metric.failedOperations++
}
fun getMetrics(operation: String): RetryMetrics? {
return metrics[operation]
}
fun printReport() {
metrics.forEach { (operation, metric) ->
println("""
|Operation: $operation
| Total Attempts: ${metric.totalAttempts}
| Retry Attempts: ${metric.retryAttempts}
| Success Rate: ${metric.successRate}%
| Retry Rate: ${metric.retryRate}%
""".trimMargin())
}
}
}
data class RetryMetrics(
var totalAttempts: Long = 0,
var retryAttempts: Long = 0,
var successfulOperations: Long = 0,
var failedOperations: Long = 0
) {
val successRate: Double
get() = if (totalAttempts > 0) successfulOperations.toDouble() / totalAttempts * 100 else 0.0
val retryRate: Double
get() = if (totalAttempts > 0) retryAttempts.toDouble() / totalAttempts * 100 else 0.0
}
enum class OperationType {
NETWORK_REQUEST,
DATABASE_OPERATION,
FILE_OPERATION,
EXTERNAL_API_CALL
}
interface RetryStrategy {
val maxRetries: Int
val initialDelay: Long
val maxDelay: Long
val factor: Double
fun shouldRetry(exception: Throwable): Boolean
}
}
2. 重试配置化
data class RetryConfig(
val maxRetries: Int = 3,
val initialDelayMillis: Long = 1000,
val maxDelayMillis: Long = 10000,
val factor: Double = 2.0,
val jitter: Boolean = true,
val jitterFactor: Double = 0.1,
val retryableExceptions: Set<String> = setOf(
"java.io.IOException",
"java.net.SocketTimeoutException",
"retrofit2.HttpException"
)
)
class ConfigurableRetryManager(
private val configProvider: RetryConfigProvider
) {
suspend fun <T> executeWithRetry(
operationName: String,
block: suspend () -> T
): T {
val config = configProvider.getConfig(operationName)
var lastException: Throwable? = null
for (attempt in 1..config.maxRetries) {
try {
return block()
} catch (e: Exception) {
lastException = e
if (attempt < config.maxRetries && shouldRetry(e, config)) {
val delay = calculateDelay(attempt, config)
println("$operationName 第 $attempt 次重试,等待 ${delay}ms")
delay(delay)
} else {
break
}
}
}
throw lastException ?: RuntimeException("$operationName 失败")
}
private fun shouldRetry(exception: Throwable, config: RetryConfig): Boolean {
// 通过类名检查是否可重试
val exceptionClassName = exception::class.qualifiedName
return config.retryableExceptions.any { className ->
exceptionClassName?.contains(className) == true
}
}
private fun calculateDelay(attempt: Int, config: RetryConfig): Long {
val baseDelay = (config.initialDelayMillis * Math.pow(config.factor, (attempt - 1).toDouble()))
.toLong()
.coerceAtMost(config.maxDelayMillis)
return if (config.jitter) {
val jitterRange = (baseDelay * config.jitterFactor).toLong()
baseDelay + Random.nextLong(-jitterRange, jitterRange)
} else {
baseDelay
}
}
}
interface RetryConfigProvider {
fun getConfig(operationName: String): RetryConfig
}
class JsonRetryConfigProvider : RetryConfigProvider {
private val configs = mapOf(
"userApi" to RetryConfig(
maxRetries = 3,
initialDelayMillis = 1000,
retryableExceptions = setOf("IOException", "HttpException")
),
"paymentApi" to RetryConfig(
maxRetries = 5,
initialDelayMillis = 2000,
factor = 1.5,
jitter = true
),
"fileUpload" to RetryConfig(
maxRetries = 10,
initialDelayMillis = 500,
maxDelayMillis = 30000,
factor = 3.0
)
)
override fun getConfig(operationName: String): RetryConfig {
return configs[operationName] ?: RetryConfig()
}
}
总结:
Kotlin 重试机制的最佳实践:
核心策略:
- 指数退避:避免雪崩效应
- 随机抖动:防止惊群效应
- 熔断器模式:保护下游服务
- 条件重试:只重试可恢复的错误
实现要点:
- 网络请求:重试超时、连接错误、特定HTTP状态码
- 数据库操作:重试死锁、连接超时、临时错误
- 文件操作:重试文件锁、临时IO错误
- 外部API:重试限流、服务不可用
监控和调优:
- 记录重试指标:成功率、重试率、延迟分布
- 动态调整参数:基于历史表现调整重试策略
- A/B测试:比较不同策略的效果
注意事项:
- 幂等性:确保重试操作是幂等的
- 资源消耗:避免无限重试消耗资源
- 用户体验:及时反馈失败,而不是无休止重试
- 依赖服务:考虑下游服务的承受能力
选择合适重试策略时,需要权衡:
- 成功率的提升 vs 延迟的增加
- 用户体验 vs 系统资源
- 业务重要性 vs 实现复杂度