kotlin retry 笔记

4 阅读8分钟

Kotlin 重试(Retry)机制详解

重试机制是处理临时性失败(如网络波动、服务暂时不可用)的重要策略。Kotlin 提供了多种实现重试的方式。

一、基础重试实现

1. 基本的 try-catch 重试

suspend fun <T> simpleRetry(
    maxRetries: Int = 3,
    block: suspend () -> T
): T {
    var lastException: Throwable? = null
    
    for (attempt in 1..maxRetries) {
        try {
            return block()
        } catch (e: Exception) {
            lastException = e
            println("尝试 $attempt 失败: ${e.message}")
            
            if (attempt < maxRetries) {
                delay(1000) // 简单的固定延迟
            }
        }
    }
    
    throw lastException ?: RuntimeException("重试失败")
}

// 使用示例
suspend fun fetchData(): String {
    return simpleRetry(maxRetries = 3) {
        // 模拟可能失败的操作
        if (Random.nextBoolean()) {
            throw IOException("网络错误")
        }
        "数据内容"
    }
}

2. 带指数退避的重试

suspend fun <T> retryWithBackoff(
    maxRetries: Int = 3,
    initialDelayMillis: Long = 100,
    maxDelayMillis: Long = 10000,
    factor: Double = 2.0,
    block: suspend () -> T
): T {
    var currentDelay = initialDelayMillis
    var lastException: Throwable? = null
    
    for (attempt in 1..maxRetries) {
        try {
            return block()
        } catch (e: Exception) {
            lastException = e
            println("第 $attempt 次尝试失败: ${e.message}")
            
            if (attempt < maxRetries) {
                // 指数退避
                delay(currentDelay)
                currentDelay = (currentDelay * factor).toLong()
                    .coerceAtMost(maxDelayMillis)
            }
        }
    }
    
    throw lastException ?: RuntimeException("重试失败")
}

二、Flow 中的重试

1. Flow 的 retry 操作符

import kotlinx.coroutines.flow.*

fun fetchDataFlow(): Flow<String> = flow {
    // 模拟不稳定的数据源
    for (i in 1..5) {
        delay(500)
        if (Random.nextDouble() < 0.3) { // 30% 失败率
            throw IOException("第 $i 次获取失败")
        }
        emit("数据块 $i")
    }
}

fun main() = runBlocking {
    // 基本 retry
    fetchDataFlow()
        .retry(3) { cause ->
            println("重试原因: ${cause.message}")
            cause is IOException // 只在 IOException 时重试
        }
        .catch { e -> emit("最终失败: ${e.message}") }
        .collect { println(it) }
    
    println("\n=== 带延迟的重试 ===")
    
    // retryWhen - 更灵活的控制
    fetchDataFlow()
        .retryWhen { cause, attempt ->
            if (cause is IOException && attempt < 3) {
                delay(1000 * attempt) // 线性退避
                println("第 $attempt 次重试")
                true
            } else {
                false
            }
        }
        .catch { e -> emit("失败: ${e.message}") }
        .collect { println(it) }
}

2. 带退避策略的 Flow 重试

import kotlinx.coroutines.flow.*

/**
 * 为 Flow 添加指数退避重试
 */
fun <T> Flow<T>.retryExponentialBackoff(
    maxRetries: Int,
    initialDelayMillis: Long = 1000,
    maxDelayMillis: Long = 60000,
    factor: Double = 2.0,
    shouldRetry: (Throwable) -> Boolean = { true }
): Flow<T> = retryWhen { cause, attempt ->
    if (attempt <= maxRetries && shouldRetry(cause)) {
        // 计算延迟时间(指数退避)
        val delayMillis = (initialDelayMillis * Math.pow(factor, (attempt - 1).toDouble()))
            .toLong()
            .coerceAtMost(maxDelayMillis)
        
        println("第 $attempt 次重试,等待 ${delayMillis}ms")
        delay(delayMillis)
        true
    } else {
        false
    }
}

// 使用示例
fun unstableFlow(): Flow<Int> = flow {
    for (i in 1..10) {
        if (i % 3 == 0) { // 每3次失败一次
            throw RuntimeException("第 $i 次失败")
        }
        emit(i)
        delay(100)
    }
}

fun main() = runBlocking {
    unstableFlow()
        .retryExponentialBackoff(
            maxRetries = 3,
            initialDelayMillis = 500,
            maxDelayMillis = 5000
        )
        .catch { e -> println("最终失败: ${e.message}") }
        .collect { println("收到: $it") }
}

三、实际应用场景

1. 网络请求重试

import retrofit2.HttpException
import java.io.IOException
import java.net.SocketTimeoutException

class NetworkRepository(
    private val apiService: ApiService
) {
    suspend fun <T> executeWithRetry(
        maxRetries: Int = 3,
        retryOn: (Throwable) -> Boolean = { it.shouldRetry() },
        block: suspend () -> T
    ): T {
        var lastException: Throwable? = null
        
        for (attempt in 1..maxRetries) {
            try {
                return block()
            } catch (e: Exception) {
                lastException = e
                
                if (attempt < maxRetries && retryOn(e)) {
                    // 计算退避延迟
                    val delayMillis = calculateBackoffDelay(attempt)
                    println("网络请求失败,${delayMillis}ms后重试 (尝试 $attempt)")
                    delay(delayMillis)
                } else {
                    break
                }
            }
        }
        
        throw lastException ?: RuntimeException("请求失败")
    }
    
    private fun calculateBackoffDelay(attempt: Int): Long {
        // 指数退避 + 随机抖动(避免惊群效应)
        val baseDelay = 1000L * (1 shl (attempt - 1)) // 1s, 2s, 4s, ...
        val jitter = (Math.random() * 1000).toLong() // 0-1s 随机抖动
        return (baseDelay + jitter).coerceAtMost(30000L) // 最大30s
    }
    
    // 获取用户数据(带重试)
    suspend fun getUserData(userId: String): UserData {
        return executeWithRetry(
            maxRetries = 3,
            retryOn = { it.shouldRetry() }
        ) {
            apiService.getUser(userId)
        }
    }
    
    // 上传文件(带重试)
    suspend fun uploadFile(file: File, maxRetries: Int = 5): UploadResult {
        return executeWithRetry(maxRetries = maxRetries) {
            apiService.uploadFile(file)
        }
    }
}

// 异常扩展:判断是否应该重试
fun Throwable.shouldRetry(): Boolean {
    return when (this) {
        is IOException -> true // 网络IO错误
        is SocketTimeoutException -> true // 超时
        is HttpException -> this.code() in setOf(408, 429, 500, 502, 503, 504) // 特定HTTP状态码
        else -> false
    }
}

2. 数据库操作重试

import java.sql.SQLException
import java.sql.SQLTransientException

class DatabaseRepository(
    private val dataSource: DataSource
) {
    suspend fun <T> executeTransactionWithRetry(
        maxRetries: Int = 3,
        block: suspend () -> T
    ): T {
        var lastException: Throwable? = null
        
        for (attempt in 1..maxRetries) {
            try {
                return dataSource.connection.use { connection ->
                    connection.autoCommit = false
                    try {
                        val result = block()
                        connection.commit()
                        result
                    } catch (e: Exception) {
                        connection.rollback()
                        throw e
                    }
                }
            } catch (e: Exception) {
                lastException = e
                
                if (attempt < maxRetries && e.shouldRetryDatabase()) {
                    // 数据库特有的退避策略
                    val delayMillis = calculateDatabaseRetryDelay(attempt, e)
                    println("数据库操作失败,${delayMillis}ms后重试")
                    delay(delayMillis)
                    
                    // 刷新连接池(如果连接有问题)
                    if (e.isConnectionError()) {
                        dataSource.refreshConnection()
                    }
                } else {
                    break
                }
            }
        }
        
        throw lastException ?: RuntimeException("数据库操作失败")
    }
    
    private fun calculateDatabaseRetryDelay(attempt: Int, exception: Exception): Long {
        return when {
            exception.isDeadlock() -> 100L // 死锁快速重试
            exception.isTimeout() -> 1000L * attempt // 超时线性退避
            else -> 500L * (1 shl (attempt - 1)) // 其他错误指数退避
        }.coerceAtMost(10000L)
    }
    
    // 批量插入(带重试)
    suspend fun batchInsert(entities: List<Entity>) {
        executeTransactionWithRetry {
            entities.forEach { entity ->
                // 插入逻辑
            }
        }
    }
}

// 数据库异常判断扩展
fun Exception.shouldRetryDatabase(): Boolean {
    return when (this) {
        is SQLTransientException -> true // 临时性SQL错误
        is SQLException -> {
            when (this.sqlState) {
                "40001" -> true // 死锁
                "08003", "08006", "08007" -> true // 连接错误
                "57014" -> true // 查询超时
                else -> false
            }
        }
        else -> false
    }
}

fun Exception.isConnectionError(): Boolean {
    return this is SQLException && this.sqlState in setOf("08003", "08006", "08007")
}

fun Exception.isDeadlock(): Boolean {
    return this is SQLException && this.sqlState == "40001"
}

fun Exception.isTimeout(): Boolean {
    return this is SQLException && this.sqlState == "57014"
}

3. 文件操作重试

import java.io.IOException
import java.nio.file.AccessDeniedException
import java.nio.file.FileSystemException

class FileOperationManager {
    suspend fun <T> retryFileOperation(
        maxRetries: Int = 3,
        retryDelayMillis: Long = 1000,
        block: suspend () -> T
    ): T {
        var lastException: Throwable? = null
        
        for (attempt in 1..maxRetries) {
            try {
                return block()
            } catch (e: Exception) {
                lastException = e
                
                if (attempt < maxRetries && e.shouldRetryFileOperation()) {
                    println("文件操作失败 (尝试 $attempt): ${e.message}")
                    delay(retryDelayMillis * attempt) // 线性退避
                    
                    // 尝试释放文件锁
                    if (e.isFileLocked()) {
                        releaseFileLocks()
                    }
                } else {
                    break
                }
            }
        }
        
        throw lastException ?: RuntimeException("文件操作失败")
    }
    
    // 读取文件(带重试)
    suspend fun readFileWithRetry(file: File): String {
        return retryFileOperation(maxRetries = 5) {
            file.readText()
        }
    }
    
    // 写入文件(带重试)
    suspend fun writeFileWithRetry(file: File, content: String) {
        retryFileOperation {
            file.writeText(content)
        }
    }
    
    // 复制大文件(带进度和重试)
    suspend fun copyLargeFileWithRetry(
        source: File,
        target: File,
        chunkSize: Int = 8192
    ) {
        var attempt = 0
        val maxRetries = 3
        
        while (attempt < maxRetries) {
            try {
                source.inputStream().use { input ->
                    target.outputStream().use { output ->
                        val buffer = ByteArray(chunkSize)
                        var bytesRead: Int
                        var totalBytes = 0L
                        
                        while (input.read(buffer).also { bytesRead = it } != -1) {
                            output.write(buffer, 0, bytesRead)
                            totalBytes += bytesRead
                            
                            // 定期刷新(避免数据丢失)
                            if (totalBytes % (chunkSize * 100) == 0L) {
                                output.flush()
                            }
                        }
                    }
                }
                return // 成功退出
            } catch (e: Exception) {
                attempt++
                if (attempt >= maxRetries || !e.shouldRetryFileOperation()) {
                    throw e
                }
                
                println("文件复制失败,第 $attempt 次重试")
                delay(1000 * attempt)
                
                // 清理可能损坏的目标文件
                if (target.exists()) {
                    target.delete()
                }
            }
        }
    }
    
    private fun Exception.shouldRetryFileOperation(): Boolean {
        return when (this) {
            is AccessDeniedException -> false // 权限问题,重试无效
            is FileSystemException -> this.reason != "Permission denied"
            is IOException -> true // 其他IO错误
            else -> false
        }
    }
    
    private fun Exception.isFileLocked(): Boolean {
        return this is IOException && 
               this.message?.contains("locked", ignoreCase = true) == true
    }
    
    private fun releaseFileLocks() {
        // 尝试释放文件锁(平台相关)
        // Windows: 使用 handle.exe 或 PowerShell
        // Linux/Mac: 使用 lsof + fuser
    }
}

四、高级重试策略

1. 基于响应内容的重试

class SmartRetryStrategy {
    data class RetryContext(
        val attempt: Int,
        val lastException: Throwable?,
        val lastResponse: Any?
    )
    
    suspend fun <T> retryWithResponseInspection(
        maxRetries: Int = 3,
        shouldRetry: suspend (RetryContext) -> Boolean,
        block: suspend () -> T
    ): T {
        var lastException: Throwable? = null
        var lastResponse: Any? = null
        
        for (attempt in 1..maxRetries) {
            try {
                val result = block()
                
                // 检查结果是否有效(即使没有抛出异常)
                if (shouldRetry(RetryContext(attempt, null, result))) {
                    // 虽然成功但结果不符合要求,重试
                    lastResponse = result
                    if (attempt < maxRetries) {
                        delay(calculateDelay(attempt))
                        continue
                    }
                }
                
                return result
            } catch (e: Exception) {
                lastException = e
                lastResponse = null
                
                if (attempt < maxRetries && 
                    shouldRetry(RetryContext(attempt, e, null))) {
                    delay(calculateDelay(attempt))
                } else {
                    break
                }
            }
        }
        
        throw lastException ?: RuntimeException("操作失败")
    }
    
    // API调用示例
    suspend fun callApiWithValidation(): ApiResponse {
        return retryWithResponseInspection(
            maxRetries = 3,
            shouldRetry = { context ->
                when {
                    // 异常情况
                    context.lastException != null -> {
                        context.lastException.shouldRetry()
                    }
                    // 响应内容检查
                    context.lastResponse != null -> {
                        val response = context.lastResponse as ApiResponse
                        response.shouldRetry()
                    }
                    else -> false
                }
            }
        ) {
            apiService.getData()
        }
    }
    
    private fun calculateDelay(attempt: Int): Long {
        return (1000L * attempt).coerceAtMost(10000L)
    }
}

data class ApiResponse(
    val success: Boolean,
    val data: Any?,
    val errorCode: String?,
    val message: String?
) {
    fun shouldRetry(): Boolean {
        return when (errorCode) {
            "RATE_LIMITED" -> true // 限流
            "TEMPORARY_ERROR" -> true // 临时错误
            "MAINTENANCE" -> true // 维护中
            else -> false
        }
    }
}

2. 并发重试策略

import kotlinx.coroutines.*
import java.util.concurrent.ConcurrentHashMap

class ConcurrentRetryManager {
    private val activeRetries = ConcurrentHashMap<String, Job>()
    
    /**
     * 并发安全的去重重试
     * 相同 key 的操作只进行一次重试
     */
    suspend fun <T> retryDeduplicated(
        key: String,
        maxRetries: Int = 3,
        block: suspend () -> T
    ): T {
        // 检查是否已有相同key的重试在进行中
        val existingJob = activeRetries[key]
        if (existingJob != null && existingJob.isActive) {
            // 等待已有的重试完成
            return try {
                existingJob.join()
                block() // 重试完成后再次执行
            } catch (e: Exception) {
                throw e
            }
        }
        
        // 启动新的重试
        return coroutineScope {
            val job = launch(start = CoroutineStart.LAZY) {
                retryWithBackoff(maxRetries, block)
            }
            
            activeRetries[key] = job
            try {
                job.start()
                job.join()
                block()
            } finally {
                activeRetries.remove(key)
            }
        }
    }
    
    /**
     * 批量操作的智能重试
     * 部分失败时只重试失败的部分
     */
    suspend fun <T, R> batchRetry(
        items: List<T>,
        concurrency: Int = 5,
        maxRetriesPerItem: Int = 3,
        transform: suspend (T) -> R
    ): List<R> = coroutineScope {
        val results = mutableMapOf<Int, Result<R>>()
        val failedIndices = mutableListOf<Int>()
        
        // 第一轮:并发处理所有项
        items.mapIndexed { index, item ->
            async {
                try {
                    Result.success(transform(item))
                } catch (e: Exception) {
                    Result.failure<R>(e)
                }.also { result ->
                    synchronized(results) {
                        results[index] = result
                        if (result.isFailure) {
                            failedIndices.add(index)
                        }
                    }
                }
            }
        }.awaitAll()
        
        // 重试失败项
        for (retryAttempt in 1..maxRetriesPerItem) {
            if (failedIndices.isEmpty()) break
            
            println("第 $retryAttempt 轮重试,失败项: ${failedIndices.size}")
            
            val currentFailed = failedIndices.toList()
            failedIndices.clear()
            
            val retryJobs = currentFailed.map { index ->
                async {
                    try {
                        retryWithBackoff(
                            maxRetries = 1, // 每次只重试一次,由外层循环控制
                            block = { transform(items[index]) }
                        ).also { success ->
                            synchronized(results) {
                                results[index] = Result.success(success)
                            }
                        }
                    } catch (e: Exception) {
                        synchronized(results) {
                            results[index] = Result.failure(e)
                            failedIndices.add(index)
                        }
                    }
                }
            }
            
            retryJobs.awaitAll()
            
            if (failedIndices.isNotEmpty()) {
                delay(1000 * retryAttempt) // 轮次间延迟
            }
        }
        
        // 收集最终结果
        items.indices.map { index ->
            results[index]?.getOrThrow() ?: throw IllegalStateException("Missing result")
        }
    }
    
    /**
     * 优先队列重试:重要任务优先重试
     */
    suspend fun <T> priorityRetry(
        tasks: List<Pair<Int, suspend () -> T>>, // (优先级, 任务)
        maxRetries: Int = 3
    ): List<T> {
        val sortedTasks = tasks.sortedByDescending { it.first } // 优先级高的先执行
        val results = mutableListOf<T>()
        
        for ((priority, task) in sortedTasks) {
            try {
                val result = retryWithBackoff(maxRetries) {
                    println("执行优先级 $priority 的任务")
                    task()
                }
                results.add(result)
            } catch (e: Exception) {
                println("优先级 $priority 的任务失败: ${e.message}")
                // 低优先级任务失败不影响高优先级任务
            }
        }
        
        return results
    }
}

3. 熔断器模式配合重试

import kotlinx.coroutines.sync.Mutex
import kotlinx.coroutines.sync.withLock

class CircuitBreaker(
    private val failureThreshold: Int = 5,
    private val resetTimeoutMillis: Long = 60000,
    private val halfOpenMaxAttempts: Int = 3
) {
    private var state: State = State.CLOSED
    private var failureCount = 0
    private var lastFailureTime = 0L
    private var halfOpenAttempts = 0
    private val mutex = Mutex()
    
    sealed class State {
        object CLOSED : State()     // 正常状态
        object OPEN : State()       // 熔断状态
        object HALF_OPEN : State()  // 半开状态(试探)
    }
    
    suspend fun <T> execute(block: suspend () -> T): T {
        mutex.withLock {
            updateState()
        }
        
        return when (state) {
            State.OPEN -> throw CircuitBreakerOpenException("熔断器开启")
            State.HALF_OPEN -> executeHalfOpen(block)
            State.CLOSED -> executeClosed(block)
        }
    }
    
    private suspend fun <T> executeClosed(block: suspend () -> T): T {
        try {
            val result = block()
            mutex.withLock {
                // 成功时重置失败计数
                failureCount = 0
            }
            return result
        } catch (e: Exception) {
            mutex.withLock {
                failureCount++
                lastFailureTime = System.currentTimeMillis()
                if (failureCount >= failureThreshold) {
                    state = State.OPEN
                }
            }
            throw e
        }
    }
    
    private suspend fun <T> executeHalfOpen(block: suspend () -> T): T {
        try {
            val result = block()
            mutex.withLock {
                // 半开状态下成功,关闭熔断器
                state = State.CLOSED
                failureCount = 0
                halfOpenAttempts = 0
            }
            return result
        } catch (e: Exception) {
            mutex.withLock {
                halfOpenAttempts++
                if (halfOpenAttempts >= halfOpenMaxAttempts) {
                    state = State.OPEN
                    lastFailureTime = System.currentTimeMillis()
                }
            }
            throw e
        }
    }
    
    private fun updateState() {
        when (state) {
            State.OPEN -> {
                val timeSinceFailure = System.currentTimeMillis() - lastFailureTime
                if (timeSinceFailure >= resetTimeoutMillis) {
                    state = State.HALF_OPEN
                    halfOpenAttempts = 0
                }
            }
            else -> Unit
        }
    }
    
    fun getStatus(): String {
        return "State: $state, Failures: $failureCount, LastFailure: $lastFailureTime"
    }
}

class CircuitBreakerOpenException(message: String) : Exception(message)

// 使用熔断器的重试策略
class ResilientService(
    private val circuitBreaker: CircuitBreaker
) {
    suspend fun <T> executeWithResilience(
        maxRetries: Int = 3,
        block: suspend () -> T
    ): T {
        var lastException: Throwable? = null
        
        for (attempt in 1..maxRetries) {
            try {
                // 通过熔断器执行
                return circuitBreaker.execute(block)
            } catch (e: CircuitBreakerOpenException) {
                // 熔断器开启,直接失败
                throw e
            } catch (e: Exception) {
                lastException = e
                
                if (attempt < maxRetries && e.shouldRetry()) {
                    val delay = calculateDelay(attempt)
                    println("服务调用失败,${delay}ms后重试")
                    delay(delay)
                } else {
                    break
                }
            }
        }
        
        throw lastException ?: RuntimeException("服务调用失败")
    }
    
    suspend fun callService(): String {
        return executeWithResilience {
            // 模拟服务调用
            if (Random.nextDouble() < 0.4) {
                throw IOException("服务暂时不可用")
            }
            "服务响应"
        }
    }
}

五、测试重试逻辑

1. 单元测试重试

import kotlinx.coroutines.test.runTest
import kotlin.test.Test
import kotlin.test.assertEquals
import kotlin.test.assertFailsWith

class RetryTest {
    @Test
    fun testRetrySuccess() = runTest {
        var attempts = 0
        
        val result = retryWithBackoff(maxRetries = 3) {
            attempts++
            if (attempts < 3) {
                throw IOException("模拟失败")
            }
            "成功"
        }
        
        assertEquals("成功", result)
        assertEquals(3, attempts)
    }
    
    @Test
    fun testRetryFailure() = runTest {
        var attempts = 0
        
        assertFailsWith<IOException> {
            retryWithBackoff(maxRetries = 3) {
                attempts++
                throw IOException("总是失败")
            }
        }
        
        assertEquals(3, attempts)
    }
    
    @Test
    fun testRetryWithCondition() = runTest {
        var attempts = 0
        
        val result = retryWithBackoff(
            maxRetries = 5,
            block = {
                attempts++
                when {
                    attempts == 1 -> throw IOException("可重试错误")
                    attempts == 2 -> throw IllegalArgumentException("不可重试错误")
                    else -> "成功"
                }
            }
        ) { e ->
            e is IOException // 只重试IOException
        }
        
        // 应该失败,因为第二次是不可重试错误
        assertFailsWith<IllegalArgumentException>()
    }
    
    @Test
    fun testExponentialBackoff() = runTest {
        val delays = mutableListOf<Long>()
        val startTime = System.currentTimeMillis()
        
        try {
            retryWithBackoff(
                maxRetries = 4,
                initialDelayMillis = 100,
                factor = 2.0
            ) {
                delays.add(System.currentTimeMillis() - startTime)
                throw IOException("失败")
            }
        } catch (e: IOException) {
            // 预期失败
        }
        
        // 验证延迟模式:~100ms, ~300ms, ~700ms
        assertEquals(3, delays.size - 1) // 第一次立即执行
        for (i in 1 until delays.size) {
            val actualDelay = delays[i] - delays[i - 1]
            val expectedDelay = 100L * (1 shl (i - 1))
            
            // 允许10%的误差
            assert(actualDelay in (expectedDelay * 0.9).toLong()..(expectedDelay * 1.1).toLong()) {
                "延迟 $actualDelay 不在预期范围 $expectedDelay"
            }
        }
    }
}

2. 模拟不稳定服务进行测试

class UnstableServiceSimulator(
    private val failurePattern: List<Boolean> // true=失败, false=成功
) {
    private var callCount = 0
    
    suspend fun call(): String {
        if (callCount >= failurePattern.size) {
            return "稳定服务"
        }
        
        val shouldFail = failurePattern[callCount]
        callCount++
        
        if (shouldFail) {
            throw IOException("服务调用 $callCount 失败")
        }
        
        return "服务调用 $callCount 成功"
    }
    
    fun reset() {
        callCount = 0
    }
}

class RetryIntegrationTest {
    @Test
    fun testRetryPatterns() = runTest {
        // 测试1: 失败-成功模式
        val simulator1 = UnstableServiceSimulator(listOf(true, false))
        val result1 = retryWithBackoff(maxRetries = 3) {
            simulator1.call()
        }
        assertEquals("服务调用 2 成功", result1)
        
        // 测试2: 连续失败模式
        simulator1.reset()
        val simulator2 = UnstableServiceSimulator(listOf(true, true, true, false))
        val result2 = retryWithBackoff(maxRetries = 4) {
            simulator2.call()
        }
        assertEquals("服务调用 4 成功", result2)
        
        // 测试3: 超过重试次数
        simulator1.reset()
        val simulator3 = UnstableServiceSimulator(listOf(true, true, true, true, false))
        assertFailsWith<IOException> {
            retryWithBackoff(maxRetries = 3) {
                simulator3.call()
            }
        }
    }
    
    @Test
    fun testCircuitBreakerWithRetry() = runTest {
        val circuitBreaker = CircuitBreaker(
            failureThreshold = 2,
            resetTimeoutMillis = 1000
        )
        val service = ResilientService(circuitBreaker)
        
        // 模拟快速失败触发熔断
        val failingService = object {
            var callCount = 0
            suspend fun call(): String {
                callCount++
                throw IOException("服务失败")
            }
        }
        
        // 前两次重试会失败
        assertFailsWith<IOException> {
            service.executeWithResilience(maxRetries = 2) {
                failingService.call()
            }
        }
        
        // 等待熔断器重置
        delay(1500)
        
        // 现在应该可以重新尝试
        failingService.callCount = 0
        assertFailsWith<IOException> {
            service.executeWithResilience(maxRetries = 1) {
                failingService.call()
            }
        }
    }
}

六、最佳实践

1. 重试策略选择指南

object RetryStrategyGuide {
    /**
     * 选择适当的重试策略
     */
    fun <T> chooseStrategy(
        operationType: OperationType,
        context: RetryContext
    ): RetryStrategy {
        return when (operationType) {
            OperationType.NETWORK_REQUEST -> NetworkRetryStrategy()
            OperationType.DATABASE_OPERATION -> DatabaseRetryStrategy()
            OperationType.FILE_OPERATION -> FileRetryStrategy()
            OperationType.EXTERNAL_API_CALL -> ExternalApiRetryStrategy()
        }
    }
    
    /**
     * 网络请求重试策略
     */
    class NetworkRetryStrategy : RetryStrategy {
        override val maxRetries = 3
        override val initialDelay = 1000L
        override val maxDelay = 10000L
        override val factor = 2.0
        
        override fun shouldRetry(exception: Throwable): Boolean {
            return when (exception) {
                is IOException -> true
                is SocketTimeoutException -> true
                is HttpException -> exception.code() in setOf(408, 429, 500, 502, 503, 504)
                else -> false
            }
        }
    }
    
    /**
     * 数据库操作重试策略
     */
    class DatabaseRetryStrategy : RetryStrategy {
        override val maxRetries = 5
        override val initialDelay = 100L
        override val maxDelay = 2000L
        override val factor = 1.5
        
        override fun shouldRetry(exception: Throwable): Boolean {
            return exception is SQLTransientException ||
                   (exception is SQLException && exception.isTransient())
        }
    }
    
    /**
     * 重试监控和指标
     */
    class RetryMonitor {
        private val metrics = mutableMapOf<String, RetryMetrics>()
        
        fun recordAttempt(operation: String, attempt: Int, success: Boolean) {
            val metric = metrics.getOrPut(operation) { RetryMetrics() }
            metric.totalAttempts++
            if (attempt > 1) metric.retryAttempts++
            if (success) metric.successfulOperations++ else metric.failedOperations++
        }
        
        fun getMetrics(operation: String): RetryMetrics? {
            return metrics[operation]
        }
        
        fun printReport() {
            metrics.forEach { (operation, metric) ->
                println("""
                    |Operation: $operation
                    |  Total Attempts: ${metric.totalAttempts}
                    |  Retry Attempts: ${metric.retryAttempts}
                    |  Success Rate: ${metric.successRate}%
                    |  Retry Rate: ${metric.retryRate}%
                """.trimMargin())
            }
        }
    }
    
    data class RetryMetrics(
        var totalAttempts: Long = 0,
        var retryAttempts: Long = 0,
        var successfulOperations: Long = 0,
        var failedOperations: Long = 0
    ) {
        val successRate: Double
            get() = if (totalAttempts > 0) successfulOperations.toDouble() / totalAttempts * 100 else 0.0
        
        val retryRate: Double
            get() = if (totalAttempts > 0) retryAttempts.toDouble() / totalAttempts * 100 else 0.0
    }
    
    enum class OperationType {
        NETWORK_REQUEST,
        DATABASE_OPERATION,
        FILE_OPERATION,
        EXTERNAL_API_CALL
    }
    
    interface RetryStrategy {
        val maxRetries: Int
        val initialDelay: Long
        val maxDelay: Long
        val factor: Double
        fun shouldRetry(exception: Throwable): Boolean
    }
}

2. 重试配置化

data class RetryConfig(
    val maxRetries: Int = 3,
    val initialDelayMillis: Long = 1000,
    val maxDelayMillis: Long = 10000,
    val factor: Double = 2.0,
    val jitter: Boolean = true,
    val jitterFactor: Double = 0.1,
    val retryableExceptions: Set<String> = setOf(
        "java.io.IOException",
        "java.net.SocketTimeoutException",
        "retrofit2.HttpException"
    )
)

class ConfigurableRetryManager(
    private val configProvider: RetryConfigProvider
) {
    suspend fun <T> executeWithRetry(
        operationName: String,
        block: suspend () -> T
    ): T {
        val config = configProvider.getConfig(operationName)
        var lastException: Throwable? = null
        
        for (attempt in 1..config.maxRetries) {
            try {
                return block()
            } catch (e: Exception) {
                lastException = e
                
                if (attempt < config.maxRetries && shouldRetry(e, config)) {
                    val delay = calculateDelay(attempt, config)
                    println("$operationName$attempt 次重试,等待 ${delay}ms")
                    delay(delay)
                } else {
                    break
                }
            }
        }
        
        throw lastException ?: RuntimeException("$operationName 失败")
    }
    
    private fun shouldRetry(exception: Throwable, config: RetryConfig): Boolean {
        // 通过类名检查是否可重试
        val exceptionClassName = exception::class.qualifiedName
        return config.retryableExceptions.any { className ->
            exceptionClassName?.contains(className) == true
        }
    }
    
    private fun calculateDelay(attempt: Int, config: RetryConfig): Long {
        val baseDelay = (config.initialDelayMillis * Math.pow(config.factor, (attempt - 1).toDouble()))
            .toLong()
            .coerceAtMost(config.maxDelayMillis)
        
        return if (config.jitter) {
            val jitterRange = (baseDelay * config.jitterFactor).toLong()
            baseDelay + Random.nextLong(-jitterRange, jitterRange)
        } else {
            baseDelay
        }
    }
}

interface RetryConfigProvider {
    fun getConfig(operationName: String): RetryConfig
}

class JsonRetryConfigProvider : RetryConfigProvider {
    private val configs = mapOf(
        "userApi" to RetryConfig(
            maxRetries = 3,
            initialDelayMillis = 1000,
            retryableExceptions = setOf("IOException", "HttpException")
        ),
        "paymentApi" to RetryConfig(
            maxRetries = 5,
            initialDelayMillis = 2000,
            factor = 1.5,
            jitter = true
        ),
        "fileUpload" to RetryConfig(
            maxRetries = 10,
            initialDelayMillis = 500,
            maxDelayMillis = 30000,
            factor = 3.0
        )
    )
    
    override fun getConfig(operationName: String): RetryConfig {
        return configs[operationName] ?: RetryConfig()
    }
}

总结

Kotlin 重试机制的最佳实践:

核心策略:

  1. 指数退避:避免雪崩效应
  2. 随机抖动:防止惊群效应
  3. 熔断器模式:保护下游服务
  4. 条件重试:只重试可恢复的错误

实现要点:

  1. 网络请求:重试超时、连接错误、特定HTTP状态码
  2. 数据库操作:重试死锁、连接超时、临时错误
  3. 文件操作:重试文件锁、临时IO错误
  4. 外部API:重试限流、服务不可用

监控和调优:

  1. 记录重试指标:成功率、重试率、延迟分布
  2. 动态调整参数:基于历史表现调整重试策略
  3. A/B测试:比较不同策略的效果

注意事项:

  1. 幂等性:确保重试操作是幂等的
  2. 资源消耗:避免无限重试消耗资源
  3. 用户体验:及时反馈失败,而不是无休止重试
  4. 依赖服务:考虑下游服务的承受能力

选择合适重试策略时,需要权衡:

  • 成功率的提升 vs 延迟的增加
  • 用户体验 vs 系统资源
  • 业务重要性 vs 实现复杂度