7.1 重大突破!权限系统高可用方案原来是这样设计的?

2 阅读9分钟

重大突破!权限系统高可用方案原来是这样设计的?

在构建高可用的权限系统时,我们需要考虑多个层面的容错和故障恢复机制。权限系统作为整个业务系统的核心组件,一旦出现故障可能导致整个系统瘫痪,因此设计一个高可用的权限系统至关重要。

1. 权限系统高可用架构设计

权限系统的高可用设计需要从多个维度考虑,包括数据存储、服务部署、缓存策略、故障恢复等。

1.1 数据存储高可用

权限数据的存储是整个权限系统的核心,我们需要保证数据的高可用性和一致性。

// PermissionDatabase 权限数据库配置
type PermissionDatabase struct {
    PrimaryDSN   string
    ReplicaDSNs  []string
    MaxIdleConns int
    MaxOpenConns int
    ConnMaxLifetime time.Duration
}

// PermissionDB 权限数据库连接池
type PermissionDB struct {
    master *sql.DB
    replicas []*sql.DB
    mutex sync.RWMutex
}

// NewPermissionDB 创建权限数据库连接池
func NewPermissionDB(config *PermissionDatabase) (*PermissionDB, error) {
    db := &PermissionDB{}
    
    // 初始化主库连接
    masterDB, err := sql.Open("mysql", config.PrimaryDSN)
    if err != nil {
        return nil, fmt.Errorf("failed to connect to master database: %w", err)
    }
    
    masterDB.SetMaxIdleConns(config.MaxIdleConns)
    masterDB.SetMaxOpenConns(config.MaxOpenConns)
    masterDB.SetConnMaxLifetime(config.ConnMaxLifetime)
    
    // 测试主库连接
    if err := masterDB.Ping(); err != nil {
        return nil, fmt.Errorf("failed to ping master database: %w", err)
    }
    
    db.master = masterDB
    
    // 初始化从库连接池
    for _, dsn := range config.ReplicaDSNs {
        replicaDB, err := sql.Open("mysql", dsn)
        if err != nil {
            return nil, fmt.Errorf("failed to connect to replica database: %w", err)
        }
        
        replicaDB.SetMaxIdleConns(config.MaxIdleConns)
        replicaDB.SetMaxOpenConns(config.MaxOpenConns)
        replicaDB.SetConnMaxLifetime(config.ConnMaxLifetime)
        
        // 测试从库连接
        if err := replicaDB.Ping(); err != nil {
            return nil, fmt.Errorf("failed to ping replica database: %w", err)
        }
        
        db.replicas = append(db.replicas, replicaDB)
    }
    
    return db, nil
}

// GetMaster 获取主库连接
func (p *PermissionDB) GetMaster() *sql.DB {
    return p.master
}

// GetReplica 获取从库连接(负载均衡)
func (p *PermissionDB) GetReplica() *sql.DB {
    p.mutex.RLock()
    defer p.mutex.RUnlock()
    
    if len(p.replicas) == 0 {
        return p.master
    }
    
    // 简单的轮询负载均衡
    rand.Seed(time.Now().UnixNano())
    idx := rand.Intn(len(p.replicas))
    return p.replicas[idx]
}

1.2 缓存高可用设计

缓存是提升权限系统性能的关键,同时也需要保证其高可用性。

// CacheCluster 缓存集群配置
type CacheCluster struct {
    RedisAddrs []string
    Password   string
    DB         int
    PoolSize   int
    Timeout    time.Duration
}

// PermissionCache 权限缓存
type PermissionCache struct {
    client *redis.ClusterClient
    timeout time.Duration
}

// NewPermissionCache 创建权限缓存实例
func NewPermissionCache(config *CacheCluster) *PermissionCache {
    client := redis.NewClusterClient(&redis.ClusterOptions{
        Addrs:    config.RedisAddrs,
        Password: config.Password,
        PoolSize: config.PoolSize,
        DialTimeout:  config.Timeout,
        ReadTimeout:  config.Timeout,
        WriteTimeout: config.Timeout,
    })
    
    return &PermissionCache{
        client:  client,
        timeout: config.Timeout,
    }
}

// GetPermission 获取权限信息
func (pc *PermissionCache) GetPermission(ctx context.Context, key string) (string, error) {
    val, err := pc.client.Get(ctx, key).Result()
    if err == redis.Nil {
        return "", nil // 缓存未命中
    }
    if err != nil {
        return "", fmt.Errorf("failed to get permission from cache: %w", err)
    }
    return val, nil
}

// SetPermission 设置权限信息
func (pc *PermissionCache) SetPermission(ctx context.Context, key, value string, expiration time.Duration) error {
    err := pc.client.Set(ctx, key, value, expiration).Err()
    if err != nil {
        return fmt.Errorf("failed to set permission to cache: %w", err)
    }
    return nil
}

// DeletePermission 删除权限信息
func (pc *PermissionCache) DeletePermission(ctx context.Context, key string) error {
    err := pc.client.Del(ctx, key).Err()
    if err != nil {
        return fmt.Errorf("failed to delete permission from cache: %w", err)
    }
    return nil
}

2. 服务容错与降级策略

在权限系统出现故障时,我们需要有相应的容错和降级策略来保证系统的可用性。

2.1 熔断机制

// CircuitBreakerConfig 熔断器配置
type CircuitBreakerConfig struct {
    FailureThreshold   int           // 失败阈值
    SuccessThreshold   int           // 成功阈值
    Timeout            time.Duration // 超时时间
    HalfOpenMaxCalls   int           // 半开状态最大调用次数
}

// CircuitBreakerState 熔断器状态
type CircuitBreakerState int

const (
    StateClosed CircuitBreakerState = iota
    StateOpen
    StateHalfOpen
)

// CircuitBreaker 熔断器
type CircuitBreaker struct {
    config CircuitBreakerConfig
    state  CircuitBreakerState
    failures int
    successes int
    lastFailure time.Time
    mutex sync.Mutex
}

// NewCircuitBreaker 创建熔断器
func NewCircuitBreaker(config CircuitBreakerConfig) *CircuitBreaker {
    return &CircuitBreaker{
        config: config,
        state:  StateClosed,
    }
}

// Execute 执行受保护的操作
func (cb *CircuitBreaker) Execute(fn func() error) error {
    cb.mutex.Lock()
    defer cb.mutex.Unlock()
    
    switch cb.state {
    case StateOpen:
        // 检查是否应该进入半开状态
        if time.Since(cb.lastFailure) >= cb.config.Timeout {
            cb.state = StateHalfOpen
            cb.successes = 0
            cb.failures = 0
        } else {
            return errors.New("circuit breaker is open")
        }
    case StateHalfOpen:
        // 在半开状态下限制调用次数
        if cb.successes+cb.failures >= cb.config.HalfOpenMaxCalls {
            return errors.New("circuit breaker is half-open, max calls reached")
        }
    }
    
    // 执行操作
    err := fn()
    
    // 更新状态
    if err != nil {
        cb.onFailure()
    } else {
        cb.onSuccess()
    }
    
    return err
}

// onFailure 处理失败情况
func (cb *CircuitBreaker) onFailure() {
    cb.failures++
    cb.lastFailure = time.Now()
    
    if cb.state == StateHalfOpen || cb.failures >= cb.config.FailureThreshold {
        cb.state = StateOpen
    }
}

// onSuccess 处理成功情况
func (cb *CircuitBreaker) onSuccess() {
    cb.successes++
    
    if cb.state == StateHalfOpen && cb.successes >= cb.config.SuccessThreshold {
        // 重置熔断器
        cb.state = StateClosed
        cb.failures = 0
        cb.successes = 0
        cb.lastFailure = time.Time{}
    } else if cb.state == StateClosed {
        cb.failures = 0
    }
}

2.2 降级策略

// DegradationStrategy 降级策略
type DegradationStrategy int

const (
    StrategyDenyAll DegradationStrategy = iota  // 拒绝所有请求
    StrategyAllowSuperUsers                     // 只允许超级用户
    StrategyAllowCached                         // 只允许缓存命中的请求
    StrategyAllowAll                            // 允许所有请求(无权限检查)
)

// PermissionService 权限服务
type PermissionService struct {
    db    *PermissionDB
    cache *PermissionCache
    cb    *CircuitBreaker
    strategy DegradationStrategy
    mutex sync.RWMutex
}

// CheckPermission 检查权限
func (ps *PermissionService) CheckPermission(ctx context.Context, userID, resource, action string) (bool, error) {
    ps.mutex.RLock()
    strategy := ps.strategy
    ps.mutex.RUnlock()
    
    // 根据降级策略处理
    switch strategy {
    case StrategyDenyAll:
        return false, errors.New("service is degraded, all requests denied")
        
    case StrategyAllowSuperUsers:
        // 检查是否为超级用户
        if ps.isSuperUser(userID) {
            return true, nil
        }
        return false, errors.New("service is degraded, only super users allowed")
        
    case StrategyAllowCached:
        // 只检查缓存
        return ps.checkPermissionFromCache(ctx, userID, resource, action)
        
    case StrategyAllowAll:
        // 允许所有请求(无权限检查)
        return true, nil
    }
    
    // 正常流程 - 使用熔断器保护
    var allowed bool
    err := ps.cb.Execute(func() error {
        var err error
        allowed, err = ps.checkPermissionInternal(ctx, userID, resource, action)
        return err
    })
    
    if err != nil {
        return false, fmt.Errorf("permission check failed: %w", err)
    }
    
    return allowed, nil
}

// checkPermissionInternal 内部权限检查逻辑
func (ps *PermissionService) checkPermissionInternal(ctx context.Context, userID, resource, action string) (bool, error) {
    // 先检查缓存
    cacheKey := fmt.Sprintf("permission:%s:%s:%s", userID, resource, action)
    if cached, err := ps.cache.GetPermission(ctx, cacheKey); err == nil && cached != "" {
        return cached == "allow", nil
    }
    
    // 缓存未命中,查询数据库
    query := `
        SELECT COUNT(*) > 0 
        FROM user_permissions up
        JOIN permissions p ON up.permission_id = p.id
        WHERE up.user_id = ? AND p.resource = ? AND p.action = ?
    `
    
    var allowed bool
    replica := ps.db.GetReplica()
    err := replica.QueryRowContext(ctx, query, userID, resource, action).Scan(&allowed)
    if err != nil {
        return false, fmt.Errorf("failed to query permission from database: %w", err)
    }
    
    // 更新缓存
    cacheValue := "deny"
    if allowed {
        cacheValue = "allow"
    }
    _ = ps.cache.SetPermission(ctx, cacheKey, cacheValue, 5*time.Minute)
    
    return allowed, nil
}

// checkPermissionFromCache 仅从缓存检查权限
func (ps *PermissionService) checkPermissionFromCache(ctx context.Context, userID, resource, action string) (bool, error) {
    cacheKey := fmt.Sprintf("permission:%s:%s:%s", userID, resource, action)
    cached, err := ps.cache.GetPermission(ctx, cacheKey)
    if err != nil {
        return false, fmt.Errorf("failed to get permission from cache: %w", err)
    }
    
    if cached == "" {
        return false, errors.New("permission not found in cache, service degraded")
    }
    
    return cached == "allow", nil
}

// isSuperUser 检查是否为超级用户
func (ps *PermissionService) isSuperUser(userID string) bool {
    // 实际实现中可能需要查询数据库或配置
    superUsers := map[string]bool{
        "admin": true,
        "root":  true,
    }
    return superUsers[userID]
}

// SetDegradationStrategy 设置降级策略
func (ps *PermissionService) SetDegradationStrategy(strategy DegradationStrategy) {
    ps.mutex.Lock()
    defer ps.mutex.Unlock()
    ps.strategy = strategy
}

3. 多活部署与灾备方案

为了进一步提升权限系统的可用性,我们需要设计多活部署和灾备方案。

3.1 多活部署架构

// MultiSiteConfig 多站点配置
type MultiSiteConfig struct {
    CurrentSite string
    Sites       map[string]*SiteConfig
}

// SiteConfig 站点配置
type SiteConfig struct {
    Name     string
    Endpoint string
    Priority int
    Enabled  bool
}

// MultiSiteManager 多站点管理器
type MultiSiteManager struct {
    config *MultiSiteConfig
    client *http.Client
    mutex  sync.RWMutex
}

// NewMultiSiteManager 创建多站点管理器
func NewMultiSiteManager(config *MultiSiteConfig) *MultiSiteManager {
    return &MultiSiteManager{
        config: config,
        client: &http.Client{
            Timeout: 5 * time.Second,
        },
    }
}

// CheckPermission 跨站点权限检查
func (msm *MultiSiteManager) CheckPermission(ctx context.Context, userID, resource, action string) (bool, error) {
    msm.mutex.RLock()
    currentSite := msm.config.CurrentSite
    sites := make([]*SiteConfig, 0, len(msm.config.Sites))
    for _, site := range msm.config.Sites {
        if site.Enabled {
            sites = append(sites, site)
        }
    }
    msm.mutex.RUnlock()
    
    // 按优先级排序
    sort.Slice(sites, func(i, j int) bool {
        return sites[i].Priority < sites[j].Priority
    })
    
    // 首先在当前站点检查
    if currentSiteConfig, exists := msm.config.Sites[currentSite]; exists && currentSiteConfig.Enabled {
        allowed, err := msm.checkPermissionAtSite(ctx, currentSiteConfig, userID, resource, action)
        if err == nil {
            return allowed, nil
        }
        // 当前站点失败,记录日志
        log.Printf("Failed to check permission at current site %s: %v", currentSite, err)
    }
    
    // 尝试其他站点
    for _, site := range sites {
        if site.Name == currentSite {
            continue // 跳过当前站点
        }
        
        allowed, err := msm.checkPermissionAtSite(ctx, site, userID, resource, action)
        if err == nil {
            return allowed, nil
        }
        log.Printf("Failed to check permission at site %s: %v", site.Name, err)
    }
    
    return false, errors.New("failed to check permission at all sites")
}

// checkPermissionAtSite 在指定站点检查权限
func (msm *MultiSiteManager) checkPermissionAtSite(ctx context.Context, site *SiteConfig, userID, resource, action string) (bool, error) {
    url := fmt.Sprintf("%s/permission/check?user_id=%s&resource=%s&action=%s", 
        site.Endpoint, userID, resource, action)
    
    req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
    if err != nil {
        return false, fmt.Errorf("failed to create request: %w", err)
    }
    
    resp, err := msm.client.Do(req)
    if err != nil {
        return false, fmt.Errorf("failed to send request: %w", err)
    }
    defer resp.Body.Close()
    
    if resp.StatusCode != http.StatusOK {
        return false, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
    }
    
    var result struct {
        Allowed bool `json:"allowed"`
    }
    
    if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
        return false, fmt.Errorf("failed to decode response: %w", err)
    }
    
    return result.Allowed, nil
}

3.2 数据同步机制

// DataSyncConfig 数据同步配置
type DataSyncConfig struct {
    SyncInterval time.Duration
    BatchSize    int
    TargetSites  []string
}

// DataSyncManager 数据同步管理器
type DataSyncManager struct {
    config   *DataSyncConfig
    db       *PermissionDB
    client   *http.Client
    ticker   *time.Ticker
    stopChan chan struct{}
    mutex    sync.Mutex
}

// NewDataSyncManager 创建数据同步管理器
func NewDataSyncManager(config *DataSyncConfig, db *PermissionDB) *DataSyncManager {
    return &DataSyncManager{
        config: config,
        db:     db,
        client: &http.Client{
            Timeout: 30 * time.Second,
        },
        stopChan: make(chan struct{}),
    }
}

// Start 启动数据同步
func (dsm *DataSyncManager) Start() {
    dsm.mutex.Lock()
    defer dsm.mutex.Unlock()
    
    if dsm.ticker != nil {
        return // 已经启动
    }
    
    dsm.ticker = time.NewTicker(dsm.config.SyncInterval)
    
    go func() {
        for {
            select {
            case <-dsm.ticker.C:
                dsm.syncData()
            case <-dsm.stopChan:
                return
            }
        }
    }()
}

// Stop 停止数据同步
func (dsm *DataSyncManager) Stop() {
    dsm.mutex.Lock()
    defer dsm.mutex.Unlock()
    
    if dsm.ticker != nil {
        dsm.ticker.Stop()
        dsm.ticker = nil
    }
    
    close(dsm.stopChan)
}

// syncData 同步数据
func (dsm *DataSyncManager) syncData() {
    ctx := context.Background()
    
    // 获取需要同步的数据
    data, err := dsm.getChangedData(ctx)
    if err != nil {
        log.Printf("Failed to get changed data: %v", err)
        return
    }
    
    if len(data) == 0 {
        return // 没有需要同步的数据
    }
    
    // 同步到各个站点
    for _, site := range dsm.config.TargetSites {
        if err := dsm.syncToSite(ctx, site, data); err != nil {
            log.Printf("Failed to sync data to site %s: %v", site, err)
        }
    }
}

// getChangedData 获取变更的数据
func (dsm *DataSyncManager) getChangedData(ctx context.Context) ([]map[string]interface{}, error) {
    // 实际实现中可能需要根据时间戳或版本号查询变更的数据
    query := `
        SELECT id, user_id, resource, action, created_at, updated_at
        FROM user_permissions 
        WHERE updated_at > ? 
        ORDER BY updated_at 
        LIMIT ?
    `
    
    master := dsm.db.GetMaster()
    rows, err := master.QueryContext(ctx, query, time.Now().Add(-dsm.config.SyncInterval), dsm.config.BatchSize)
    if err != nil {
        return nil, fmt.Errorf("failed to query changed data: %w", err)
    }
    defer rows.Close()
    
    var data []map[string]interface{}
    for rows.Next() {
        var id, userID, resource, action string
        var createdAt, updatedAt time.Time
        
        if err := rows.Scan(&id, &userID, &resource, &action, &createdAt, &updatedAt); err != nil {
            return nil, fmt.Errorf("failed to scan row: %w", err)
        }
        
        data = append(data, map[string]interface{}{
            "id":         id,
            "user_id":    userID,
            "resource":   resource,
            "action":     action,
            "created_at": createdAt,
            "updated_at": updatedAt,
        })
    }
    
    return data, nil
}

// syncToSite 同步数据到指定站点
func (dsm *DataSyncManager) syncToSite(ctx context.Context, site string, data []map[string]interface{}) error {
    url := fmt.Sprintf("%s/permission/sync", site)
    
    payload, err := json.Marshal(data)
    if err != nil {
        return fmt.Errorf("failed to marshal data: %w", err)
    }
    
    req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(payload))
    if err != nil {
        return fmt.Errorf("failed to create request: %w", err)
    }
    
    req.Header.Set("Content-Type", "application/json")
    
    resp, err := dsm.client.Do(req)
    if err != nil {
        return fmt.Errorf("failed to send request: %w", err)
    }
    defer resp.Body.Close()
    
    if resp.StatusCode != http.StatusOK {
        return fmt.Errorf("unexpected status code: %d", resp.StatusCode)
    }
    
    return nil
}

4. 可用性监控与告警

高可用系统需要完善的监控和告警机制,以便及时发现问题并进行处理。

// MetricsCollector 指标收集器
type MetricsCollector struct {
    // 权限检查计数
    permissionChecks *prometheus.CounterVec
    // 权限检查延迟
    permissionCheckDuration *prometheus.HistogramVec
    // 错误计数
    errors *prometheus.CounterVec
    // 熔断器状态
    circuitBreakerState *prometheus.GaugeVec
}

// NewMetricsCollector 创建指标收集器
func NewMetricsCollector() *MetricsCollector {
    collector := &MetricsCollector{
        permissionChecks: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "permission_checks_total",
                Help: "Total number of permission checks",
            },
            []string{"result", "site"},
        ),
        permissionCheckDuration: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "permission_check_duration_seconds",
                Help:    "Permission check duration in seconds",
                Buckets: prometheus.DefBuckets,
            },
            []string{"site"},
        ),
        errors: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "permission_errors_total",
                Help: "Total number of permission errors",
            },
            []string{"type", "site"},
        ),
        circuitBreakerState: prometheus.NewGaugeVec(
            prometheus.GaugeOpts{
                Name: "circuit_breaker_state",
                Help: "Circuit breaker state (0=closed, 1=open, 2=half-open)",
            },
            []string{"service", "site"},
        ),
    }
    
    // 注册指标
    prometheus.MustRegister(collector.permissionChecks)
    prometheus.MustRegister(collector.permissionCheckDuration)
    prometheus.MustRegister(collector.errors)
    prometheus.MustRegister(collector.circuitBreakerState)
    
    return collector
}

// RecordPermissionCheck 记录权限检查
func (mc *MetricsCollector) RecordPermissionCheck(site string, duration time.Duration, allowed bool, err error) {
    result := "allowed"
    if !allowed {
        result = "denied"
    }
    if err != nil {
        result = "error"
    }
    
    mc.permissionChecks.WithLabelValues(result, site).Inc()
    mc.permissionCheckDuration.WithLabelValues(site).Observe(duration.Seconds())
    
    if err != nil {
        mc.errors.WithLabelValues("check_failure", site).Inc()
    }
}

// RecordCircuitBreakerState 记录熔断器状态
func (mc *MetricsCollector) RecordCircuitBreakerState(service, site string, state CircuitBreakerState) {
    mc.circuitBreakerState.WithLabelValues(service, site).Set(float64(state))
}

5. 总结

权限系统的高可用设计需要从多个方面综合考虑:

  1. 数据存储高可用:通过主从复制、读写分离、多实例部署等方式保证数据的高可用性
  2. 缓存高可用:使用Redis集群等方案保证缓存的高可用性
  3. 服务容错:通过熔断器、降级策略等机制提高服务的容错能力
  4. 多活部署:在多个数据中心部署服务实例,实现跨地域的高可用
  5. 数据同步:保证多活部署环境下的数据一致性
  6. 监控告警:建立完善的监控体系,及时发现和处理问题

通过以上这些设计和实现,我们可以构建一个高可用的权限系统,即使在部分组件出现故障的情况下,也能保证核心功能的正常运行,从而保障整个业务系统的稳定性和可用性。

在实际项目中,还需要根据具体的业务场景和技术栈选择合适的实现方案,并持续优化和改进系统的高可用性设计。