重大突破!权限系统高可用方案原来是这样设计的?
在构建高可用的权限系统时,我们需要考虑多个层面的容错和故障恢复机制。权限系统作为整个业务系统的核心组件,一旦出现故障可能导致整个系统瘫痪,因此设计一个高可用的权限系统至关重要。
1. 权限系统高可用架构设计
权限系统的高可用设计需要从多个维度考虑,包括数据存储、服务部署、缓存策略、故障恢复等。
1.1 数据存储高可用
权限数据的存储是整个权限系统的核心,我们需要保证数据的高可用性和一致性。
// PermissionDatabase 权限数据库配置
type PermissionDatabase struct {
PrimaryDSN string
ReplicaDSNs []string
MaxIdleConns int
MaxOpenConns int
ConnMaxLifetime time.Duration
}
// PermissionDB 权限数据库连接池
type PermissionDB struct {
master *sql.DB
replicas []*sql.DB
mutex sync.RWMutex
}
// NewPermissionDB 创建权限数据库连接池
func NewPermissionDB(config *PermissionDatabase) (*PermissionDB, error) {
db := &PermissionDB{}
// 初始化主库连接
masterDB, err := sql.Open("mysql", config.PrimaryDSN)
if err != nil {
return nil, fmt.Errorf("failed to connect to master database: %w", err)
}
masterDB.SetMaxIdleConns(config.MaxIdleConns)
masterDB.SetMaxOpenConns(config.MaxOpenConns)
masterDB.SetConnMaxLifetime(config.ConnMaxLifetime)
// 测试主库连接
if err := masterDB.Ping(); err != nil {
return nil, fmt.Errorf("failed to ping master database: %w", err)
}
db.master = masterDB
// 初始化从库连接池
for _, dsn := range config.ReplicaDSNs {
replicaDB, err := sql.Open("mysql", dsn)
if err != nil {
return nil, fmt.Errorf("failed to connect to replica database: %w", err)
}
replicaDB.SetMaxIdleConns(config.MaxIdleConns)
replicaDB.SetMaxOpenConns(config.MaxOpenConns)
replicaDB.SetConnMaxLifetime(config.ConnMaxLifetime)
// 测试从库连接
if err := replicaDB.Ping(); err != nil {
return nil, fmt.Errorf("failed to ping replica database: %w", err)
}
db.replicas = append(db.replicas, replicaDB)
}
return db, nil
}
// GetMaster 获取主库连接
func (p *PermissionDB) GetMaster() *sql.DB {
return p.master
}
// GetReplica 获取从库连接(负载均衡)
func (p *PermissionDB) GetReplica() *sql.DB {
p.mutex.RLock()
defer p.mutex.RUnlock()
if len(p.replicas) == 0 {
return p.master
}
// 简单的轮询负载均衡
rand.Seed(time.Now().UnixNano())
idx := rand.Intn(len(p.replicas))
return p.replicas[idx]
}
1.2 缓存高可用设计
缓存是提升权限系统性能的关键,同时也需要保证其高可用性。
// CacheCluster 缓存集群配置
type CacheCluster struct {
RedisAddrs []string
Password string
DB int
PoolSize int
Timeout time.Duration
}
// PermissionCache 权限缓存
type PermissionCache struct {
client *redis.ClusterClient
timeout time.Duration
}
// NewPermissionCache 创建权限缓存实例
func NewPermissionCache(config *CacheCluster) *PermissionCache {
client := redis.NewClusterClient(&redis.ClusterOptions{
Addrs: config.RedisAddrs,
Password: config.Password,
PoolSize: config.PoolSize,
DialTimeout: config.Timeout,
ReadTimeout: config.Timeout,
WriteTimeout: config.Timeout,
})
return &PermissionCache{
client: client,
timeout: config.Timeout,
}
}
// GetPermission 获取权限信息
func (pc *PermissionCache) GetPermission(ctx context.Context, key string) (string, error) {
val, err := pc.client.Get(ctx, key).Result()
if err == redis.Nil {
return "", nil // 缓存未命中
}
if err != nil {
return "", fmt.Errorf("failed to get permission from cache: %w", err)
}
return val, nil
}
// SetPermission 设置权限信息
func (pc *PermissionCache) SetPermission(ctx context.Context, key, value string, expiration time.Duration) error {
err := pc.client.Set(ctx, key, value, expiration).Err()
if err != nil {
return fmt.Errorf("failed to set permission to cache: %w", err)
}
return nil
}
// DeletePermission 删除权限信息
func (pc *PermissionCache) DeletePermission(ctx context.Context, key string) error {
err := pc.client.Del(ctx, key).Err()
if err != nil {
return fmt.Errorf("failed to delete permission from cache: %w", err)
}
return nil
}
2. 服务容错与降级策略
在权限系统出现故障时,我们需要有相应的容错和降级策略来保证系统的可用性。
2.1 熔断机制
// CircuitBreakerConfig 熔断器配置
type CircuitBreakerConfig struct {
FailureThreshold int // 失败阈值
SuccessThreshold int // 成功阈值
Timeout time.Duration // 超时时间
HalfOpenMaxCalls int // 半开状态最大调用次数
}
// CircuitBreakerState 熔断器状态
type CircuitBreakerState int
const (
StateClosed CircuitBreakerState = iota
StateOpen
StateHalfOpen
)
// CircuitBreaker 熔断器
type CircuitBreaker struct {
config CircuitBreakerConfig
state CircuitBreakerState
failures int
successes int
lastFailure time.Time
mutex sync.Mutex
}
// NewCircuitBreaker 创建熔断器
func NewCircuitBreaker(config CircuitBreakerConfig) *CircuitBreaker {
return &CircuitBreaker{
config: config,
state: StateClosed,
}
}
// Execute 执行受保护的操作
func (cb *CircuitBreaker) Execute(fn func() error) error {
cb.mutex.Lock()
defer cb.mutex.Unlock()
switch cb.state {
case StateOpen:
// 检查是否应该进入半开状态
if time.Since(cb.lastFailure) >= cb.config.Timeout {
cb.state = StateHalfOpen
cb.successes = 0
cb.failures = 0
} else {
return errors.New("circuit breaker is open")
}
case StateHalfOpen:
// 在半开状态下限制调用次数
if cb.successes+cb.failures >= cb.config.HalfOpenMaxCalls {
return errors.New("circuit breaker is half-open, max calls reached")
}
}
// 执行操作
err := fn()
// 更新状态
if err != nil {
cb.onFailure()
} else {
cb.onSuccess()
}
return err
}
// onFailure 处理失败情况
func (cb *CircuitBreaker) onFailure() {
cb.failures++
cb.lastFailure = time.Now()
if cb.state == StateHalfOpen || cb.failures >= cb.config.FailureThreshold {
cb.state = StateOpen
}
}
// onSuccess 处理成功情况
func (cb *CircuitBreaker) onSuccess() {
cb.successes++
if cb.state == StateHalfOpen && cb.successes >= cb.config.SuccessThreshold {
// 重置熔断器
cb.state = StateClosed
cb.failures = 0
cb.successes = 0
cb.lastFailure = time.Time{}
} else if cb.state == StateClosed {
cb.failures = 0
}
}
2.2 降级策略
// DegradationStrategy 降级策略
type DegradationStrategy int
const (
StrategyDenyAll DegradationStrategy = iota // 拒绝所有请求
StrategyAllowSuperUsers // 只允许超级用户
StrategyAllowCached // 只允许缓存命中的请求
StrategyAllowAll // 允许所有请求(无权限检查)
)
// PermissionService 权限服务
type PermissionService struct {
db *PermissionDB
cache *PermissionCache
cb *CircuitBreaker
strategy DegradationStrategy
mutex sync.RWMutex
}
// CheckPermission 检查权限
func (ps *PermissionService) CheckPermission(ctx context.Context, userID, resource, action string) (bool, error) {
ps.mutex.RLock()
strategy := ps.strategy
ps.mutex.RUnlock()
// 根据降级策略处理
switch strategy {
case StrategyDenyAll:
return false, errors.New("service is degraded, all requests denied")
case StrategyAllowSuperUsers:
// 检查是否为超级用户
if ps.isSuperUser(userID) {
return true, nil
}
return false, errors.New("service is degraded, only super users allowed")
case StrategyAllowCached:
// 只检查缓存
return ps.checkPermissionFromCache(ctx, userID, resource, action)
case StrategyAllowAll:
// 允许所有请求(无权限检查)
return true, nil
}
// 正常流程 - 使用熔断器保护
var allowed bool
err := ps.cb.Execute(func() error {
var err error
allowed, err = ps.checkPermissionInternal(ctx, userID, resource, action)
return err
})
if err != nil {
return false, fmt.Errorf("permission check failed: %w", err)
}
return allowed, nil
}
// checkPermissionInternal 内部权限检查逻辑
func (ps *PermissionService) checkPermissionInternal(ctx context.Context, userID, resource, action string) (bool, error) {
// 先检查缓存
cacheKey := fmt.Sprintf("permission:%s:%s:%s", userID, resource, action)
if cached, err := ps.cache.GetPermission(ctx, cacheKey); err == nil && cached != "" {
return cached == "allow", nil
}
// 缓存未命中,查询数据库
query := `
SELECT COUNT(*) > 0
FROM user_permissions up
JOIN permissions p ON up.permission_id = p.id
WHERE up.user_id = ? AND p.resource = ? AND p.action = ?
`
var allowed bool
replica := ps.db.GetReplica()
err := replica.QueryRowContext(ctx, query, userID, resource, action).Scan(&allowed)
if err != nil {
return false, fmt.Errorf("failed to query permission from database: %w", err)
}
// 更新缓存
cacheValue := "deny"
if allowed {
cacheValue = "allow"
}
_ = ps.cache.SetPermission(ctx, cacheKey, cacheValue, 5*time.Minute)
return allowed, nil
}
// checkPermissionFromCache 仅从缓存检查权限
func (ps *PermissionService) checkPermissionFromCache(ctx context.Context, userID, resource, action string) (bool, error) {
cacheKey := fmt.Sprintf("permission:%s:%s:%s", userID, resource, action)
cached, err := ps.cache.GetPermission(ctx, cacheKey)
if err != nil {
return false, fmt.Errorf("failed to get permission from cache: %w", err)
}
if cached == "" {
return false, errors.New("permission not found in cache, service degraded")
}
return cached == "allow", nil
}
// isSuperUser 检查是否为超级用户
func (ps *PermissionService) isSuperUser(userID string) bool {
// 实际实现中可能需要查询数据库或配置
superUsers := map[string]bool{
"admin": true,
"root": true,
}
return superUsers[userID]
}
// SetDegradationStrategy 设置降级策略
func (ps *PermissionService) SetDegradationStrategy(strategy DegradationStrategy) {
ps.mutex.Lock()
defer ps.mutex.Unlock()
ps.strategy = strategy
}
3. 多活部署与灾备方案
为了进一步提升权限系统的可用性,我们需要设计多活部署和灾备方案。
3.1 多活部署架构
// MultiSiteConfig 多站点配置
type MultiSiteConfig struct {
CurrentSite string
Sites map[string]*SiteConfig
}
// SiteConfig 站点配置
type SiteConfig struct {
Name string
Endpoint string
Priority int
Enabled bool
}
// MultiSiteManager 多站点管理器
type MultiSiteManager struct {
config *MultiSiteConfig
client *http.Client
mutex sync.RWMutex
}
// NewMultiSiteManager 创建多站点管理器
func NewMultiSiteManager(config *MultiSiteConfig) *MultiSiteManager {
return &MultiSiteManager{
config: config,
client: &http.Client{
Timeout: 5 * time.Second,
},
}
}
// CheckPermission 跨站点权限检查
func (msm *MultiSiteManager) CheckPermission(ctx context.Context, userID, resource, action string) (bool, error) {
msm.mutex.RLock()
currentSite := msm.config.CurrentSite
sites := make([]*SiteConfig, 0, len(msm.config.Sites))
for _, site := range msm.config.Sites {
if site.Enabled {
sites = append(sites, site)
}
}
msm.mutex.RUnlock()
// 按优先级排序
sort.Slice(sites, func(i, j int) bool {
return sites[i].Priority < sites[j].Priority
})
// 首先在当前站点检查
if currentSiteConfig, exists := msm.config.Sites[currentSite]; exists && currentSiteConfig.Enabled {
allowed, err := msm.checkPermissionAtSite(ctx, currentSiteConfig, userID, resource, action)
if err == nil {
return allowed, nil
}
// 当前站点失败,记录日志
log.Printf("Failed to check permission at current site %s: %v", currentSite, err)
}
// 尝试其他站点
for _, site := range sites {
if site.Name == currentSite {
continue // 跳过当前站点
}
allowed, err := msm.checkPermissionAtSite(ctx, site, userID, resource, action)
if err == nil {
return allowed, nil
}
log.Printf("Failed to check permission at site %s: %v", site.Name, err)
}
return false, errors.New("failed to check permission at all sites")
}
// checkPermissionAtSite 在指定站点检查权限
func (msm *MultiSiteManager) checkPermissionAtSite(ctx context.Context, site *SiteConfig, userID, resource, action string) (bool, error) {
url := fmt.Sprintf("%s/permission/check?user_id=%s&resource=%s&action=%s",
site.Endpoint, userID, resource, action)
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return false, fmt.Errorf("failed to create request: %w", err)
}
resp, err := msm.client.Do(req)
if err != nil {
return false, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return false, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
var result struct {
Allowed bool `json:"allowed"`
}
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return false, fmt.Errorf("failed to decode response: %w", err)
}
return result.Allowed, nil
}
3.2 数据同步机制
// DataSyncConfig 数据同步配置
type DataSyncConfig struct {
SyncInterval time.Duration
BatchSize int
TargetSites []string
}
// DataSyncManager 数据同步管理器
type DataSyncManager struct {
config *DataSyncConfig
db *PermissionDB
client *http.Client
ticker *time.Ticker
stopChan chan struct{}
mutex sync.Mutex
}
// NewDataSyncManager 创建数据同步管理器
func NewDataSyncManager(config *DataSyncConfig, db *PermissionDB) *DataSyncManager {
return &DataSyncManager{
config: config,
db: db,
client: &http.Client{
Timeout: 30 * time.Second,
},
stopChan: make(chan struct{}),
}
}
// Start 启动数据同步
func (dsm *DataSyncManager) Start() {
dsm.mutex.Lock()
defer dsm.mutex.Unlock()
if dsm.ticker != nil {
return // 已经启动
}
dsm.ticker = time.NewTicker(dsm.config.SyncInterval)
go func() {
for {
select {
case <-dsm.ticker.C:
dsm.syncData()
case <-dsm.stopChan:
return
}
}
}()
}
// Stop 停止数据同步
func (dsm *DataSyncManager) Stop() {
dsm.mutex.Lock()
defer dsm.mutex.Unlock()
if dsm.ticker != nil {
dsm.ticker.Stop()
dsm.ticker = nil
}
close(dsm.stopChan)
}
// syncData 同步数据
func (dsm *DataSyncManager) syncData() {
ctx := context.Background()
// 获取需要同步的数据
data, err := dsm.getChangedData(ctx)
if err != nil {
log.Printf("Failed to get changed data: %v", err)
return
}
if len(data) == 0 {
return // 没有需要同步的数据
}
// 同步到各个站点
for _, site := range dsm.config.TargetSites {
if err := dsm.syncToSite(ctx, site, data); err != nil {
log.Printf("Failed to sync data to site %s: %v", site, err)
}
}
}
// getChangedData 获取变更的数据
func (dsm *DataSyncManager) getChangedData(ctx context.Context) ([]map[string]interface{}, error) {
// 实际实现中可能需要根据时间戳或版本号查询变更的数据
query := `
SELECT id, user_id, resource, action, created_at, updated_at
FROM user_permissions
WHERE updated_at > ?
ORDER BY updated_at
LIMIT ?
`
master := dsm.db.GetMaster()
rows, err := master.QueryContext(ctx, query, time.Now().Add(-dsm.config.SyncInterval), dsm.config.BatchSize)
if err != nil {
return nil, fmt.Errorf("failed to query changed data: %w", err)
}
defer rows.Close()
var data []map[string]interface{}
for rows.Next() {
var id, userID, resource, action string
var createdAt, updatedAt time.Time
if err := rows.Scan(&id, &userID, &resource, &action, &createdAt, &updatedAt); err != nil {
return nil, fmt.Errorf("failed to scan row: %w", err)
}
data = append(data, map[string]interface{}{
"id": id,
"user_id": userID,
"resource": resource,
"action": action,
"created_at": createdAt,
"updated_at": updatedAt,
})
}
return data, nil
}
// syncToSite 同步数据到指定站点
func (dsm *DataSyncManager) syncToSite(ctx context.Context, site string, data []map[string]interface{}) error {
url := fmt.Sprintf("%s/permission/sync", site)
payload, err := json.Marshal(data)
if err != nil {
return fmt.Errorf("failed to marshal data: %w", err)
}
req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewBuffer(payload))
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
resp, err := dsm.client.Do(req)
if err != nil {
return fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
return nil
}
4. 可用性监控与告警
高可用系统需要完善的监控和告警机制,以便及时发现问题并进行处理。
// MetricsCollector 指标收集器
type MetricsCollector struct {
// 权限检查计数
permissionChecks *prometheus.CounterVec
// 权限检查延迟
permissionCheckDuration *prometheus.HistogramVec
// 错误计数
errors *prometheus.CounterVec
// 熔断器状态
circuitBreakerState *prometheus.GaugeVec
}
// NewMetricsCollector 创建指标收集器
func NewMetricsCollector() *MetricsCollector {
collector := &MetricsCollector{
permissionChecks: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "permission_checks_total",
Help: "Total number of permission checks",
},
[]string{"result", "site"},
),
permissionCheckDuration: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "permission_check_duration_seconds",
Help: "Permission check duration in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"site"},
),
errors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "permission_errors_total",
Help: "Total number of permission errors",
},
[]string{"type", "site"},
),
circuitBreakerState: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "circuit_breaker_state",
Help: "Circuit breaker state (0=closed, 1=open, 2=half-open)",
},
[]string{"service", "site"},
),
}
// 注册指标
prometheus.MustRegister(collector.permissionChecks)
prometheus.MustRegister(collector.permissionCheckDuration)
prometheus.MustRegister(collector.errors)
prometheus.MustRegister(collector.circuitBreakerState)
return collector
}
// RecordPermissionCheck 记录权限检查
func (mc *MetricsCollector) RecordPermissionCheck(site string, duration time.Duration, allowed bool, err error) {
result := "allowed"
if !allowed {
result = "denied"
}
if err != nil {
result = "error"
}
mc.permissionChecks.WithLabelValues(result, site).Inc()
mc.permissionCheckDuration.WithLabelValues(site).Observe(duration.Seconds())
if err != nil {
mc.errors.WithLabelValues("check_failure", site).Inc()
}
}
// RecordCircuitBreakerState 记录熔断器状态
func (mc *MetricsCollector) RecordCircuitBreakerState(service, site string, state CircuitBreakerState) {
mc.circuitBreakerState.WithLabelValues(service, site).Set(float64(state))
}
5. 总结
权限系统的高可用设计需要从多个方面综合考虑:
- 数据存储高可用:通过主从复制、读写分离、多实例部署等方式保证数据的高可用性
- 缓存高可用:使用Redis集群等方案保证缓存的高可用性
- 服务容错:通过熔断器、降级策略等机制提高服务的容错能力
- 多活部署:在多个数据中心部署服务实例,实现跨地域的高可用
- 数据同步:保证多活部署环境下的数据一致性
- 监控告警:建立完善的监控体系,及时发现和处理问题
通过以上这些设计和实现,我们可以构建一个高可用的权限系统,即使在部分组件出现故障的情况下,也能保证核心功能的正常运行,从而保障整个业务系统的稳定性和可用性。
在实际项目中,还需要根据具体的业务场景和技术栈选择合适的实现方案,并持续优化和改进系统的高可用性设计。