2.4 可观测性设计:如何监控和追踪每一条通知的生命周期?

0 阅读8分钟

2.4 可观测性设计:如何监控和追踪每一条通知的生命周期?

引言

在复杂的分布式系统中,可观测性是保障系统稳定运行和快速定位问题的关键能力。对于通知平台这类高并发、多渠道的服务,我们需要能够监控和追踪每一条通知的完整生命周期,从创建、调度、发送到最终结果,确保能够及时发现和解决问题。

本节我们将深入探讨通知平台的可观测性设计,包括指标监控、日志记录、链路追踪等关键技术,构建一个完整的可观测性体系。

可观测性的核心要素

可观测性主要包括以下三个核心要素:

  1. Metrics(指标):系统运行状态的量化数据,如QPS、响应时间、错误率等
  2. Logs(日志):系统运行过程中的详细记录,用于问题排查和审计
  3. Traces(链路追踪):请求在分布式系统中的完整调用链路
graph TB
    A[可观测性] --> B[Metrics 指标]
    A --> C[Logs 日志]
    A --> D[Traces 链路追踪]
    
    B --> B1[系统指标]
    B --> B2[业务指标]
    B --> B3[性能指标]
    
    C --> C1[访问日志]
    C --> C2[错误日志]
    C --> C3[审计日志]
    
    D --> D1[请求链路]
    D --> D2[服务调用]
    D --> D3[性能分析]

指标监控系统

指标监控是可观测性的基础,我们需要收集和展示各种关键指标。

指标管理器

``go // MetricsManager 指标管理器 type MetricsManager struct { // 系统性能指标 systemMetrics *SystemMetrics

// 业务指标
businessMetrics *BusinessMetrics

// 渠道指标
channelMetrics *ChannelMetrics

// 注册表
registry *prometheus.Registry

// HTTP服务器
httpServer *http.Server

}

// SystemMetrics 系统指标 type SystemMetrics struct { // 请求相关指标 totalRequests *prometheus.CounterVec requestDuration *prometheus.HistogramVec requestErrors *prometheus.CounterVec

// 资源相关指标
cpuUsage           prometheus.Gauge
memoryUsage        prometheus.Gauge
goroutineCount     prometheus.Gauge

// 队列相关指标
queueLength        prometheus.Gauge
queueWaitTime      *prometheus.HistogramVec

}

// BusinessMetrics 业务指标 type BusinessMetrics struct { // 业务方相关指标 businessRequests *prometheus.CounterVec businessSuccess *prometheus.CounterVec businessFailures *prometheus.CounterVec

// 消息相关指标
messageCreated     *prometheus.CounterVec
messageSent        *prometheus.CounterVec
messageFailed      *prometheus.CounterVec
messageRetry       *prometheus.CounterVec

}

// ChannelMetrics 渠道指标 type ChannelMetrics struct { // 渠道相关指标 channelRequests *prometheus.CounterVec channelSuccess *prometheus.CounterVec channelFailures *prometheus.CounterVec channelLatency *prometheus.HistogramVec }

// NewMetricsManager 创建指标管理器 func NewMetricsManager() *MetricsManager { registry := prometheus.NewRegistry()

// 系统指标
systemMetrics := &SystemMetrics{
    totalRequests: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_system_requests_total",
            Help: "Total number of system requests",
        },
        []string{"method", "endpoint"},
    ),
    requestDuration: prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name: "notification_request_duration_seconds",
            Help: "Request duration in seconds",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "endpoint"},
    ),
    requestErrors: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_request_errors_total",
            Help: "Total number of request errors",
        },
        []string{"method", "endpoint", "error"},
    ),
    cpuUsage: prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "notification_cpu_usage_percent",
            Help: "CPU usage percentage",
        },
    ),
    memoryUsage: prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "notification_memory_usage_bytes",
            Help: "Memory usage in bytes",
        },
    ),
    goroutineCount: prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "notification_goroutines_total",
            Help: "Total number of goroutines",
        },
    ),
    queueLength: prometheus.NewGauge(
        prometheus.GaugeOpts{
            Name: "notification_queue_length",
            Help: "Current queue length",
        },
    ),
    queueWaitTime: prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name: "notification_queue_wait_time_seconds",
            Help: "Queue wait time in seconds",
            Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30},
        },
        []string{"queue_type"},
    ),
}

// 业务指标
businessMetrics := &BusinessMetrics{
    businessRequests: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_business_requests_total",
            Help: "Total number of business requests",
        },
        []string{"business_id", "channel"},
    ),
    businessSuccess: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_business_success_total",
            Help: "Total number of successful business requests",
        },
        []string{"business_id", "channel"},
    ),
    businessFailures: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_business_failures_total",
            Help: "Total number of failed business requests",
        },
        []string{"business_id", "channel", "error"},
    ),
    messageCreated: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_messages_created_total",
            Help: "Total number of messages created",
        },
        []string{"business_id", "channel", "priority"},
    ),
    messageSent: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_messages_sent_total",
            Help: "Total number of messages sent",
        },
        []string{"business_id", "channel", "priority"},
    ),
    messageFailed: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_messages_failed_total",
            Help: "Total number of messages failed",
        },
        []string{"business_id", "channel", "priority", "error"},
    ),
    messageRetry: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_messages_retry_total",
            Help: "Total number of message retries",
        },
        []string{"business_id", "channel", "priority"},
    ),
}

// 渠道指标
channelMetrics := &ChannelMetrics{
    channelRequests: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_channel_requests_total",
            Help: "Total number of channel requests",
        },
        []string{"channel", "operation"},
    ),
    channelSuccess: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_channel_success_total",
            Help: "Total number of successful channel operations",
        },
        []string{"channel", "operation"},
    ),
    channelFailures: prometheus.NewCounterVec(
        prometheus.CounterOpts{
            Name: "notification_channel_failures_total",
            Help: "Total number of failed channel operations",
        },
        []string{"channel", "operation", "error"},
    ),
    channelLatency: prometheus.NewHistogramVec(
        prometheus.HistogramOpts{
            Name: "notification_channel_latency_seconds",
            Help: "Channel operation latency in seconds",
            Buckets: prometheus.DefBuckets,
        },
        []string{"channel", "operation"},
    ),
}

// 注册指标
registry.MustRegister(
    systemMetrics.totalRequests,
    systemMetrics.requestDuration,
    systemMetrics.requestErrors,
    systemMetrics.cpuUsage,
    systemMetrics.memoryUsage,
    systemMetrics.goroutineCount,
    systemMetrics.queueLength,
    systemMetrics.queueWaitTime,
    
    businessMetrics.businessRequests,
    businessMetrics.businessSuccess,
    businessMetrics.businessFailures,
    businessMetrics.messageCreated,
    businessMetrics.messageSent,
    businessMetrics.messageFailed,
    businessMetrics.messageRetry,
    
    channelMetrics.channelRequests,
    channelMetrics.channelSuccess,
    channelMetrics.channelFailures,
    channelMetrics.channelLatency,
)

return &MetricsManager{
    systemMetrics:   systemMetrics,
    businessMetrics: businessMetrics,
    channelMetrics:  channelMetrics,
    registry:        registry,
}

}

// StartHTTPServer 启动HTTP服务器 func (mm *MetricsManager) StartHTTPServer(addr string) error { mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor(mm.registry, promhttp.HandlerOpts{}))

mm.httpServer = &http.Server{
    Addr:    addr,
    Handler: mux,
}

return mm.httpServer.ListenAndServe()

}


### 系统指标收集

``go
// SystemMetricsCollector 系统指标收集器
type SystemMetricsCollector struct {
    metrics *SystemMetrics
    stopCh  chan struct{}
}

// Start 启动指标收集
func (sc *SystemMetricsCollector) Start() {
    ticker := time.NewTicker(10 * time.Second)
    defer ticker.Stop()
    
    for {
        select {
        case <-ticker.C:
            sc.collectSystemMetrics()
        case <-sc.stopCh:
            return
        }
    }
}

// collectSystemMetrics 收集系统指标
func (sc *SystemMetricsCollector) collectSystemMetrics() {
    // 收集CPU使用率
    cpuPercent, err := cpu.Percent(time.Second, false)
    if err == nil && len(cpuPercent) > 0 {
        sc.metrics.cpuUsage.Set(cpuPercent[0])
    }
    
    // 收集内存使用情况
    memStats, err := mem.VirtualMemory()
    if err == nil {
        sc.metrics.memoryUsage.Set(float64(memStats.Used))
    }
    
    // 收集协程数量
    sc.metrics.goroutineCount.Set(float64(runtime.NumGoroutine()))
}

日志系统

完善的日志系统是问题排查和系统审计的基础。

结构化日志

``go // Logger 结构化日志记录器 type Logger struct { logger *zap.Logger }

// LogEntry 日志条目 type LogEntry struct { Timestamp time.Time json:"timestamp" Level string json:"level" Message string json:"message" Service string json:"service" TraceID string json:"trace_id,omitempty" SpanID string json:"span_id,omitempty" BusinessID string json:"business_id,omitempty" MessageID string json:"message_id,omitempty" Channel string json:"channel,omitempty" Fields map[string]interface{} json:"fields,omitempty" }

// NewLogger 创建日志记录器 func NewLogger(serviceName string) (*Logger, error) { config := zap.NewProductionConfig() config.EncoderConfig.TimeKey = "timestamp" config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder

logger, err := config.Build()
if err != nil {
    return nil, fmt.Errorf("failed to create logger: %v", err)
}

return &Logger{
    logger: logger.With(zap.String("service", serviceName)),
}, nil

}

// Info 记录信息日志 func (l *Logger) Info(message string, fields ...zap.Field) { l.logger.Info(message, fields...) }

// Error 记录错误日志 func (l *Logger) Error(message string, fields ...zap.Field) { l.logger.Error(message, fields...) }

// WithTraceID 添加追踪ID func (l *Logger) WithTraceID(traceID, spanID string) *Logger { return &Logger{ logger: l.logger.With( zap.String("trace_id", traceID), zap.String("span_id", spanID), ), } }

// WithBusinessContext 添加业务上下文 func (l *Logger) WithBusinessContext(businessID, messageID, channel string) *Logger { return &Logger{ logger: l.logger.With( zap.String("business_id", businessID), zap.String("message_id", messageID), zap.String("channel", channel), ), } }


### 访问日志

``go
// AccessLogger 访问日志记录器
type AccessLogger struct {
    logger *Logger
}

// AccessLog 访问日志
type AccessLog struct {
    Timestamp    time.Time `json:"timestamp"`
    Method       string    `json:"method"`
    Path         string    `json:"path"`
    ClientIP     string    `json:"client_ip"`
    UserAgent    string    `json:"user_agent"`
    StatusCode   int       `json:"status_code"`
    Latency      int64     `json:"latency_ms"`
    RequestSize  int       `json:"request_size"`
    ResponseSize int       `json:"response_size"`
    BusinessID   string    `json:"business_id,omitempty"`
    TraceID      string    `json:"trace_id,omitempty"`
}

// LogAccess 记录访问日志
func (al *AccessLogger) LogAccess(log *AccessLog) {
    al.logger.Info("HTTP access",
        zap.String("method", log.Method),
        zap.String("path", log.Path),
        zap.String("client_ip", log.ClientIP),
        zap.String("user_agent", log.UserAgent),
        zap.Int("status_code", log.StatusCode),
        zap.Int64("latency_ms", log.Latency),
        zap.Int("request_size", log.RequestSize),
        zap.Int("response_size", log.ResponseSize),
        zap.String("business_id", log.BusinessID),
        zap.String("trace_id", log.TraceID),
    )
}

链路追踪系统

链路追踪能够帮助我们理解请求在分布式系统中的完整调用链路。

追踪管理器

``go // Tracer 追踪器 type Tracer struct { tracer opentracing.Tracer closer io.Closer }

// Span 追踪跨度 type Span struct { span opentracing.Span }

// NewTracer 创建追踪器 func NewTracer(serviceName, agentHostPort string) (*Tracer, error) { cfg := jaegercfg.Configuration{ ServiceName: serviceName, Sampler: &jaegercfg.SamplerConfig{ Type: jaeger.SamplerTypeConst, Param: 1, }, Reporter: &jaegercfg.ReporterConfig{ LogSpans: true, BufferFlushInterval: 1 * time.Second, LocalAgentHostPort: agentHostPort, }, }

tracer, closer, err := cfg.NewTracer()
if err != nil {
    return nil, fmt.Errorf("failed to create tracer: %v", err)
}

opentracing.SetGlobalTracer(tracer)

return &Tracer{
    tracer: tracer,
    closer: closer,
}, nil

}

// StartSpan 开始追踪跨度 func (t *Tracer) StartSpan(operationName string, opts ...opentracing.StartSpanOption) *Span { span := t.tracer.StartSpan(operationName, opts...) return &Span{span: span} }

// InjectSpan 注入追踪上下文 func (t *Tracer) InjectSpan(span *Span, carrier opentracing.HTTPHeadersCarrier) error { return t.tracer.Inject(span.span.Context(), opentracing.HTTPHeaders, carrier) }

// ExtractSpan 提取追踪上下文 func (t *Tracer) ExtractSpan(carrier opentracing.HTTPHeadersCarrier) (*Span, error) { spanContext, err := t.tracer.Extract(opentracing.HTTPHeaders, carrier) if err != nil { return nil, err }

span := t.tracer.StartSpan("extracted_span", opentracing.ChildOf(spanContext))
return &Span{span: span}, nil

}

// FinishSpan 结束追踪跨度 func (s *Span) Finish() { s.span.Finish() }

// SetTag 设置标签 func (s *Span) SetTag(key string, value interface{}) { s.span.SetTag(key, value) }

// LogKV 记录键值对日志 func (s *Span) LogKV(keyValues ...interface{}) { s.span.LogKV(keyValues...) }


### 业务追踪

``go
// NotificationTracer 通知追踪器
type NotificationTracer struct {
    tracer *Tracer
    logger *Logger
}

// NotificationTraceContext 通知追踪上下文
type NotificationTraceContext struct {
    TraceID   string
    SpanID    string
    BusinessID string
    MessageID string
    Channel   string
}

// StartNotificationTrace 开始通知追踪
func (nt *NotificationTracer) StartNotificationTrace(ctx context.Context, operation string) (context.Context, *Span) {
    // 从上下文中提取追踪信息
    var span *Span
    if parentSpan := opentracing.SpanFromContext(ctx); parentSpan != nil {
        span = nt.tracer.StartSpan(operation, opentracing.ChildOf(parentSpan.Context()))
    } else {
        span = nt.tracer.StartSpan(operation)
    }
    
    // 添加业务标签
    if businessID, ok := ctx.Value("business_id").(string); ok {
        span.SetTag("business_id", businessID)
    }
    
    if messageID, ok := ctx.Value("message_id").(string); ok {
        span.SetTag("message_id", messageID)
    }
    
    if channel, ok := ctx.Value("channel").(string); ok {
        span.SetTag("channel", channel)
    }
    
    // 将span添加到上下文
    return opentracing.ContextWithSpan(ctx, span.span), span
}

// TraceMessageLifecycle 追踪消息生命周期
func (nt *NotificationTracer) TraceMessageLifecycle(message *Message) *Span {
    span := nt.tracer.StartSpan("message_lifecycle")
    span.SetTag("message_id", message.ID)
    span.SetTag("business_id", message.BusinessID)
    span.SetTag("channel", message.Channel)
    span.SetTag("receivers_count", len(message.Receivers))
    
    return span
}

// TraceChannelOperation 追踪渠道操作
func (nt *NotificationTracer) TraceChannelOperation(channel, operation string) *Span {
    span := nt.tracer.StartSpan(fmt.Sprintf("channel_%s", operation))
    span.SetTag("channel", channel)
    span.SetTag("operation", operation)
    
    return span
}

监控告警系统

基于收集的指标和日志,我们需要实现监控告警系统。

告警规则

``go // AlertManager 告警管理器 type AlertManager struct { // 告警规则 rules []*AlertRule

// 告警通知器
notifier *AlertNotifier

// 指标管理器
metricsManager *MetricsManager

// 告警历史
alertHistory *cache.Cache

}

// AlertRule 告警规则 type AlertRule struct { ID string json:"id" Name string json:"name" Description string json:"description" Metric string json:"metric" Condition string json:"condition" // >, <, ==, !=, >=, <= Threshold float64 json:"threshold" Duration time.Duration json:"duration" Severity int json:"severity" // 1-低, 2-中, 3-高 Enabled bool json:"enabled" LastTrigger time.Time json:"last_trigger" }

// Alert 告警 type Alert struct { ID string json:"id" RuleID string json:"rule_id" Name string json:"name" Severity int json:"severity" Message string json:"message" Values map[string]float64 json:"values" TriggeredAt time.Time json:"triggered_at" }

// EvaluateRules 评估告警规则 func (am *AlertManager) EvaluateRules() { for _, rule := range am.rules { if !rule.Enabled { continue }

    // 评估规则
    triggered, values := am.evaluateRule(rule)
    if triggered {
        // 检查是否在静默期内
        if am.isInSilencePeriod(rule) {
            continue
        }
        
        // 创建告警
        alert := &Alert{
            ID:          generateAlertID(),
            RuleID:      rule.ID,
            Name:        rule.Name,
            Severity:    rule.Severity,
            Message:     fmt.Sprintf("Alert rule '%s' triggered", rule.Name),
            Values:      values,
            TriggeredAt: time.Now(),
        }
        
        // 发送告警
        am.notifier.SendAlert(alert)
        
        // 记录告警历史
        am.alertHistory.Set(alert.ID, alert, cache.DefaultExpiration)
        
        // 更新规则触发时间
        rule.LastTrigger = time.Now()
    }
}

}

// evaluateRule 评估单个规则 func (am *AlertManager) evaluateRule(rule *AlertRule) (bool, map[string]float64) { // 这里简化实现,实际应用中需要根据具体指标进行评估 switch rule.Metric { case "cpu_usage": // 获取当前CPU使用率 cpuUsage := am.getCurrentCPUUsage() return am.evaluateCondition(cpuUsage, rule.Condition, rule.Threshold), map[string]float64{"cpu_usage": cpuUsage}

case "memory_usage":
    // 获取当前内存使用率
    memoryUsage := am.getCurrentMemoryUsage()
    return am.evaluateCondition(memoryUsage, rule.Condition, rule.Threshold), 
           map[string]float64{"memory_usage": memoryUsage}
    
case "error_rate":
    // 获取当前错误率
    errorRate := am.getCurrentErrorRate()
    return am.evaluateCondition(errorRate, rule.Condition, rule.Threshold), 
           map[string]float64{"error_rate": errorRate}
    
default:
    return false, nil
}

}

// evaluateCondition 评估条件 func (am *AlertManager) evaluateCondition(value float64, condition string, threshold float64) bool { switch condition { case ">": return value > threshold case "<": return value < threshold case "==": return value == threshold case "!=": return value != threshold case ">=": return value >= threshold case "<=": return value <= threshold default: return false } }


### 告警通知

``go
// AlertNotifier 告警通知器
type AlertNotifier struct {
    // 通知渠道
    channels []NotificationChannel
}

// NotificationChannel 通知渠道接口
type NotificationChannel interface {
    Send(alert *Alert) error
    Type() string
}

// EmailNotificationChannel 邮件通知渠道
type EmailNotificationChannel struct {
    config EmailConfig
}

// Send 发送告警通知
func (e *EmailNotificationChannel) Send(alert *Alert) error {
    subject := fmt.Sprintf("[Alert %s] %s", getSeverityString(alert.Severity), alert.Name)
    body := fmt.Sprintf(`
Alert Details:
- Name: %s
- Severity: %s
- Message: %s
- Triggered At: %s

Values:
%s
    `, 
        alert.Name,
        getSeverityString(alert.Severity),
        alert.Message,
        alert.TriggeredAt.Format("2006-01-02 15:04:05"),
        formatAlertValues(alert.Values),
    )
    
    return e.sendEmail(subject, body)
}

// WebhookNotificationChannel Webhook通知渠道
type WebhookNotificationChannel struct {
    config WebhookConfig
    client *http.Client
}

// Send 发送告警通知
func (w *WebhookNotificationChannel) Send(alert *Alert) error {
    data := map[string]interface{}{
        "alert_id":     alert.ID,
        "rule_id":      alert.RuleID,
        "name":         alert.Name,
        "severity":     alert.Severity,
        "message":      alert.Message,
        "values":       alert.Values,
        "triggered_at": alert.TriggeredAt,
    }
    
    body, err := json.Marshal(data)
    if err != nil {
        return fmt.Errorf("failed to marshal alert data: %v", err)
    }
    
    req, err := http.NewRequest("POST", w.config.URL, bytes.NewBuffer(body))
    if err != nil {
        return fmt.Errorf("failed to create webhook request: %v", err)
    }
    
    req.Header.Set("Content-Type", "application/json")
    req.Header.Set("X-Alert-Severity", strconv.Itoa(alert.Severity))
    
    resp, err := w.client.Do(req)
    if err != nil {
        return fmt.Errorf("failed to send webhook request: %v", err)
    }
    defer resp.Body.Close()
    
    if resp.StatusCode >= 400 {
        return fmt.Errorf("webhook request failed with status: %d", resp.StatusCode)
    }
    
    return nil
}

总结

通过本节的学习,我们了解了如何构建一个完整的可观测性系统:

  1. 指标监控:通过Prometheus等工具收集和展示系统关键指标
  2. 日志系统:实现结构化日志记录,便于问题排查和审计
  3. 链路追踪:通过OpenTracing等标准追踪请求在分布式系统中的完整调用链路
  4. 监控告警:基于收集的指标和日志实现自动监控告警

这套可观测性系统能够帮助我们全面监控和追踪每一条通知的生命周期,及时发现和解决问题,保障系统的稳定运行。在实际应用中,我们可以根据具体业务需求和系统特点对这套系统进行调整和优化。

在下一节中,我们将探讨服务治理方案,包括熔断、限流、降级等技术如何保障系统稳定。