前端监控平台/监控SDK的架构设计和难点亮点

107 阅读4分钟

前端监控平台/监控SDK的架构设计和难点亮点---youkeit.xyz/14907/

前端监控SDK:从指标采集到智能决策的全链路实践

现代前端监控体系全景图

现代前端监控已从简单的错误收集发展为覆盖"采集-传输-存储-分析-决策"全链路的智能体系。一个完整的监控SDK需要解决以下核心问题:

  1. 全维度指标采集:用户体验指标、性能指标、业务指标、错误指标
  2. 高效可靠传输:应对网络不稳定、数据量大等场景
  3. 智能数据分析:异常检测、根因分析、趋势预测
  4. 决策支撑:报警熔断、自动修复、体验优化建议

前端监控体系架构图转存失败,建议直接上传图片文件

核心代码实现

1. 多维度指标采集器

// metrics-collector.ts
interface Metric {
  name: string;
  value: number;
  tags: Record<string, string>;
  timestamp: number;
}

class MetricsCollector {
  private metrics: Metric[] = [];
  private readonly MAX_METRICS = 100;
  private readonly FLUSH_INTERVAL = 10000; // 10秒

  constructor() {
    this.setupAutoFlush();
    this.setupPerformanceObserver();
    this.setupErrorTracking();
  }

  // 自动定期上报
  private setupAutoFlush() {
    setInterval(() => this.flush(), this.FLUSH_INTERVAL);
    window.addEventListener('beforeunload', () => this.flush());
  }

  // 性能指标自动采集
  private setupPerformanceObserver() {
    if (!window.PerformanceObserver) return;

    // 采集LCP、FID、CLS等Web Vitals指标
    const observeVitals = () => {
      const vitals = ['LCP', 'FID', 'CLS'];
      vitals.forEach(metric => {
        (window as any).webVitals[metric](data => {
          this.push(metric, data.value, {
            id: data.id,
            rating: data.rating
          });
        });
      });
    };

    // 采集长任务
    new PerformanceObserver(list => {
      list.getEntries().forEach(entry => {
        this.push('long_task', entry.duration, {
          containerId: (entry as any).containerId || ''
        });
      });
    }).observe({ type: 'longtask', buffered: true });

    // 采集资源加载
    new PerformanceObserver(list => {
      list.getEntries().forEach(entry => {
        this.push('resource_load', entry.duration, {
          name: entry.name,
          type: entry.initiatorType,
          size: entry.transferSize
        });
      });
    }).observe({ type: 'resource', buffered: true });

    // 动态import vitals库
    import('web-vitals').then(webVitals => {
      (window as any).webVitals = webVitals;
      observeVitals();
    });
  }

  // 错误采集
  private setupErrorTracking() {
    // JS错误
    window.addEventListener('error', event => {
      this.push('js_error', 1, {
        message: event.message,
        filename: event.filename,
        lineno: event.lineno,
        colno: event.colno,
        stack: event.error?.stack
      });
    });

    // 未处理的Promise rejection
    window.addEventListener('unhandledrejection', event => {
      this.push('promise_error', 1, {
        reason: event.reason?.toString()
      });
    });

    // 资源加载失败
    window.addEventListener('error', event => {
      const target = event.target as HTMLElement;
      if (target && target.tagName) {
        this.push('resource_error', 1, {
          tag: target.tagName,
          src: (target as any).src || (target as any).href,
          type: target.getAttribute('type')
        });
      }
    }, true);
  }

  // 自定义业务指标
  push(name: string, value: number, tags: Record<string, string> = {}) {
    this.metrics.push({
      name,
      value,
      tags: {
        ...tags,
        page: window.location.pathname,
        env: process.env.NODE_ENV
      },
      timestamp: Date.now()
    });

    if (this.metrics.length >= this.MAX_METRICS) {
      this.flush();
    }
  }

  // 数据上报
  async flush() {
    if (this.metrics.length === 0) return;

    const metricsToSend = [...this.metrics];
    this.metrics = [];

    try {
      // 使用navigator.sendBeacon优先,失败后降级到fetch
      const blob = new Blob([JSON.stringify(metricsToSend)], {
        type: 'application/json'
      });
      
      if (!navigator.sendBeacon('/api/metrics', blob)) {
        await fetch('/api/metrics', {
          method: 'POST',
          body: JSON.stringify(metricsToSend),
          headers: { 'Content-Type': 'application/json' },
          keepalive: true
        });
      }
    } catch (err) {
      console.error('上报指标失败:', err);
      // 失败后重新放回队列(去重)
      this.metrics = [
        ...metricsToSend.filter(m => 
          !this.metrics.some(existing => 
            existing.timestamp === m.timestamp && existing.name === m.name
          )
        ),
        ...this.metrics
      ];
    }
  }
}

// 使用示例
const collector = new MetricsCollector();

// 自定义业务指标
collector.push('checkout_step', 1, { step: 'cart_view' });
collector.push('api_response_time', 235, { endpoint: '/user/info' });

2. 智能数据传输控制器

// data-transmitter.ts
interface QueuedItem {
  id: string;
  data: any;
  retries: number;
  timestamp: number;
}

class DataTransmitter {
  private queue: QueuedItem[] = [];
  private readonly MAX_RETRIES = 3;
  private readonly MAX_QUEUE_SIZE = 500;
  private readonly BATCH_SIZE = 20;
  private readonly RETRY_DELAY = [1000, 5000, 10000]; // 重试延迟策略
  private isOnline = navigator.onLine;
  private isSending = false;

  constructor() {
    this.setupConnectivityListener();
    this.setupVisibilityListener();
    this.setupStorage();
  }

  private setupConnectivityListener() {
    window.addEventListener('online', () => {
      this.isOnline = true;
      this.processQueue();
    });
    
    window.addEventListener('offline', () => {
      this.isOnline = false;
    });
  }

  private setupVisibilityListener() {
    document.addEventListener('visibilitychange', () => {
      if (document.visibilityState === 'visible') {
        this.processQueue();
      }
    });
  }

  private setupStorage() {
    // 从本地存储恢复队列
    const savedQueue = localStorage.getItem('monitoring_queue');
    if (savedQueue) {
      try {
        this.queue = JSON.parse(savedQueue);
      } catch (err) {
        console.error('恢复队列失败:', err);
      }
    }

    // 定期持久化队列
    setInterval(() => {
      if (this.queue.length > 0) {
        localStorage.setItem('monitoring_queue', JSON.stringify(this.queue));
      }
    }, 5000);
  }

  async send(data: any, urgent = false) {
    const item: QueuedItem = {
      id: Math.random().toString(36).slice(2),
      data,
      retries: 0,
      timestamp: Date.now()
    };

    if (urgent || this.queue.length >= this.MAX_QUEUE_SIZE) {
      this.queue.unshift(item); // 紧急数据或队列满时插入队首
    } else {
      this.queue.push(item);
    }

    await this.processQueue();
  }

  private async processQueue() {
    if (!this.isOnline || this.isSending || this.queue.length === 0) {
      return;
    }

    this.isSending = true;

    try {
      // 按优先级处理:先处理重试次数多的,再处理新的
      this.queue.sort((a, b) => b.retries - a.retries || a.timestamp - b.timestamp);
      
      const batch = this.queue.slice(0, this.BATCH_SIZE);
      const success = await this.sendBatch(batch);

      if (success) {
        this.queue = this.queue.slice(batch.length);
        localStorage.removeItem('monitoring_queue');
      }
    } catch (err) {
      console.error('处理队列失败:', err);
    } finally {
      this.isSending = false;
    }
  }

  private async sendBatch(batch: QueuedItem[]): Promise<boolean> {
    try {
      const response = await fetch('/api/batch', {
        method: 'POST',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify(batch.map(item => item.data)),
        keepalive: true
      });

      if (!response.ok) throw new Error(`HTTP ${response.status}`);

      return true;
    } catch (err) {
      console.error('批量发送失败:', err);
      
      // 更新重试次数和延迟
      batch.forEach(item => {
        item.retries++;
        if (item.retries <= this.MAX_RETRIES) {
          setTimeout(() => this.processQueue(), this.RETRY_DELAY[item.retries - 1]);
        }
      });

      // 超过重试次数的数据丢弃
      this.queue = this.queue.filter(item => 
        item.retries <= this.MAX_RETRIES
      );

      return false;
    }
  }

  // 压缩数据减少传输量
  private compress(data: any): string {
    // 实际项目可使用lz-string等库
    return JSON.stringify(data);
  }
}

3. 智能决策引擎

// decision-engine.ts
type AlertRule = {
  type: 'threshold' | 'trend' | 'anomaly';
  metric: string;
  condition: (value: number, history: number[]) => boolean;
  severity: 'warning' | 'error' | 'critical';
  action: 'alert' | 'rollback' | 'throttle';
};

class DecisionEngine {
  private rules: AlertRule[] = [];
  private metricHistory: Record<string, number[]> = {};
  private readonly HISTORY_SIZE = 100;

  constructor() {
    this.setupDefaultRules();
  }

  private setupDefaultRules() {
    // 阈值规则
    this.addRule({
      type: 'threshold',
      metric: 'js_error',
      condition: (value) => value > 10,
      severity: 'error',
      action: 'alert'
    });

    // 趋势规则
    this.addRule({
      type: 'trend',
      metric: 'api_response_time',
      condition: (value, history) => {
        if (history.length < 5) return false;
        const avg = history.reduce((sum, v) => sum + v, 0) / history.length;
        return value > avg * 2;
      },
      severity: 'warning',
      action: 'alert'
    });

    // 异常检测规则(基于3σ原则)
    this.addRule({
      type: 'anomaly',
      metric: 'LCP',
      condition: (value, history) => {
        if (history.length < 30) return false;
        
        const mean = history.reduce((sum, v) => sum + v, 0) / history.length;
        const variance = history.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / history.length;
        const stdDev = Math.sqrt(variance);
        
        return value > mean + 3 * stdDev;
      },
      severity: 'critical',
      action: 'throttle'
    });
  }

  addRule(rule: AlertRule) {
    this.rules.push(rule);
  }

  processMetric(metric: string, value: number) {
    // 保存历史数据
    if (!this.metricHistory[metric]) {
      this.metricHistory[metric] = [];
    }
    
    this.metricHistory[metric].push(value);
    if (this.metricHistory[metric].length > this.HISTORY_SIZE) {
      this.metricHistory[metric].shift();
    }

    // 检查相关规则
    const relevantRules = this.rules.filter(r => r.metric === metric);
    const history = this.metricHistory[metric] || [];
    
    for (const rule of relevantRules) {
      if (rule.condition(value, history)) {
        this.triggerAction(rule, value);
      }
    }
  }

  private triggerAction(rule: AlertRule, value: number) {
    console.log(`[${rule.severity.toUpperCase()}] ${rule.metric}触发规则:`, value);
    
    switch (rule.action) {
      case 'alert':
        this.sendAlert(rule, value);
        break;
      case 'rollback':
        this.triggerRollback(rule);
        break;
      case 'throttle':
        this.throttleSystem(rule);
        break;
    }
  }

  private sendAlert(rule: AlertRule, value: number) {
    const message = `[前端监控] ${rule.metric}异常: ${value} (${rule.type})`;
    
    // 实际项目中可接入企业微信、钉钉、Slack等
    fetch('/api/alert', {
      method: 'POST',
      body: JSON.stringify({
        title: `${rule.severity.toUpperCase()}告警`,
        message,
        metric: rule.metric,
        value,
        type: rule.type
      })
    });
  }

  private triggerRollback(rule: AlertRule) {
    console.log('执行回滚操作...');
    // 实际项目中可触发CI/CD系统的回滚流程
  }

  private throttleSystem(rule: AlertRule) {
    console.log('执行降级操作...');
    // 例如:关闭非核心功能、减少数据采集频率等
  }

  // 预测性分析
  predictTrend(metric: string): { current: number; predicted: number; trend: 'up' | 'down' | 'stable' } {
    const history = this.metricHistory[metric] || [];
    if (history.length < 10) {
      return { current: 0, predicted: 0, trend: 'stable' };
    }

    // 简单线性回归预测
    const n = history.length;
    const xSum = history.reduce((sum, _, i) => sum + i, 0);
    const ySum = history.reduce((sum, y) => sum + y, 0);
    const xySum = history.reduce((sum, y, i) => sum + i * y, 0);
    const xxSum = history.reduce((sum, _, i) => sum + i * i, 0);
    
    const slope = (n * xySum - xSum * ySum) / (n * xxSum - xSum * xSum);
    const intercept = (ySum - slope * xSum) / n;
    
    const current = history[history.length - 1];
    const predicted = slope * n + intercept;
    
    return {
      current,
      predicted,
      trend: slope > 0.1 ? 'up' : slope < -0.1 ? 'down' : 'stable'
    };
  }
}

// 使用示例
const engine = new DecisionEngine();

// 自定义业务规则
engine.addRule({
  type: 'threshold',
  metric: 'checkout_abandon_rate',
  condition: (value) => value > 0.7, // 结账放弃率超过70%
  severity: 'error',
  action: 'alert'
});

// 处理指标
collector.push('checkout_abandon_rate', 0.75);
engine.processMetric('checkout_abandon_rate', 0.75);

// 获取预测
const lcpTrend = engine.predictTrend('LCP');
console.log('LCP趋势预测:', lcpTrend);

全链路亮点设计

1. 智能采样与降级策略

// adaptive-sampler.js
class AdaptiveSampler {
  constructor() {
    this.sampleRates = {
      error: 1.0,    // 错误全采集
      performance: 0.2,
      business: 0.1,
      log: 0.01
    };
    
    this.systemLoad = 0;
    this.monitorSystemLoad();
  }

  monitorSystemLoad() {
    // 基于内存和CPU使用率计算系统负载
    setInterval(() => {
      if (window.performance && performance.memory) {
        const memoryUsage = performance.memory.usedJSHeapSize / 
                          performance.memory.jsHeapSizeLimit;
        
        // 简化版CPU负载估算(实际项目可使用Web Worker)
        const now = performance.now();
        let sum = 0;
        for (let i = 0; i < 1000000; i++) {
          sum += Math.random();
        }
        const cpuUsage = (performance.now() - now) / 10;
        
        this.systemLoad = Math.max(memoryUsage, cpuUsage);
        
        // 动态调整采样率
        this.adjustSampleRates();
      }
    }, 5000);
  }

  adjustSampleRates() {
    if (this.systemLoad > 0.8) {
      // 高负载时降级采样
      this.sampleRates.performance = Math.max(0.05, this.sampleRates.performance * 0.5);
      this.sampleRates.business = Math.max(0.02, this.sampleRates.business * 0.5);
      this.sampleRates.log = 0;
    } else if (this.systemLoad < 0.3) {
      // 低负载时恢复采样
      this.sampleRates.performance = Math.min(0.3, this.sampleRates.performance * 1.5);
      this.sampleRates.business = Math.min(0.2, this.sampleRates.business * 1.5);
      this.sampleRates.log = 0.01;
    }
  }

  shouldSample(type) {
    if (!this.sampleRates[type]) return true;
    
    // 重要用户全采样(可根据userId哈希决定)
    if (this.isImportantUser()) return true;
    
    return Math.random() < this.sampleRates[type];