概述

本文主要深入分析 Grafana Ngalert 中告警规则的调度和执行过程，从源码级别进行探讨，以了解其优秀的设计理念和待改进的地方。

调度器

type schedule struct {
    // 调度器的执行频率，默认是 10s
    baseInterval time.Duration

    // 每个告警规则都会对应一个 channel 和 gorountine
    registry ruleRegistry

    maxAttempts int64

    clock clock.Clock

    log log.Logger

    evaluatorFactory eval.EvaluatorFactory

    ruleStore RulesStore
    // 状态管理器
    stateManager *state.Manager

    jitterEvaluations    JitterStrategy

    alertsSender    AlertsSender
    minRuleInterval time.Duration

    // schedulableAlertRules contains the alert rules that are considered for
    // evaluation in the current tick. The evaluation of an alert rule in the
    // current tick depends on its evaluation interval and when it was
    // last evaluated.
    schedulableAlertRules alertRulesRegistry
}

启动

func (sch *schedule) Run(ctx context.Context) error {
    sch.log.Info("Starting scheduler", "tickInterval", sch.baseInterval, "maxAttempts", sch.maxAttempts)
    t := ticker.New(sch.clock, sch.baseInterval, sch.metrics.Ticker)
    defer t.Stop()

    if err := sch.schedulePeriodic(ctx, t); err != nil {
       sch.log.Error("Failure while running the rule evaluation loop", "error", err)
    }
    return nil
}

func (sch *schedule) schedulePeriodic(ctx context.Context, t *ticker.T) error {
    dispatcherGroup, ctx := errgroup.WithContext(ctx)
    for {
       select {
       case tick := <-t.C:
          // We use Round(0) on the start time to remove the monotonic clock.
          // This is required as ticks from the ticker and time.Now() can have
          // a monotonic clock that when subtracted do not represent the delta
          // in wall clock time.
          start := time.Now().Round(0)
          sch.metrics.BehindSeconds.Set(start.Sub(tick).Seconds())

          sch.processTick(ctx, dispatcherGroup, tick)

          sch.metrics.SchedulePeriodicDuration.Observe(time.Since(start).Seconds())
       case <-ctx.Done():
          // waiting for all rule evaluation routines to stop
          waitErr := dispatcherGroup.Wait()
          return waitErr
       }
    }
}

每隔baseInterval调度一次，默认 10s。

执行

func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.Group, tick time.Time) ([]readyToRunItem, map[ngmodels.AlertRuleKey]struct{}, []ngmodels.AlertRuleKeyWithVersion) {
    tickNum := tick.Unix() / int64(sch.baseInterval.Seconds())

    // update the local registry. If there was a difference between the previous state and the current new state, rulesDiff will contains keys of rules that were updated.
    rulesDiff, err := sch.updateSchedulableAlertRules(ctx)
    updated := rulesDiff.updated
    if updated == nil { // make sure map is not nil
        updated = map[ngmodels.AlertRuleKey]struct{}{}
    }
    if err != nil {
       sch.log.Error("Failed to update alert rules", "error", err)
    }

    // this is the new current state. rulesDiff contains the previously existing rules that were different between this state and the previous state.
    alertRules, folderTitles := sch.schedulableAlertRules.all()

    registeredDefinitions := sch.registry.keyMap()

    readyToRun := make([]readyToRunItem, 0)
    updatedRules := make([]ngmodels.AlertRuleKeyWithVersion, 0, len(updated)) // this is needed for tests only
     
   
    for _, item := range alertRules {
       ruleRoutine, newRoutine := sch.registry.getOrCreate(ctx, item, ruleFactory)
       key := item.GetKey()
       logger := sch.log.FromContext(ctx).New(key.LogContext()...)

       // enforce minimum evaluation interval
       if item.IntervalSeconds < int64(sch.minRuleInterval.Seconds()) {
          logger.Debug("Interval adjusted", "originalInterval", item.IntervalSeconds, "adjustedInterval", sch.minRuleInterval.Seconds())
          item.IntervalSeconds = int64(sch.minRuleInterval.Seconds())
       }

       invalidInterval := item.IntervalSeconds%int64(sch.baseInterval.Seconds()) != 0

       if newRoutine && !invalidInterval {
          dispatcherGroup.Go(func() error {
             return ruleRoutine.Run(key)
          })
       }

       if invalidInterval {
          // this is expected to be always false
          // given that we validate interval during alert rule updates
          logger.Warn("Rule has an invalid interval and will be ignored. Interval should be divided exactly by scheduler interval", "ruleInterval", time.Duration(item.IntervalSeconds)*time.Second, "schedulerInterval", sch.baseInterval)
          continue
       }

       itemFrequency := item.IntervalSeconds / int64(sch.baseInterval.Seconds())
       offset := jitterOffsetInTicks(item, sch.baseInterval, sch.jitterEvaluations)
       isReadyToRun := item.IntervalSeconds != 0 && (tickNum%itemFrequency)-offset == 0

       if isReadyToRun {
          logger.Debug("Rule is ready to run on the current tick", "tick", tickNum, "frequency", itemFrequency, "offset", offset)
          readyToRun = append(readyToRun, readyToRunItem{ruleRoutine: ruleRoutine, Evaluation: Evaluation{
             scheduledAt: tick,
             rule:        item,
             folderTitle: folderTitle,
          }})
       }
       if _, isUpdated := updated[key]; isUpdated && !isReadyToRun {
          // if we do not need to eval the rule, check the whether rule was just updated and if it was, notify evaluation routine about that
          logger.Debug("Rule has been updated. Notifying evaluation routine")
          go func(routine Rule, rule *ngmodels.AlertRule) {
             routine.Update(RuleVersionAndPauseStatus{
                Fingerprint: ruleWithFolder{rule: rule, folderTitle: folderTitle}.Fingerprint(),
                IsPaused:    rule.IsPaused,
             })
          }(ruleRoutine, item)
          updatedRules = append(updatedRules, ngmodels.AlertRuleKeyWithVersion{
             Version:      item.Version,
             AlertRuleKey: item.GetKey(),
          })
       }

       // remove the alert rule from the registered alert rules
       delete(registeredDefinitions, key)
    }


    var step int64 = 0
    if len(readyToRun) > 0 {
       step = sch.baseInterval.Nanoseconds() / int64(len(readyToRun))
    }
    // 排序，主要保证在HA模式下规则执行的一致性
    slices.SortFunc(readyToRun, func(a, b readyToRunItem) int {
       return strings.Compare(a.rule.UID, b.rule.UID)
    })
    for i := range readyToRun {
       item := readyToRun[i]

       time.AfterFunc(time.Duration(int64(i)*step), func() {
          key := item.rule.GetKey()
          success, dropped := item.ruleRoutine.Eval(&item.Evaluation)
          if !success {
             sch.log.Debug("Scheduled evaluation was canceled because evaluation routine was stopped", append(key.LogContext(), "time", tick)...)
             return
          }
          if dropped != nil {
             sch.log.Warn("Tick dropped because alert rule evaluation is too slow", append(key.LogContext(), "time", tick)...)
             orgID := fmt.Sprint(key.OrgID)
             sch.metrics.EvaluationMissed.WithLabelValues(orgID, item.rule.Title).Inc()
          }
       })
    }

    // unregister and stop routines of the deleted alert rules
    toDelete := make([]ngmodels.AlertRuleKey, 0, len(registeredDefinitions))
    for key := range registeredDefinitions {
       toDelete = append(toDelete, key)
    }
    sch.deleteAlertRule(toDelete...)
    return readyToRun, registeredDefinitions, updatedRules
}

tickNum：当前调度时间(tickTime.Unix())秒数除以调度器的执行间隔（baseInterval）的秒数，得出当前调度所在的时间片。

rulesDiff：updateSchedulableAlertRules 函数会从数据库中加载所有的告警规则，并将其缓存到内存中。然后，返回已更新的告警规则的ruleKey。

可以在updateSchedulableAlertRules中增加告警规则的分片加载逻辑。

func (r *alertRulesRegistry) getDiff(rules map[models.AlertRuleKey]*models.AlertRule) diff {
    result := diff{
       updated: map[models.AlertRuleKey]struct{}{},
    }
    for key, newRule := range rules {
       oldRule, ok := r.rules[key]
       if !ok || newRule.Version == oldRule.Version {
          // a new rule or not updated
continue
       }
       result.updated[key] = struct{}{}
    }
    return result
}

registeredDefinitions ：用于存储需要删除的（无效的）告警规则的ruleKey。初始时，它被分配为所有告警规则的ruleKey。在当前调度周期中，每个有效的告警规则都会从该 Map 中移除。因此，最终剩余在registeredDefinitions中的ruleKey就是需要被删除的那些。
遍历所有的告警规则，根据 AlertRuleKey 创建或获取其对应的 ruleRoutine。ruleRoutine 这个名称很有意思，它有点类似于 Go 语言中的 goroutine，它是对告警规则的封装。每个 ruleRoutine 都会启动一个 goroutine 来对其进行评估。
检查告警规则的执行间隔是否是调度器执行间隔的整数倍。若不是，则该告警规则无效，将不会被执行。
如果 ruleRoutine 是新建的（只有新建的告警规则才会创建一个 ruleRoutine），将其放入 dispatcherGroup 中的goroutine中等待执行。
itemFrequency：通过告警规则的执行间隔除调度器的执行间隔（baseInterval）的秒数，得出告警规则的执行频率。
isReadyToRun：tickNum%itemFrequency=0 意味着到了执行该规则的时间点。
下面这段逻辑将为即将执行的规则增加一个 baseInterval 秒（默认 10s ）内的随机打散策略，以避免在同一时间执行太多具有相同评估间隔的告警规则，从而导致资源负载不均衡。

        var step int64 = 0
        if len(readyToRun) > 0 {
                step = sch.baseInterval.Nanoseconds() / int64(len(readyToRun))
        }

        for i := range readyToRun {
                item := readyToRun[i]
                time.AfterFunc(time.Duration(int64(i)*step), func() {
                    // ...  
                })
        }

在本次调度前按照 UID 对告警规则进行排序。这是因为 Grafana 的规则调度不支持分片，在 HA 模式下，每个节点都会执行所有的告警规则。因此，为了保证每个节点在同一时刻执行相同的告警规则，需要按照告警规则 UID 进行排序。
offset：一个根据ruleGroup或者rule哈希值计算的偏移量，旨在与上述打散策略配合使用，以进一步将告警规则的评估均匀分布到每个时间窗口内，从而降低处理峰值和网络流量峰值。

告警规则

Rule：一个接口，它将调度程序和规则实现完全分离。

// Rule represents a single piece of work that is executed periodically by the ruler.  
type Rule interface {  
    // Run creates the resources that will perform the rule's work, and starts it. It blocks indefinitely, until Stop is called or another signal is sent.  
    Run(key ngmodels.AlertRuleKey) error  
    // Stop shuts down the rule's execution with an optional reason. It has no effect if the rule has not yet been Run.  
    Stop(reason error)  
    // Eval sends a signal to execute the work represented by the rule, exactly one time.  
    // It has no effect if the rule has not yet been Run, or if the rule is Stopped.  
    Eval(eval *Evaluation) (bool, *Evaluation)  
    // Update sends a singal to change the definition of the rule.  
    Update(lastVersion RuleVersionAndPauseStatus) bool  
}

Run：创建执行规则工作的资源，并启动这些资源。该进程将无限期阻塞，直到调用 Stop 方法或接收到其他信号。
Stop：关闭规则的执行时，可以选择是否附带一个原因参数。
Eval：发送一个信号，让规则执行一次任务。若规则尚未运行或已被停止，则该方法无效。
Update：发送一个信号，改变规则的定义。

alertRule：

 type alertRule struct {
     evalCh   chan *Evaluation
     updateCh chan RuleVersionAndPauseStatus
     ctx      context.Context
     stopFn   util.CancelCauseFunc

     maxAttempts          int64

     clock        clock.Clock
     sender       AlertsSender
     // 状态管理器
     stateManager *state.Manager
     // 评估器工厂
     evalFactory  eval.EvaluatorFactory
     ruleProvider ruleProvider

     metrics *metrics.Scheduler
     logger  log.Logger
 }

evalCh：处理评估信号的 channel。
updateCh：处理更新信号的 channel。

ctx 和 stopFn：带 reason 的 context，这里现应该可以用context.WithCancelCause()代替。

 ctx, stop := util.WithCancelCause(parent)
 return &alertRule{
     evalCh:               make(chan *Evaluation),
     updateCh:             make(chan RuleVersionAndPauseStatus),
     stopFn:               stop,」
 }

Evaluation：评估信号

 type Evaluation struct {
     // 当前周期调度时间
     scheduledAt time.Time
     // 告警规则 model
     rule        *models.AlertRule
 }

ruleFactory：用于创建 rule，使用闭包实现的工厂模式

  type ruleFactory interface {
      new(context.Context, *models.AlertRule) Rule
  }

  type ruleFactoryFunc func(context.Context, *ngmodels.AlertRule) Rule

  func (f ruleFactoryFunc) new(ctx context.Context, rule *ngmodels.AlertRule) Rule {
      return f(ctx, rule)
  }

Run

func (a *alertRule) Run(key ngmodels.AlertRuleKey) error {
    grafanaCtx := ngmodels.WithRuleKey(a.ctx, key)
    logger := a.logger.FromContext(grafanaCtx)
    logger.Debug("Alert rule routine started")

    var currentFingerprint fingerprint
    defer a.stopApplied(key)
    for {
       select {
       // used by external services (API) to notify that rule is updated.
       case ctx := <-a.updateCh:
          if currentFingerprint == ctx.Fingerprint {
             logger.Info("Rule's fingerprint has not changed. Skip resetting the state", "currentFingerprint", currentFingerprint)
             continue
          }

          logger.Info("Clearing the state of the rule because it was updated", "isPaused", ctx.IsPaused, "fingerprint", ctx.Fingerprint)
          // clear the state. So the next evaluation will start from the scratch.
          a.resetState(grafanaCtx, key, ctx.IsPaused)
          currentFingerprint = ctx.Fingerprint
         // evalCh - used by the scheduler to signal that evaluation is needed.
       case ctx, ok := <-a.evalCh:
          if !ok {
             logger.Debug("Evaluation channel has been closed. Exiting")
             return nil
          }

          func() {
             orgID := fmt.Sprint(key.OrgID)
             evalDuration := a.metrics.EvalDuration.WithLabelValues(orgID)
             evalTotal := a.metrics.EvalTotal.WithLabelValues(orgID)

             evalStart := a.clock.Now()
             defer func() {
                evalDuration.Observe(a.clock.Now().Sub(evalStart).Seconds())
             }()

             for attempt := int64(1); attempt <= a.maxAttempts; attempt++ {
                isPaused := ctx.rule.IsPaused
                f := ctx.Fingerprint()
                // Do not clean up state if the eval loop has just started.
                var needReset bool
                if currentFingerprint != 0 && currentFingerprint != f {
                   logger.Debug("Got a new version of alert rule. Clear up the state", "fingerprint", f)
                   needReset = true
                }

                needReset = needReset || (currentFingerprint == 0 && isPaused)
                if needReset {
                   a.resetState(grafanaCtx, key, isPaused)
                }
                currentFingerprint = f
                if isPaused {
                   logger.Debug("Skip rule evaluation because it is paused")
                   return
                }

                // Only increment evaluation counter once, not per-retry.
                if attempt == 1 {
                   evalTotal.Inc()
                }

                retry := attempt < a.maxAttempts
                err := a.evaluate(tracingCtx, key, f, attempt, ctx, span, retry)
                if err == nil {
                   return
                }
                
                logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
                select {
                case <-tracingCtx.Done():
                   logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
                   return
                case <-time.After(retryDelay):
                   continue
                }
             }
          }()

       case <-grafanaCtx.Done():
             // clean up the state only if the reason for stopping the evaluation loop is that the rule was deleted
             if errors.Is(grafanaCtx.Err(), errRuleDeleted) {
             // We do not want a context to be unbounded which could potentially cause a go routine running
             // indefinitely. 1 minute is an almost randomly chosen timeout, big enough to cover the majority of the
             // cases.
             ctx, cancelFunc := context.WithTimeout(context.Background(), time.Minute)
             defer cancelFunc()
             states := a.stateManager.DeleteStateByRuleUID(ngmodels.WithRuleKey(ctx, key), key, ngmodels.StateReasonRuleDeleted)
             a.notify(grafanaCtx, key, states)
          }
          logger.Debug("Stopping alert rule routine")
          return nil
       }
    }
}

处理评估信号： ctx, ok := <-a.evalCh:
- currentFingerprint：根据告警规则的UID+Org+Version+Data（规则详情）计算出的规则的指纹。currentFingerprint 指上个周期评估的指纹。
- isPaused：判断当前规则是否被暂停。
- f：计算告警规则的指纹。
- resetState：如果当前规则的当前指纹与上次评估时的指纹不一致，或者告警规则被暂停，那么将重置告警规则评估的所有标签集的状态。
- a.evaluate()：进行告警规则的评估。
处理更新信号：ctx := <-a.updateCh:
- 如果Fingerprint没有变化，那么不需要处理。
- resetState：置告警规则评估的所有标签集的状态。

处理 ctx done 信号：<-grafanaCtx.Done():

告警规则被删除的的情况下下 done 信号则被触发

  func (sch *schedule) deleteAlertRule(keys ...ngmodels.AlertRuleKey) {
      for _, key := range keys {
         if _, ok := sch.schedulableAlertRules.del(key); !ok {
            sch.log.Info("Alert rule cannot be removed from the scheduler as it is not scheduled", key.LogContext()...)
         }
         // Delete the rule routine
         ruleRoutine, ok := sch.registry.del(key)
         if !ok {
            sch.log.Info("Alert rule cannot be stopped as it is not running", key.LogContext()...)
            continue
         }
         // stop rule evaluation
         ruleRoutine.Stop(errRuleDeleted)
      }
  }

resetState：置告警规则评估的所有标签集的状态。

Eval

func (a *alertRule) Eval(eval *Evaluation) (bool, *Evaluation) {
    // read the channel in unblocking manner to make sure that there is no concurrent send operation.
    var droppedMsg *Evaluation
    select {
    case droppedMsg = <-a.evalCh:
    default:
    }

    select {
    case a.evalCh <- eval:
       return true, droppedMsg
    case <-a.ctx.Done():
       return false, droppedMsg
    }
}

在向 channel 中发送评估信号之前，先执行非阻塞读取，以确保没有并发任务在执行。如果存在超时的评估任务，需要将其 drop 掉，然后再向 channel 中写入评估信号。

Update

func (a *alertRule) Update(lastVersion RuleVersionAndPauseStatus) bool {
    // check if the channel is not empty.
    select {
    case <-a.updateCh:
    case <-a.ctx.Done():
       return false
    default:
    }

    select {
    case a.updateCh <- lastVersion:
       return true
    case <-a.ctx.Done():
       return false
    }
}

在发送更新信号之前，同样先执行非阻塞读取，确保没有并发更新，然后，将更新信号写入channel。

Stop

func (a *alertRule) Stop(reason error) {
    if a.stopFn != nil {
       a.stopFn(reason)
    }
}

深入理解Grafana告警规则的调度和执行

概述

调度器

启动

执行

告警规则

Run

Eval

Update

Stop