13.1 分布式任务调度架构竟然可以这样做?

1 阅读11分钟

震撼!分布式任务调度架构竟然可以这样做?

分布式任务调度系统是现代企业级应用的重要组成部分,负责在分布式环境中高效、可靠地执行各种定时和异步任务。一个设计良好的分布式任务调度系统不仅要支持高并发和高可用,还要具备灵活的任务编排、复杂的调度策略和完善的监控能力。本章将深入探讨分布式任务调度系统的架构设计和核心技术。

1. 分布式任务调度概述

在深入具体实现之前,我们需要了解分布式任务调度系统的基本概念和核心挑战。

1.1 核心概念

// DistributedTaskSchedulingConcepts 分布式任务调度核心概念
type DistributedTaskSchedulingConcepts struct {
    // 任务(Job) - 需要执行的工作单元
    Job bool
    
    // 调度器(Scheduler) - 负责决定何时执行任务
    Scheduler bool
    
    // 执行器(Executor) - 负责实际执行任务
    Executor bool
    
    // 触发器(Trigger) - 定义任务的执行时间规则
    Trigger bool
    
    // 任务队列(Task Queue) - 存储待执行任务的队列
    TaskQueue bool
}

1.2 核心挑战

// DistributedTaskSchedulingChallenges 分布式任务调度挑战
type DistributedTaskSchedulingChallenges struct {
    // 数据一致性
    DataConsistency bool
    
    // 高可用性
    HighAvailability bool
    
    // 故障恢复
    FaultRecovery bool
    
    // 负载均衡
    LoadBalancing bool
    
    // 任务分片
    TaskSharding bool
    
    // 并发控制
    ConcurrencyControl bool
    
    // 监控和告警
    MonitoringAndAlerting bool
}

2. 系统架构设计

分布式任务调度系统的架构设计需要考虑多个组件的协调工作,形成一个完整的工作流。

2.1 架构概览

graph TB
    A[任务生产者] --> B[任务存储]
    C[调度器集群] --> B
    D[执行器集群] --> B
    E[管理控制台] --> B
    F[监控系统] --> C
    F --> D
    G[通知服务] --> A
    B --> H[持久化存储]
    C --> I[注册中心]
    D --> I

2.2 核心组件设计

// TaskSchedulerSystem 任务调度系统
type TaskSchedulerSystem struct {
    config          *SchedulerConfig
    jobStore        JobStore
    scheduler       *Scheduler
    executor        *Executor
    registry        *ServiceRegistry
    monitor         *SystemMonitor
    notification    *NotificationService
    metrics         *SchedulerMetrics
}

// SchedulerConfig 调度器配置
type SchedulerConfig struct {
    // 调度间隔
    ScheduleInterval time.Duration `json:"schedule_interval"`
    
    // 执行超时时间
    ExecutionTimeout time.Duration `json:"execution_timeout"`
    
    // 最大并发任务数
    MaxConcurrentTasks int `json:"max_concurrent_tasks"`
    
    // 任务重试次数
    MaxRetryAttempts int `json:"max_retry_attempts"`
    
    // 负载均衡策略
    LoadBalancingStrategy string `json:"load_balancing_strategy"`
    
    // 故障转移策略
    FailoverStrategy string `json:"failover_strategy"`
}

// Job 任务定义
type Job struct {
    ID          string            `json:"id"`
    Name        string            `json:"name"`
    Description string            `json:"description"`
    CronExpr    string            `json:"cron_expr"`    // Cron表达式
    TaskType    string            `json:"task_type"`    // 任务类型
    Parameters  map[string]string `json:"parameters"`   // 任务参数
    Priority    int               `json:"priority"`     // 任务优先级
    Timeout     time.Duration     `json:"timeout"`      // 执行超时
    RetryCount  int               `json:"retry_count"`  // 重试次数
    MaxRetry    int               `json:"max_retry"`    // 最大重试次数
    Status      JobStatus         `json:"status"`       // 任务状态
    CreatedAt   time.Time         `json:"created_at"`
    UpdatedAt   time.Time         `json:"updated_at"`
}

// JobStatus 任务状态
type JobStatus string

const (
    JobStatusPending   JobStatus = "pending"   // 待执行
    JobStatusRunning   JobStatus = "running"   // 执行中
    JobStatusSuccess   JobStatus = "success"   // 执行成功
    JobStatusFailed    JobStatus = "failed"    // 执行失败
    JobStatusCancelled JobStatus = "cancelled" // 已取消
)

// JobExecution 任务执行记录
type JobExecution struct {
    ID          string            `json:"id"`
    JobID       string            `json:"job_id"`
    ExecutorID  string            `json:"executor_id"`
    StartTime   time.Time         `json:"start_time"`
    EndTime     time.Time         `json:"end_time"`
    Status      ExecutionStatus   `json:"status"`
    Result      string            `json:"result"`
    Error       string            `json:"error"`
    RetryCount  int               `json:"retry_count"`
}

// ExecutionStatus 执行状态
type ExecutionStatus string

const (
    ExecutionStatusStarted  ExecutionStatus = "started"
    ExecutionStatusFinished ExecutionStatus = "finished"
    ExecutionStatusFailed   ExecutionStatus = "failed"
)

// NewTaskSchedulerSystem 创建任务调度系统
func NewTaskSchedulerSystem(config *SchedulerConfig) *TaskSchedulerSystem {
    return &TaskSchedulerSystem{
        config:       config,
        jobStore:     NewInMemoryJobStore(),
        scheduler:    NewScheduler(config),
        executor:     NewExecutor(config),
        registry:     NewServiceRegistry(),
        monitor:      NewSystemMonitor(),
        notification: NewNotificationService(),
        metrics:      NewSchedulerMetrics(),
    }
}

3. 任务存储实现

任务存储是分布式任务调度系统的核心组件,负责存储任务定义、执行记录等关键数据。

3.1 任务存储接口

// JobStore 任务存储接口
type JobStore interface {
    // 创建任务
    CreateJob(ctx context.Context, job *Job) error
    
    // 更新任务
    UpdateJob(ctx context.Context, job *Job) error
    
    // 删除任务
    DeleteJob(ctx context.Context, jobID string) error
    
    // 获取任务
    GetJob(ctx context.Context, jobID string) (*Job, error)
    
    // 列出所有任务
    ListJobs(ctx context.Context) ([]*Job, error)
    
    // 根据状态获取任务
    GetJobsByStatus(ctx context.Context, status JobStatus) ([]*Job, error)
    
    // 创建任务执行记录
    CreateExecution(ctx context.Context, execution *JobExecution) error
    
    // 更新任务执行记录
    UpdateExecution(ctx context.Context, execution *JobExecution) error
    
    // 获取任务执行记录
    GetExecutions(ctx context.Context, jobID string) ([]*JobExecution, error)
    
    // 获取待执行任务
    GetPendingJobs(ctx context.Context, limit int) ([]*Job, error)
}

// InMemoryJobStore 内存任务存储实现
type InMemoryJobStore struct {
    jobs       sync.Map // map[string]*Job
    executions sync.Map // map[string][]*JobExecution
    mutex      sync.RWMutex
}

// NewInMemoryJobStore 创建内存任务存储
func NewInMemoryJobStore() *InMemoryJobStore {
    return &InMemoryJobStore{}
}

// CreateJob 创建任务
func (ims *InMemoryJobStore) CreateJob(ctx context.Context, job *Job) error {
    if job.ID == "" {
        job.ID = uuid.New().String()
    }
    
    job.CreatedAt = time.Now()
    job.UpdatedAt = time.Now()
    
    ims.jobs.Store(job.ID, job)
    return nil
}

// UpdateJob 更新任务
func (ims *InMemoryJobStore) UpdateJob(ctx context.Context, job *Job) error {
    ims.mutex.Lock()
    defer ims.mutex.Unlock()
    
    existingJob, exists := ims.jobs.Load(job.ID)
    if !exists {
        return fmt.Errorf("job %s not found", job.ID)
    }
    
    existing := existingJob.(*Job)
    job.CreatedAt = existing.CreatedAt
    job.UpdatedAt = time.Now()
    
    ims.jobs.Store(job.ID, job)
    return nil
}

// DeleteJob 删除任务
func (ims *InMemoryJobStore) DeleteJob(ctx context.Context, jobID string) error {
    ims.jobs.Delete(jobID)
    ims.executions.Delete(jobID)
    return nil
}

// GetJob 获取任务
func (ims *InMemoryJobStore) GetJob(ctx context.Context, jobID string) (*Job, error) {
    job, exists := ims.jobs.Load(jobID)
    if !exists {
        return nil, fmt.Errorf("job %s not found", jobID)
    }
    
    return job.(*Job), nil
}

// ListJobs 列出所有任务
func (ims *InMemoryJobStore) ListJobs(ctx context.Context) ([]*Job, error) {
    var jobs []*Job
    
    ims.jobs.Range(func(key, value interface{}) bool {
        jobs = append(jobs, value.(*Job))
        return true
    })
    
    return jobs, nil
}

// GetJobsByStatus 根据状态获取任务
func (ims *InMemoryJobStore) GetJobsByStatus(ctx context.Context, status JobStatus) ([]*Job, error) {
    var jobs []*Job
    
    ims.jobs.Range(func(key, value interface{}) bool {
        job := value.(*Job)
        if job.Status == status {
            jobs = append(jobs, job)
        }
        return true
    })
    
    return jobs, nil
}

// CreateExecution 创建任务执行记录
func (ims *InMemoryJobStore) CreateExecution(ctx context.Context, execution *JobExecution) error {
    ims.mutex.Lock()
    defer ims.mutex.Unlock()
    
    if execution.ID == "" {
        execution.ID = uuid.New().String()
    }
    
    // 获取该任务的执行记录列表
    executions, _ := ims.executions.LoadOrStore(execution.JobID, make([]*JobExecution, 0))
    executionList := executions.([]*JobExecution)
    
    // 添加新的执行记录
    executionList = append(executionList, execution)
    ims.executions.Store(execution.JobID, executionList)
    
    return nil
}

// UpdateExecution 更新任务执行记录
func (ims *InMemoryJobStore) UpdateExecution(ctx context.Context, execution *JobExecution) error {
    // 在内存实现中,更新和创建执行记录是相同的
    return ims.CreateExecution(ctx, execution)
}

// GetExecutions 获取任务执行记录
func (ims *InMemoryJobStore) GetExecutions(ctx context.Context, jobID string) ([]*JobExecution, error) {
    executions, exists := ims.executions.Load(jobID)
    if !exists {
        return []*JobExecution{}, nil
    }
    
    return executions.([]*JobExecution), nil
}

// GetPendingJobs 获取待执行任务
func (ims *InMemoryJobStore) GetPendingJobs(ctx context.Context, limit int) ([]*Job, error) {
    var pendingJobs []*Job
    count := 0
    
    ims.jobs.Range(func(key, value interface{}) bool {
        if count >= limit {
            return false
        }
        
        job := value.(*Job)
        if job.Status == JobStatusPending {
            pendingJobs = append(pendingJobs, job)
            count++
        }
        
        return true
    })
    
    return pendingJobs, nil
}

4. 调度器实现

调度器是任务调度系统的核心组件,负责根据任务的调度规则触发任务执行。

4.1 调度器设计

// Scheduler 调度器
type Scheduler struct {
    config       *SchedulerConfig
    jobStore     JobStore
    executor     *Executor
    cronParser   *cron.Parser
    ticker       *time.Ticker
    stopChan     chan struct{}
    wg           sync.WaitGroup
    metrics      *SchedulerMetrics
}

// NewScheduler 创建调度器
func NewScheduler(config *SchedulerConfig) *Scheduler {
    return &Scheduler{
        config:     config,
        cronParser: cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow),
        stopChan:   make(chan struct{}),
        metrics:    NewSchedulerMetrics(),
    }
}

// Start 启动调度器
func (s *Scheduler) Start(ctx context.Context, jobStore JobStore, executor *Executor) error {
    s.jobStore = jobStore
    s.executor = executor
    
    // 启动调度循环
    s.wg.Add(1)
    go s.scheduleLoop()
    
    return nil
}

// scheduleLoop 调度循环
func (s *Scheduler) scheduleLoop() {
    defer s.wg.Done()
    
    s.ticker = time.NewTicker(s.config.ScheduleInterval)
    defer s.ticker.Stop()
    
    for {
        select {
        case <-s.stopChan:
            return
        case <-s.ticker.C:
            s.scheduleJobs()
        }
    }
}

// scheduleJobs 调度任务
func (s *Scheduler) scheduleJobs() {
    ctx := context.Background()
    
    // 获取待执行任务
    jobs, err := s.jobStore.GetPendingJobs(ctx, 100) // 一次最多处理100个任务
    if err != nil {
        log.Printf("Failed to get pending jobs: %v", err)
        return
    }
    
    now := time.Now()
    for _, job := range jobs {
        // 解析Cron表达式
        schedule, err := s.cronParser.Parse(job.CronExpr)
        if err != nil {
            log.Printf("Failed to parse cron expression for job %s: %v", job.ID, err)
            continue
        }
        
        // 检查是否应该执行
        if schedule.Next(time.Now()).Before(now) || schedule.Next(time.Now()).Equal(now) {
            // 提交任务执行
            if err := s.executor.SubmitJob(ctx, job); err != nil {
                log.Printf("Failed to submit job %s for execution: %v", job.ID, err)
                s.metrics.SchedulingErrors.WithLabelValues("submission_failed").Inc()
                continue
            }
            
            s.metrics.JobsScheduled.WithLabelValues(job.TaskType).Inc()
        }
    }
}

// Stop 停止调度器
func (s *Scheduler) Stop() {
    close(s.stopChan)
    s.wg.Wait()
}

4.2 Cron表达式解析

// CronExpressionParser Cron表达式解析器
type CronExpressionParser struct {
    parser *cron.Parser
}

// NewCronExpressionParser 创建Cron表达式解析器
func NewCronExpressionParser() *CronExpressionParser {
    return &CronExpressionParser{
        parser: cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow),
    }
}

// Parse 解析Cron表达式
func (cep *CronExpressionParser) Parse(expression string) (cron.Schedule, error) {
    return cep.parser.Parse(expression)
}

// GetNextExecutionTime 获取下次执行时间
func (cep *CronExpressionParser) GetNextExecutionTime(expression string, fromTime time.Time) (time.Time, error) {
    schedule, err := cep.Parse(expression)
    if err != nil {
        return time.Time{}, err
    }
    
    return schedule.Next(fromTime), nil
}

// Validate 验证Cron表达式
func (cep *CronExpressionParser) Validate(expression string) error {
    _, err := cep.Parse(expression)
    return err
}

5. 执行器实现

执行器负责实际执行任务,是任务调度系统的执行单元。

5.1 执行器设计

// Executor 执行器
type Executor struct {
    config        *SchedulerConfig
    jobStore      JobStore
    taskRegistry  *TaskRegistry
    workerPool    *WorkerPool
    metrics       *ExecutorMetrics
}

// TaskRegistry 任务注册表
type TaskRegistry struct {
    tasks sync.Map // map[string]TaskHandler
}

// TaskHandler 任务处理器接口
type TaskHandler interface {
    Execute(ctx context.Context, job *Job) (string, error)
}

// WorkerPool 工作协程池
type WorkerPool struct {
    workers    int
    jobQueue   chan *JobExecution
    quit       chan struct{}
    wg         sync.WaitGroup
    metrics    *WorkerPoolMetrics
}

// WorkerPoolMetrics 工作协程池指标
type WorkerPoolMetrics struct {
    ActiveWorkers *prometheus.GaugeVec
    QueuedJobs    *prometheus.GaugeVec
    ProcessedJobs *prometheus.CounterVec
}

// NewExecutor 创建执行器
func NewExecutor(config *SchedulerConfig) *Executor {
    return &Executor{
        config:       config,
        taskRegistry: NewTaskRegistry(),
        workerPool:   NewWorkerPool(config.MaxConcurrentTasks),
        metrics:      NewExecutorMetrics(),
    }
}

// NewTaskRegistry 创建任务注册表
func NewTaskRegistry() *TaskRegistry {
    return &TaskRegistry{}
}

// NewWorkerPool 创建工作协程池
func NewWorkerPool(workers int) *WorkerPool {
    wp := &WorkerPool{
        workers:  workers,
        jobQueue: make(chan *JobExecution, 1000),
        quit:     make(chan struct{}),
        metrics: &WorkerPoolMetrics{
            ActiveWorkers: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "executor_active_workers",
                    Help: "Number of active workers",
                },
                []string{"executor_type"},
            ),
            QueuedJobs: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "executor_queued_jobs",
                    Help: "Number of queued jobs",
                },
                []string{"job_type"},
            ),
            ProcessedJobs: prometheus.NewCounterVec(
                prometheus.CounterOpts{
                    Name: "executor_processed_jobs_total",
                    Help: "Total number of processed jobs",
                },
                []string{"job_type", "result"},
            ),
        },
    }
    
    // 启动工作协程
    for i := 0; i < workers; i++ {
        wp.wg.Add(1)
        go wp.worker(i)
    }
    
    return wp
}

// RegisterTask 注册任务处理器
func (tr *TaskRegistry) RegisterTask(taskType string, handler TaskHandler) {
    tr.tasks.Store(taskType, handler)
}

// GetTask 获取任务处理器
func (tr *TaskRegistry) GetTask(taskType string) (TaskHandler, bool) {
    handler, exists := tr.tasks.Load(taskType)
    if !exists {
        return nil, false
    }
    
    return handler.(TaskHandler), true
}

// SubmitJob 提交任务执行
func (e *Executor) SubmitJob(ctx context.Context, job *Job) error {
    // 创建执行记录
    execution := &JobExecution{
        ID:         uuid.New().String(),
        JobID:      job.ID,
        StartTime:  time.Now(),
        Status:     ExecutionStatusStarted,
        RetryCount: 0,
    }
    
    // 保存执行记录
    if err := e.jobStore.CreateExecution(ctx, execution); err != nil {
        return fmt.Errorf("failed to create execution record: %w", err)
    }
    
    // 提交到工作队列
    select {
    case e.workerPool.jobQueue <- execution:
        e.metrics.JobsSubmitted.WithLabelValues(job.TaskType).Inc()
        return nil
    default:
        return errors.New("executor job queue is full")
    }
}

// worker 工作协程
func (wp *WorkerPool) worker(id int) {
    defer wp.wg.Done()
    
    for {
        select {
        case execution := <-wp.jobQueue:
            wp.metrics.ActiveWorkers.WithLabelValues("task_executor").Inc()
            
            // 执行任务
            wp.executeJob(execution)
            
            wp.metrics.ActiveWorkers.WithLabelValues("task_executor").Dec()
            wp.metrics.QueuedJobs.WithLabelValues("task").Dec()
            
        case <-wp.quit:
            return
        }
    }
}

// executeJob 执行任务
func (wp *WorkerPool) executeJob(execution *JobExecution) {
    // 这里应该从全局执行器获取实际的执行器实例
    // 简化处理,仅作为示例
}

// Stop 停止工作协程池
func (wp *WorkerPool) Stop() {
    close(wp.quit)
    wp.wg.Wait()
}

6. 服务注册与发现

在分布式环境中,服务注册与发现是实现高可用和负载均衡的关键。

6.1 服务注册中心

// ServiceRegistry 服务注册中心
type ServiceRegistry struct {
    services sync.Map // map[string]*ServiceInstance
    mutex    sync.RWMutex
    ttl      time.Duration
}

// ServiceInstance 服务实例
type ServiceInstance struct {
    ID        string            `json:"id"`
    Name      string            `json:"name"`
    Address   string            `json:"address"`
    Port      int               `json:"port"`
    Tags      []string          `json:"tags"`
    Metadata  map[string]string `json:"metadata"`
    Heartbeat time.Time         `json:"heartbeat"`
    Status    ServiceStatus     `json:"status"`
}

// ServiceStatus 服务状态
type ServiceStatus string

const (
    ServiceStatusUp   ServiceStatus = "up"
    ServiceStatusDown ServiceStatus = "down"
)

// NewServiceRegistry 创建服务注册中心
func NewServiceRegistry() *ServiceRegistry {
    sr := &ServiceRegistry{
        ttl: 30 * time.Second, // 默认30秒超时
    }
    
    // 启动健康检查
    go sr.startHealthCheck()
    
    return sr
}

// RegisterService 注册服务
func (sr *ServiceRegistry) RegisterService(instance *ServiceInstance) error {
    sr.mutex.Lock()
    defer sr.mutex.Unlock()
    
    if instance.ID == "" {
        instance.ID = uuid.New().String()
    }
    
    instance.Heartbeat = time.Now()
    instance.Status = ServiceStatusUp
    
    if instance.Metadata == nil {
        instance.Metadata = make(map[string]string)
    }
    
    sr.services.Store(instance.ID, instance)
    return nil
}

// UnregisterService 注销服务
func (sr *ServiceRegistry) UnregisterService(instanceID string) error {
    sr.mutex.Lock()
    defer sr.mutex.Unlock()
    
    sr.services.Delete(instanceID)
    return nil
}

// Heartbeat 心跳
func (sr *ServiceRegistry) Heartbeat(instanceID string) error {
    sr.mutex.Lock()
    defer sr.mutex.Unlock()
    
    instance, exists := sr.services.Load(instanceID)
    if !exists {
        return fmt.Errorf("service instance %s not found", instanceID)
    }
    
    serviceInstance := instance.(*ServiceInstance)
    serviceInstance.Heartbeat = time.Now()
    serviceInstance.Status = ServiceStatusUp
    
    return nil
}

// GetServiceInstances 获取服务实例列表
func (sr *ServiceRegistry) GetServiceInstances(serviceName string) ([]*ServiceInstance, error) {
    sr.mutex.RLock()
    defer sr.mutex.RUnlock()
    
    var instances []*ServiceInstance
    
    sr.services.Range(func(key, value interface{}) bool {
        instance := value.(*ServiceInstance)
        if instance.Name == serviceName && instance.Status == ServiceStatusUp {
            instances = append(instances, instance)
        }
        return true
    })
    
    return instances, nil
}

// GetServiceInstance 获取特定服务实例
func (sr *ServiceRegistry) GetServiceInstance(instanceID string) (*ServiceInstance, error) {
    sr.mutex.RLock()
    defer sr.mutex.RUnlock()
    
    instance, exists := sr.services.Load(instanceID)
    if !exists {
        return nil, fmt.Errorf("service instance %s not found", instanceID)
    }
    
    return instance.(*ServiceInstance), nil
}

// startHealthCheck 启动健康检查
func (sr *ServiceRegistry) startHealthCheck() {
    ticker := time.NewTicker(10 * time.Second)
    defer ticker.Stop()
    
    for range ticker.C {
        sr.checkHealth()
    }
}

// checkHealth 检查健康状态
func (sr *ServiceRegistry) checkHealth() {
    sr.mutex.Lock()
    defer sr.mutex.Unlock()
    
    now := time.Now()
    
    sr.services.Range(func(key, value interface{}) bool {
        instance := value.(*ServiceInstance)
        if now.Sub(instance.Heartbeat) > sr.ttl {
            instance.Status = ServiceStatusDown
        }
        return true
    })
}

7. 监控与指标

完善的监控体系是保障分布式任务调度系统稳定运行的重要手段。

7.1 监控指标

// SchedulerMetrics 调度器指标
type SchedulerMetrics struct {
    JobsScheduled    *prometheus.CounterVec
    SchedulingErrors *prometheus.CounterVec
    ScheduleLatency  *prometheus.HistogramVec
}

// ExecutorMetrics 执行器指标
type ExecutorMetrics struct {
    JobsSubmitted    *prometheus.CounterVec
    JobsCompleted    *prometheus.CounterVec
    ExecutionErrors  *prometheus.CounterVec
    ExecutionLatency *prometheus.HistogramVec
}

// NewSchedulerMetrics 创建调度器指标
func NewSchedulerMetrics() *SchedulerMetrics {
    return &SchedulerMetrics{
        JobsScheduled: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "scheduler_jobs_scheduled_total",
                Help: "Total number of jobs scheduled",
            },
            []string{"job_type"},
        ),
        SchedulingErrors: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "scheduler_errors_total",
                Help: "Total number of scheduling errors",
            },
            []string{"error_type"},
        ),
        ScheduleLatency: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "scheduler_schedule_latency_seconds",
                Help:    "Scheduling latency in seconds",
                Buckets: prometheus.DefBuckets,
            },
            []string{"job_type"},
        ),
    }
}

// NewExecutorMetrics 创建执行器指标
func NewExecutorMetrics() *ExecutorMetrics {
    return &ExecutorMetrics{
        JobsSubmitted: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "executor_jobs_submitted_total",
                Help: "Total number of jobs submitted",
            },
            []string{"job_type"},
        ),
        JobsCompleted: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "executor_jobs_completed_total",
                Help: "Total number of jobs completed",
            },
            []string{"job_type", "result"},
        ),
        ExecutionErrors: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "executor_errors_total",
                Help: "Total number of execution errors",
            },
            []string{"error_type"},
        ),
        ExecutionLatency: prometheus.NewHistogramVec(
            prometheus.HistogramOpts{
                Name:    "executor_execution_latency_seconds",
                Help:    "Execution latency in seconds",
                Buckets: prometheus.DefBuckets,
            },
            []string{"job_type"},
        ),
    }
}

// SystemMonitor 系统监控器
type SystemMonitor struct {
    metrics *SystemMetrics
    ticker  *time.Ticker
    quit    chan struct{}
    wg      sync.WaitGroup
}

// SystemMetrics 系统指标
type SystemMetrics struct {
    // 系统指标
    CPUUsage    *prometheus.GaugeVec
    MemoryUsage *prometheus.GaugeVec
    Goroutines  *prometheus.GaugeVec
    
    // 任务指标
    TotalJobs      *prometheus.GaugeVec
    RunningJobs    *prometheus.GaugeVec
    FailedJobs     *prometheus.GaugeVec
    SuccessRate    *prometheus.GaugeVec
    
    // 执行器指标
    ActiveExecutors *prometheus.GaugeVec
    ExecutorQueue   *prometheus.GaugeVec
}

// NewSystemMonitor 创建系统监控器
func NewSystemMonitor() *SystemMonitor {
    return &SystemMonitor{
        metrics: &SystemMetrics{
            CPUUsage: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_cpu_usage_percent",
                    Help: "System CPU usage percentage",
                },
                []string{"component"},
            ),
            MemoryUsage: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_memory_usage_bytes",
                    Help: "System memory usage in bytes",
                },
                []string{"component"},
            ),
            Goroutines: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_goroutines",
                    Help: "Number of goroutines",
                },
                []string{"component"},
            ),
            TotalJobs: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_total_jobs",
                    Help: "Total number of jobs",
                },
                []string{"job_status"},
            ),
            RunningJobs: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_running_jobs",
                    Help: "Number of running jobs",
                },
                []string{"job_type"},
            ),
            FailedJobs: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_failed_jobs",
                    Help: "Number of failed jobs",
                },
                []string{"job_type"},
            ),
            SuccessRate: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_success_rate",
                    Help: "Job success rate",
                },
                []string{"job_type"},
            ),
            ActiveExecutors: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_active_executors",
                    Help: "Number of active executors",
                },
                []string{"executor_type"},
            ),
            ExecutorQueue: prometheus.NewGaugeVec(
                prometheus.GaugeOpts{
                    Name: "system_executor_queue_size",
                    Help: "Executor queue size",
                },
                []string{"queue_type"},
            ),
        },
        quit: make(chan struct{}),
    }
}

// Start 启动监控
func (sm *SystemMonitor) Start(interval time.Duration) {
    sm.ticker = time.NewTicker(interval)
    
    sm.wg.Add(1)
    go sm.collectMetrics()
}

// collectMetrics 收集指标
func (sm *SystemMonitor) collectMetrics() {
    defer sm.wg.Done()
    
    for {
        select {
        case <-sm.ticker.C:
            sm.collectSystemMetrics()
        case <-sm.quit:
            return
        }
    }
}

// collectSystemMetrics 收集系统指标
func (sm *SystemMonitor) collectSystemMetrics() {
    // 收集CPU使用率
    // 注意:在实际实现中需要使用系统调用来获取真实的CPU使用率
    cpuPercent := 0.0 // 简化处理
    
    sm.metrics.CPUUsage.WithLabelValues("scheduler").Set(cpuPercent)
    
    // 收集内存使用
    var m runtime.MemStats
    runtime.ReadMemStats(&m)
    sm.metrics.MemoryUsage.WithLabelValues("scheduler").Set(float64(m.Alloc))
    
    // 收集协程数量
    sm.metrics.Goroutines.WithLabelValues("scheduler").Set(float64(runtime.NumGoroutine()))
}

// Stop 停止监控
func (sm *SystemMonitor) Stop() {
    close(sm.quit)
    if sm.ticker != nil {
        sm.ticker.Stop()
    }
    sm.wg.Wait()
}

8. 总结

分布式任务调度系统的架构设计是一个复杂的工程问题,需要考虑多个方面的因素:

  1. 系统架构:合理的架构设计包括任务存储、调度器、执行器、服务注册中心等核心组件
  2. 数据存储:可靠的任务和执行记录存储机制是系统稳定运行的基础
  3. 调度机制:高效的调度算法和Cron表达式解析能力确保任务按时执行
  4. 执行管理:通过工作协程池和任务注册机制实现任务的并发执行
  5. 服务发现:在分布式环境中实现服务的自动注册和发现
  6. 监控告警:完善的监控体系帮助及时发现和解决问题

在实际应用中,需要根据具体的业务场景和性能要求选择合适的技术方案,并持续优化以达到最佳效果。通过合理的设计和实现,我们可以构建出高性能、高可用、易扩展的分布式任务调度系统,为企业的各种定时和异步任务提供可靠的执行保障。

下一章我们将深入探讨分布式任务调度系统中的复杂调度策略和任务编排机制。