13.2 复杂调度策略竟然还能这样设计?

0 阅读14分钟

太震撼了!复杂调度策略竟然还能这样设计?

在分布式任务调度系统中,简单的定时调度往往无法满足复杂的业务需求。企业级应用通常需要支持基于优先级、资源感知、依赖关系、动态调整等多种复杂调度策略。本章将深入探讨如何设计和实现这些高级调度策略,以满足各种复杂的业务场景。

1. 复杂调度策略概述

复杂调度策略是指超越基本时间触发的调度机制,它们考虑更多维度的因素来决定任务的执行时机和执行方式。

1.1 调度策略分类

// ComplexSchedulingStrategies 复杂调度策略分类
type ComplexSchedulingStrategies struct {
    // 优先级调度
    PriorityBasedScheduling bool
    
    // 资源感知调度
    ResourceAwareScheduling bool
    
    // 依赖调度
    DependencyBasedScheduling bool
    
    // 动态调度
    DynamicScheduling bool
    
    // 负载均衡调度
    LoadBalancedScheduling bool
    
    // 地域感知调度
    RegionAwareScheduling bool
}

1.2 调度策略挑战

// SchedulingStrategyChallenges 调度策略挑战
type SchedulingStrategyChallenges struct {
    // 策略冲突解决
    StrategyConflictResolution bool
    
    // 实时性要求
    RealTimeRequirements bool
    
    // 可扩展性
    Scalability bool
    
    // 公平性保证
    FairnessGuarantee bool
    
    // 性能优化
    PerformanceOptimization bool
}

2. 优先级调度策略

优先级调度是根据任务的重要性和紧急程度来决定执行顺序的策略。

2.1 优先级调度器设计

// PriorityBasedScheduler 优先级调度器
type PriorityBasedScheduler struct {
    config      *PrioritySchedulerConfig
    jobStore    JobStore
    priorityQueue *PriorityQueue
    metrics     *PrioritySchedulerMetrics
}

// PrioritySchedulerConfig 优先级调度器配置
type PrioritySchedulerConfig struct {
    // 默认优先级
    DefaultPriority int `json:"default_priority"`
    
    // 最大优先级
    MaxPriority int `json:"max_priority"`
    
    // 最小优先级
    MinPriority int `json:"min_priority"`
    
    // 优先级权重
    PriorityWeights map[int]float64 `json:"priority_weights"`
    
    // 饥饿避免机制
    StarvationPrevention bool `json:"starvation_prevention"`
    
    // 饥饿超时时间
    StarvationTimeout time.Duration `json:"starvation_timeout"`
}

// PriorityQueue 优先级队列
type PriorityQueue struct {
    queues    map[int]*JobQueue
    mutex     sync.RWMutex
    config    *PrioritySchedulerConfig
    lastServed map[int]time.Time
}

// JobQueue 任务队列
type JobQueue struct {
    jobs    []*Job
    mutex   sync.Mutex
    maxSize int
}

// PrioritySchedulerMetrics 优先级调度器指标
type PrioritySchedulerMetrics struct {
    JobsScheduledByPriority *prometheus.CounterVec
    PriorityStarvationCases *prometheus.CounterVec
    AverageWaitTimeByPriority *prometheus.GaugeVec
}

// NewPriorityBasedScheduler 创建优先级调度器
func NewPriorityBasedScheduler(config *PrioritySchedulerConfig) *PriorityBasedScheduler {
    return &PriorityBasedScheduler{
        config:      config,
        priorityQueue: NewPriorityQueue(config),
        metrics:     NewPrioritySchedulerMetrics(),
    }
}

// NewPriorityQueue 创建优先级队列
func NewPriorityQueue(config *PrioritySchedulerConfig) *PriorityQueue {
    return &PriorityQueue{
        queues:     make(map[int]*JobQueue),
        config:     config,
        lastServed: make(map[int]time.Time),
    }
}

// NewJobQueue 创建任务队列
func NewJobQueue(maxSize int) *JobQueue {
    return &JobQueue{
        jobs:    make([]*Job, 0),
        maxSize: maxSize,
    }
}

// NewPrioritySchedulerMetrics 创建优先级调度器指标
func NewPrioritySchedulerMetrics() *PrioritySchedulerMetrics {
    return &PrioritySchedulerMetrics{
        JobsScheduledByPriority: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "priority_scheduler_jobs_scheduled_by_priority_total",
                Help: "Total number of jobs scheduled by priority",
            },
            []string{"priority_level"},
        ),
        PriorityStarvationCases: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "priority_scheduler_starvation_cases_total",
                Help: "Total number of priority starvation cases",
            },
            []string{"priority_level"},
        ),
        AverageWaitTimeByPriority: prometheus.NewGaugeVec(
            prometheus.GaugeOpts{
                Name: "priority_scheduler_average_wait_time_by_priority_seconds",
                Help: "Average wait time by priority in seconds",
            },
            []string{"priority_level"},
        ),
    }
}

// ScheduleJob 调度任务
func (pbs *PriorityBasedScheduler) ScheduleJob(ctx context.Context, job *Job) error {
    // 验证优先级
    if job.Priority < pbs.config.MinPriority || job.Priority > pbs.config.MaxPriority {
        job.Priority = pbs.config.DefaultPriority
    }
    
    // 添加到优先级队列
    if err := pbs.priorityQueue.Enqueue(job); err != nil {
        return fmt.Errorf("failed to enqueue job: %w", err)
    }
    
    pbs.metrics.JobsScheduledByPriority.WithLabelValues(fmt.Sprintf("%d", job.Priority)).Inc()
    return nil
}

// Enqueue 添加任务到队列
func (pq *PriorityQueue) Enqueue(job *Job) error {
    pq.mutex.Lock()
    defer pq.mutex.Unlock()
    
    // 获取或创建对应优先级的队列
    queue, exists := pq.queues[job.Priority]
    if !exists {
        queue = NewJobQueue(1000) // 默认队列大小
        pq.queues[job.Priority] = queue
    }
    
    // 添加任务到队列
    return queue.Enqueue(job)
}

// Dequeue 从队列中取出任务
func (pq *PriorityQueue) Dequeue() (*Job, error) {
    pq.mutex.Lock()
    defer pq.mutex.Unlock()
    
    // 按优先级顺序检查队列
    for priority := pq.config.MaxPriority; priority >= pq.config.MinPriority; priority-- {
        // 检查是否需要避免饥饿
        if pq.config.StarvationPrevention {
            if lastServed, exists := pq.lastServed[priority]; exists {
                if time.Since(lastServed) > pq.config.StarvationTimeout {
                    pq.metrics.PriorityStarvationCases.WithLabelValues(fmt.Sprintf("%d", priority)).Inc()
                }
            }
        }
        
        // 获取对应优先级的队列
        queue, exists := pq.queues[priority]
        if !exists {
            continue
        }
        
        // 尝试从队列中取出任务
        job, err := queue.Dequeue()
        if err == nil {
            pq.lastServed[priority] = time.Now()
            return job, nil
        }
    }
    
    return nil, errors.New("no jobs available")
}

// Enqueue 添加任务到队列
func (jq *JobQueue) Enqueue(job *Job) error {
    jq.mutex.Lock()
    defer jq.mutex.Unlock()
    
    if len(jq.jobs) >= jq.maxSize {
        return errors.New("job queue is full")
    }
    
    jq.jobs = append(jq.jobs, job)
    return nil
}

// Dequeue 从队列中取出任务
func (jq *JobQueue) Dequeue() (*Job, error) {
    jq.mutex.Lock()
    defer jq.mutex.Unlock()
    
    if len(jq.jobs) == 0 {
        return nil, errors.New("job queue is empty")
    }
    
    // 取出第一个任务
    job := jq.jobs[0]
    jq.jobs = jq.jobs[1:]
    return job, nil
}

3. 资源感知调度策略

资源感知调度是根据系统资源使用情况来决定任务执行的策略,确保系统资源的合理利用。

3.1 资源感知调度器设计

// ResourceAwareScheduler 资源感知调度器
type ResourceAwareScheduler struct {
    config        *ResourceSchedulerConfig
    resourceMonitor *ResourceMonitor
    jobStore      JobStore
    metrics       *ResourceSchedulerMetrics
}

// ResourceSchedulerConfig 资源调度器配置
type ResourceSchedulerConfig struct {
    // CPU使用率阈值
    CPUThreshold float64 `json:"cpu_threshold"`
    
    // 内存使用率阈值
    MemoryThreshold float64 `json:"memory_threshold"`
    
    // 磁盘使用率阈值
    DiskThreshold float64 `json:"disk_threshold"`
    
    // 网络带宽阈值
    NetworkThreshold float64 `json:"network_threshold"`
    
    // 资源检查间隔
    ResourceCheckInterval time.Duration `json:"resource_check_interval"`
    
    // 资源预留比例
    ResourceReservationRatio float64 `json:"resource_reservation_ratio"`
}

// ResourceMonitor 资源监控器
type ResourceMonitor struct {
    config    *ResourceSchedulerConfig
    ticker    *time.Ticker
    quit      chan struct{}
    wg        sync.WaitGroup
    resources *SystemResources
    mutex     sync.RWMutex
}

// SystemResources 系统资源
type SystemResources struct {
    CPUUsage    float64 `json:"cpu_usage"`
    MemoryUsage float64 `json:"memory_usage"`
    DiskUsage   float64 `json:"disk_usage"`
    NetworkUsage float64 `json:"network_usage"`
    LastUpdated time.Time `json:"last_updated"`
}

// ResourceSchedulerMetrics 资源调度器指标
type ResourceSchedulerMetrics struct {
    ResourceBasedSchedulingDecisions *prometheus.CounterVec
    ResourceThresholdExceeded *prometheus.CounterVec
    AverageResourceUsage *prometheus.GaugeVec
}

// NewResourceAwareScheduler 创建资源感知调度器
func NewResourceAwareScheduler(config *ResourceSchedulerConfig) *ResourceAwareScheduler {
    return &ResourceAwareScheduler{
        config:        config,
        resourceMonitor: NewResourceMonitor(config),
        metrics:       NewResourceSchedulerMetrics(),
    }
}

// NewResourceMonitor 创建资源监控器
func NewResourceMonitor(config *ResourceSchedulerConfig) *ResourceMonitor {
    rm := &ResourceMonitor{
        config:    config,
        quit:      make(chan struct{}),
        resources: &SystemResources{},
    }
    
    // 启动资源监控
    rm.startMonitoring()
    
    return rm
}

// NewResourceSchedulerMetrics 创建资源调度器指标
func NewResourceSchedulerMetrics() *ResourceSchedulerMetrics {
    return &ResourceSchedulerMetrics{
        ResourceBasedSchedulingDecisions: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "resource_scheduler_decisions_total",
                Help: "Total number of resource-based scheduling decisions",
            },
            []string{"decision_type"},
        ),
        ResourceThresholdExceeded: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "resource_scheduler_threshold_exceeded_total",
                Help: "Total number of times resource threshold exceeded",
            },
            []string{"resource_type"},
        ),
        AverageResourceUsage: prometheus.NewGaugeVec(
            prometheus.GaugeOpts{
                Name: "resource_scheduler_average_usage_percent",
                Help: "Average resource usage percentage",
            },
            []string{"resource_type"},
        ),
    }
}

// startMonitoring 启动资源监控
func (rm *ResourceMonitor) startMonitoring() {
    rm.ticker = time.NewTicker(rm.config.ResourceCheckInterval)
    
    rm.wg.Add(1)
    go func() {
        defer rm.wg.Done()
        
        for {
            select {
            case <-rm.ticker.C:
                rm.collectResources()
            case <-rm.quit:
                return
            }
        }
    }()
}

// collectResources 收集资源使用情况
func (rm *ResourceMonitor) collectResources() {
    rm.mutex.Lock()
    defer rm.mutex.Unlock()
    
    // 收集CPU使用率
    cpuPercent := rm.getCPUUsage()
    rm.resources.CPUUsage = cpuPercent
    
    // 收集内存使用率
    memoryPercent := rm.getMemoryUsage()
    rm.resources.MemoryUsage = memoryPercent
    
    // 收集磁盘使用率
    diskPercent := rm.getDiskUsage()
    rm.resources.DiskUsage = diskPercent
    
    // 收集网络使用率
    networkPercent := rm.getNetworkUsage()
    rm.resources.NetworkUsage = networkPercent
    
    rm.resources.LastUpdated = time.Now()
}

// getCPUUsage 获取CPU使用率
func (rm *ResourceMonitor) getCPUUsage() float64 {
    // 在实际实现中,这里需要调用系统API获取真实的CPU使用率
    // 简化处理,返回模拟值
    return 0.0
}

// getMemoryUsage 获取内存使用率
func (rm *ResourceMonitor) getMemoryUsage() float64 {
    // 在实际实现中,这里需要调用系统API获取真实的内存使用率
    // 简化处理,返回模拟值
    return 0.0
}

// getDiskUsage 获取磁盘使用率
func (rm *ResourceMonitor) getDiskUsage() float64 {
    // 在实际实现中,这里需要调用系统API获取真实的磁盘使用率
    // 简化处理,返回模拟值
    return 0.0
}

// getNetworkUsage 获取网络使用率
func (rm *ResourceMonitor) getNetworkUsage() float64 {
    // 在实际实现中,这里需要调用系统API获取真实的网络使用率
    // 简化处理,返回模拟值
    return 0.0
}

// GetCurrentResources 获取当前资源使用情况
func (rm *ResourceMonitor) GetCurrentResources() *SystemResources {
    rm.mutex.RLock()
    defer rm.mutex.RUnlock()
    
    return &SystemResources{
        CPUUsage:     rm.resources.CPUUsage,
        MemoryUsage:  rm.resources.MemoryUsage,
        DiskUsage:    rm.resources.DiskUsage,
        NetworkUsage: rm.resources.NetworkUsage,
        LastUpdated:  rm.resources.LastUpdated,
    }
}

// ShouldScheduleJob 检查是否应该调度任务
func (ras *ResourceAwareScheduler) ShouldScheduleJob(job *Job) bool {
    resources := ras.resourceMonitor.GetCurrentResources()
    
    // 检查CPU使用率
    if resources.CPUUsage > ras.config.CPUThreshold {
        ras.metrics.ResourceThresholdExceeded.WithLabelValues("cpu").Inc()
        return false
    }
    
    // 检查内存使用率
    if resources.MemoryUsage > ras.config.MemoryThreshold {
        ras.metrics.ResourceThresholdExceeded.WithLabelValues("memory").Inc()
        return false
    }
    
    // 检查磁盘使用率
    if resources.DiskUsage > ras.config.DiskThreshold {
        ras.metrics.ResourceThresholdExceeded.WithLabelValues("disk").Inc()
        return false
    }
    
    // 检查网络使用率
    if resources.NetworkUsage > ras.config.NetworkThreshold {
        ras.metrics.ResourceThresholdExceeded.WithLabelValues("network").Inc()
        return false
    }
    
    ras.metrics.ResourceBasedSchedulingDecisions.WithLabelValues("scheduled").Inc()
    return true
}

// Stop 停止资源监控器
func (rm *ResourceMonitor) Stop() {
    close(rm.quit)
    if rm.ticker != nil {
        rm.ticker.Stop()
    }
    rm.wg.Wait()
}

4. 依赖调度策略

依赖调度是根据任务之间的依赖关系来决定执行顺序的策略,确保依赖任务按正确顺序执行。

4.1 依赖调度器设计

// DependencyBasedScheduler 依赖调度器
type DependencyBasedScheduler struct {
    config       *DependencySchedulerConfig
    jobStore     JobStore
    dependencyGraph *DependencyGraph
    metrics      *DependencySchedulerMetrics
}

// DependencySchedulerConfig 依赖调度器配置
type DependencySchedulerConfig struct {
    // 最大依赖深度
    MaxDependencyDepth int `json:"max_dependency_depth"`
    
    // 循环依赖检测
    CircularDependencyDetection bool `json:"circular_dependency_detection"`
    
    // 并行执行限制
    MaxParallelExecutions int `json:"max_parallel_executions"`
    
    // 依赖超时时间
    DependencyTimeout time.Duration `json:"dependency_timeout"`
}

// DependencyGraph 依赖图
type DependencyGraph struct {
    nodes map[string]*DependencyNode
    edges map[string]map[string]bool
    mutex sync.RWMutex
}

// DependencyNode 依赖节点
type DependencyNode struct {
    JobID     string    `json:"job_id"`
    Job       *Job      `json:"job"`
    Status    JobStatus `json:"status"`
    DependsOn []string  `json:"depends_on"`
    Dependent []string  `json:"dependent"`
    CreatedAt time.Time `json:"created_at"`
}

// DependencySchedulerMetrics 依赖调度器指标
type DependencySchedulerMetrics struct {
    DependencyResolutions *prometheus.CounterVec
    CircularDependencies  *prometheus.CounterVec
    DependencyTimeouts    *prometheus.CounterVec
    ParallelExecutions    *prometheus.GaugeVec
}

// NewDependencyBasedScheduler 创建依赖调度器
func NewDependencyBasedScheduler(config *DependencySchedulerConfig) *DependencyBasedScheduler {
    return &DependencyBasedScheduler{
        config:          config,
        dependencyGraph: NewDependencyGraph(),
        metrics:         NewDependencySchedulerMetrics(),
    }
}

// NewDependencyGraph 创建依赖图
func NewDependencyGraph() *DependencyGraph {
    return &DependencyGraph{
        nodes: make(map[string]*DependencyNode),
        edges: make(map[string]map[string]bool),
    }
}

// NewDependencySchedulerMetrics 创建依赖调度器指标
func NewDependencySchedulerMetrics() *DependencySchedulerMetrics {
    return &DependencySchedulerMetrics{
        DependencyResolutions: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "dependency_scheduler_resolutions_total",
                Help: "Total number of dependency resolutions",
            },
            []string{"resolution_type"},
        ),
        CircularDependencies: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "dependency_scheduler_circular_dependencies_total",
                Help: "Total number of circular dependencies detected",
            },
            []string{"dependency_chain"},
        ),
        DependencyTimeouts: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "dependency_scheduler_timeouts_total",
                Help: "Total number of dependency timeouts",
            },
            []string{"job_id"},
        ),
        ParallelExecutions: prometheus.NewGaugeVec(
            prometheus.GaugeOpts{
                Name: "dependency_scheduler_parallel_executions",
                Help: "Number of parallel executions",
            },
            []string{"execution_type"},
        ),
    }
}

// AddJob 添加任务到依赖图
func (dg *DependencyGraph) AddJob(job *Job, dependencies []string) error {
    dg.mutex.Lock()
    defer dg.mutex.Unlock()
    
    // 检查循环依赖
    if dg.config.CircularDependencyDetection {
        if err := dg.detectCircularDependency(job.ID, dependencies); err != nil {
            return fmt.Errorf("circular dependency detected: %w", err)
        }
    }
    
    // 创建依赖节点
    node := &DependencyNode{
        JobID:     job.ID,
        Job:       job,
        Status:    JobStatusPending,
        DependsOn: dependencies,
        CreatedAt: time.Now(),
    }
    
    // 添加节点
    dg.nodes[job.ID] = node
    
    // 添加边
    for _, depID := range dependencies {
        if dg.edges[depID] == nil {
            dg.edges[depID] = make(map[string]bool)
        }
        dg.edges[depID][job.ID] = true
        
        // 更新依赖节点的被依赖列表
        if depNode, exists := dg.nodes[depID]; exists {
            depNode.Dependent = append(depNode.Dependent, job.ID)
        }
    }
    
    return nil
}

// detectCircularDependency 检测循环依赖
func (dg *DependencyGraph) detectCircularDependency(jobID string, dependencies []string) error {
    visited := make(map[string]bool)
    recursionStack := make(map[string]bool)
    
    // 检查从当前任务开始的路径
    if dg.hasCycle(jobID, visited, recursionStack) {
        return fmt.Errorf("circular dependency detected involving job %s", jobID)
    }
    
    // 检查依赖任务的路径
    for _, depID := range dependencies {
        if dg.hasCycle(depID, visited, recursionStack) {
            return fmt.Errorf("circular dependency detected involving dependency %s", depID)
        }
    }
    
    return nil
}

// hasCycle 检查是否存在环
func (dg *DependencyGraph) hasCycle(nodeID string, visited, recursionStack map[string]bool) bool {
    if !visited[nodeID] {
        visited[nodeID] = true
        recursionStack[nodeID] = true
        
        // 检查所有依赖的任务
        if edges, exists := dg.edges[nodeID]; exists {
            for dependentID := range edges {
                if !visited[dependentID] && dg.hasCycle(dependentID, visited, recursionStack) {
                    return true
                } else if recursionStack[dependentID] {
                    return true
                }
            }
        }
    }
    
    recursionStack[nodeID] = false
    return false
}

// CanExecute 检查任务是否可以执行
func (dg *DependencyGraph) CanExecute(jobID string) (bool, error) {
    dg.mutex.RLock()
    defer dg.mutex.RUnlock()
    
    node, exists := dg.nodes[jobID]
    if !exists {
        return false, fmt.Errorf("job %s not found in dependency graph", jobID)
    }
    
    // 检查所有依赖是否已完成
    for _, depID := range node.DependsOn {
        depNode, exists := dg.nodes[depID]
        if !exists {
            return false, fmt.Errorf("dependency %s not found", depID)
        }
        
        if depNode.Status != JobStatusSuccess {
            return false, nil // 依赖未完成
        }
    }
    
    return true, nil
}

// MarkCompleted 标记任务完成
func (dg *DependencyGraph) MarkCompleted(jobID string) error {
    dg.mutex.Lock()
    defer dg.mutex.Unlock()
    
    node, exists := dg.nodes[jobID]
    if !exists {
        return fmt.Errorf("job %s not found in dependency graph", jobID)
    }
    
    node.Status = JobStatusSuccess
    
    // 通知依赖于此任务的其他任务
    if dependents, exists := dg.edges[jobID]; exists {
        for dependentID := range dependents {
            if dependentNode, exists := dg.nodes[dependentID]; exists {
                // 可以考虑触发依赖任务的重新评估
                _ = dependentNode
            }
        }
    }
    
    return nil
}

// GetReadyJobs 获取可执行的任务
func (dbs *DependencyBasedScheduler) GetReadyJobs(limit int) ([]*Job, error) {
    var readyJobs []*Job
    count := 0
    
    nodes := dbs.dependencyGraph.GetAllNodes()
    for _, node := range nodes {
        if count >= limit {
            break
        }
        
        if node.Status == JobStatusPending {
            canExecute, err := dbs.dependencyGraph.CanExecute(node.JobID)
            if err != nil {
                log.Printf("Error checking if job %s can execute: %v", node.JobID, err)
                continue
            }
            
            if canExecute {
                readyJobs = append(readyJobs, node.Job)
                count++
            }
        }
    }
    
    return readyJobs, nil
}

// GetAllNodes 获取所有节点
func (dg *DependencyGraph) GetAllNodes() []*DependencyNode {
    dg.mutex.RLock()
    defer dg.mutex.RUnlock()
    
    var nodes []*DependencyNode
    for _, node := range dg.nodes {
        nodes = append(nodes, node)
    }
    
    return nodes
}

5. 动态调度策略

动态调度是根据系统运行时状态和任务特征动态调整调度策略的机制。

5.1 动态调度器设计

// DynamicScheduler 动态调度器
type DynamicScheduler struct {
    config           *DynamicSchedulerConfig
    jobStore         JobStore
    strategySelector *StrategySelector
    metrics          *DynamicSchedulerMetrics
}

// DynamicSchedulerConfig 动态调度器配置
type DynamicSchedulerConfig struct {
    // 策略评估间隔
    StrategyEvaluationInterval time.Duration `json:"strategy_evaluation_interval"`
    
    // 学习窗口大小
    LearningWindow int `json:"learning_window"`
    
    // 自适应阈值
    AdaptiveThresholds map[string]float64 `json:"adaptive_thresholds"`
    
    // 策略权重
    StrategyWeights map[string]float64 `json:"strategy_weights"`
}

// StrategySelector 策略选择器
type StrategySelector struct {
    config        *DynamicSchedulerConfig
    history       *SchedulingHistory
    strategies    map[string]SchedulingStrategy
    currentStrategy string
    mutex         sync.RWMutex
}

// SchedulingHistory 调度历史
type SchedulingHistory struct {
    records []SchedulingRecord
    maxSize int
    mutex   sync.RWMutex
}

// SchedulingRecord 调度记录
type SchedulingRecord struct {
    JobID        string        `json:"job_id"`
    Strategy     string        `json:"strategy"`
    ExecutionTime time.Duration `json:"execution_time"`
    ResourceUsage float64      `json:"resource_usage"`
    Success      bool          `json:"success"`
    Timestamp    time.Time     `json:"timestamp"`
}

// SchedulingStrategy 调度策略接口
type SchedulingStrategy interface {
    Name() string
    Evaluate(job *Job, context *SchedulingContext) float64
    Schedule(job *Job) error
}

// SchedulingContext 调度上下文
type SchedulingContext struct {
    SystemLoad      float64           `json:"system_load"`
    ResourceUsage   *SystemResources  `json:"resource_usage"`
    QueueLengths    map[string]int    `json:"queue_lengths"`
    HistoricalData  *SchedulingHistory `json:"historical_data"`
}

// DynamicSchedulerMetrics 动态调度器指标
type DynamicSchedulerMetrics struct {
    StrategySwitches *prometheus.CounterVec
    StrategyEffectiveness *prometheus.GaugeVec
    AdaptiveAdjustments *prometheus.CounterVec
}

// NewDynamicScheduler 创建动态调度器
func NewDynamicScheduler(config *DynamicSchedulerConfig) *DynamicScheduler {
    return &DynamicScheduler{
        config:           config,
        strategySelector: NewStrategySelector(config),
        metrics:          NewDynamicSchedulerMetrics(),
    }
}

// NewStrategySelector 创建策略选择器
func NewStrategySelector(config *DynamicSchedulerConfig) *StrategySelector {
    return &StrategySelector{
        config:     config,
        history:    NewSchedulingHistory(config.LearningWindow),
        strategies: make(map[string]SchedulingStrategy),
    }
}

// NewSchedulingHistory 创建调度历史
func NewSchedulingHistory(maxSize int) *SchedulingHistory {
    return &SchedulingHistory{
        records: make([]SchedulingRecord, 0, maxSize),
        maxSize: maxSize,
    }
}

// NewDynamicSchedulerMetrics 创建动态调度器指标
func NewDynamicSchedulerMetrics() *DynamicSchedulerMetrics {
    return &DynamicSchedulerMetrics{
        StrategySwitches: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "dynamic_scheduler_strategy_switches_total",
                Help: "Total number of strategy switches",
            },
            []string{"from_strategy", "to_strategy"},
        ),
        StrategyEffectiveness: prometheus.NewGaugeVec(
            prometheus.GaugeOpts{
                Name: "dynamic_scheduler_strategy_effectiveness",
                Help: "Effectiveness of scheduling strategies",
            },
            []string{"strategy"},
        ),
        AdaptiveAdjustments: prometheus.NewCounterVec(
            prometheus.CounterOpts{
                Name: "dynamic_scheduler_adaptive_adjustments_total",
                Help: "Total number of adaptive adjustments",
            },
            []string{"adjustment_type"},
        ),
    }
}

// RegisterStrategy 注册调度策略
func (ss *StrategySelector) RegisterStrategy(strategy SchedulingStrategy) {
    ss.mutex.Lock()
    defer ss.mutex.Unlock()
    
    ss.strategies[strategy.Name()] = strategy
}

// SelectStrategy 选择调度策略
func (ss *StrategySelector) SelectStrategy(job *Job, context *SchedulingContext) SchedulingStrategy {
    ss.mutex.RLock()
    defer ss.mutex.RUnlock()
    
    if len(ss.strategies) == 0 {
        return nil
    }
    
    // 如果只有一个策略,直接返回
    if len(ss.strategies) == 1 {
        for _, strategy := range ss.strategies {
            return strategy
        }
    }
    
    // 基于评估分数选择最佳策略
    var bestStrategy SchedulingStrategy
    var bestScore float64
    
    for _, strategy := range ss.strategies {
        score := strategy.Evaluate(job, context)
        weightedScore := score * ss.config.StrategyWeights[strategy.Name()]
        
        if weightedScore > bestScore {
            bestScore = weightedScore
            bestStrategy = strategy
        }
    }
    
    // 记录策略切换
    if bestStrategy != nil && bestStrategy.Name() != ss.currentStrategy {
        if ss.currentStrategy != "" {
            ss.metrics.StrategySwitches.WithLabelValues(ss.currentStrategy, bestStrategy.Name()).Inc()
        }
        ss.currentStrategy = bestStrategy.Name()
    }
    
    return bestStrategy
}

// AddRecord 添加调度记录
func (sh *SchedulingHistory) AddRecord(record SchedulingRecord) {
    sh.mutex.Lock()
    defer sh.mutex.Unlock()
    
    sh.records = append(sh.records, record)
    
    // 保持历史记录在窗口大小内
    if len(sh.records) > sh.maxSize {
        sh.records = sh.records[len(sh.records)-sh.maxSize:]
    }
}

// GetRecentRecords 获取最近的调度记录
func (sh *SchedulingHistory) GetRecentRecords(count int) []SchedulingRecord {
    sh.mutex.RLock()
    defer sh.mutex.RUnlock()
    
    if len(sh.records) == 0 {
        return []SchedulingRecord{}
    }
    
    start := len(sh.records) - count
    if start < 0 {
        start = 0
    }
    
    return sh.records[start:]
}

// GetAverageExecutionTime 获取平均执行时间
func (sh *SchedulingHistory) GetAverageExecutionTime(strategy string) time.Duration {
    sh.mutex.RLock()
    defer sh.mutex.RUnlock()
    
    var total time.Duration
    var count int
    
    for _, record := range sh.records {
        if record.Strategy == strategy {
            total += record.ExecutionTime
            count++
        }
    }
    
    if count == 0 {
        return 0
    }
    
    return time.Duration(int64(total) / int64(count))
}

// GetSuccessRate 获取成功率
func (sh *SchedulingHistory) GetSuccessRate(strategy string) float64 {
    sh.mutex.RLock()
    defer sh.mutex.RUnlock()
    
    var successCount int
    var totalCount int
    
    for _, record := range sh.records {
        if record.Strategy == strategy {
            totalCount++
            if record.Success {
                successCount++
            }
        }
    }
    
    if totalCount == 0 {
        return 0
    }
    
    return float64(successCount) / float64(totalCount)
}

// AdaptivePriorityStrategy 自适应优先级策略
type AdaptivePriorityStrategy struct {
    name     string
    history  *SchedulingHistory
    config   *DynamicSchedulerConfig
}

// NewAdaptivePriorityStrategy 创建自适应优先级策略
func NewAdaptivePriorityStrategy(history *SchedulingHistory, config *DynamicSchedulerConfig) *AdaptivePriorityStrategy {
    return &AdaptivePriorityStrategy{
        name:    "adaptive_priority",
        history: history,
        config:  config,
    }
}

// Name 策略名称
func (aps *AdaptivePriorityStrategy) Name() string {
    return aps.name
}

// Evaluate 评估策略适用性
func (aps *AdaptivePriorityStrategy) Evaluate(job *Job, context *SchedulingContext) float64 {
    // 基于历史数据和当前系统状态评估
    successRate := aps.history.GetSuccessRate(aps.name)
    avgExecutionTime := aps.history.GetAverageExecutionTime(aps.name)
    
    // 考虑任务优先级
    priorityScore := float64(job.Priority) / 10.0 // 假设优先级范围是1-10
    
    // 考虑系统负载
    loadScore := 1.0 - context.SystemLoad
    
    // 综合评分
    score := successRate*0.4 + (1.0-float64(avgExecutionTime.Seconds())/60.0)*0.3 + priorityScore*0.2 + loadScore*0.1
    
    return score
}

// Schedule 调度任务
func (aps *AdaptivePriorityStrategy) Schedule(job *Job) error {
    // 实现具体的调度逻辑
    log.Printf("Scheduling job %s with adaptive priority strategy", job.ID)
    return nil
}

// ResourceAwareStrategy 资源感知策略
type ResourceAwareStrategy struct {
    name   string
    config *DynamicSchedulerConfig
}

// NewResourceAwareStrategy 创建资源感知策略
func NewResourceAwareStrategy(config *DynamicSchedulerConfig) *ResourceAwareStrategy {
    return &ResourceAwareStrategy{
        name:   "resource_aware",
        config: config,
    }
}

// Name 策略名称
func (ras *ResourceAwareStrategy) Name() string {
    return ras.name
}

// Evaluate 评估策略适用性
func (ras *ResourceAwareStrategy) Evaluate(job *Job, context *SchedulingContext) float64 {
    // 基于资源使用情况评估
    cpuUsage := context.ResourceUsage.CPUUsage
    memoryUsage := context.ResourceUsage.MemoryUsage
    
    // 如果资源使用率高,此策略更适用
    resourceScore := (cpuUsage + memoryUsage) / 2.0
    
    // 考虑队列长度
    var totalQueueLength int
    for _, length := range context.QueueLengths {
        totalQueueLength += length
    }
    queueScore := 1.0 - float64(totalQueueLength)/100.0 // 假设100为队列长度阈值
    
    // 综合评分
    score := resourceScore*0.6 + queueScore*0.4
    
    return score
}

// Schedule 调度任务
func (ras *ResourceAwareStrategy) Schedule(job *Job) error {
    // 实现具体的调度逻辑
    log.Printf("Scheduling job %s with resource aware strategy", job.ID)
    return nil
}

6. 调度策略组合与优化

在实际应用中,往往需要组合多种调度策略来满足复杂的业务需求。

6.1 策略组合器

// StrategyCombinator 策略组合器
type StrategyCombinator struct {
    strategies []SchedulingStrategy
    combinatorType CombinatorType
    weights    []float64
}

// CombinatorType 组合器类型
type CombinatorType string

const (
    CombinatorTypeWeightedAverage CombinatorType = "weighted_average"
    CombinatorTypeBestScore       CombinatorType = "best_score"
    CombinatorTypeVoting          CombinatorType = "voting"
)

// NewStrategyCombinator 创建策略组合器
func NewStrategyCombinator(strategies []SchedulingStrategy, combinatorType CombinatorType, weights []float64) *StrategyCombinator {
    if weights == nil {
        weights = make([]float64, len(strategies))
        for i := range weights {
            weights[i] = 1.0 / float64(len(strategies))
        }
    }
    
    return &StrategyCombinator{
        strategies:     strategies,
        combinatorType: combinatorType,
        weights:        weights,
    }
}

// Combine 组合策略评估结果
func (sc *StrategyCombinator) Combine(job *Job, context *SchedulingContext) SchedulingStrategy {
    switch sc.combinatorType {
    case CombinatorTypeWeightedAverage:
        return sc.weightedAverageSelection(job, context)
    case CombinatorTypeBestScore:
        return sc.bestScoreSelection(job, context)
    case CombinatorTypeVoting:
        return sc.votingSelection(job, context)
    default:
        return sc.bestScoreSelection(job, context)
    }
}

// weightedAverageSelection 加权平均选择
func (sc *StrategyCombinator) weightedAverageSelection(job *Job, context *SchedulingContext) SchedulingStrategy {
    var bestStrategy SchedulingStrategy
    var bestScore float64
    
    for i, strategy := range sc.strategies {
        score := strategy.Evaluate(job, context)
        weightedScore := score * sc.weights[i]
        
        if weightedScore > bestScore {
            bestScore = weightedScore
            bestStrategy = strategy
        }
    }
    
    return bestStrategy
}

// bestScoreSelection 最佳评分选择
func (sc *StrategyCombinator) bestScoreSelection(job *Job, context *SchedulingContext) SchedulingStrategy {
    var bestStrategy SchedulingStrategy
    var bestScore float64
    
    for _, strategy := range sc.strategies {
        score := strategy.Evaluate(job, context)
        if score > bestScore {
            bestScore = score
            bestStrategy = strategy
        }
    }
    
    return bestStrategy
}

// votingSelection 投票选择
func (sc *StrategyCombinator) votingSelection(job *Job, context *SchedulingContext) SchedulingStrategy {
    votes := make(map[string]int)
    
    for _, strategy := range sc.strategies {
        // 简化的投票机制:每个策略推荐一个最佳执行时间窗口
        score := strategy.Evaluate(job, context)
        if score > 0.5 { // 阈值
            votes[strategy.Name()]++
        }
    }
    
    // 选择得票最多的策略
    var winner string
    var maxVotes int
    
    for strategyName, voteCount := range votes {
        if voteCount > maxVotes {
            maxVotes = voteCount
            winner = strategyName
        }
    }
    
    // 返回获胜策略
    for _, strategy := range sc.strategies {
        if strategy.Name() == winner {
            return strategy
        }
    }
    
    return nil
}

7. 总结

复杂调度策略的设计和实现是分布式任务调度系统的核心能力之一,它能够显著提升系统的智能化水平和资源利用效率:

  1. 优先级调度:通过多级优先级队列确保重要任务优先执行,同时避免低优先级任务饥饿
  2. 资源感知调度:根据系统资源使用情况动态调整任务执行策略,防止系统过载
  3. 依赖调度:通过构建依赖图确保任务按正确的依赖关系执行,支持复杂的任务编排
  4. 动态调度:基于历史数据和实时状态自适应调整调度策略,持续优化调度效果
  5. 策略组合:通过组合多种调度策略形成更强大的综合调度能力

在实际应用中,需要根据具体的业务场景和性能要求选择合适的调度策略,并持续监控和优化以达到最佳效果。通过合理运用这些复杂调度策略,我们可以构建出更加智能、高效、可靠的分布式任务调度系统,为企业级应用提供强有力的调度保障。

下一章我们将深入探讨分布式任务调度系统中的任务编排与规则引擎设计。