震撼!分布式任务调度架构竟然可以这样做?
分布式任务调度系统是现代企业级应用的重要组成部分,负责在分布式环境中高效、可靠地执行各种定时和异步任务。一个设计良好的分布式任务调度系统不仅要支持高并发和高可用,还要具备灵活的任务编排、复杂的调度策略和完善的监控能力。本章将深入探讨分布式任务调度系统的架构设计和核心技术。
1. 分布式任务调度概述
在深入具体实现之前,我们需要了解分布式任务调度系统的基本概念和核心挑战。
1.1 核心概念
// DistributedTaskSchedulingConcepts 分布式任务调度核心概念
type DistributedTaskSchedulingConcepts struct {
// 任务(Job) - 需要执行的工作单元
Job bool
// 调度器(Scheduler) - 负责决定何时执行任务
Scheduler bool
// 执行器(Executor) - 负责实际执行任务
Executor bool
// 触发器(Trigger) - 定义任务的执行时间规则
Trigger bool
// 任务队列(Task Queue) - 存储待执行任务的队列
TaskQueue bool
}
1.2 核心挑战
// DistributedTaskSchedulingChallenges 分布式任务调度挑战
type DistributedTaskSchedulingChallenges struct {
// 数据一致性
DataConsistency bool
// 高可用性
HighAvailability bool
// 故障恢复
FaultRecovery bool
// 负载均衡
LoadBalancing bool
// 任务分片
TaskSharding bool
// 并发控制
ConcurrencyControl bool
// 监控和告警
MonitoringAndAlerting bool
}
2. 系统架构设计
分布式任务调度系统的架构设计需要考虑多个组件的协调工作,形成一个完整的工作流。
2.1 架构概览
graph TB
A[任务生产者] --> B[任务存储]
C[调度器集群] --> B
D[执行器集群] --> B
E[管理控制台] --> B
F[监控系统] --> C
F --> D
G[通知服务] --> A
B --> H[持久化存储]
C --> I[注册中心]
D --> I
2.2 核心组件设计
// TaskSchedulerSystem 任务调度系统
type TaskSchedulerSystem struct {
config *SchedulerConfig
jobStore JobStore
scheduler *Scheduler
executor *Executor
registry *ServiceRegistry
monitor *SystemMonitor
notification *NotificationService
metrics *SchedulerMetrics
}
// SchedulerConfig 调度器配置
type SchedulerConfig struct {
// 调度间隔
ScheduleInterval time.Duration `json:"schedule_interval"`
// 执行超时时间
ExecutionTimeout time.Duration `json:"execution_timeout"`
// 最大并发任务数
MaxConcurrentTasks int `json:"max_concurrent_tasks"`
// 任务重试次数
MaxRetryAttempts int `json:"max_retry_attempts"`
// 负载均衡策略
LoadBalancingStrategy string `json:"load_balancing_strategy"`
// 故障转移策略
FailoverStrategy string `json:"failover_strategy"`
}
// Job 任务定义
type Job struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
CronExpr string `json:"cron_expr"` // Cron表达式
TaskType string `json:"task_type"` // 任务类型
Parameters map[string]string `json:"parameters"` // 任务参数
Priority int `json:"priority"` // 任务优先级
Timeout time.Duration `json:"timeout"` // 执行超时
RetryCount int `json:"retry_count"` // 重试次数
MaxRetry int `json:"max_retry"` // 最大重试次数
Status JobStatus `json:"status"` // 任务状态
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// JobStatus 任务状态
type JobStatus string
const (
JobStatusPending JobStatus = "pending" // 待执行
JobStatusRunning JobStatus = "running" // 执行中
JobStatusSuccess JobStatus = "success" // 执行成功
JobStatusFailed JobStatus = "failed" // 执行失败
JobStatusCancelled JobStatus = "cancelled" // 已取消
)
// JobExecution 任务执行记录
type JobExecution struct {
ID string `json:"id"`
JobID string `json:"job_id"`
ExecutorID string `json:"executor_id"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Status ExecutionStatus `json:"status"`
Result string `json:"result"`
Error string `json:"error"`
RetryCount int `json:"retry_count"`
}
// ExecutionStatus 执行状态
type ExecutionStatus string
const (
ExecutionStatusStarted ExecutionStatus = "started"
ExecutionStatusFinished ExecutionStatus = "finished"
ExecutionStatusFailed ExecutionStatus = "failed"
)
// NewTaskSchedulerSystem 创建任务调度系统
func NewTaskSchedulerSystem(config *SchedulerConfig) *TaskSchedulerSystem {
return &TaskSchedulerSystem{
config: config,
jobStore: NewInMemoryJobStore(),
scheduler: NewScheduler(config),
executor: NewExecutor(config),
registry: NewServiceRegistry(),
monitor: NewSystemMonitor(),
notification: NewNotificationService(),
metrics: NewSchedulerMetrics(),
}
}
3. 任务存储实现
任务存储是分布式任务调度系统的核心组件,负责存储任务定义、执行记录等关键数据。
3.1 任务存储接口
// JobStore 任务存储接口
type JobStore interface {
// 创建任务
CreateJob(ctx context.Context, job *Job) error
// 更新任务
UpdateJob(ctx context.Context, job *Job) error
// 删除任务
DeleteJob(ctx context.Context, jobID string) error
// 获取任务
GetJob(ctx context.Context, jobID string) (*Job, error)
// 列出所有任务
ListJobs(ctx context.Context) ([]*Job, error)
// 根据状态获取任务
GetJobsByStatus(ctx context.Context, status JobStatus) ([]*Job, error)
// 创建任务执行记录
CreateExecution(ctx context.Context, execution *JobExecution) error
// 更新任务执行记录
UpdateExecution(ctx context.Context, execution *JobExecution) error
// 获取任务执行记录
GetExecutions(ctx context.Context, jobID string) ([]*JobExecution, error)
// 获取待执行任务
GetPendingJobs(ctx context.Context, limit int) ([]*Job, error)
}
// InMemoryJobStore 内存任务存储实现
type InMemoryJobStore struct {
jobs sync.Map // map[string]*Job
executions sync.Map // map[string][]*JobExecution
mutex sync.RWMutex
}
// NewInMemoryJobStore 创建内存任务存储
func NewInMemoryJobStore() *InMemoryJobStore {
return &InMemoryJobStore{}
}
// CreateJob 创建任务
func (ims *InMemoryJobStore) CreateJob(ctx context.Context, job *Job) error {
if job.ID == "" {
job.ID = uuid.New().String()
}
job.CreatedAt = time.Now()
job.UpdatedAt = time.Now()
ims.jobs.Store(job.ID, job)
return nil
}
// UpdateJob 更新任务
func (ims *InMemoryJobStore) UpdateJob(ctx context.Context, job *Job) error {
ims.mutex.Lock()
defer ims.mutex.Unlock()
existingJob, exists := ims.jobs.Load(job.ID)
if !exists {
return fmt.Errorf("job %s not found", job.ID)
}
existing := existingJob.(*Job)
job.CreatedAt = existing.CreatedAt
job.UpdatedAt = time.Now()
ims.jobs.Store(job.ID, job)
return nil
}
// DeleteJob 删除任务
func (ims *InMemoryJobStore) DeleteJob(ctx context.Context, jobID string) error {
ims.jobs.Delete(jobID)
ims.executions.Delete(jobID)
return nil
}
// GetJob 获取任务
func (ims *InMemoryJobStore) GetJob(ctx context.Context, jobID string) (*Job, error) {
job, exists := ims.jobs.Load(jobID)
if !exists {
return nil, fmt.Errorf("job %s not found", jobID)
}
return job.(*Job), nil
}
// ListJobs 列出所有任务
func (ims *InMemoryJobStore) ListJobs(ctx context.Context) ([]*Job, error) {
var jobs []*Job
ims.jobs.Range(func(key, value interface{}) bool {
jobs = append(jobs, value.(*Job))
return true
})
return jobs, nil
}
// GetJobsByStatus 根据状态获取任务
func (ims *InMemoryJobStore) GetJobsByStatus(ctx context.Context, status JobStatus) ([]*Job, error) {
var jobs []*Job
ims.jobs.Range(func(key, value interface{}) bool {
job := value.(*Job)
if job.Status == status {
jobs = append(jobs, job)
}
return true
})
return jobs, nil
}
// CreateExecution 创建任务执行记录
func (ims *InMemoryJobStore) CreateExecution(ctx context.Context, execution *JobExecution) error {
ims.mutex.Lock()
defer ims.mutex.Unlock()
if execution.ID == "" {
execution.ID = uuid.New().String()
}
// 获取该任务的执行记录列表
executions, _ := ims.executions.LoadOrStore(execution.JobID, make([]*JobExecution, 0))
executionList := executions.([]*JobExecution)
// 添加新的执行记录
executionList = append(executionList, execution)
ims.executions.Store(execution.JobID, executionList)
return nil
}
// UpdateExecution 更新任务执行记录
func (ims *InMemoryJobStore) UpdateExecution(ctx context.Context, execution *JobExecution) error {
// 在内存实现中,更新和创建执行记录是相同的
return ims.CreateExecution(ctx, execution)
}
// GetExecutions 获取任务执行记录
func (ims *InMemoryJobStore) GetExecutions(ctx context.Context, jobID string) ([]*JobExecution, error) {
executions, exists := ims.executions.Load(jobID)
if !exists {
return []*JobExecution{}, nil
}
return executions.([]*JobExecution), nil
}
// GetPendingJobs 获取待执行任务
func (ims *InMemoryJobStore) GetPendingJobs(ctx context.Context, limit int) ([]*Job, error) {
var pendingJobs []*Job
count := 0
ims.jobs.Range(func(key, value interface{}) bool {
if count >= limit {
return false
}
job := value.(*Job)
if job.Status == JobStatusPending {
pendingJobs = append(pendingJobs, job)
count++
}
return true
})
return pendingJobs, nil
}
4. 调度器实现
调度器是任务调度系统的核心组件,负责根据任务的调度规则触发任务执行。
4.1 调度器设计
// Scheduler 调度器
type Scheduler struct {
config *SchedulerConfig
jobStore JobStore
executor *Executor
cronParser *cron.Parser
ticker *time.Ticker
stopChan chan struct{}
wg sync.WaitGroup
metrics *SchedulerMetrics
}
// NewScheduler 创建调度器
func NewScheduler(config *SchedulerConfig) *Scheduler {
return &Scheduler{
config: config,
cronParser: cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow),
stopChan: make(chan struct{}),
metrics: NewSchedulerMetrics(),
}
}
// Start 启动调度器
func (s *Scheduler) Start(ctx context.Context, jobStore JobStore, executor *Executor) error {
s.jobStore = jobStore
s.executor = executor
// 启动调度循环
s.wg.Add(1)
go s.scheduleLoop()
return nil
}
// scheduleLoop 调度循环
func (s *Scheduler) scheduleLoop() {
defer s.wg.Done()
s.ticker = time.NewTicker(s.config.ScheduleInterval)
defer s.ticker.Stop()
for {
select {
case <-s.stopChan:
return
case <-s.ticker.C:
s.scheduleJobs()
}
}
}
// scheduleJobs 调度任务
func (s *Scheduler) scheduleJobs() {
ctx := context.Background()
// 获取待执行任务
jobs, err := s.jobStore.GetPendingJobs(ctx, 100) // 一次最多处理100个任务
if err != nil {
log.Printf("Failed to get pending jobs: %v", err)
return
}
now := time.Now()
for _, job := range jobs {
// 解析Cron表达式
schedule, err := s.cronParser.Parse(job.CronExpr)
if err != nil {
log.Printf("Failed to parse cron expression for job %s: %v", job.ID, err)
continue
}
// 检查是否应该执行
if schedule.Next(time.Now()).Before(now) || schedule.Next(time.Now()).Equal(now) {
// 提交任务执行
if err := s.executor.SubmitJob(ctx, job); err != nil {
log.Printf("Failed to submit job %s for execution: %v", job.ID, err)
s.metrics.SchedulingErrors.WithLabelValues("submission_failed").Inc()
continue
}
s.metrics.JobsScheduled.WithLabelValues(job.TaskType).Inc()
}
}
}
// Stop 停止调度器
func (s *Scheduler) Stop() {
close(s.stopChan)
s.wg.Wait()
}
4.2 Cron表达式解析
// CronExpressionParser Cron表达式解析器
type CronExpressionParser struct {
parser *cron.Parser
}
// NewCronExpressionParser 创建Cron表达式解析器
func NewCronExpressionParser() *CronExpressionParser {
return &CronExpressionParser{
parser: cron.NewParser(cron.Second | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow),
}
}
// Parse 解析Cron表达式
func (cep *CronExpressionParser) Parse(expression string) (cron.Schedule, error) {
return cep.parser.Parse(expression)
}
// GetNextExecutionTime 获取下次执行时间
func (cep *CronExpressionParser) GetNextExecutionTime(expression string, fromTime time.Time) (time.Time, error) {
schedule, err := cep.Parse(expression)
if err != nil {
return time.Time{}, err
}
return schedule.Next(fromTime), nil
}
// Validate 验证Cron表达式
func (cep *CronExpressionParser) Validate(expression string) error {
_, err := cep.Parse(expression)
return err
}
5. 执行器实现
执行器负责实际执行任务,是任务调度系统的执行单元。
5.1 执行器设计
// Executor 执行器
type Executor struct {
config *SchedulerConfig
jobStore JobStore
taskRegistry *TaskRegistry
workerPool *WorkerPool
metrics *ExecutorMetrics
}
// TaskRegistry 任务注册表
type TaskRegistry struct {
tasks sync.Map // map[string]TaskHandler
}
// TaskHandler 任务处理器接口
type TaskHandler interface {
Execute(ctx context.Context, job *Job) (string, error)
}
// WorkerPool 工作协程池
type WorkerPool struct {
workers int
jobQueue chan *JobExecution
quit chan struct{}
wg sync.WaitGroup
metrics *WorkerPoolMetrics
}
// WorkerPoolMetrics 工作协程池指标
type WorkerPoolMetrics struct {
ActiveWorkers *prometheus.GaugeVec
QueuedJobs *prometheus.GaugeVec
ProcessedJobs *prometheus.CounterVec
}
// NewExecutor 创建执行器
func NewExecutor(config *SchedulerConfig) *Executor {
return &Executor{
config: config,
taskRegistry: NewTaskRegistry(),
workerPool: NewWorkerPool(config.MaxConcurrentTasks),
metrics: NewExecutorMetrics(),
}
}
// NewTaskRegistry 创建任务注册表
func NewTaskRegistry() *TaskRegistry {
return &TaskRegistry{}
}
// NewWorkerPool 创建工作协程池
func NewWorkerPool(workers int) *WorkerPool {
wp := &WorkerPool{
workers: workers,
jobQueue: make(chan *JobExecution, 1000),
quit: make(chan struct{}),
metrics: &WorkerPoolMetrics{
ActiveWorkers: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "executor_active_workers",
Help: "Number of active workers",
},
[]string{"executor_type"},
),
QueuedJobs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "executor_queued_jobs",
Help: "Number of queued jobs",
},
[]string{"job_type"},
),
ProcessedJobs: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "executor_processed_jobs_total",
Help: "Total number of processed jobs",
},
[]string{"job_type", "result"},
),
},
}
// 启动工作协程
for i := 0; i < workers; i++ {
wp.wg.Add(1)
go wp.worker(i)
}
return wp
}
// RegisterTask 注册任务处理器
func (tr *TaskRegistry) RegisterTask(taskType string, handler TaskHandler) {
tr.tasks.Store(taskType, handler)
}
// GetTask 获取任务处理器
func (tr *TaskRegistry) GetTask(taskType string) (TaskHandler, bool) {
handler, exists := tr.tasks.Load(taskType)
if !exists {
return nil, false
}
return handler.(TaskHandler), true
}
// SubmitJob 提交任务执行
func (e *Executor) SubmitJob(ctx context.Context, job *Job) error {
// 创建执行记录
execution := &JobExecution{
ID: uuid.New().String(),
JobID: job.ID,
StartTime: time.Now(),
Status: ExecutionStatusStarted,
RetryCount: 0,
}
// 保存执行记录
if err := e.jobStore.CreateExecution(ctx, execution); err != nil {
return fmt.Errorf("failed to create execution record: %w", err)
}
// 提交到工作队列
select {
case e.workerPool.jobQueue <- execution:
e.metrics.JobsSubmitted.WithLabelValues(job.TaskType).Inc()
return nil
default:
return errors.New("executor job queue is full")
}
}
// worker 工作协程
func (wp *WorkerPool) worker(id int) {
defer wp.wg.Done()
for {
select {
case execution := <-wp.jobQueue:
wp.metrics.ActiveWorkers.WithLabelValues("task_executor").Inc()
// 执行任务
wp.executeJob(execution)
wp.metrics.ActiveWorkers.WithLabelValues("task_executor").Dec()
wp.metrics.QueuedJobs.WithLabelValues("task").Dec()
case <-wp.quit:
return
}
}
}
// executeJob 执行任务
func (wp *WorkerPool) executeJob(execution *JobExecution) {
// 这里应该从全局执行器获取实际的执行器实例
// 简化处理,仅作为示例
}
// Stop 停止工作协程池
func (wp *WorkerPool) Stop() {
close(wp.quit)
wp.wg.Wait()
}
6. 服务注册与发现
在分布式环境中,服务注册与发现是实现高可用和负载均衡的关键。
6.1 服务注册中心
// ServiceRegistry 服务注册中心
type ServiceRegistry struct {
services sync.Map // map[string]*ServiceInstance
mutex sync.RWMutex
ttl time.Duration
}
// ServiceInstance 服务实例
type ServiceInstance struct {
ID string `json:"id"`
Name string `json:"name"`
Address string `json:"address"`
Port int `json:"port"`
Tags []string `json:"tags"`
Metadata map[string]string `json:"metadata"`
Heartbeat time.Time `json:"heartbeat"`
Status ServiceStatus `json:"status"`
}
// ServiceStatus 服务状态
type ServiceStatus string
const (
ServiceStatusUp ServiceStatus = "up"
ServiceStatusDown ServiceStatus = "down"
)
// NewServiceRegistry 创建服务注册中心
func NewServiceRegistry() *ServiceRegistry {
sr := &ServiceRegistry{
ttl: 30 * time.Second, // 默认30秒超时
}
// 启动健康检查
go sr.startHealthCheck()
return sr
}
// RegisterService 注册服务
func (sr *ServiceRegistry) RegisterService(instance *ServiceInstance) error {
sr.mutex.Lock()
defer sr.mutex.Unlock()
if instance.ID == "" {
instance.ID = uuid.New().String()
}
instance.Heartbeat = time.Now()
instance.Status = ServiceStatusUp
if instance.Metadata == nil {
instance.Metadata = make(map[string]string)
}
sr.services.Store(instance.ID, instance)
return nil
}
// UnregisterService 注销服务
func (sr *ServiceRegistry) UnregisterService(instanceID string) error {
sr.mutex.Lock()
defer sr.mutex.Unlock()
sr.services.Delete(instanceID)
return nil
}
// Heartbeat 心跳
func (sr *ServiceRegistry) Heartbeat(instanceID string) error {
sr.mutex.Lock()
defer sr.mutex.Unlock()
instance, exists := sr.services.Load(instanceID)
if !exists {
return fmt.Errorf("service instance %s not found", instanceID)
}
serviceInstance := instance.(*ServiceInstance)
serviceInstance.Heartbeat = time.Now()
serviceInstance.Status = ServiceStatusUp
return nil
}
// GetServiceInstances 获取服务实例列表
func (sr *ServiceRegistry) GetServiceInstances(serviceName string) ([]*ServiceInstance, error) {
sr.mutex.RLock()
defer sr.mutex.RUnlock()
var instances []*ServiceInstance
sr.services.Range(func(key, value interface{}) bool {
instance := value.(*ServiceInstance)
if instance.Name == serviceName && instance.Status == ServiceStatusUp {
instances = append(instances, instance)
}
return true
})
return instances, nil
}
// GetServiceInstance 获取特定服务实例
func (sr *ServiceRegistry) GetServiceInstance(instanceID string) (*ServiceInstance, error) {
sr.mutex.RLock()
defer sr.mutex.RUnlock()
instance, exists := sr.services.Load(instanceID)
if !exists {
return nil, fmt.Errorf("service instance %s not found", instanceID)
}
return instance.(*ServiceInstance), nil
}
// startHealthCheck 启动健康检查
func (sr *ServiceRegistry) startHealthCheck() {
ticker := time.NewTicker(10 * time.Second)
defer ticker.Stop()
for range ticker.C {
sr.checkHealth()
}
}
// checkHealth 检查健康状态
func (sr *ServiceRegistry) checkHealth() {
sr.mutex.Lock()
defer sr.mutex.Unlock()
now := time.Now()
sr.services.Range(func(key, value interface{}) bool {
instance := value.(*ServiceInstance)
if now.Sub(instance.Heartbeat) > sr.ttl {
instance.Status = ServiceStatusDown
}
return true
})
}
7. 监控与指标
完善的监控体系是保障分布式任务调度系统稳定运行的重要手段。
7.1 监控指标
// SchedulerMetrics 调度器指标
type SchedulerMetrics struct {
JobsScheduled *prometheus.CounterVec
SchedulingErrors *prometheus.CounterVec
ScheduleLatency *prometheus.HistogramVec
}
// ExecutorMetrics 执行器指标
type ExecutorMetrics struct {
JobsSubmitted *prometheus.CounterVec
JobsCompleted *prometheus.CounterVec
ExecutionErrors *prometheus.CounterVec
ExecutionLatency *prometheus.HistogramVec
}
// NewSchedulerMetrics 创建调度器指标
func NewSchedulerMetrics() *SchedulerMetrics {
return &SchedulerMetrics{
JobsScheduled: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "scheduler_jobs_scheduled_total",
Help: "Total number of jobs scheduled",
},
[]string{"job_type"},
),
SchedulingErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "scheduler_errors_total",
Help: "Total number of scheduling errors",
},
[]string{"error_type"},
),
ScheduleLatency: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "scheduler_schedule_latency_seconds",
Help: "Scheduling latency in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"job_type"},
),
}
}
// NewExecutorMetrics 创建执行器指标
func NewExecutorMetrics() *ExecutorMetrics {
return &ExecutorMetrics{
JobsSubmitted: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "executor_jobs_submitted_total",
Help: "Total number of jobs submitted",
},
[]string{"job_type"},
),
JobsCompleted: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "executor_jobs_completed_total",
Help: "Total number of jobs completed",
},
[]string{"job_type", "result"},
),
ExecutionErrors: prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "executor_errors_total",
Help: "Total number of execution errors",
},
[]string{"error_type"},
),
ExecutionLatency: prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "executor_execution_latency_seconds",
Help: "Execution latency in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"job_type"},
),
}
}
// SystemMonitor 系统监控器
type SystemMonitor struct {
metrics *SystemMetrics
ticker *time.Ticker
quit chan struct{}
wg sync.WaitGroup
}
// SystemMetrics 系统指标
type SystemMetrics struct {
// 系统指标
CPUUsage *prometheus.GaugeVec
MemoryUsage *prometheus.GaugeVec
Goroutines *prometheus.GaugeVec
// 任务指标
TotalJobs *prometheus.GaugeVec
RunningJobs *prometheus.GaugeVec
FailedJobs *prometheus.GaugeVec
SuccessRate *prometheus.GaugeVec
// 执行器指标
ActiveExecutors *prometheus.GaugeVec
ExecutorQueue *prometheus.GaugeVec
}
// NewSystemMonitor 创建系统监控器
func NewSystemMonitor() *SystemMonitor {
return &SystemMonitor{
metrics: &SystemMetrics{
CPUUsage: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_cpu_usage_percent",
Help: "System CPU usage percentage",
},
[]string{"component"},
),
MemoryUsage: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_memory_usage_bytes",
Help: "System memory usage in bytes",
},
[]string{"component"},
),
Goroutines: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_goroutines",
Help: "Number of goroutines",
},
[]string{"component"},
),
TotalJobs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_total_jobs",
Help: "Total number of jobs",
},
[]string{"job_status"},
),
RunningJobs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_running_jobs",
Help: "Number of running jobs",
},
[]string{"job_type"},
),
FailedJobs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_failed_jobs",
Help: "Number of failed jobs",
},
[]string{"job_type"},
),
SuccessRate: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_success_rate",
Help: "Job success rate",
},
[]string{"job_type"},
),
ActiveExecutors: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_active_executors",
Help: "Number of active executors",
},
[]string{"executor_type"},
),
ExecutorQueue: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Name: "system_executor_queue_size",
Help: "Executor queue size",
},
[]string{"queue_type"},
),
},
quit: make(chan struct{}),
}
}
// Start 启动监控
func (sm *SystemMonitor) Start(interval time.Duration) {
sm.ticker = time.NewTicker(interval)
sm.wg.Add(1)
go sm.collectMetrics()
}
// collectMetrics 收集指标
func (sm *SystemMonitor) collectMetrics() {
defer sm.wg.Done()
for {
select {
case <-sm.ticker.C:
sm.collectSystemMetrics()
case <-sm.quit:
return
}
}
}
// collectSystemMetrics 收集系统指标
func (sm *SystemMonitor) collectSystemMetrics() {
// 收集CPU使用率
// 注意:在实际实现中需要使用系统调用来获取真实的CPU使用率
cpuPercent := 0.0 // 简化处理
sm.metrics.CPUUsage.WithLabelValues("scheduler").Set(cpuPercent)
// 收集内存使用
var m runtime.MemStats
runtime.ReadMemStats(&m)
sm.metrics.MemoryUsage.WithLabelValues("scheduler").Set(float64(m.Alloc))
// 收集协程数量
sm.metrics.Goroutines.WithLabelValues("scheduler").Set(float64(runtime.NumGoroutine()))
}
// Stop 停止监控
func (sm *SystemMonitor) Stop() {
close(sm.quit)
if sm.ticker != nil {
sm.ticker.Stop()
}
sm.wg.Wait()
}
8. 总结
分布式任务调度系统的架构设计是一个复杂的工程问题,需要考虑多个方面的因素:
- 系统架构:合理的架构设计包括任务存储、调度器、执行器、服务注册中心等核心组件
- 数据存储:可靠的任务和执行记录存储机制是系统稳定运行的基础
- 调度机制:高效的调度算法和Cron表达式解析能力确保任务按时执行
- 执行管理:通过工作协程池和任务注册机制实现任务的并发执行
- 服务发现:在分布式环境中实现服务的自动注册和发现
- 监控告警:完善的监控体系帮助及时发现和解决问题
在实际应用中,需要根据具体的业务场景和性能要求选择合适的技术方案,并持续优化以达到最佳效果。通过合理的设计和实现,我们可以构建出高性能、高可用、易扩展的分布式任务调度系统,为企业的各种定时和异步任务提供可靠的执行保障。
下一章我们将深入探讨分布式任务调度系统中的复杂调度策略和任务编排机制。