LangChainGo 智能体实战:构建一个智能日志分析告警AI Agent🦜️

0 阅读9分钟

构建智能日志分析器.png

前言

本篇文章给各位介绍如何使用 LangChainGo 库构建一个智能日志分析器智能体,通过给 AI 合适的提示词实现智能分析日志的问题,然后实现告警的效果。

实现的效果

一个命令行工具:

  • 解析各种格式的日志文件(JSON、结构化文本等)
  • 识别错误模式和异常情况
  • 总结日志活动和趋势
  • 基于检测到的问题建议执行动作
  • 为严重问题生成告警

环境准备

  • Go 1.21+
  • LLM API Key(OpenAI、Anthropic)
  • 需要分析的样例日志文件

开始

首先我们使用 Goland 新建一个工程,然后导入以下依赖:

go get github.com/tmc/langchaingo
go get github.com/sirupsen/logrus

结构图

日志相关结构体

package main

import (
    "fmt"
    "time"
)

// 日志条目
type LogEntry struct {
    Timestamp time.Time `json:"timestamp"`
    Level     string    `json:"level,omitempty"`
    Message   string    `json:"message,omitempty"`
    Source    string    `json:"source,omitempty"`
    Raw       string    `json:"raw,omitempty"`
}

// 日志分析结果
type LogAnalysis struct {
    TotalEntries    int            `json:"total_entries"`
    ErrorCount      int            `json:"error_count"`
    WarningCount    int            `json:"warning_count"`
    TopErrors       []ErrorPattern `json:"top_errors"`
    TimeRange       TimeRange      `json:"time_range"`
    Recommendations []string       `json:"recommendations"`
    Anomalies       []Anomaly      `json:"anomalies"`
}

// 错误模式
type ErrorPattern struct {
    Pattern string `json:"pattern"`
    Count   int    `json:"count"`
    Example string `json:"example"`
}

// 时间跨度
type TimeRange struct {
    Start time.Time `json:"start"`
    End   time.Time `json:"end"`
}

// 异常情况
type Anomaly struct {
    Type        string   `json:"type"`
    Description string   `json:"description"`
    Severity    string   `json:"severity"`
    Examples    []string `json:"examples"`
}

以上几个结构体是用来保存日志分析结果的基本信息。

分析器Analyzer

Analyzer 结构体是我们这个智能体的核心,它包含了一个 llms.Model 类型的字段,用于调用 langchaingo 库的能力。

package main

import (
    "bufio"
    "context"
    "encoding/json"
    "fmt"
    "os"
    "regexp"
    "sort"
    "strings"
    "time"

    "github.com/tmc/langchaingo/llms"
    "github.com/tmc/langchaingo/llms/openai"
    "github.com/tmc/langchaingo/prompts"
)

type LogAnalyzer struct {
    llm llms.Model
}

func NewLogAnalyzer(config *AppConfig) (*LogAnalyzer, error) {
    llm, err := openai.New(openai.WithBaseURL(config.LlmConfig.EndPoint),
       openai.WithToken(config.LlmConfig.ApiKey),
       openai.WithModel(config.LlmConfig.Model),
    )
    if err != nil {
       return nil, fmt.Errorf("failed to create LLM: %w", err)
    }
    return &LogAnalyzer{llm: llm}, nil
}

大模型接口的地址、密钥、模型名称都是从配置文件中读取的,这样便于修改配置。

解析日志文件

Analyzer 组件会根据文件名来解析日志文件,然后返回 []LogEntry,这个方法主要是根据不同的日志类型来解析日志文件。

func (la *LogAnalyzer) ParseLogFile(filename string) ([]LogEntry, error) {
    file, err := os.Open(filename)
    if err != nil {
       return nil, fmt.Errorf("opening file: %w", err)
    }
    defer file.Close()

    var entries []LogEntry
    scanner := bufio.NewScanner(file)

    // Common log patterns
    patterns := []*regexp.Regexp{
       // JSON logs
       regexp.MustCompile(`^{.*}$`),
       // Standard format: 2023-01-01 12:00:00 [ERROR] message
       regexp.MustCompile(`^(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+[(\w+)]\s+(.+)$`),
       // Nginx/Apache format
       regexp.MustCompile(`^(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}).*[([^]]+)].*"([^"]*)".*(\d{3})`),
    }

    for scanner.Scan() {
       line := scanner.Text()
       if strings.TrimSpace(line) == "" {
          continue
       }

       entry := LogEntry{Raw: line}

       // Try JSON first
       if line[0] == '{' {
          var jsonEntry map[string]any
          if err := json.Unmarshal([]byte(line), &jsonEntry); err == nil {
             entry = parseJSONLog(jsonEntry, line)
             entries = append(entries, entry)
             continue
          }
       }

       // Try structured patterns
       for _, pattern := range patterns[1:] {
          if matches := pattern.FindStringSubmatch(line); matches != nil {
             entry = parseStructuredLog(matches, line)
             break
          }
       }

       // Fallback: treat as unstructured
       if entry.Timestamp.IsZero() {
          entry = LogEntry{
             Timestamp: time.Now(), // Use current time as fallback
             Level:     inferLogLevel(line),
             Message:   line,
             Raw:       line,
          }
       }

       entries = append(entries, entry)
    }

    return entries, scanner.Err()
}

假如是以 { 开头的,那么就被判定为是 JSON 类型的日志:

func parseJSONLog(data map[string]any, raw string) LogEntry {
    entry := LogEntry{Raw: raw}
    
    if ts, ok := data["timestamp"].(string); ok {
        if t, err := time.Parse(time.RFC3339, ts); err == nil {
            entry.Timestamp = t
        }
    }
    
    if level, ok := data["level"].(string); ok {
        entry.Level = level
    }
    
    if msg, ok := data["message"].(string); ok {
        entry.Message = msg
    }
    
    if src, ok := data["source"].(string); ok {
        entry.Source = src
    }

    return entry
}

否则使用正则表达式来匹配结构化日志,然后返回以一个 LogEntry 类型的结构体。pattern.FindStringSubmatch(line) 方法返回的是一个 []string 切片,因为上述代码定义的正则表达式有 3 个捕获组,那么返回的切片长度就是 4,第 0 个就是目标字符串本身。从第 1 个开始进行获取即可。

func parseStructuredLog(matches []string, raw string) LogEntry {
    entry := LogEntry{Raw: raw}
    
    if len(matches) >= 4 {
        if t, err := time.Parse("2006-01-02 15:04:05", matches[1]); err == nil {
            entry.Timestamp = t
        }
        entry.Level = matches[2]
        entry.Message = matches[3]
    }
    
    return entry
}

最后还有一个兜底的操作,假如 LogEntryTimestamp.IsZero()true 成立,那么就进行判断字符串是否包含关键词,这样就可以推断出日志的等级。

func inferLogLevel(line string) string {
    lower := strings.ToLower(line)
    switch {
    case strings.Contains(lower, "error") || strings.Contains(lower, "fatal"):
        return "ERROR"
    case strings.Contains(lower, "warn"):
        return "WARN"
    case strings.Contains(lower, "debug"):
        return "DEBUG"
    default:
        return "INFO"
    }
}

分析日志文件

分析日志文件是整个应用的核心业务逻辑,因为这个流程里面包含了与 AI 大模型交互的步骤。AnalyzeLogs 方法接收一个 []LogEntry 参数,最后返回分析的结果也就是 LogAnalysis 类型的结构体。

func (la *LogAnalyzer) AnalyzeLogs(entries []LogEntry) (*LogAnalysis, error) {
    if len(entries) == 0 {
        return &LogAnalysis{}, nil
    }

    // Basic statistics
    analysis := &LogAnalysis{
        TotalEntries: len(entries),
        TimeRange: TimeRange{
            Start: entries[0].Timestamp,
            End:   entries[len(entries)-1].Timestamp,
        },
    }

    // Count by level
    errorMessages := []string{}
    for _, entry := range entries {
        switch strings.ToUpper(entry.Level) {
        case "ERROR", "FATAL":
            analysis.ErrorCount++
            errorMessages = append(errorMessages, entry.Message)
        case "WARN", "WARNING":
            analysis.WarningCount++
        }
    }

    // Find error patterns
    analysis.TopErrors = findErrorPatterns(errorMessages)

    // Use AI for deeper analysis
    if err := la.performAIAnalysis(entries, analysis); err != nil {
        return nil, fmt.Errorf("AI analysis failed: %w", err)
    }

    return analysis, nil
}

但是在调用大模型的能力之前,我们会首先尝试使用 findErrorPatterns 函数获取 Top10 的错误模式(ErrorPattern)切片。然后根据 Count 属性进行降序排序。

func findErrorPatterns(messages []string) []ErrorPattern {
    patternCounts := make(map[string]int)
    patternExamples := make(map[string]string)
    
    for _, msg := range messages {
        // Normalize error messages by removing specific values
        pattern := normalizeErrorMessage(msg)
        patternCounts[pattern]++
        if patternExamples[pattern] == "" {
            patternExamples[pattern] = msg
        }
    }
    
    // Sort by frequency
    type kv struct {
        Pattern string
        Count   int
    }
    
    var sorted []kv
    for k, v := range patternCounts {
        sorted = append(sorted, kv{k, v})
    }
    
    sort.Slice(sorted, func(i, j int) bool {
        return sorted[i].Count > sorted[j].Count
    })
    
    var result []ErrorPattern
    for i, kv := range sorted {
        if i >= 10 { // Top 10 patterns
            break
        }
        result = append(result, ErrorPattern{
            Pattern: kv.Pattern,
            Count:   kv.Count,
            Example: patternExamples[kv.Pattern],
        })
    }
    
    return result
}

normalizeErrorMessage 函数主要是将日志消息里面的数字替换为 XXX,实际的 UUID 替换为字符串 "UUID",实际的邮箱替换为 EMAIL

func normalizeErrorMessage(msg string) string {
    // Replace common variable patterns
    re1 := regexp.MustCompile(`\d+`)
    re2 := regexp.MustCompile(`[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}`)
    re3 := regexp.MustCompile(`\b\w+@\w+\.\w+\b`)
    
    normalized := re1.ReplaceAllString(msg, "XXX")
    normalized = re2.ReplaceAllString(normalized, "UUID")
    normalized = re3.ReplaceAllString(normalized, "EMAIL")
    
    return normalized
}

performAIAnalysis 方法是最核心的逻辑,这里面使用了 AI 大模型的能力:

func (la *LogAnalyzer) performAIAnalysis(entries []LogEntry, analysis *LogAnalysis) error {
    // Prepare sample of entries for AI analysis
    sampleSize := 50
    if len(entries) < sampleSize {
        sampleSize = len(entries)
    }
    
    sample := entries[len(entries)-sampleSize:] // Last N entries
    
    template := prompts.NewPromptTemplate(`
You are an expert system administrator analyzing application logs. Based on the log data provided, identify:

1. **Anomalies**: Unusual patterns, spikes, or unexpected behaviors
2. **Recommendations**: Specific actions to improve system reliability
3. **Critical Issues**: Problems requiring immediate attention

Log Summary:
- Total Entries: {{.total_entries}}
- Errors: {{.error_count}}
- Warnings: {{.warning_count}}
- Time Range: {{.time_range}}

Top Error Patterns:
{{range .top_errors}}
- {{.Pattern}} ({{.Count}} occurrences)
{{end}}

Recent Log Sample:
{{range .sample}}
{{.timestamp}} [{{.level}}] {{.message}}
{{end}}

Respond in JSON format:
{
  "anomalies": [
    {
      "type": "error_spike|performance|security|other",
      "description": "What was detected",
      "severity": "critical|high|medium|low",
      "examples": ["example log entries"]
    }
  ],
  "recommendations": [
    "Specific actionable recommendations"
  ]
}`, []string{"total_entries", "error_count", "warning_count", "time_range", "top_errors", "sample"})

    sampleData := make([]map[string]string, len(sample))
    for i, entry := range sample {
        sampleData[i] = map[string]string{
            "timestamp": entry.Timestamp.Format(time.RFC3339),
            "level":     entry.Level,
            "message":   entry.Message,
        }
    }

    prompt, err := template.Format(map[string]any{
        "total_entries": analysis.TotalEntries,
        "error_count":   analysis.ErrorCount,
        "warning_count": analysis.WarningCount,
        "time_range":    fmt.Sprintf("%s to %s", analysis.TimeRange.Start.Format(time.RFC3339), analysis.TimeRange.End.Format(time.RFC3339)),
        "top_errors":    analysis.TopErrors,
        "sample":        sampleData,
    })
    if err != nil {
        return fmt.Errorf("formatting prompt: %w", err)
    }

    ctx := context.Background()
    response, err := la.llm.GenerateContent(ctx, []llms.MessageContent{
        llms.TextParts(llms.ChatMessageTypeHuman, prompt),
    }, llms.WithJSONMode())
    if err != nil {
        return fmt.Errorf("generating analysis: %w", err)
    }

    var aiResult struct {
        Anomalies       []Anomaly `json:"anomalies"`
        Recommendations []string  `json:"recommendations"`
    }

    if err := json.Unmarshal([]byte(response.Choices[0].Content), &aiResult); err != nil {
        return fmt.Errorf("parsing AI response: %w", err)
    }

    analysis.Anomalies = aiResult.Anomalies
    analysis.Recommendations = aiResult.Recommendations

    return nil
}

这一块的核心就是使用了 Go Template 语法的提示词模板,给 AI 提供一段需要检视的日志内容,然后让 AI 输出 JSON 格式的数据。最后再返回给后端解析 JSON 为结构体。

打印分析报告

PrintReport 方法主要是用于在控制台上面打印分析结果的。它会打印出 Top5的错误、所有的异常信息还有建议。

func (la *LogAnalysis) PrintReport() {
    fmt.Printf("📊 Log Analysis Report\n")
    fmt.Printf("=====================\n\n")

    fmt.Printf("📈 Summary:\n")
    fmt.Printf("  Total Entries: %d\n", la.TotalEntries)
    fmt.Printf("  Errors: %d\n", la.ErrorCount)
    fmt.Printf("  Warnings: %d\n", la.WarningCount)
    fmt.Printf("  Time Range: %s to %s\n\n",
       la.TimeRange.Start.Format("2006-01-02 15:04:05"),
       la.TimeRange.End.Format("2006-01-02 15:04:05"))

    if len(la.TopErrors) > 0 {
       fmt.Printf("🔴 Top Error Patterns:\n")
       for i, pattern := range la.TopErrors {
          if i >= 5 {
             break
          }
          fmt.Printf("  %d. %s (%d occurrences)\n", i+1, pattern.Pattern, pattern.Count)
       }
       fmt.Println()
    }

    if len(la.Anomalies) > 0 {
       fmt.Printf("⚠️  Detected Anomalies:\n")
       for _, anomaly := range la.Anomalies {
          fmt.Printf("  %s - %s (%s)\n", anomaly.Type, anomaly.Description, anomaly.Severity)
       }
       fmt.Println()
    }

    if len(la.Recommendations) > 0 {
       fmt.Printf("💡 Recommendations:\n")
       for i, rec := range la.Recommendations {
          fmt.Printf("  %d. %s\n", i+1, rec)
       }
       fmt.Println()
    }
}

analyzeFile辅助函数

analyzeFile 函数整合了解析日志文件、分析日志条目、打印报告、输出报告内容到本地文件等流程。

func analyzeFile(analyzer *LogAnalyzer, filename, outputFile string) error {
    fmt.Printf("🔍 Analyzing %s...\n", filename)

    entries, err := analyzer.ParseLogFile(filename)
    if err != nil {
       return fmt.Errorf("parsing log file: %w", err)
    }

    analysis, err := analyzer.AnalyzeLogs(entries)
    if err != nil {
       return fmt.Errorf("analyzing logs: %w", err)
    }

    analysis.PrintReport()

    if outputFile != "" {
       data, err := json.MarshalIndent(analysis, "", "  ")
       if err != nil {
          return fmt.Errorf("marshaling report: %w", err)
       }

       if err := os.WriteFile(outputFile, data, 0644); err != nil {
          return fmt.Errorf("writing report: %w", err)
       }
       fmt.Printf("📄 Report saved to %s\n", outputFile)
    }

    return nil
}

main函数

最后,我们来实现一下 main 函数。main 函数中主要进行定义了一些命令行参数、加载配置文件、创建日志分析器、分析日志文件操作。同时,命令行参数的 watch 假如为 true,那么这个程序不会退出,而是会每隔 30s 就会扫描一下日志文件,然后进行分析。

func main() {
    var (
       file   = flag.String("file", "sample.log", "Log file to analyze")
       output = flag.String("output", "report.json", "Output file for JSON report")
       watch  = flag.Bool("watch", false, "Watch file for changes")
    )
    flag.Parse()

    if *file == "" {
       fmt.Println("Usage: log-analyzer -file=application.log")
       os.Exit(1)
    }

    config, err := LoadConfig("config-example.yml")
    if err != nil {
       log.Fatal(err)
    }
    analyzer, err := NewLogAnalyzer(config)
    if err != nil {
       log.Fatal(err)
    }

    if *watch {
       // Watch mode - simplified version
       fmt.Printf("👀 Watching %s for changes...\n", *file)
       for {
          if err := analyzeFile(analyzer, *file, *output); err != nil {
             log.Printf("Analysis error: %v", err)
          }
          time.Sleep(30 * time.Second)
       }
    } else {
       if err := analyzeFile(analyzer, *file, *output); err != nil {
          log.Fatal(err)
       }
    }
}

创建sample.log

在这里我们创建一个 sample.log 文件,里面存放一些少量简单的日志信息供测试用:

2024-01-15 10:30:01 [INFO] Application started successfully
2024-01-15 10:30:02 [INFO] Database connection established
2024-01-15 10:30:15 [ERROR] Failed to process user request: invalid email format user@
2024-01-15 10:30:16 [WARN] High memory usage detected: 85%
2024-01-15 10:30:17 [ERROR] Database timeout after 30s
2024-01-15 10:30:18 [ERROR] Failed to process user request: invalid email format admin@
2024-01-15 10:30:19 [INFO] Request processed successfully
2024-01-15 10:30:25 [ERROR] Database timeout after 30s
2024-01-15 10:30:30 [FATAL] Out of memory error - application terminating
2024-01-15 10:30:31 [INFO] Application shutdown initiated

运行效果

输入结果如下:

image.png

report.json 文件中也有了相应的内容:

image.png

扩展功能

持续监控

如果你想要持续性监控,那么你可以考虑使用 fsnotify 库,创建 monitor.go 文件如下:

package main

import (
    "context"
    "fmt"
    "log"
    "time"

    "github.com/fsnotify/fsnotify"
    "github.com/tmc/langchaingo/llms"
    "github.com/tmc/langchaingo/chains"
)

type LogMonitor struct {
    analyzer    *LogAnalyzer
    watcher     *fsnotify.Watcher
    alertChain  chains.Chain
    thresholds  MonitoringThresholds
}

type MonitoringThresholds struct {
    ErrorsPerMinute   int
    CriticalKeywords  []string
    ResponseTimeLimit time.Duration
}

func NewLogMonitor(analyzer *LogAnalyzer) (*LogMonitor, error) {
    watcher, err := fsnotify.NewWatcher()
    if err != nil {
        return nil, err
    }

    // Create alert chain for notifications
    alertChain := chains.NewLLMChain(analyzer.llm, prompts.NewPromptTemplate(`
Generate a concise alert message for this log analysis:

{{.analysis}}

Format as: [SEVERITY] Brief description - Action needed
Keep under 140 characters.`, []string{"analysis"}))

    return &LogMonitor{
        analyzer:   analyzer,
        watcher:    watcher,
        alertChain: alertChain,
        thresholds: MonitoringThresholds{
            ErrorsPerMinute:   10,
            CriticalKeywords:  []string{"fatal", "out of memory", "database down"},
            ResponseTimeLimit: 5 * time.Second,
        },
    }, nil
}

func (lm *LogMonitor) Start(filename string) error {
    err := lm.watcher.Add(filename)
    if err != nil {
        return err
    }

    fmt.Printf("🚨 Monitoring %s for critical issues...\n", filename)

    for {
        select {
        case event, ok := <-lm.watcher.Events:
            if !ok {
                return nil
            }
            if event.Op&fsnotify.Write == fsnotify.Write {
                go lm.checkForAlerts(filename)
            }
        case err, ok := <-lm.watcher.Errors:
            if !ok {
                return nil
            }
            log.Printf("Watcher error: %v", err)
        }
    }
}

func (lm *LogMonitor) checkForAlerts(filename string) {
    // Read last N lines and check for critical issues
    entries, err := lm.analyzer.ParseLogFile(filename)
    if err != nil {
        log.Printf("Error parsing file: %v", err)
        return
    }

    // Check recent entries (last minute)
    recent := lm.getRecentEntries(entries, time.Minute)
    if lm.shouldAlert(recent) {
        analysis, err := lm.analyzer.AnalyzeLogs(recent)
        if err != nil {
            log.Printf("Error analyzing logs: %v", err)
            return
        }

        alert, err := chains.Run(context.Background(), lm.alertChain, 
            fmt.Sprintf("Analysis: %+v", analysis))
        if err != nil {
            log.Printf("Error generating alert: %v", err)
            return
        }

        fmt.Printf("🚨 ALERT: %s\n", alert)
        // Here you would send to Slack, email, etc.
    }
}

func (lm *LogMonitor) getRecentEntries(entries []LogEntry, duration time.Duration) []LogEntry {
    cutoff := time.Now().Add(-duration)
    var recent []LogEntry
    
    for i := len(entries) - 1; i >= 0; i-- {
        if entries[i].Timestamp.Before(cutoff) {
            break
        }
        recent = append([]LogEntry{entries[i]}, recent...)
    }
    
    return recent
}

func (lm *LogMonitor) shouldAlert(entries []LogEntry) bool {
    errorCount := 0
    for _, entry := range entries {
        if entry.Level == "ERROR" || entry.Level == "FATAL" {
            errorCount++
        }
        
        // Check for critical keywords
        for _, keyword := range lm.thresholds.CriticalKeywords {
            if strings.Contains(strings.ToLower(entry.Message), keyword) {
                return true
            }
        }
    }
    
    return errorCount >= lm.thresholds.ErrorsPerMinute
}

可观测性系统集成

异常日志分析还可以集成到第三方可观测系统,比如 Prometheus,然后通过 Slack 推送。

创建 integration.go 文件:

package main

import (
    "bytes"
    "encoding/json"
    "fmt"
    "net/http"
)

type SlackAlert struct {
    Text string `json:"text"`
}

func (lm *LogMonitor) sendSlackAlert(message string, webhookURL string) error {
    alert := SlackAlert{Text: fmt.Sprintf("Log Alert: %s", message)}
    
    jsonData, err := json.Marshal(alert)
    if err != nil {
        return err
    }
    
    resp, err := http.Post(webhookURL, "application/json", bytes.NewBuffer(jsonData))
    if err != nil {
        return err
    }
    defer resp.Body.Close()
    
    return nil
}

// Prometheus metrics
type MetricsCollector struct {
    errorCount   int
    warningCount int
}

func (mc *MetricsCollector) UpdateFromAnalysis(analysis *LogAnalysis) {
    mc.errorCount += analysis.ErrorCount
    mc.warningCount += analysis.WarningCount
}

// Export to Prometheus format
func (mc *MetricsCollector) PrometheusMetrics() string {
    return fmt.Sprintf(`
# HELP log_errors_total Total number of error log entries
# TYPE log_errors_total counter
log_errors_total %d

# HELP log_warnings_total Total number of warning log entries  
# TYPE log_warnings_total counter
log_warnings_total %d
`, mc.errorCount, mc.warningCount)
}