前言
本篇文章给各位介绍如何使用 LangChainGo 库构建一个智能日志分析器智能体,通过给 AI 合适的提示词实现智能分析日志的问题,然后实现告警的效果。
实现的效果
一个命令行工具:
- 解析各种格式的日志文件(JSON、结构化文本等)
- 识别错误模式和异常情况
- 总结日志活动和趋势
- 基于检测到的问题建议执行动作
- 为严重问题生成告警
环境准备
- Go 1.21+
- LLM API Key(OpenAI、Anthropic)
- 需要分析的样例日志文件
开始
首先我们使用 Goland 新建一个工程,然后导入以下依赖:
go get github.com/tmc/langchaingo
go get github.com/sirupsen/logrus
结构图
日志相关结构体
package main
import (
"fmt"
"time"
)
// 日志条目
type LogEntry struct {
Timestamp time.Time `json:"timestamp"`
Level string `json:"level,omitempty"`
Message string `json:"message,omitempty"`
Source string `json:"source,omitempty"`
Raw string `json:"raw,omitempty"`
}
// 日志分析结果
type LogAnalysis struct {
TotalEntries int `json:"total_entries"`
ErrorCount int `json:"error_count"`
WarningCount int `json:"warning_count"`
TopErrors []ErrorPattern `json:"top_errors"`
TimeRange TimeRange `json:"time_range"`
Recommendations []string `json:"recommendations"`
Anomalies []Anomaly `json:"anomalies"`
}
// 错误模式
type ErrorPattern struct {
Pattern string `json:"pattern"`
Count int `json:"count"`
Example string `json:"example"`
}
// 时间跨度
type TimeRange struct {
Start time.Time `json:"start"`
End time.Time `json:"end"`
}
// 异常情况
type Anomaly struct {
Type string `json:"type"`
Description string `json:"description"`
Severity string `json:"severity"`
Examples []string `json:"examples"`
}
以上几个结构体是用来保存日志分析结果的基本信息。
分析器Analyzer
Analyzer 结构体是我们这个智能体的核心,它包含了一个 llms.Model 类型的字段,用于调用 langchaingo 库的能力。
package main
import (
"bufio"
"context"
"encoding/json"
"fmt"
"os"
"regexp"
"sort"
"strings"
"time"
"github.com/tmc/langchaingo/llms"
"github.com/tmc/langchaingo/llms/openai"
"github.com/tmc/langchaingo/prompts"
)
type LogAnalyzer struct {
llm llms.Model
}
func NewLogAnalyzer(config *AppConfig) (*LogAnalyzer, error) {
llm, err := openai.New(openai.WithBaseURL(config.LlmConfig.EndPoint),
openai.WithToken(config.LlmConfig.ApiKey),
openai.WithModel(config.LlmConfig.Model),
)
if err != nil {
return nil, fmt.Errorf("failed to create LLM: %w", err)
}
return &LogAnalyzer{llm: llm}, nil
}
大模型接口的地址、密钥、模型名称都是从配置文件中读取的,这样便于修改配置。
解析日志文件
Analyzer 组件会根据文件名来解析日志文件,然后返回 []LogEntry,这个方法主要是根据不同的日志类型来解析日志文件。
func (la *LogAnalyzer) ParseLogFile(filename string) ([]LogEntry, error) {
file, err := os.Open(filename)
if err != nil {
return nil, fmt.Errorf("opening file: %w", err)
}
defer file.Close()
var entries []LogEntry
scanner := bufio.NewScanner(file)
// Common log patterns
patterns := []*regexp.Regexp{
// JSON logs
regexp.MustCompile(`^{.*}$`),
// Standard format: 2023-01-01 12:00:00 [ERROR] message
regexp.MustCompile(`^(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+[(\w+)]\s+(.+)$`),
// Nginx/Apache format
regexp.MustCompile(`^(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}).*[([^]]+)].*"([^"]*)".*(\d{3})`),
}
for scanner.Scan() {
line := scanner.Text()
if strings.TrimSpace(line) == "" {
continue
}
entry := LogEntry{Raw: line}
// Try JSON first
if line[0] == '{' {
var jsonEntry map[string]any
if err := json.Unmarshal([]byte(line), &jsonEntry); err == nil {
entry = parseJSONLog(jsonEntry, line)
entries = append(entries, entry)
continue
}
}
// Try structured patterns
for _, pattern := range patterns[1:] {
if matches := pattern.FindStringSubmatch(line); matches != nil {
entry = parseStructuredLog(matches, line)
break
}
}
// Fallback: treat as unstructured
if entry.Timestamp.IsZero() {
entry = LogEntry{
Timestamp: time.Now(), // Use current time as fallback
Level: inferLogLevel(line),
Message: line,
Raw: line,
}
}
entries = append(entries, entry)
}
return entries, scanner.Err()
}
假如是以 { 开头的,那么就被判定为是 JSON 类型的日志:
func parseJSONLog(data map[string]any, raw string) LogEntry {
entry := LogEntry{Raw: raw}
if ts, ok := data["timestamp"].(string); ok {
if t, err := time.Parse(time.RFC3339, ts); err == nil {
entry.Timestamp = t
}
}
if level, ok := data["level"].(string); ok {
entry.Level = level
}
if msg, ok := data["message"].(string); ok {
entry.Message = msg
}
if src, ok := data["source"].(string); ok {
entry.Source = src
}
return entry
}
否则使用正则表达式来匹配结构化日志,然后返回以一个 LogEntry 类型的结构体。pattern.FindStringSubmatch(line) 方法返回的是一个 []string 切片,因为上述代码定义的正则表达式有 3 个捕获组,那么返回的切片长度就是 4,第 0 个就是目标字符串本身。从第 1 个开始进行获取即可。
func parseStructuredLog(matches []string, raw string) LogEntry {
entry := LogEntry{Raw: raw}
if len(matches) >= 4 {
if t, err := time.Parse("2006-01-02 15:04:05", matches[1]); err == nil {
entry.Timestamp = t
}
entry.Level = matches[2]
entry.Message = matches[3]
}
return entry
}
最后还有一个兜底的操作,假如 LogEntry 的 Timestamp.IsZero() 为 true 成立,那么就进行判断字符串是否包含关键词,这样就可以推断出日志的等级。
func inferLogLevel(line string) string {
lower := strings.ToLower(line)
switch {
case strings.Contains(lower, "error") || strings.Contains(lower, "fatal"):
return "ERROR"
case strings.Contains(lower, "warn"):
return "WARN"
case strings.Contains(lower, "debug"):
return "DEBUG"
default:
return "INFO"
}
}
分析日志文件
分析日志文件是整个应用的核心业务逻辑,因为这个流程里面包含了与 AI 大模型交互的步骤。AnalyzeLogs 方法接收一个 []LogEntry 参数,最后返回分析的结果也就是 LogAnalysis 类型的结构体。
func (la *LogAnalyzer) AnalyzeLogs(entries []LogEntry) (*LogAnalysis, error) {
if len(entries) == 0 {
return &LogAnalysis{}, nil
}
// Basic statistics
analysis := &LogAnalysis{
TotalEntries: len(entries),
TimeRange: TimeRange{
Start: entries[0].Timestamp,
End: entries[len(entries)-1].Timestamp,
},
}
// Count by level
errorMessages := []string{}
for _, entry := range entries {
switch strings.ToUpper(entry.Level) {
case "ERROR", "FATAL":
analysis.ErrorCount++
errorMessages = append(errorMessages, entry.Message)
case "WARN", "WARNING":
analysis.WarningCount++
}
}
// Find error patterns
analysis.TopErrors = findErrorPatterns(errorMessages)
// Use AI for deeper analysis
if err := la.performAIAnalysis(entries, analysis); err != nil {
return nil, fmt.Errorf("AI analysis failed: %w", err)
}
return analysis, nil
}
但是在调用大模型的能力之前,我们会首先尝试使用 findErrorPatterns 函数获取 Top10 的错误模式(ErrorPattern)切片。然后根据 Count 属性进行降序排序。
func findErrorPatterns(messages []string) []ErrorPattern {
patternCounts := make(map[string]int)
patternExamples := make(map[string]string)
for _, msg := range messages {
// Normalize error messages by removing specific values
pattern := normalizeErrorMessage(msg)
patternCounts[pattern]++
if patternExamples[pattern] == "" {
patternExamples[pattern] = msg
}
}
// Sort by frequency
type kv struct {
Pattern string
Count int
}
var sorted []kv
for k, v := range patternCounts {
sorted = append(sorted, kv{k, v})
}
sort.Slice(sorted, func(i, j int) bool {
return sorted[i].Count > sorted[j].Count
})
var result []ErrorPattern
for i, kv := range sorted {
if i >= 10 { // Top 10 patterns
break
}
result = append(result, ErrorPattern{
Pattern: kv.Pattern,
Count: kv.Count,
Example: patternExamples[kv.Pattern],
})
}
return result
}
normalizeErrorMessage 函数主要是将日志消息里面的数字替换为 XXX,实际的 UUID 替换为字符串 "UUID",实际的邮箱替换为 EMAIL。
func normalizeErrorMessage(msg string) string {
// Replace common variable patterns
re1 := regexp.MustCompile(`\d+`)
re2 := regexp.MustCompile(`[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}`)
re3 := regexp.MustCompile(`\b\w+@\w+\.\w+\b`)
normalized := re1.ReplaceAllString(msg, "XXX")
normalized = re2.ReplaceAllString(normalized, "UUID")
normalized = re3.ReplaceAllString(normalized, "EMAIL")
return normalized
}
performAIAnalysis 方法是最核心的逻辑,这里面使用了 AI 大模型的能力:
func (la *LogAnalyzer) performAIAnalysis(entries []LogEntry, analysis *LogAnalysis) error {
// Prepare sample of entries for AI analysis
sampleSize := 50
if len(entries) < sampleSize {
sampleSize = len(entries)
}
sample := entries[len(entries)-sampleSize:] // Last N entries
template := prompts.NewPromptTemplate(`
You are an expert system administrator analyzing application logs. Based on the log data provided, identify:
1. **Anomalies**: Unusual patterns, spikes, or unexpected behaviors
2. **Recommendations**: Specific actions to improve system reliability
3. **Critical Issues**: Problems requiring immediate attention
Log Summary:
- Total Entries: {{.total_entries}}
- Errors: {{.error_count}}
- Warnings: {{.warning_count}}
- Time Range: {{.time_range}}
Top Error Patterns:
{{range .top_errors}}
- {{.Pattern}} ({{.Count}} occurrences)
{{end}}
Recent Log Sample:
{{range .sample}}
{{.timestamp}} [{{.level}}] {{.message}}
{{end}}
Respond in JSON format:
{
"anomalies": [
{
"type": "error_spike|performance|security|other",
"description": "What was detected",
"severity": "critical|high|medium|low",
"examples": ["example log entries"]
}
],
"recommendations": [
"Specific actionable recommendations"
]
}`, []string{"total_entries", "error_count", "warning_count", "time_range", "top_errors", "sample"})
sampleData := make([]map[string]string, len(sample))
for i, entry := range sample {
sampleData[i] = map[string]string{
"timestamp": entry.Timestamp.Format(time.RFC3339),
"level": entry.Level,
"message": entry.Message,
}
}
prompt, err := template.Format(map[string]any{
"total_entries": analysis.TotalEntries,
"error_count": analysis.ErrorCount,
"warning_count": analysis.WarningCount,
"time_range": fmt.Sprintf("%s to %s", analysis.TimeRange.Start.Format(time.RFC3339), analysis.TimeRange.End.Format(time.RFC3339)),
"top_errors": analysis.TopErrors,
"sample": sampleData,
})
if err != nil {
return fmt.Errorf("formatting prompt: %w", err)
}
ctx := context.Background()
response, err := la.llm.GenerateContent(ctx, []llms.MessageContent{
llms.TextParts(llms.ChatMessageTypeHuman, prompt),
}, llms.WithJSONMode())
if err != nil {
return fmt.Errorf("generating analysis: %w", err)
}
var aiResult struct {
Anomalies []Anomaly `json:"anomalies"`
Recommendations []string `json:"recommendations"`
}
if err := json.Unmarshal([]byte(response.Choices[0].Content), &aiResult); err != nil {
return fmt.Errorf("parsing AI response: %w", err)
}
analysis.Anomalies = aiResult.Anomalies
analysis.Recommendations = aiResult.Recommendations
return nil
}
这一块的核心就是使用了 Go Template 语法的提示词模板,给 AI 提供一段需要检视的日志内容,然后让 AI 输出 JSON 格式的数据。最后再返回给后端解析 JSON 为结构体。
打印分析报告
PrintReport 方法主要是用于在控制台上面打印分析结果的。它会打印出 Top5的错误、所有的异常信息还有建议。
func (la *LogAnalysis) PrintReport() {
fmt.Printf("📊 Log Analysis Report\n")
fmt.Printf("=====================\n\n")
fmt.Printf("📈 Summary:\n")
fmt.Printf(" Total Entries: %d\n", la.TotalEntries)
fmt.Printf(" Errors: %d\n", la.ErrorCount)
fmt.Printf(" Warnings: %d\n", la.WarningCount)
fmt.Printf(" Time Range: %s to %s\n\n",
la.TimeRange.Start.Format("2006-01-02 15:04:05"),
la.TimeRange.End.Format("2006-01-02 15:04:05"))
if len(la.TopErrors) > 0 {
fmt.Printf("🔴 Top Error Patterns:\n")
for i, pattern := range la.TopErrors {
if i >= 5 {
break
}
fmt.Printf(" %d. %s (%d occurrences)\n", i+1, pattern.Pattern, pattern.Count)
}
fmt.Println()
}
if len(la.Anomalies) > 0 {
fmt.Printf("⚠️ Detected Anomalies:\n")
for _, anomaly := range la.Anomalies {
fmt.Printf(" %s - %s (%s)\n", anomaly.Type, anomaly.Description, anomaly.Severity)
}
fmt.Println()
}
if len(la.Recommendations) > 0 {
fmt.Printf("💡 Recommendations:\n")
for i, rec := range la.Recommendations {
fmt.Printf(" %d. %s\n", i+1, rec)
}
fmt.Println()
}
}
analyzeFile辅助函数
analyzeFile 函数整合了解析日志文件、分析日志条目、打印报告、输出报告内容到本地文件等流程。
func analyzeFile(analyzer *LogAnalyzer, filename, outputFile string) error {
fmt.Printf("🔍 Analyzing %s...\n", filename)
entries, err := analyzer.ParseLogFile(filename)
if err != nil {
return fmt.Errorf("parsing log file: %w", err)
}
analysis, err := analyzer.AnalyzeLogs(entries)
if err != nil {
return fmt.Errorf("analyzing logs: %w", err)
}
analysis.PrintReport()
if outputFile != "" {
data, err := json.MarshalIndent(analysis, "", " ")
if err != nil {
return fmt.Errorf("marshaling report: %w", err)
}
if err := os.WriteFile(outputFile, data, 0644); err != nil {
return fmt.Errorf("writing report: %w", err)
}
fmt.Printf("📄 Report saved to %s\n", outputFile)
}
return nil
}
main函数
最后,我们来实现一下 main 函数。main 函数中主要进行定义了一些命令行参数、加载配置文件、创建日志分析器、分析日志文件操作。同时,命令行参数的 watch 假如为 true,那么这个程序不会退出,而是会每隔 30s 就会扫描一下日志文件,然后进行分析。
func main() {
var (
file = flag.String("file", "sample.log", "Log file to analyze")
output = flag.String("output", "report.json", "Output file for JSON report")
watch = flag.Bool("watch", false, "Watch file for changes")
)
flag.Parse()
if *file == "" {
fmt.Println("Usage: log-analyzer -file=application.log")
os.Exit(1)
}
config, err := LoadConfig("config-example.yml")
if err != nil {
log.Fatal(err)
}
analyzer, err := NewLogAnalyzer(config)
if err != nil {
log.Fatal(err)
}
if *watch {
// Watch mode - simplified version
fmt.Printf("👀 Watching %s for changes...\n", *file)
for {
if err := analyzeFile(analyzer, *file, *output); err != nil {
log.Printf("Analysis error: %v", err)
}
time.Sleep(30 * time.Second)
}
} else {
if err := analyzeFile(analyzer, *file, *output); err != nil {
log.Fatal(err)
}
}
}
创建sample.log
在这里我们创建一个 sample.log 文件,里面存放一些少量简单的日志信息供测试用:
2024-01-15 10:30:01 [INFO] Application started successfully
2024-01-15 10:30:02 [INFO] Database connection established
2024-01-15 10:30:15 [ERROR] Failed to process user request: invalid email format user@
2024-01-15 10:30:16 [WARN] High memory usage detected: 85%
2024-01-15 10:30:17 [ERROR] Database timeout after 30s
2024-01-15 10:30:18 [ERROR] Failed to process user request: invalid email format admin@
2024-01-15 10:30:19 [INFO] Request processed successfully
2024-01-15 10:30:25 [ERROR] Database timeout after 30s
2024-01-15 10:30:30 [FATAL] Out of memory error - application terminating
2024-01-15 10:30:31 [INFO] Application shutdown initiated
运行效果
输入结果如下:
report.json 文件中也有了相应的内容:
扩展功能
持续监控
如果你想要持续性监控,那么你可以考虑使用 fsnotify 库,创建 monitor.go 文件如下:
package main
import (
"context"
"fmt"
"log"
"time"
"github.com/fsnotify/fsnotify"
"github.com/tmc/langchaingo/llms"
"github.com/tmc/langchaingo/chains"
)
type LogMonitor struct {
analyzer *LogAnalyzer
watcher *fsnotify.Watcher
alertChain chains.Chain
thresholds MonitoringThresholds
}
type MonitoringThresholds struct {
ErrorsPerMinute int
CriticalKeywords []string
ResponseTimeLimit time.Duration
}
func NewLogMonitor(analyzer *LogAnalyzer) (*LogMonitor, error) {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return nil, err
}
// Create alert chain for notifications
alertChain := chains.NewLLMChain(analyzer.llm, prompts.NewPromptTemplate(`
Generate a concise alert message for this log analysis:
{{.analysis}}
Format as: [SEVERITY] Brief description - Action needed
Keep under 140 characters.`, []string{"analysis"}))
return &LogMonitor{
analyzer: analyzer,
watcher: watcher,
alertChain: alertChain,
thresholds: MonitoringThresholds{
ErrorsPerMinute: 10,
CriticalKeywords: []string{"fatal", "out of memory", "database down"},
ResponseTimeLimit: 5 * time.Second,
},
}, nil
}
func (lm *LogMonitor) Start(filename string) error {
err := lm.watcher.Add(filename)
if err != nil {
return err
}
fmt.Printf("🚨 Monitoring %s for critical issues...\n", filename)
for {
select {
case event, ok := <-lm.watcher.Events:
if !ok {
return nil
}
if event.Op&fsnotify.Write == fsnotify.Write {
go lm.checkForAlerts(filename)
}
case err, ok := <-lm.watcher.Errors:
if !ok {
return nil
}
log.Printf("Watcher error: %v", err)
}
}
}
func (lm *LogMonitor) checkForAlerts(filename string) {
// Read last N lines and check for critical issues
entries, err := lm.analyzer.ParseLogFile(filename)
if err != nil {
log.Printf("Error parsing file: %v", err)
return
}
// Check recent entries (last minute)
recent := lm.getRecentEntries(entries, time.Minute)
if lm.shouldAlert(recent) {
analysis, err := lm.analyzer.AnalyzeLogs(recent)
if err != nil {
log.Printf("Error analyzing logs: %v", err)
return
}
alert, err := chains.Run(context.Background(), lm.alertChain,
fmt.Sprintf("Analysis: %+v", analysis))
if err != nil {
log.Printf("Error generating alert: %v", err)
return
}
fmt.Printf("🚨 ALERT: %s\n", alert)
// Here you would send to Slack, email, etc.
}
}
func (lm *LogMonitor) getRecentEntries(entries []LogEntry, duration time.Duration) []LogEntry {
cutoff := time.Now().Add(-duration)
var recent []LogEntry
for i := len(entries) - 1; i >= 0; i-- {
if entries[i].Timestamp.Before(cutoff) {
break
}
recent = append([]LogEntry{entries[i]}, recent...)
}
return recent
}
func (lm *LogMonitor) shouldAlert(entries []LogEntry) bool {
errorCount := 0
for _, entry := range entries {
if entry.Level == "ERROR" || entry.Level == "FATAL" {
errorCount++
}
// Check for critical keywords
for _, keyword := range lm.thresholds.CriticalKeywords {
if strings.Contains(strings.ToLower(entry.Message), keyword) {
return true
}
}
}
return errorCount >= lm.thresholds.ErrorsPerMinute
}
可观测性系统集成
异常日志分析还可以集成到第三方可观测系统,比如 Prometheus,然后通过 Slack 推送。
创建 integration.go 文件:
package main
import (
"bytes"
"encoding/json"
"fmt"
"net/http"
)
type SlackAlert struct {
Text string `json:"text"`
}
func (lm *LogMonitor) sendSlackAlert(message string, webhookURL string) error {
alert := SlackAlert{Text: fmt.Sprintf("Log Alert: %s", message)}
jsonData, err := json.Marshal(alert)
if err != nil {
return err
}
resp, err := http.Post(webhookURL, "application/json", bytes.NewBuffer(jsonData))
if err != nil {
return err
}
defer resp.Body.Close()
return nil
}
// Prometheus metrics
type MetricsCollector struct {
errorCount int
warningCount int
}
func (mc *MetricsCollector) UpdateFromAnalysis(analysis *LogAnalysis) {
mc.errorCount += analysis.ErrorCount
mc.warningCount += analysis.WarningCount
}
// Export to Prometheus format
func (mc *MetricsCollector) PrometheusMetrics() string {
return fmt.Sprintf(`
# HELP log_errors_total Total number of error log entries
# TYPE log_errors_total counter
log_errors_total %d
# HELP log_warnings_total Total number of warning log entries
# TYPE log_warnings_total counter
log_warnings_total %d
`, mc.errorCount, mc.warningCount)
}