作者：Benjamin Trent

作为一名前系统管理员，我个人对 categorize_text 对探索日志的意义感到兴奋。这种新的 Elasticsearch 功能是我希望在那些日子里拥有的东西。花费大量时间筛选大量日志以找出令人不安的模式。 categorize_text 在查询时将流行的日志模式带到最前沿。此功能与 Elasticsearch 已经广泛且强大的聚合框架相结合，可缩短获取信息的时间。探索成山的日志变得更加容易。在 Kibana 中自动集群日志、计算统计数据和可视化是任何 SRE 或管理员的有力工具。

categorize_text 聚合是如何工作？

categorize_text 从文档 _source 读取文本并使用自定义 tokenizer ml_standard 创建 tokenizer，该 tokenizer 专门为一般机器生成的文本构建。事实上，异常检测中提供的许多相同选项在 categorize_text 中可用。分析文本后，将使用 DRAIN 算法的修改版本将 token 聚集在一起。 DRAIN 构建一个 token 树并认为较早的 token 更重要。我们稍微修改了算法，以允许在构建类别时合并文本中较早的token。本质上，具有高可变性的 token 被删除，而更一致的 token 形成类别定义。

文本分类示例

以下是 categorize_text 解析以下 NGINX 日志行的方式。



1.  {"message": "2018/11/26 18:09:45 [error] 8#8: *4781 open() \"/etc/nginx/html/wan.php\" failed (2: No such file or directory), client: 154.91.201.90, server: _, request: \"POST /wan.php HTTP/1.1\", host: \"35.246.148.213\""},
2.  {"message": "2018/11/20 17:26:36 [error] 8#8: *3672 open() \"/etc/nginx/html/pe.php\" failed (2: No such file or directory), client: 139.159.210.222, server: _, request: \"POST /pe.php HTTP/1.1\", host: \"35.246.148.213\""}

使用默认设置，它将成为以下类别：

error open * failed No such file or directory client server request * host

公共 token 包含在类别定义中，变量 token（在这种情况下为 url 文件路径）用 * 值省略。

既然我们知道它在高层次上是如何工作的，那么它怎么能被使用呢？

可视化日志类别的示例

让我们研究 categorize_text 聚合的三个用例，它们可以帮助你作为系统管理员：随时间按类别识别问题、显示最常见的错误类别和类别趋势可视化。以下示例均使用 Kibana Vega 在查询时可视化日志类别。

比较不同日期的顶级类别

以下示例显示了两天内 NGINX 错误的不同顶级类别。在将之前已知的 “好日子” 与系统行为不稳定的日子进行比较时，这很有用。



1.  {
2.    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
3.    "title": "Text categories between two days",
4.    "data": {
5.      "url": {
6.        "index": "filebeat-*",
7.        "body": {
8.          "size": 0,
9.          "query": {
10.            "bool": {
11.              "filter": [
12.                {"term": {"event.dataset": "nginx.error"}},
13.                {
14.                  "bool": {
15.                    "should": [
16.                      {
17.                        "range": {
18.                          "@timestamp": {
19.                            "gte": "2021-02-25T00:00:00.000Z",
20.                            "lte": "2021-02-25T12:00:00.000Z"
21.                          }
22.                        }
23.                      },
24.                      {
25.                        "range": {
26.                          "@timestamp": {
27.                            "gte": "2021-02-26T00:00:00.000Z",
28.                            "lte": "2021-02-26T12:00:00.000Z"
29.                          }
30.                        }
31.                      }
32.                    ],
33.                    "minimum_should_match": 1
34.                  }
35.                }
36.              ]
37.            }
38.          },
39.          "aggs": {
40.            "sample": {
41.              "sampler": {"shard_size": 5000},
42.              "aggs": {
43.                "categories": {
44.                  "categorize_text": {
45.                    "field": "message",
46.                    "similarity_threshold": 20,
47.                    "max_unique_tokens": 20
48.                  },
49.                  "aggs": {
50.                    "time_buckets": {
51.                      "filters": {
52.                        "filters": {
53.                          "first": {
54.                            "range": {
55.                              "@timestamp": {
56.                                "gte": "2021-02-25T00:00:00.000Z",
57.                                "lte": "2021-02-25T12:00:00.000Z"
58.                              }
59.                            }
60.                          },
61.                          "second": {
62.                            "range": {
63.                              "@timestamp": {
64.                                "gte": "2021-02-26T00:00:00.000Z",
65.                                "lte": "2021-02-26T12:00:00.000Z"
66.                              }
67.                            }
68.                          }
69.                        }
70.                      }
71.                    }
72.                  }
73.                }
74.              }
75.            }
76.          }
77.        }
78.      },
79.      "format": {"property": "aggregations.sample.categories.buckets"}
80.    },
81.    "transform": [
82.      {
83.        "fold": [
84.          "time_buckets.buckets.first.doc_count",
85.          "time_buckets.buckets.second.doc_count"
86.        ],
87.        "as": ["subKey", "subValue"]
88.      }
89.    ],
90.    "mark": "bar",
91.    "encoding": {
92.      "x": {"field": "subKey", "type": "ordinal", "axis": {"title": null}},
93.      "y": {
94.        "field": "subValue",
95.        "type": "quantitative",
96.        "axis": {"title": "Document count"}
97.      },
98.      "color": {"field": "key"},
99.      "tooltip": [
100.        {"field": "key", "type": "nominal", "title": "category"},
101.        {"field": "subValue", "type": "quantitative", "title": "Count"}
102.      ]
103.    },
104.    "layer": [{"mark": "bar", "encoding": {"color": {"field": "key"}}}]
105.  }

如果你对如何使用 Vega 来进行可视化还不是很了解的话，请参阅文章 “Kibana：Vega 可视化入门 - 定制自己的可视化图”。

通过术语聚合收集顶级类别

此术语聚合示例显示了每个类别中哪些术语值最普遍。在这种特殊情况下，

Kubernetes pod 是使用的术语。



1.  {
2.    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
3.    "title": "Event counts from all indexes",
4.    "data": {
5.      "url": {
6.        "%context%": true,
7.        "%timefield%": "@timestamp",
8.        "index": "filebeat-8.0.0-*",
9.        "body": {
10.          "aggs": {
11.            "sample": {
12.              "sampler": {"shard_size": 5000},
13.              "aggs": {
14.                "categories": {
15.                  "categorize_text": {
16.                    "field": "message",
17.                    "similarity_threshold": 20,
18.                    "max_unique_tokens": 20
19.                  },
20.                  "aggs": {
21.                    "k8_pod": {
22.                      "terms": {"field": "kubernetes.pod.name", "size": 5}
23.                    }
24.                  }
25.                }
26.              }
27.            }
28.          },
29.          "size": 0
30.        }
31.      },
32.      "format": {"property": "aggregations.sample.categories.buckets"}
33.    },
34.    "transform": [
35.      {"flatten": ["k8_pod.buckets"], "as": ["k8_pod_buckets"]}
36.    ],
37.    "mark": "bar",
38.    "encoding": {
39.      "x": {"field": "key", "type": "ordinal", "axis": {"title": false}},
40.      "y": {
41.        "field": "doc_count",
42.        "type": "quantitative",
43.        "axis": {"title": "Document count"}
44.      },
45.      "color": {"field": "k8_pod_buckets.key"},
46.      "tooltip": [{
47.        "field": "k8_pod_buckets",
48.        "type": "nominal",
49.        "title": "category"
50.      }, {
51.        "field": "k8_pod_buckets.doc_count",
52.        "type": "quantitative",
53.        "title": "Count"
54.      }]
55.    }
56.  }

随着时间的推移可视化类别趋势

此分析可用于探索奇怪的日志记录峰值，并帮助确定哪些类别对峰值的贡献最大。



1.  {
2.    "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
3.    "title": "top categories every 15m",
4.    "data": {
5.      "url": {
6.        "%context%": true,
7.        "%timefield%": "@timestamp",
8.        "index": "filebeat-8.0.0-*",
9.        "body": {
10.          "aggs": {
11.            "categories": {
12.              "categorize_text": {
13.                "field": "message",
14.                "similarity_threshold": 20,
15.                "max_unique_tokens": 20
16.              },
17.              "aggs": {
18.                "time_buckets": {
19.                  "date_histogram": {
20.                    "field": "@timestamp",
21.                    "interval": "15m",
22.                    "min_doc_count": 1
23.                  }
24.                }
25.              }
26.            }
27.          },
28.          "size": 0
29.        }
30.      },
31.      "format": {"property": "aggregations.categories.buckets"}
32.    },
33.    "transform": [{"flatten": ["time_buckets.buckets"], "as": ["buckets"]}],
34.    "mark": "area",
35.    "encoding": {
36.      "tooltip": [
37.        {"field": "buckets.key", "type": "temporal", "title": "Date"},
38.        {"field": "key", "type": "nominal", "title": "Category"},
39.        {"field": "buckets.doc_count", "type": "quantitative", "title": "Count"}
40.      ],
41.      "x": {"field": "buckets.key", "type": "temporal", "axis": {"title": "category"}},
42.      "y": {
43.        "field": "buckets.doc_count",
44.        "type": "quantitative",
45.        "stack": true,
46.        "axis": {"title": "Document count"}
47.      },
48.      "color": {"field": "key", "type": "nominal"}
49.    },
50.    "layer": [
51.      {"mark": "area"},
52.      {
53.        "mark": "point",
54.        "selection": {
55.          "pointhover": {
56.            "type": "single",
57.            "on": "mouseover",
58.            "clear": "mouseout",
59.            "empty": "none",
60.            "fields": ["buckets.key", "key"],
61.            "nearest": true
62.          }
63.        },
64.        "encoding": {
65.          "size": {
66.            "condition": {"selection": "pointhover", "value": 100},
67.            "value": 5
68.          },
69.          "fill": {"condition": {"selection": "pointhover", "value": "white"}}
70.        }
71.      }
72.    ]
73.  }

试试看

这些示例只是 7.16 技术预览版中发布的 categorize_text 聚合的开始。对机器生成的文本进行分类和 Elasticsearch 中强大的聚合框架为你提供了大量的日志和数据探索机会。立即启动 Elastic Cloud 集群并试一试。我们很想听听你的反馈——在我们的讨论论坛或社区 Slack 频道中加入有关 Elastic 机器学习的对话。

Elasticsearch：使用 Elasticsearch categorize_text 聚合对日志进行分类