作者:David Pilato
对于 Elasticsearch®,我们知道联接应该在 “索引时” 而不是查询时完成。 本博文是一系列三篇博文的开始,因为我们可以在 Elastic® 生态系统中采取多种方法。 我们将介绍如何在 Elasticsearch 中做到这一点。 下一篇博文将介绍如何使用集中式组件 Logstash 来实现这一点,上一篇博文将展示如何使用 Elastic Agent/Beats 在边缘实现这一点。
举一个简单的例子,假设我们是一个电子商务网站,在 kibana_sample_data_logs 中收集日志:
1. {
2. "agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24",
3. "bytes": 1831,
4. "clientip": "30.156.16.164",
5. "extension": "",
6. "geo": {
7. "srcdest": "US:IN",
8. "src": "US",
9. "dest": "IN",
10. "coordinates": {
11. "lat": 55.53741389,
12. "lon": -132.3975144
13. }
14. },
15. "host": "elastic-elastic-elastic.org",
16. "index": "kibana_sample_data_logs",
17. "ip": "30.156.16.163",
18. "machine": {
19. "ram": 9663676416,
20. "os": "win xp"
21. },
22. "memory": 73240,
23. "message": "30.156.16.163 - - [2018-09-01T12:43:49.756Z] \"GET /wp-login.php HTTP/1.1\" 404 1831 \"-\" \"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24\"",
24. "phpmemory": 73240,
25. "referer": "http://www.elastic-elastic-elastic.com/success/timothy-l-kopra",
26. "request": "/wp-login.php",
27. "response": 404,
28. "tags": [
29. "success",
30. "info"
31. ],
32. "timestamp": "2023-03-18T12:43:49.756Z",
33. "url": "https://elastic-elastic-elastic.org/wp-login.php",
34. "utc_time": "2023-03-18T12:43:49.756Z",
35. "event": {
36. "dataset": "sample_web_logs"
37. }
38. }
请注意,你可以通过单击 “Sample web blogs” 框中的 “Add data”按钮,使用 Kibana® 示例数据集轻松导入此数据集:
我们还有一个 VIP 索引,其中包含有关我们客户的信息:
1. {
2. "ip" : "30.156.16.164",
3. "vip": true,
4. "name": "David P"
5. }
要导入此示例数据集,我们只需运行:
1. DELETE /vip
2. PUT /vip
3. {
4. "mappings": {
5. "properties": {
6. "ip": { "type": "keyword" },
7. "name": { "type": "text" },
8. "vip": { "type": "boolean" }
9. }
10. }
11. }
12. POST /vip/_bulk
13. { "index" : { } }
14. { "ip" : "30.156.16.164", "vip": true, "name": "David P" }
15. { "index" : { } }
16. { "ip" : "164.85.94.243", "vip": true, "name": "Philipp K" }
17. { "index" : { } }
18. { "ip" : "50.184.59.162", "vip": true, "name": "Adrienne V" }
19. { "index" : { } }
20. { "ip" : "236.212.255.77", "vip": true, "name": "Carly R" }
21. { "index" : { } }
22. { "ip" : "16.241.165.21", "vip": true, "name": "Naoise R" }
23. { "index" : { } }
24. { "ip" : "246.106.125.113", "vip": true, "name": "Iulia F" }
25. { "index" : { } }
26. { "ip" : "81.194.200.150", "vip": true, "name": "Jelena Z" }
27. { "index" : { } }
28. { "ip" : "111.237.144.54", "vip": true, "name": "Matt R" }
要执行 “joins at index time”,我们需要丰富我们的数据集以获得如下所示的最终日志:
1. {
2. "agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24",
3. "bytes": 1831,
4. "clientip": "30.156.16.164",
5. "extension": "",
6. "geo": {
7. "srcdest": "US:IN",
8. "src": "US",
9. "dest": "IN",
10. "coordinates": {
11. "lat": 55.53741389,
12. "lon": -132.3975144
13. }
14. },
15. "host": "elastic-elastic-elastic.org",
16. "index": "kibana_sample_data_logs",
17. "ip": "30.156.16.163",
18. "machine": {
19. "ram": 9663676416,
20. "os": "win xp"
21. },
22. "memory": 73240,
23. "message": "30.156.16.163 - - [2018-09-01T12:43:49.756Z] \"GET /wp-login.php HTTP/1.1\" 404 1831 \"-\" \"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.50 Safari/534.24\"",
24. "phpmemory": 73240,
25. "referer": "http://www.elastic-elastic-elastic.com/success/timothy-l-kopra",
26. "request": "/wp-login.php",
27. "response": 404,
28. "tags": [
29. "success",
30. "info"
31. ],
32. "timestamp": "2023-03-18T12:43:49.756Z",
33. "url": "https://elastic-elastic-elastic.org/wp-login.php",
34. "utc_time": "2023-03-18T12:43:49.756Z",
35. "event": {
36. "dataset": "sample_web_logs"
37. },
38. "vip": true,
39. "name": "David P"
40. }
你可以使用摄取管道中的 Elasticsearch Enrich Processor 开箱即用地执行此操作。 让我们看看如何做到这一点。
在 Elasticsearch 中丰富 Elasticsearch 数据
摄取管道 - ingest pipeline
让我们首先使用摄取管道。
我们可以从一个空的开始,我们将用它来模拟我们想要的行为。 我们不需要原始数据集的完整字段集,因此我们对其进行了简化:
1. POST /_ingest/pipeline/_simulate
2. {
3. "docs": [
4. {
5. "_source": {
6. "clientip": "30.156.16.164"
7. }
8. }
9. ],
10. "pipeline": {
11. "processors": []
12. }
13. }
我们现在需要向我们的管道添加一个 enrich processor。 但为此,我们需要首先创建一个丰富的策略 (enrich policy):
1. PUT /_enrich/policy/vip-policy
2. {
3. "match": {
4. "indices": "vip",
5. "match_field": "ip",
6. "enrich_fields": ["name", "vip"]
7. }
8. }
创建丰富策略后,我们可以使用执行丰富策略 API 来执行它:
PUT /_enrich/policy/vip-policy/_execute
我们现在可以模拟它:
1. POST /_ingest/pipeline/_simulate
2. {
3. "docs": [
4. {
5. "_source": {
6. "clientip": "30.156.16.164"
7. }
8. }
9. ],
10. "pipeline": {
11. "processors": [{
12. "enrich": {
13. "policy_name": "vip-policy",
14. "field": "clientip",
15. "target_field": "enriched"
16. }
17. }]
18. }
19. }
这给出如下的响应:
1. {
2. "docs": [
3. {
4. "doc": {
5. "_index": "_index",
6. "_id": "_id",
7. "_version": "-3",
8. "_source": {
9. "enriched": {
10. "name": "David P",
11. "vip": true,
12. "ip": "30.156.16.164"
13. },
14. "clientip": "30.156.16.164"
15. },
16. "_ingest": {
17. "timestamp": "2023-04-06T17:14:29.127569953Z"
18. }
19. }
20. }
21. ]
22. }
我们只需清理一下数据即可获得我们期望的结构:
1. POST /_ingest/pipeline/_simulate
2. {
3. "docs": [
4. {
5. "_source": {
6. "clientip": "30.156.16.164"
7. }
8. }
9. ],
10. "pipeline": {
11. "processors": [{
12. "enrich": {
13. "policy_name": "vip-policy",
14. "field": "clientip",
15. "target_field": "enriched"
16. }
17. },{
18. "rename": {
19. "field": "enriched.name",
20. "target_field": "name"
21. }
22. },{
23. "rename": {
24. "field": "enriched.vip",
25. "target_field": "vip"
26. }
27. },{
28. "remove": {
29. "field": "enriched"
30. }
31. }
32. ]
33. }
34. }
现在给出了预期的结果:
1. {
2. "docs": [
3. {
4. "doc": {
5. "_index": "_index",
6. "_id": "_id",
7. "_version": "-3",
8. "_source": {
9. "name": "David P",
10. "vip": true,
11. "clientip": "30.156.16.164"
12. },
13. "_ingest": {
14. "timestamp": "2023-04-06T17:16:08.175186282Z"
15. }
16. }
17. }
18. ]
19. }
我们现在可以存储最终的管道:
1. PUT /_ingest/pipeline/vip
2. {
3. "processors": [{
4. "enrich": {
5. "policy_name": "vip-policy",
6. "field": "clientip",
7. "target_field": "enriched"
8. }
9. },{
10. "rename": {
11. "field": "enriched.name",
12. "target_field": "name",
13. "ignore_failure": true
14. }
15. },{
16. "rename": {
17. "field": "enriched.vip",
18. "target_field": "vip",
19. "ignore_failure": true
20. }
21. },{
22. "remove": {
23. "field": "enriched",
24. "ignore_failure": true
25. }
26. }
27. ]
28. }
请注意,我们通过添加一些 ignore_failure 指令对其进行了一些更改,因为我们可能在 vip 索引中找不到任何相关数据。
我们可以使用与源索引相同的映射来创建目标索引:
1. # Get the source mapping
2. GET /kibana_sample_data_logs/_mapping
4. # Create the destination index
5. PUT /kibana_sample_data_logs_new
6. {
7. // Paste the source mappings structure
8. "mappings": {
9. "properties": {
10. // And add the properties we are adding
11. "name": {
12. "type": "keyword"
13. },
14. "vip": {
15. "type": "boolean"
16. }
17. }
18. }
19. }
并调用重建索引 API:
1. POST _reindex
2. {
3. "source": {
4. "index": "kibana_sample_data_logs"
5. },
6. "dest": {
7. "index": "kibana_sample_data_logs_new",
8. "pipeline": "vip"
9. }
10. }
让我们检查一下工作是否已完成:
1. GET /kibana_sample_data_logs_new/_search?filter_path=aggregations.by_name.buckets
2. {
3. "size": 0,
4. "aggs": {
5. "by_name": {
6. "terms": {
7. "field": "name"
8. }
9. }
10. }
11. }
上述命令给出如下类似的响应:
1. {
2. "aggregations": {
3. "by_name": {
4. "buckets": [
5. {
6. "key": "David P",
7. "doc_count": 100
8. },
9. {
10. "key": "Philipp K",
11. "doc_count": 29
12. },
13. {
14. "key": "Adrienne V",
15. "doc_count": 26
16. },
17. {
18. "key": "Carly R",
19. "doc_count": 26
20. },
21. {
22. "key": "Iulia F",
23. "doc_count": 25
24. },
25. {
26. "key": "Naoise R",
27. "doc_count": 25
28. },
29. {
30. "key": "Jelena Z",
31. "doc_count": 24
32. },
33. {
34. "key": "Matt R",
35. "doc_count": 24
36. }
37. ]
38. }
39. }
40. }
运行时字段丰富
丰富数据的另一种方法是在搜索时而不是索引时执行此操作。 这与本文的第一句话相悖,但有时,你需要进行一些权衡。 在这里,我们想用搜索速度来交换灵活性。
运行时字段功能 (runtime field feature) 允许丰富搜索响应对象,但不能用于查询或聚合数据。 此功能的一个简单示例:
1. GET kibana_sample_data_logs/_search?filter_path=hits.hits.fields
2. {
3. "size": 1,
4. "query": {
5. "match": {
6. "clientip": "30.156.16.164"
7. }
8. },
9. "runtime_mappings": {
10. "enriched": {
11. "type": "lookup",
12. "target_index": "vip",
13. "input_field": "clientip",
14. "target_field": "ip",
15. "fetch_fields": ["name", "vip"]
16. }
17. },
18. "fields": [
19. "clientip",
20. "enriched"
21. ],
22. "_source": false
23. }
上述命令给出如下的响应:
1. {
2. "hits": {
3. "hits": [
4. {
5. "fields": {
6. "enriched": [
7. {
8. "name": [
9. "David P"
10. ],
11. "vip": [
12. true
13. ]
14. }
15. ],
16. "clientip": [
17. "30.156.16.164"
18. ]
19. }
20. }
21. ]
22. }
23. }
请注意,这也可以添加为映射的一部分:
1. PUT kibana_sample_data_logs/_mappings
2. {
3. "runtime": {
4. "enriched": {
5. "type": "lookup",
6. "target_index": "vip",
7. "input_field": "clientip",
8. "target_field": "ip",
9. "fetch_fields": ["name", "vip"]
10. }
11. }
12. }
14. GET kibana_sample_data_logs/_search
15. {
16. "size": 1,
17. "query": {
18. "match": {
19. "clientip": "30.156.16.164"
20. }
21. },
22. "fields": [
23. "clientip",
24. "enriched"
25. ]
26. }
但是,如果你希望能够搜索或聚合这些字段,则需要在搜索时实际发出 (emit) 一些内容。
请注意,我们不能使用此方法在另一个索引中进行查找。 因此,因为且仅仅因为列表的长度很小,我们可以使用脚本来动态进行 “丰富”:
1. PUT kibana_sample_data_logs/_mappings
2. {
3. "runtime": {
4. "name": {
5. "type": "keyword",
6. "script": {
7. "source":
8. """
9. def name=params.name;
10. for (int i=0; i< params.lookup.length; i++) {
11. if (params.lookup[i].ip == doc['clientip'].value) {
12. emit(params.lookup[i].name);
13. break;
14. }
15. }
16. """,
17. "lang": "painless",
18. "params": {
19. "name": "David P",
20. "lookup": [
21. { "ip" : "30.156.16.164", "vip": true, "name": "David P" },
22. { "ip" : "164.85.94.243", "vip": true, "name": "Philipp K" },
23. { "ip" : "50.184.59.162", "vip": true, "name": "Adrienne V" },
24. { "ip" : "236.212.255.77", "vip": true, "name": "Carly R" },
25. { "ip" : "16.241.165.21", "vip": true, "name": "Naoise R" },
26. { "ip" : "246.106.125.113", "vip": true, "name": "Iulia F" },
27. { "ip" : "81.194.200.150", "vip": true, "name": "Jelena Z" },
28. { "ip" : "111.237.144.54", "vip": true, "name": "Matt R" }
29. ]
30. }
31. }
32. },
33. "vip": {
34. "type": "boolean",
35. "script": {
36. "source":
37. """
38. def name=params.name;
39. for (int i=0; i< params.lookup.length; i++) {
40. if (params.lookup[i].ip == doc['clientip'].value) {
41. emit(params.lookup[i].vip);
42. break;
43. }
44. }
45. """,
46. "lang": "painless",
47. "params": {
48. "name": "David P",
49. "lookup": [
50. { "ip" : "30.156.16.164", "vip": true, "name": "David P" },
51. { "ip" : "164.85.94.243", "vip": true, "name": "Philipp K" },
52. { "ip" : "50.184.59.162", "vip": true, "name": "Adrienne V" },
53. { "ip" : "236.212.255.77", "vip": true, "name": "Carly R" },
54. { "ip" : "16.241.165.21", "vip": true, "name": "Naoise R" },
55. { "ip" : "246.106.125.113", "vip": true, "name": "Iulia F" },
56. { "ip" : "81.194.200.150", "vip": true, "name": "Jelena Z" },
57. { "ip" : "111.237.144.54", "vip": true, "name": "Matt R" }
58. ]
59. }
60. }
61. }
62. }
63. }
我们可以再次聚合这些运行时字段:
1. GET /kibana_sample_data_logs/_search?filter_path=aggregations.by_name.buckets
2. {
3. "size": 0,
4. "aggs": {
5. "by_name": {
6. "terms": {
7. "field": "name"
8. }
9. }
10. }
11. }
这给出了与我们之前看到的相同的结果,但当然有点慢:
1. {
2. "aggregations": {
3. "by_name": {
4. "buckets": [
5. {
6. "key": "David P",
7. "doc_count": 100
8. },
9. {
10. "key": "Philipp K",
11. "doc_count": 29
12. },
13. {
14. "key": "Adrienne V",
15. "doc_count": 26
16. },
17. {
18. "key": "Carly R",
19. "doc_count": 26
20. },
21. {
22. "key": "Iulia F",
23. "doc_count": 25
24. },
25. {
26. "key": "Naoise R",
27. "doc_count": 25
28. },
29. {
30. "key": "Jelena Z",
31. "doc_count": 24
32. },
33. {
34. "key": "Matt R",
35. "doc_count": 24
36. }
37. ]
38. }
39. }
40. }
同样,此方法不适用于大索引,因此如我们在第一部分中看到的那样重新索引数据将是首选方法。
本文中描述的任何特性或功能的发布和时间安排均由 Elastic 自行决定。 当前不可用的任何特性或功能可能无法按时交付或根本无法交付。