Elasticsearch: Prefix queries - 前缀查询

1,421 阅读10分钟

本文由 简悦 SimpRead 转码, 原文地址 blog.csdn.net

Prefix queries 被用于在查询时返回在提供的字段中包含特定前缀的文档。有时我们可能想使用前缀查询单词,例如 Leonardo 的 Leo 或 Marlon Brando、Mark Hamill 或 Martin Balsam 的 Mar。 Elasticsearch 提供了一个前缀查询,用于获取匹配单词开头部分(前缀)的记录。 

准备数据

示例

我们先准备数据。我们想创建如下的一个 movies 的索引:



1.  PUT movies
2.  {
3.    "settings": {
4.      "analysis": {
5.        "analyzer": {
6.          "en_analyzer": {
7.            "tokenizer": "standard",
8.            "filter": [
9.              "lowercase",
10.              "stop"
11.            ]
12.          },
13.          "shingle_analyzer": {
14.            "type": "custom",
15.            "tokenizer": "standard",
16.            "filter": [
17.              "lowercase",
18.              "shingle_filter"
19.            ]
20.          }
21.        },
22.        "filter": {
23.          "shingle_filter": {
24.            "type": "shingle",
25.            "min_shingle_size": 2,
26.            "max_shingle_size": 3
27.          }
28.        }
29.      }
30.    },
31.    "mappings": {
32.      "properties": {
33.        "title": {
34.          "type": "text",
35.          "analyzer": "en_analyzer",
36.          "fields": {
37.            "suggest": {
38.              "type": "text",
39.              "analyzer": "shingle_analyzer"
40.            }
41.          }
42.        },
43.        "actors": {
44.          "type": "text",
45.          "analyzer": "en_analyzer",
46.          "fields": {
47.            "keyword": {
48.              "type": "keyword",
49.              "ignore_above": 256
50.            }
51.          }
52.        },
53.        "description": {
54.          "type": "text",
55.          "analyzer": "en_analyzer",
56.          "fields": {
57.            "keyword": {
58.              "type": "keyword",
59.              "ignore_above": 256
60.            }
61.          }
62.        },
63.        "director": {
64.          "type": "text",
65.          "fields": {
66.            "keyword": {
67.              "type": "keyword",
68.              "ignore_above": 256
69.            }
70.          }
71.        },
72.        "genre": {
73.          "type": "text",
74.          "fields": {
75.            "keyword": {
76.              "type": "keyword",
77.              "ignore_above": 256
78.            }
79.          }
80.        },
81.        "metascore": {
82.          "type": "long"
83.        },
84.        "rating": {
85.          "type": "float"
86.        },
87.        "revenue": {
88.          "type": "float"
89.        },
90.        "runtime": {
91.          "type": "long"
92.        },
93.        "votes": {
94.          "type": "long"
95.        },
96.        "year": {
97.          "type": "long"
98.        },
99.        "title_suggest": {
100.          "type": "completion",
101.          "analyzer": "simple",
102.          "preserve_separators": true,
103.          "preserve_position_increments": true,
104.          "max_input_length": 50
105.        }
106.      }
107.    }
108.  }


我们接下来使用 _bulk 命令来写入一些文档到这个索引中去。我们使用这个链接中的内容。我们使用如下的方法:



1.  POST movies/_bulk
2.  {"index": {}}
3.  {"title": "Guardians of the Galaxy", "genre": "Action,Adventure,Sci-Fi", "director": "James Gunn", "actors": "Chris Pratt, Vin Diesel, Bradley Cooper, Zoe Saldana", "description": "A group of intergalactic criminals are forced to work together to stop a fanatical warrior from taking control of the universe.", "year": 2014, "runtime": 121, "rating": 8.1, "votes": 757074, "revenue": 333.13, "metascore": 76}
4.  {"index": {}}
5.  {"title": "Prometheus", "genre": "Adventure,Mystery,Sci-Fi", "director": "Ridley Scott", "actors": "Noomi Rapace, Logan Marshall-Green, Michael Fassbender, Charlize Theron", "description": "Following clues to the origin of mankind, a team finds a structure on a distant moon, but they soon realize they are not alone.", "year": 2012, "runtime": 124, "rating": 7, "votes": 485820, "revenue": 126.46, "metascore": 65}

7.  ....


在上面,为了说明的方便,我省去了其它的文档。你需要把整个 movies.txt 的文件拷贝过来,并全部写入到 Elasticsearch 中。它共有1000 个文档。

Prefix 查询

我们使用如下的例子来进行查询:



1.  GET movies/_search?filter_path=**.hits
2.  {
3.    "_source": false, 
4.    "fields": [
5.      "actors"
6.    ], 
7.    "query": {
8.      "prefix": {
9.        "actors.keyword": {
10.          "value": "Mar"
11.        }
12.      }
13.    }
14.  }


当我们搜索前缀 Mar 时,上面的查询获取了演员以 Mar 开头的电影。请注意,我们正在 actors.keyword 字段上运行前缀查询。它是一个 keyword 字段。返回的结果为:



1.  {
2.    "hits": {
3.      "hits": [
4.        {
5.          "_index": "movies",
6.          "_id": "RgJfWIYBfOmyc7Qq5geX",
7.          "_score": 1,
8.          "fields": {
9.            "actors": [
10.              "Mark Wahlberg, Michelle Monaghan, J.K. Simmons, John Goodman"
11.            ]
12.          }
13.        },
14.        {
15.          "_index": "movies",
16.          "_id": "SQJfWIYBfOmyc7Qq5geX",
17.          "_score": 1,
18.          "fields": {
19.            "actors": [
20.              "Mark Wahlberg, Kurt Russell, Douglas M. Griffin, James DuMont"
21.            ]
22.          }
23.        },
24.        {
25.          "_index": "movies",
26.          "_id": "awJfWIYBfOmyc7Qq5geX",
27.          "_score": 1,
28.          "fields": {
29.            "actors": [
30.              "Mario Casas, Ana Wagener, José Coronado, Bárbara Lennie"
31.            ]
32.          }
33.        },
34.        {
35.          "_index": "movies",
36.          "_id": "ggJfWIYBfOmyc7Qq5geX",
37.          "_score": 1,
38.          "fields": {
39.            "actors": [
40.              "Mark Wahlberg, Nicola Peltz, Jack Reynor, Stanley Tucci"
41.            ]
42.          }
43.        },
44.        {
45.          "_index": "movies",
46.          "_id": "mgJfWIYBfOmyc7Qq5geX",
47.          "_score": 1,
48.          "fields": {
49.            "actors": [
50.              "Mark Rylance, Ruby Barnhill, Penelope Wilton,Jemaine Clement"
51.            ]
52.          }
53.        },
54.        {
55.          "_index": "movies",
56.          "_id": "xAJfWIYBfOmyc7Qq5geX",
57.          "_score": 1,
58.          "fields": {
59.            "actors": [
60.              "Mark Ruffalo, Michael Keaton, Rachel McAdams, Liev Schreiber"
61.            ]
62.          }
63.        },
64.        {
65.          "_index": "movies",
66.          "_id": "3gJfWIYBfOmyc7Qq5geX",
67.          "_score": 1,
68.          "fields": {
69.            "actors": [
70.              "Mark Huberman, Susan Loughnane, Steve Oram,Catherine Walker"
71.            ]
72.          }
73.        },
74.        {
75.          "_index": "movies",
76.          "_id": "EwJfWIYBfOmyc7Qq5giX",
77.          "_score": 1,
78.          "fields": {
79.            "actors": [
80.              "Martin Freeman, Ian McKellen, Richard Armitage,Andy Serkis"
81.            ]
82.          }
83.        },
84.        {
85.          "_index": "movies",
86.          "_id": "MQJfWIYBfOmyc7Qq5giX",
87.          "_score": 1,
88.          "fields": {
89.            "actors": [
90.              "Mark Wahlberg, Taylor Kitsch, Emile Hirsch, Ben Foster"
91.            ]
92.          }
93.        },
94.        {
95.          "_index": "movies",
96.          "_id": "tgJfWIYBfOmyc7Qq5giY",
97.          "_score": 1,
98.          "fields": {
99.            "actors": [
100.              "Marilyn Manson, Mark Boone Junior, Sam Quartin, Niko Nicotera"
101.            ]
102.          }
103.        }
104.      ]
105.    }
106.  }


很显然,actors 的列表中都是以 Mar 为开头的列表。

注意前缀查询是一个昂贵的查询 - 有时会破坏集群的稳定性。

我们不需要在字段块级别添加由 value 组成的对象。 相反,你可以创建一个缩短的版本,如下所示,为简洁起见:



1.  GET movies/_search?filter_path=**.hits
2.  {
3.    "_source": false,
4.    "fields": [
5.      "actors"
6.    ],
7.    "query": {
8.      "prefix": {
9.        "actors.keyword": "Mar"
10.      }
11.    }
12.  }


由于我们希望在结果中找出匹配的字段,因此我们将通过在查询中添加高亮来突出显示结果。 我们向前缀查询添加一个 highlight 显示块。 这会突出一个或多个匹配的字段,如下面的清单所示。



1.  GET movies/_search?filter_path=**.hits
2.  {
3.    "_source": false,
4.    "query": {
5.      "prefix": {
6.        "actors.keyword": "Mar"
7.      }
8.    },
9.    "highlight": {
10.      "fields": {
11.        "actors.keyword": {}
12.      }
13.    }
14.  }


上面的搜索结果显示:



1.  {
2.    "hits": {
3.      "hits": [
4.        {
5.          "_index": "movies",
6.          "_id": "RgJfWIYBfOmyc7Qq5geX",
7.          "_score": 1,
8.          "highlight": {
9.            "actors.keyword": [
10.              "<em>Mark Wahlberg, Michelle Monaghan, J.K. Simmons, John Goodman</em>"
11.            ]
12.          }
13.        },
14.        {
15.          "_index": "movies",
16.          "_id": "SQJfWIYBfOmyc7Qq5geX",
17.          "_score": 1,
18.          "highlight": {
19.            "actors.keyword": [
20.              "<em>Mark Wahlberg, Kurt Russell, Douglas M. Griffin, James DuMont</em>"
21.            ]
22.          }
23.        },
24.        {
25.          "_index": "movies",
26.          "_id": "awJfWIYBfOmyc7Qq5geX",
27.          "_score": 1,
28.          "highlight": {
29.            "actors.keyword": [
30.              "<em>Mario Casas, Ana Wagener, José Coronado, Bárbara Lennie</em>"
31.            ]
32.          }
33.        },
34.        {
35.          "_index": "movies",
36.          "_id": "ggJfWIYBfOmyc7Qq5geX",
37.          "_score": 1,
38.          "highlight": {
39.            "actors.keyword": [
40.              "<em>Mark Wahlberg, Nicola Peltz, Jack Reynor, Stanley Tucci</em>"
41.            ]
42.          }
43.        },
44.        {
45.          "_index": "movies",
46.          "_id": "mgJfWIYBfOmyc7Qq5geX",
47.          "_score": 1,
48.          "highlight": {
49.            "actors.keyword": [
50.              "<em>Mark Rylance, Ruby Barnhill, Penelope Wilton,Jemaine Clement</em>"
51.            ]
52.          }
53.        },
54.        {
55.          "_index": "movies",
56.          "_id": "xAJfWIYBfOmyc7Qq5geX",
57.          "_score": 1,
58.          "highlight": {
59.            "actors.keyword": [
60.              "<em>Mark Ruffalo, Michael Keaton, Rachel McAdams, Liev Schreiber</em>"
61.            ]
62.          }
63.        },
64.        {
65.          "_index": "movies",
66.          "_id": "3gJfWIYBfOmyc7Qq5geX",
67.          "_score": 1,
68.          "highlight": {
69.            "actors.keyword": [
70.              "<em>Mark Huberman, Susan Loughnane, Steve Oram,Catherine Walker</em>"
71.            ]
72.          }
73.        },
74.        {
75.          "_index": "movies",
76.          "_id": "EwJfWIYBfOmyc7Qq5giX",
77.          "_score": 1,
78.          "highlight": {
79.            "actors.keyword": [
80.              "<em>Martin Freeman, Ian McKellen, Richard Armitage,Andy Serkis</em>"
81.            ]
82.          }
83.        },
84.        {
85.          "_index": "movies",
86.          "_id": "MQJfWIYBfOmyc7Qq5giX",
87.          "_score": 1,
88.          "highlight": {
89.            "actors.keyword": [
90.              "<em>Mark Wahlberg, Taylor Kitsch, Emile Hirsch, Ben Foster</em>"
91.            ]
92.          }
93.        },
94.        {
95.          "_index": "movies",
96.          "_id": "tgJfWIYBfOmyc7Qq5giY",
97.          "_score": 1,
98.          "highlight": {
99.            "actors.keyword": [
100.              "<em>Marilyn Manson, Mark Boone Junior, Sam Quartin, Niko Nicotera</em>"
101.            ]
102.          }
103.        }
104.      ]
105.    }
106.  }


我们之前讨论过,前缀查询在运行查询时会施加额外的计算压力。 幸运的是,有一种方法可以加快这种煞费苦心的性能不佳的前缀查询 —— 将在下一节中讨论。

加速前缀查询

这是因为引擎必须根据前缀(任何带字母的单词)得出结果。 因此,前缀查询运行起来很慢,但有一种机制可以加快它们的速度:在字段上使用 index_prefixes 参数。

我们可以在开发映射模式时在字段上设置 index_prefixes 参数。 例如,下面清单中的映射定义在我们为本练习创建的新索引 new_movies 上使用附加参数 index_prefixes 设置 title 字段(请记住,title 字段是 text 数据类型)。我们按照如下的命令来创建这个新索引:



1.  PUT new_movies
2.  {
3.    "settings": {
4.      "analysis": {
5.        "analyzer": {
6.          "en_analyzer": {
7.            "tokenizer": "standard",
8.            "filter": [
9.              "lowercase",
10.              "stop"
11.            ]
12.          },
13.          "shingle_analyzer": {
14.            "type": "custom",
15.            "tokenizer": "standard",
16.            "filter": [
17.              "lowercase",
18.              "shingle_filter"
19.            ]
20.          }
21.        },
22.        "filter": {
23.          "shingle_filter": {
24.            "type": "shingle",
25.            "min_shingle_size": 2,
26.            "max_shingle_size": 3
27.          }
28.        }
29.      }
30.    },
31.    "mappings": {
32.      "properties": {
33.        "title": {
34.          "type": "text",
35.          "index_prefixes": {}
36.        },
37.        "actors": {
38.          "type": "text",
39.          "analyzer": "en_analyzer",
40.          "fields": {
41.            "keyword": {
42.              "type": "keyword",
43.              "ignore_above": 256
44.            }
45.          }
46.        },
47.        "description": {
48.          "type": "text",
49.          "analyzer": "en_analyzer",
50.          "fields": {
51.            "keyword": {
52.              "type": "keyword",
53.              "ignore_above": 256
54.            }
55.          }
56.        },
57.        "director": {
58.          "type": "text",
59.          "fields": {
60.            "keyword": {
61.              "type": "keyword",
62.              "ignore_above": 256
63.            }
64.          }
65.        },
66.        "genre": {
67.          "type": "text",
68.          "fields": {
69.            "keyword": {
70.              "type": "keyword",
71.              "ignore_above": 256
72.            }
73.          }
74.        },
75.        "metascore": {
76.          "type": "long"
77.        },
78.        "rating": {
79.          "type": "float"
80.        },
81.        "revenue": {
82.          "type": "float"
83.        },
84.        "runtime": {
85.          "type": "long"
86.        },
87.        "votes": {
88.          "type": "long"
89.        },
90.        "year": {
91.          "type": "long"
92.        },
93.        "title_suggest": {
94.          "type": "completion",
95.          "analyzer": "simple",
96.          "preserve_separators": true,
97.          "preserve_position_increments": true,
98.          "max_input_length": 50
99.        }
100.      }
101.    }
102.  }


在上面,我们为 new_movies 添加了如下的 index_prefixes 相:

 1.        "title": {
2.          "type": "text",
3.          "index_prefixes": {},
4.          "analyzer": "en_analyzer",
5.          "fields": {
6.            "suggest": {
7.              "type": "text",
8.              "analyzer": "shingle_analyzer"
9.            }
10.          }
11.        }

从清单中的代码可以看出,title 属性包含一个附加属性 index_prefixes。 这向引擎表明,在索引过程中,它应该创建带有预置前缀的字段并存储这些值。 我们使用如下的代码来写入数据到这个索引中:



1.  POST _reindex
2.  {
3.    "source": {
4.      "index": "movies"
5.    },
6.    "dest": {
7.      "index": "new_movies"
8.    }
9.  }


我们使用 reindex 把之前的 movies 里的文档写入到 new_movies 索引中去。

因为我们在上面显示的列表中的 title 字段上设置了 index_prefixes,所以 Elasticsearch 默认为最小字符大小 2 和最大字符大小 5 索引前缀。 这样,当我们运行前缀查询时,就不需要计算前缀了。 相反,它从存储中获取它们。

当然,我们可以更改 Elasticsearch 在索引期间尝试为我们创建的前缀的默认最小和最大大小。 这是通过调整 index_prefixes 对象的大小来完成的,如下面的清单所示。



1.  PUT my-index-000001
2.  {
3.    "mappings": {
4.      "properties": {
5.        "full_name": {
6.          "type": "text",
7.          "index_prefixes": {
8.            "min_chars" : 1,
9.            "max_chars" : 10
10.          }
11.        }
12.      }
13.    }
14.  }


在清单中,我们要求引擎预先创建最小和最大字符长度分别为 4 个和 10 个字母的前缀。 注意,min_chars 必须大于 0,max_chars 应小于 20 个字符。 这样,我们就可以在索引过程中自定义 Elasticsearch 应该预先创建的前缀。

我们接着可以对 title 字段做类似下面的搜索:



1.  GET new_movies/_search?filter_path=**.hits
2.  {
3.    "_source": false,
4.    "fields": [
5.      "title"
6.    ], 
7.    "query": {
8.      "prefix": {
9.        "title": {
10.          "value": "ga"
11.        }
12.      }
13.    }
14.  }


在上面的搜索中,我们查询 titile 字段里 含有 ga 为开头的文档。上述搜索返回如下的结果:



1.  {
2.    "hits": {
3.      "hits": [
4.        {
5.          "_index": "new_movies",
6.          "_id": "BAJfWIYBfOmyc7Qq5geX",
7.          "_score": 1,
8.          "fields": {
9.            "title": [
10.              "Guardians of the Galaxy"
11.            ]
12.          }
13.        },
14.        {
15.          "_index": "new_movies",
16.          "_id": "jQJfWIYBfOmyc7Qq5geX",
17.          "_score": 1,
18.          "fields": {
19.            "title": [
20.              "The Great Gatsby"
21.            ]
22.          }
23.        },
24.        {
25.          "_index": "new_movies",
26.          "_id": "lQJfWIYBfOmyc7Qq5geX",
27.          "_score": 1,
28.          "fields": {
29.            "title": [
30.              "Ah-ga-ssi"
31.            ]
32.          }
33.        },
34.        {
35.          "_index": "new_movies",
36.          "_id": "mwJfWIYBfOmyc7Qq5geX",
37.          "_score": 1,
38.          "fields": {
39.            "title": [
40.              "The Hunger Games"
41.            ]
42.          }
43.        },
44.        {
45.          "_index": "new_movies",
46.          "_id": "sAJfWIYBfOmyc7Qq5geX",
47.          "_score": 1,
48.          "fields": {
49.            "title": [
50.              "Beyond the Gates"
51.            ]
52.          }
53.        },
54.        {
55.          "_index": "new_movies",
56.          "_id": "ygJfWIYBfOmyc7Qq5geX",
57.          "_score": 1,
58.          "fields": {
59.            "title": [
60.              "The Imitation Game"
61.            ]
62.          }
63.        },
64.        {
65.          "_index": "new_movies",
66.          "_id": "jQJfWIYBfOmyc7Qq5giY",
67.          "_score": 1,
68.          "fields": {
69.            "title": [
70.              "Whisky Galore"
71.            ]
72.          }
73.        },
74.        {
75.          "_index": "new_movies",
76.          "_id": "nAJfWIYBfOmyc7Qq5giY",
77.          "_score": 1,
78.          "fields": {
79.            "title": [
80.              "The Hunger Games: Mockingjay - Part 2"
81.            ]
82.          }
83.        },
84.        {
85.          "_index": "new_movies",
86.          "_id": "1QJfWIYBfOmyc7Qq5giY",
87.          "_score": 1,
88.          "fields": {
89.            "title": [
90.              "Sherlock Holmes: A Game of Shadows"
91.            ]
92.          }
93.        },
94.        {
95.          "_index": "new_movies",
96.          "_id": "2gJfWIYBfOmyc7Qq5giY",
97.          "_score": 1,
98.          "fields": {
99.            "title": [
100.              "American Gangster"
101.            ]
102.          }
103.        }
104.      ]
105.    }
106.  }


很显然,返回的结果里都含有 "ga" 为开头的单词。