Elasticsearch进阶笔记第三十四篇Elasticsearch高手进阶篇(73) elasticsearch高手

Elasticsearch高手进阶篇(73)

elasticsearch高手进阶_基于term vector深入探查数据的情况

term vector介绍

获取document中的某个field内的各个term的统计信息

term information: term frequency in the field, term positions, start and end offsets, term payloads

term statistics: 设置term_statistics=true; total term frequency, 一个term在所有document中出现的频率; document frequency，有多少document包含这个term

field statistics: document count，有多少document包含这个field; sum of document frequency，一个field中所有term的df之和; sum of total term frequency，一个field中的所有term的tf之和

 GET /twitter/tweet/1/_termvectors
 GET /twitter/tweet/1/_termvectors?fields=text

term statistics和field statistics并不精准，不会被考虑有的doc可能被删除了

我告诉大家，其实很少用，用的时候，一般来说，就是你需要对一些数据做探查的时候。比如说，你想要看到某个term，某个词条，大话西游，这个词条，在多少个document中出现了。或者说某个field，film_desc，电影的说明信息，有多少个doc包含了这个说明信息。

index-iime term vector实验

term vector，涉及了很多的term和field相关的统计信息，有两种方式可以采集到这个统计信息

index-time，你在mapping里配置一下，然后建立索引的时候，就直接给你生成这些term和field的统计信息了
query-time，你之前没有生成过任何的Term vector信息，然后在查看term vector的时候，直接就可以看到了，会on the fly，现场计算出各种统计信息，然后返回给你

掌握，如何采集term vector信息

看懂term vector信息，你能掌握利用term vector进行数据探查

建立索引

 PUT /waws_index
 {
   "mappings": {
     "waws_type": {
       "properties": {
         "text": {
             "type": "text",
             "term_vector": "with_positions_offsets_payloads",
             "store" : true,
             "analyzer" : "fulltext_analyzer"
          },
          "fullname": {
             "type": "text",
             "analyzer" : "fulltext_analyzer"
         }
       }
     }
   },
   "settings" : {
     "index" : {
       "number_of_shards" : 1,
       "number_of_replicas" : 0
     },
     "analysis": {
       "analyzer": {
         "fulltext_analyzer": {
           "type": "custom",
           "tokenizer": "whitespace",
           "filter": [
             "lowercase",
             "type_as_payload"
           ]
         }
       }
     }
   }
 }

存入数据

 PUT /waws_index/waws_type/1
 {
   "fullname" : "Leo Li",
   "text" : "hello test test test "
 }
 
 PUT /waws_index/waws_type/2
 {
   "fullname" : "Leo Li",
   "text" : "other hello test ..."
 }

获取数据

 GET /waws_index/waws_type/1/_termvectors
 {
   "fields" : ["text"],
   "offsets" : true,
   "payloads" : true,
   "positions" : true,
   "term_statistics" : true,
   "field_statistics" : true
 }
 
 {
   "_index": "waws_index",
   "_type": "waws_type",
   "_id": "1",
   "_version": 1,
   "found": true,
   "took": 19,
   "term_vectors": {
     "text": {
       "field_statistics": {
         "sum_doc_freq": 6,   # 一个field中所有term的df之和
         "doc_count": 2,      # 一共两个doc
         "sum_ttf": 8         # 所有字段的和(重复算多个)
       },
       "terms": {
         "hello": {             # hello这个字段
           "doc_freq": 2,       # 有多少的doc包含这个字段
           "ttf": 2,            # 一个term在所有document中出现的频率
           "term_freq": 1,      # hello在doc1中包含几次
           "tokens": [
             {
               "position": 0,            # 位置
               "start_offset": 0,        # 起始偏移量
               "end_offset": 5,          # 终止偏移量
               "payload": "d29yZA=="
             }
           ]
         },
         "test": {
           "doc_freq": 2,
           "ttf": 4,
           "term_freq": 3,
           "tokens": [
             {
               "position": 1,
               "start_offset": 6,
               "end_offset": 10,
               "payload": "d29yZA=="
             },
             {
               "position": 2,
               "start_offset": 11,
               "end_offset": 15,
               "payload": "d29yZA=="
             },
             {
               "position": 3,
               "start_offset": 16,
               "end_offset": 20,
               "payload": "d29yZA=="
             }
           ]
         }
       }
     }
   }
 }

query-time term vector实验

 GET /waws_index/waws_type/1/_termvectors
 {
   "fields" : ["fullname"],
   "offsets" : true,
   "positions" : true,
   "term_statistics" : true,
   "field_statistics" : true
 }
 
 {
   "_index": "waws_index",
   "_type": "waws_type",
   "_id": "1",
   "_version": 1,
   "found": true,
   "took": 39,
   "term_vectors": {
     "fullname": {
       "field_statistics": {
         "sum_doc_freq": 4,
         "doc_count": 2,
         "sum_ttf": 4
       },
       "terms": {
         "leo": {
           "doc_freq": 2,
           "ttf": 2,
           "term_freq": 1,
           "tokens": [
             {
               "position": 0,
               "start_offset": 0,
               "end_offset": 3
             }
           ]
         },
         "li": {
           "doc_freq": 2,
           "ttf": 2,
           "term_freq": 1,
           "tokens": [
             {
               "position": 1,
               "start_offset": 4,
               "end_offset": 6
             }
           ]
         }
       }
     }
   }
 }

一般来说，如果条件允许，你就用query time的term vector就可以了，你要探查什么数据，现场去探查一下就好了

手动指定doc的term vector

 GET /waws_index/waws_type/_termvectors
 {
   "doc" : {
     "fullname" : "Leo Li",
     "text" : "hello test test test"
   },
   "fields" : ["text"],
   "offsets" : true,
   "payloads" : true,
   "positions" : true,
   "term_statistics" : true,
   "field_statistics" : true
 }
 
 {
   "_index": "waws_index",
   "_type": "waws_type",
   "_version": 0,
   "found": true,
   "took": 1,
   "term_vectors": {
     "text": {
       "field_statistics": {
         "sum_doc_freq": 6,
         "doc_count": 2,
         "sum_ttf": 8
       },
       "terms": {
         "hello": {
           "doc_freq": 2,
           "ttf": 2,
           "term_freq": 1,
           "tokens": [
             {
               "position": 0,
               "start_offset": 0,
               "end_offset": 5
             }
           ]
         },
         "test": {
           "doc_freq": 2,
           "ttf": 4,
           "term_freq": 3,
           "tokens": [
             {
               "position": 1,
               "start_offset": 6,
               "end_offset": 10
             },
             {
               "position": 2,
               "start_offset": 11,
               "end_offset": 15
             },
             {
               "position": 3,
               "start_offset": 16,
               "end_offset": 20
             }
           ]
         }
       }
     }
   }
 }

手动指定一个doc，实际上不是要指定doc，而是要指定你想要安插的词条，hello test，那么就可以放在一个field中

将这些term分词，然后对每个term，都去计算它在现有的所有doc中的一些统计信息

这个挺有用的，可以让你手动指定要探查的term的数据情况，你就可以指定探查“大话西游”这个词条的统计信息

手动指定analyzer来生成term vector

 GET /waws_index/waws_type/_termvectors
 {
   "doc" : {
     "fullname" : "Leo Li",
     "text" : "hello test test test"
   },
   "fields" : ["text"],
   "offsets" : true,
   "payloads" : true,
   "positions" : true,
   "term_statistics" : true,
   "field_statistics" : true,
   "per_field_analyzer" : {
     "text": "standard"
   }
 }
 
 {
   "_index": "waws_index",
   "_type": "waws_type",
   "_version": 0,
   "found": true,
   "took": 0,
   "term_vectors": {
     "text": {
       "field_statistics": {
         "sum_doc_freq": 6,
         "doc_count": 2,
         "sum_ttf": 8
       },
       "terms": {
         "hello": {
           "doc_freq": 2,
           "ttf": 2,
           "term_freq": 1,
           "tokens": [
             {
               "position": 0,
               "start_offset": 0,
               "end_offset": 5
             }
           ]
         },
         "test": {
           "doc_freq": 2,
           "ttf": 4,
           "term_freq": 3,
           "tokens": [
             {
               "position": 1,
               "start_offset": 6,
               "end_offset": 10
             },
             {
               "position": 2,
               "start_offset": 11,
               "end_offset": 15
             },
             {
               "position": 3,
               "start_offset": 16,
               "end_offset": 20
             }
           ]
         }
       }
     }
   }
 }

terms filter

 GET /waws_index/waws_type/_termvectors
 {
   "doc" : {
     "fullname" : "Leo Li",
     "text" : "hello test test test"
   },
   "fields" : ["text"],
   "offsets" : true,
   "payloads" : true,
   "positions" : true,
   "term_statistics" : true,
   "field_statistics" : true,
   "filter" : {
       "max_num_terms" : 3,
       "min_term_freq" : 1,
       "min_doc_freq" : 1
     }
 }
 
 {
   "_index": "waws_index",
   "_type": "waws_type",
   "_version": 0,
   "found": true,
   "took": 1,
   "term_vectors": {
     "text": {
       "field_statistics": {
         "sum_doc_freq": 6,
         "doc_count": 2,
         "sum_ttf": 8
       },
       "terms": {
         "hello": {
           "doc_freq": 2,
           "ttf": 2,
           "term_freq": 1,
           "tokens": [
             {
               "position": 0,
               "start_offset": 0,
               "end_offset": 5
             }
           ],
           "score": 1
         },
         "test": {
           "doc_freq": 2,
           "ttf": 4,
           "term_freq": 3,
           "tokens": [
             {
               "position": 1,
               "start_offset": 6,
               "end_offset": 10
             },
             {
               "position": 2,
               "start_offset": 11,
               "end_offset": 15
             },
             {
               "position": 3,
               "start_offset": 16,
               "end_offset": 20
             }
           ],
           "score": 3
         }
       }
     }
   }
 }

这个就是说，根据term统计信息，过滤出你想要看到的term vector统计结果也挺有用的，比如你探查数据把，可以过滤掉一些出现频率过低的term，就不考虑了

multi term vector

 GET _mtermvectors
 {
    "docs": [
       {
          "_index": "my_index",
          "_type": "my_type",
          "_id": "2",
          "term_statistics": true
       },
       {
          "_index": "my_index",
          "_type": "my_type",
          "_id": "1",
          "fields": [
             "text"
          ]
       }
    ]
 }
 
 {
   "docs": [
     {
       "_index": "waws_index",
       "_type": "waws_type",
       "_id": "2",
       "_version": 1,
       "found": true,
       "took": 0,
       "term_vectors": {
         "text": {
           "field_statistics": {
             "sum_doc_freq": 6,
             "doc_count": 2,
             "sum_ttf": 8
           },
           "terms": {
             "...": {
               "doc_freq": 1,
               "ttf": 1,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 3,
                   "start_offset": 17,
                   "end_offset": 20,
                   "payload": "d29yZA=="
                 }
               ]
             },
             "hello": {
               "doc_freq": 2,
               "ttf": 2,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 1,
                   "start_offset": 6,
                   "end_offset": 11,
                   "payload": "d29yZA=="
                 }
               ]
             },
             "other": {
               "doc_freq": 1,
               "ttf": 1,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 0,
                   "start_offset": 0,
                   "end_offset": 5,
                   "payload": "d29yZA=="
                 }
               ]
             },
             "test": {
               "doc_freq": 2,
               "ttf": 4,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 2,
                   "start_offset": 12,
                   "end_offset": 16,
                   "payload": "d29yZA=="
                 }
               ]
             }
           }
         }
       }
     },
     {
       "_index": "my_index",
       "_type": "my_type",
       "_id": "1",
       "error": {
         "root_cause": [
           {
             "type": "index_not_found_exception",
             "reason": "no such index",
             "index_uuid": "_na_",
             "index": "my_index"
           }
         ],
         "type": "index_not_found_exception",
         "reason": "no such index",
         "index_uuid": "_na_",
         "index": "my_index"
       }
     }
   ]
 }

第二个

 GET /waws_index/_mtermvectors
 {
    "docs": [
       {
          "_type": "test",
          "_id": "2",
          "fields": [
             "text"
          ],
          "term_statistics": true
       },
       {
          "_type": "test",
          "_id": "1"
       }
    ]
 }
 
 {
   "docs": [
     {
       "_index": "waws_index",
       "_type": "test",
       "_id": "2",
       "_version": 0,
       "found": false,
       "took": 0
     },
     {
       "_index": "waws_index",
       "_type": "test",
       "_id": "1",
       "_version": 0,
       "found": false,
       "took": 0
     }
   ]
 }

第三个

 GET /waws_index/waws_type/_mtermvectors
 {
    "docs": [
       {
          "_id": "2",
          "fields": [
             "text"
          ],
          "term_statistics": true
       },
       {
          "_id": "1"
       }
    ]
 }
 
 {
   "docs": [
     {
       "_index": "waws_index",
       "_type": "waws_type",
       "_id": "2",
       "_version": 1,
       "found": true,
       "took": 0,
       "term_vectors": {
         "text": {
           "field_statistics": {
             "sum_doc_freq": 6,
             "doc_count": 2,
             "sum_ttf": 8
           },
           "terms": {
             "...": {
               "doc_freq": 1,
               "ttf": 1,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 3,
                   "start_offset": 17,
                   "end_offset": 20,
                   "payload": "d29yZA=="
                 }
               ]
             },
             "hello": {
               "doc_freq": 2,
               "ttf": 2,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 1,
                   "start_offset": 6,
                   "end_offset": 11,
                   "payload": "d29yZA=="
                 }
               ]
             },
             "other": {
               "doc_freq": 1,
               "ttf": 1,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 0,
                   "start_offset": 0,
                   "end_offset": 5,
                   "payload": "d29yZA=="
                 }
               ]
             },
             "test": {
               "doc_freq": 2,
               "ttf": 4,
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 2,
                   "start_offset": 12,
                   "end_offset": 16,
                   "payload": "d29yZA=="
                 }
               ]
             }
           }
         }
       }
     },
     {
       "_index": "waws_index",
       "_type": "waws_type",
       "_id": "1",
       "_version": 1,
       "found": true,
       "took": 0,
       "term_vectors": {
         "text": {
           "field_statistics": {
             "sum_doc_freq": 6,
             "doc_count": 2,
             "sum_ttf": 8
           },
           "terms": {
             "hello": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 0,
                   "start_offset": 0,
                   "end_offset": 5,
                   "payload": "d29yZA=="
                 }
               ]
             },
             "test": {
               "term_freq": 3,
               "tokens": [
                 {
                   "position": 1,
                   "start_offset": 6,
                   "end_offset": 10,
                   "payload": "d29yZA=="
                 },
                 {
                   "position": 2,
                   "start_offset": 11,
                   "end_offset": 15,
                   "payload": "d29yZA=="
                 },
                 {
                   "position": 3,
                   "start_offset": 16,
                   "end_offset": 20,
                   "payload": "d29yZA=="
                 }
               ]
             }
           }
         }
       }
     }
   ]
 }

第四个

 GET /_mtermvectors
 {
    "docs": [
       {
          "_index": "waws_index",
          "_type": "waws_type",
          "doc" : {
             "fullname" : "Leo Li",
             "text" : "hello test test test"
          }
       },
       {
          "_index": "my_index",
          "_type": "my_type",
          "doc" : {
            "fullname" : "Leo Li",
            "text" : "other hello test ..."
          }
       }
    ]
 }
 
 {
   "docs": [
     {
       "_index": "waws_index",
       "_type": "waws_type",
       "_version": 0,
       "found": true,
       "took": 0,
       "term_vectors": {
         "fullname": {
           "field_statistics": {
             "sum_doc_freq": 4,
             "doc_count": 2,
             "sum_ttf": 4
           },
           "terms": {
             "leo": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 0,
                   "start_offset": 0,
                   "end_offset": 3
                 }
               ]
             },
             "li": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 1,
                   "start_offset": 4,
                   "end_offset": 6
                 }
               ]
             }
           }
         },
         "text": {
           "field_statistics": {
             "sum_doc_freq": 6,
             "doc_count": 2,
             "sum_ttf": 8
           },
           "terms": {
             "hello": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 0,
                   "start_offset": 0,
                   "end_offset": 5
                 }
               ]
             },
             "test": {
               "term_freq": 3,
               "tokens": [
                 {
                   "position": 1,
                   "start_offset": 6,
                   "end_offset": 10
                 },
                 {
                   "position": 2,
                   "start_offset": 11,
                   "end_offset": 15
                 },
                 {
                   "position": 3,
                   "start_offset": 16,
                   "end_offset": 20
                 }
               ]
             }
           }
         }
       }
     },
     {
       "_index": "waws_index",
       "_type": "waws_type",
       "_version": 0,
       "found": true,
       "took": 0,
       "term_vectors": {
         "text": {
           "field_statistics": {
             "sum_doc_freq": 6,
             "doc_count": 2,
             "sum_ttf": 8
           },
           "terms": {
             "...": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 3,
                   "start_offset": 17,
                   "end_offset": 20
                 }
               ]
             },
             "hello": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 1,
                   "start_offset": 6,
                   "end_offset": 11
                 }
               ]
             },
             "other": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 0,
                   "start_offset": 0,
                   "end_offset": 5
                 }
               ]
             },
             "test": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 2,
                   "start_offset": 12,
                   "end_offset": 16
                 }
               ]
             }
           }
         },
         "fullname": {
           "field_statistics": {
             "sum_doc_freq": 4,
             "doc_count": 2,
             "sum_ttf": 4
           },
           "terms": {
             "leo": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 0,
                   "start_offset": 0,
                   "end_offset": 3
                 }
               ]
             },
             "li": {
               "term_freq": 1,
               "tokens": [
                 {
                   "position": 1,
                   "start_offset": 4,
                   "end_offset": 6
                 }
               ]
             }
           }
         }
       }
     }
   ]
 }