es - elasticsearch的内建分析器

86 阅读4分钟

世界上并没有完美的程序,但是我们并不因此而沮丧,因为写程序就是一个不断追求完美的过程。

es的内建分析器主要有 :

  1. standard
  2. simple
  3. whitespace
  4. stop
  5. keyword
  6. pattern
  7. fingerprint
  8. language

下面具体介绍并展示分词效果 :

# 内建分析器 - standard
# 去除了绝大部分标点符号
# 英文按单词分割并转为小写
# 中文按字分割
POST _analyze
{
  "analyzer": "standard",
  "text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "the",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "<ALPHANUM>",
      "position" : 0
    },
    {
      "token" : "2",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "<NUM>",
      "position" : 1
    },
    {
      "token" : "quick",
      "start_offset" : 6,
      "end_offset" : 11,
      "type" : "<ALPHANUM>",
      "position" : 2
    },
    {
      "token" : "brown",
      "start_offset" : 12,
      "end_offset" : 17,
      "type" : "<ALPHANUM>",
      "position" : 3
    },
    {
      "token" : "foxes",
      "start_offset" : 18,
      "end_offset" : 23,
      "type" : "<ALPHANUM>",
      "position" : 4
    },
    {
      "token" : "jumped",
      "start_offset" : 24,
      "end_offset" : 30,
      "type" : "<ALPHANUM>",
      "position" : 5
    },
    {
      "token" : "over",
      "start_offset" : 31,
      "end_offset" : 35,
      "type" : "<ALPHANUM>",
      "position" : 6
    },
    {
      "token" : "the",
      "start_offset" : 36,
      "end_offset" : 39,
      "type" : "<ALPHANUM>",
      "position" : 7
    },
    {
      "token" : "lazy",
      "start_offset" : 40,
      "end_offset" : 44,
      "type" : "<ALPHANUM>",
      "position" : 8
    },
    {
      "token" : "dog's",
      "start_offset" : 45,
      "end_offset" : 50,
      "type" : "<ALPHANUM>",
      "position" : 9
    },
    {
      "token" : "bone",
      "start_offset" : 51,
      "end_offset" : 55,
      "type" : "<ALPHANUM>",
      "position" : 10
    },
    {
      "token" : "我",
      "start_offset" : 57,
      "end_offset" : 58,
      "type" : "<IDEOGRAPHIC>",
      "position" : 11
    },
    {
      "token" : "是",
      "start_offset" : 61,
      "end_offset" : 62,
      "type" : "<IDEOGRAPHIC>",
      "position" : 12
    },
    {
      "token" : "中",
      "start_offset" : 62,
      "end_offset" : 63,
      "type" : "<IDEOGRAPHIC>",
      "position" : 13
    },
    {
      "token" : "国",
      "start_offset" : 63,
      "end_offset" : 64,
      "type" : "<IDEOGRAPHIC>",
      "position" : 14
    },
    {
      "token" : "人",
      "start_offset" : 64,
      "end_offset" : 65,
      "type" : "<IDEOGRAPHIC>",
      "position" : 15
    }
  ]
}
# 内建分析器 - simple
# 去除所有非字母字符
# 英文按标点与单词分割并转为小写
# 中文按标点分割
POST _analyze
{
  "analyzer": "simple",
   "text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "the",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "quick",
      "start_offset" : 6,
      "end_offset" : 11,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "brown",
      "start_offset" : 12,
      "end_offset" : 17,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "foxes",
      "start_offset" : 18,
      "end_offset" : 23,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "jumped",
      "start_offset" : 24,
      "end_offset" : 30,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "over",
      "start_offset" : 31,
      "end_offset" : 35,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "the",
      "start_offset" : 36,
      "end_offset" : 39,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "lazy",
      "start_offset" : 40,
      "end_offset" : 44,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "dog",
      "start_offset" : 45,
      "end_offset" : 48,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "s",
      "start_offset" : 49,
      "end_offset" : 50,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "bone",
      "start_offset" : 51,
      "end_offset" : 55,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "我",
      "start_offset" : 57,
      "end_offset" : 58,
      "type" : "word",
      "position" : 11
    },
    {
      "token" : "是中国人",
      "start_offset" : 61,
      "end_offset" : 65,
      "type" : "word",
      "position" : 12
    }
  ]
}
# 内建分析器 - whitespace
# 只按空格分割
# 英文不转换大小写
POST _analyze
{
  "analyzer": "whitespace",
  "text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "The",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "2",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "QUICK",
      "start_offset" : 6,
      "end_offset" : 11,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "Brown-Foxes",
      "start_offset" : 12,
      "end_offset" : 23,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "jumped",
      "start_offset" : 24,
      "end_offset" : 30,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "over",
      "start_offset" : 31,
      "end_offset" : 35,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "the",
      "start_offset" : 36,
      "end_offset" : 39,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "lazy",
      "start_offset" : 40,
      "end_offset" : 44,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "dog's",
      "start_offset" : 45,
      "end_offset" : 50,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "bone.",
      "start_offset" : 51,
      "end_offset" : 56,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "我,“”是中国人",
      "start_offset" : 57,
      "end_offset" : 65,
      "type" : "word",
      "position" : 10
    }
  ]
}
# 内建分析器 - stop
# 分词效果与simple类似
# 去除停用词(默认英文)
POST _analyze
{
  "analyzer": "stop",
  "text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "quick",
      "start_offset" : 6,
      "end_offset" : 11,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "brown",
      "start_offset" : 12,
      "end_offset" : 17,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "foxes",
      "start_offset" : 18,
      "end_offset" : 23,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "jumped",
      "start_offset" : 24,
      "end_offset" : 30,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "over",
      "start_offset" : 31,
      "end_offset" : 35,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "lazy",
      "start_offset" : 40,
      "end_offset" : 44,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "dog",
      "start_offset" : 45,
      "end_offset" : 48,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "s",
      "start_offset" : 49,
      "end_offset" : 50,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "bone",
      "start_offset" : 51,
      "end_offset" : 55,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "我",
      "start_offset" : 57,
      "end_offset" : 58,
      "type" : "word",
      "position" : 11
    },
    {
      "token" : "是中国人",
      "start_offset" : 61,
      "end_offset" : 65,
      "type" : "word",
      "position" : 12
    }
  ]
}
# 内建分析器 - keyword
# 不分词,原样返回
POST _analyze
{
  "analyzer": "keyword",
  "text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
      "start_offset" : 0,
      "end_offset" : 56,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "我,“”是中国人",
      "start_offset" : 57,
      "end_offset" : 65,
      "type" : "word",
      "position" : 1
    }
  ]
}
# 内建分析器 - pattern
# 可以自定义pattern :
#  可配置项 : 
#    pattern 
#    flags 
#    lowercase(默认true) 
#    stopwords 
#    stopwords_path
# 默认对所有非单词(非英文)字符分割 : \\W+
POST _analyze
{
  "analyzer": "pattern",
  "text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "the",
      "start_offset" : 0,
      "end_offset" : 3,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "2",
      "start_offset" : 4,
      "end_offset" : 5,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "quick",
      "start_offset" : 6,
      "end_offset" : 11,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "brown",
      "start_offset" : 12,
      "end_offset" : 17,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "foxes",
      "start_offset" : 18,
      "end_offset" : 23,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "jumped",
      "start_offset" : 24,
      "end_offset" : 30,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "over",
      "start_offset" : 31,
      "end_offset" : 35,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "the",
      "start_offset" : 36,
      "end_offset" : 39,
      "type" : "word",
      "position" : 7
    },
    {
      "token" : "lazy",
      "start_offset" : 40,
      "end_offset" : 44,
      "type" : "word",
      "position" : 8
    },
    {
      "token" : "dog",
      "start_offset" : 45,
      "end_offset" : 48,
      "type" : "word",
      "position" : 9
    },
    {
      "token" : "s",
      "start_offset" : 49,
      "end_offset" : 50,
      "type" : "word",
      "position" : 10
    },
    {
      "token" : "bone",
      "start_offset" : 51,
      "end_offset" : 55,
      "type" : "word",
      "position" : 11
    }
  ]
}
# 自定义分析器
# 内建分析器 - pattern
# 自定义配置,使用下划线分割
PUT /test_analyzer
{
  "settings": {
    "analysis": {
      "analyzer": {
        "under_line_analyzer" : {
          "type" : "pattern",
          "pattern" : "_",
          "lowercase" : true
        }
      }
    }
  }
}

POST /test_analyzer/_analyze
{
  "analyzer": "under_line_analyzer",
  "text": ["hello_this_is_my_###_analyzer_我_中国"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "hello",
      "start_offset" : 0,
      "end_offset" : 5,
      "type" : "word",
      "position" : 0
    },
    {
      "token" : "this",
      "start_offset" : 6,
      "end_offset" : 10,
      "type" : "word",
      "position" : 1
    },
    {
      "token" : "is",
      "start_offset" : 11,
      "end_offset" : 13,
      "type" : "word",
      "position" : 2
    },
    {
      "token" : "my",
      "start_offset" : 14,
      "end_offset" : 16,
      "type" : "word",
      "position" : 3
    },
    {
      "token" : "###",
      "start_offset" : 17,
      "end_offset" : 20,
      "type" : "word",
      "position" : 4
    },
    {
      "token" : "analyzer",
      "start_offset" : 21,
      "end_offset" : 29,
      "type" : "word",
      "position" : 5
    },
    {
      "token" : "我",
      "start_offset" : 30,
      "end_offset" : 31,
      "type" : "word",
      "position" : 6
    },
    {
      "token" : "中国",
      "start_offset" : 32,
      "end_offset" : 34,
      "type" : "word",
      "position" : 7
    }
  ]
}
# 内建分析器 - fingerprint
# 用于辅助聚类
POST _analyze
{
  "analyzer": "fingerprint",
  "text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}

# 结果
{
  "tokens" : [
    {
      "token" : "2 bone brown dog's foxes jumped lazy over quick the",
      "start_offset" : 0,
      "end_offset" : 56,
      "type" : "fingerprint",
      "position" : 0
    },
    {
      "token" : "中 人 国 我 是",
      "start_offset" : 57,
      "end_offset" : 65,
      "type" : "fingerprint",
      "position" : 2
    }
  ]
}
# 内建分析器 - 特定语言的分析器 根据需要自己配置