世界上并没有完美的程序,但是我们并不因此而沮丧,因为写程序就是一个不断追求完美的过程。
es的内建分析器主要有 :
- standard
- simple
- whitespace
- stop
- keyword
- pattern
- fingerprint
- language
下面具体介绍并展示分词效果 :
# 内建分析器 - standard
# 去除了绝大部分标点符号
# 英文按单词分割并转为小写
# 中文按字分割
POST _analyze
{
"analyzer": "standard",
"text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}
# 结果
{
"tokens" : [
{
"token" : "the",
"start_offset" : 0,
"end_offset" : 3,
"type" : "<ALPHANUM>",
"position" : 0
},
{
"token" : "2",
"start_offset" : 4,
"end_offset" : 5,
"type" : "<NUM>",
"position" : 1
},
{
"token" : "quick",
"start_offset" : 6,
"end_offset" : 11,
"type" : "<ALPHANUM>",
"position" : 2
},
{
"token" : "brown",
"start_offset" : 12,
"end_offset" : 17,
"type" : "<ALPHANUM>",
"position" : 3
},
{
"token" : "foxes",
"start_offset" : 18,
"end_offset" : 23,
"type" : "<ALPHANUM>",
"position" : 4
},
{
"token" : "jumped",
"start_offset" : 24,
"end_offset" : 30,
"type" : "<ALPHANUM>",
"position" : 5
},
{
"token" : "over",
"start_offset" : 31,
"end_offset" : 35,
"type" : "<ALPHANUM>",
"position" : 6
},
{
"token" : "the",
"start_offset" : 36,
"end_offset" : 39,
"type" : "<ALPHANUM>",
"position" : 7
},
{
"token" : "lazy",
"start_offset" : 40,
"end_offset" : 44,
"type" : "<ALPHANUM>",
"position" : 8
},
{
"token" : "dog's",
"start_offset" : 45,
"end_offset" : 50,
"type" : "<ALPHANUM>",
"position" : 9
},
{
"token" : "bone",
"start_offset" : 51,
"end_offset" : 55,
"type" : "<ALPHANUM>",
"position" : 10
},
{
"token" : "我",
"start_offset" : 57,
"end_offset" : 58,
"type" : "<IDEOGRAPHIC>",
"position" : 11
},
{
"token" : "是",
"start_offset" : 61,
"end_offset" : 62,
"type" : "<IDEOGRAPHIC>",
"position" : 12
},
{
"token" : "中",
"start_offset" : 62,
"end_offset" : 63,
"type" : "<IDEOGRAPHIC>",
"position" : 13
},
{
"token" : "国",
"start_offset" : 63,
"end_offset" : 64,
"type" : "<IDEOGRAPHIC>",
"position" : 14
},
{
"token" : "人",
"start_offset" : 64,
"end_offset" : 65,
"type" : "<IDEOGRAPHIC>",
"position" : 15
}
]
}
# 内建分析器 - simple
# 去除所有非字母字符
# 英文按标点与单词分割并转为小写
# 中文按标点分割
POST _analyze
{
"analyzer": "simple",
"text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}
# 结果
{
"tokens" : [
{
"token" : "the",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 0
},
{
"token" : "quick",
"start_offset" : 6,
"end_offset" : 11,
"type" : "word",
"position" : 1
},
{
"token" : "brown",
"start_offset" : 12,
"end_offset" : 17,
"type" : "word",
"position" : 2
},
{
"token" : "foxes",
"start_offset" : 18,
"end_offset" : 23,
"type" : "word",
"position" : 3
},
{
"token" : "jumped",
"start_offset" : 24,
"end_offset" : 30,
"type" : "word",
"position" : 4
},
{
"token" : "over",
"start_offset" : 31,
"end_offset" : 35,
"type" : "word",
"position" : 5
},
{
"token" : "the",
"start_offset" : 36,
"end_offset" : 39,
"type" : "word",
"position" : 6
},
{
"token" : "lazy",
"start_offset" : 40,
"end_offset" : 44,
"type" : "word",
"position" : 7
},
{
"token" : "dog",
"start_offset" : 45,
"end_offset" : 48,
"type" : "word",
"position" : 8
},
{
"token" : "s",
"start_offset" : 49,
"end_offset" : 50,
"type" : "word",
"position" : 9
},
{
"token" : "bone",
"start_offset" : 51,
"end_offset" : 55,
"type" : "word",
"position" : 10
},
{
"token" : "我",
"start_offset" : 57,
"end_offset" : 58,
"type" : "word",
"position" : 11
},
{
"token" : "是中国人",
"start_offset" : 61,
"end_offset" : 65,
"type" : "word",
"position" : 12
}
]
}
# 内建分析器 - whitespace
# 只按空格分割
# 英文不转换大小写
POST _analyze
{
"analyzer": "whitespace",
"text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}
# 结果
{
"tokens" : [
{
"token" : "The",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 0
},
{
"token" : "2",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 1
},
{
"token" : "QUICK",
"start_offset" : 6,
"end_offset" : 11,
"type" : "word",
"position" : 2
},
{
"token" : "Brown-Foxes",
"start_offset" : 12,
"end_offset" : 23,
"type" : "word",
"position" : 3
},
{
"token" : "jumped",
"start_offset" : 24,
"end_offset" : 30,
"type" : "word",
"position" : 4
},
{
"token" : "over",
"start_offset" : 31,
"end_offset" : 35,
"type" : "word",
"position" : 5
},
{
"token" : "the",
"start_offset" : 36,
"end_offset" : 39,
"type" : "word",
"position" : 6
},
{
"token" : "lazy",
"start_offset" : 40,
"end_offset" : 44,
"type" : "word",
"position" : 7
},
{
"token" : "dog's",
"start_offset" : 45,
"end_offset" : 50,
"type" : "word",
"position" : 8
},
{
"token" : "bone.",
"start_offset" : 51,
"end_offset" : 56,
"type" : "word",
"position" : 9
},
{
"token" : "我,“”是中国人",
"start_offset" : 57,
"end_offset" : 65,
"type" : "word",
"position" : 10
}
]
}
# 内建分析器 - stop
# 分词效果与simple类似
# 去除停用词(默认英文)
POST _analyze
{
"analyzer": "stop",
"text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}
# 结果
{
"tokens" : [
{
"token" : "quick",
"start_offset" : 6,
"end_offset" : 11,
"type" : "word",
"position" : 1
},
{
"token" : "brown",
"start_offset" : 12,
"end_offset" : 17,
"type" : "word",
"position" : 2
},
{
"token" : "foxes",
"start_offset" : 18,
"end_offset" : 23,
"type" : "word",
"position" : 3
},
{
"token" : "jumped",
"start_offset" : 24,
"end_offset" : 30,
"type" : "word",
"position" : 4
},
{
"token" : "over",
"start_offset" : 31,
"end_offset" : 35,
"type" : "word",
"position" : 5
},
{
"token" : "lazy",
"start_offset" : 40,
"end_offset" : 44,
"type" : "word",
"position" : 7
},
{
"token" : "dog",
"start_offset" : 45,
"end_offset" : 48,
"type" : "word",
"position" : 8
},
{
"token" : "s",
"start_offset" : 49,
"end_offset" : 50,
"type" : "word",
"position" : 9
},
{
"token" : "bone",
"start_offset" : 51,
"end_offset" : 55,
"type" : "word",
"position" : 10
},
{
"token" : "我",
"start_offset" : 57,
"end_offset" : 58,
"type" : "word",
"position" : 11
},
{
"token" : "是中国人",
"start_offset" : 61,
"end_offset" : 65,
"type" : "word",
"position" : 12
}
]
}
# 内建分析器 - keyword
# 不分词,原样返回
POST _analyze
{
"analyzer": "keyword",
"text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}
# 结果
{
"tokens" : [
{
"token" : "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
"start_offset" : 0,
"end_offset" : 56,
"type" : "word",
"position" : 0
},
{
"token" : "我,“”是中国人",
"start_offset" : 57,
"end_offset" : 65,
"type" : "word",
"position" : 1
}
]
}
# 内建分析器 - pattern
# 可以自定义pattern :
# 可配置项 :
# pattern
# flags
# lowercase(默认true)
# stopwords
# stopwords_path
# 默认对所有非单词(非英文)字符分割 : \\W+
POST _analyze
{
"analyzer": "pattern",
"text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}
# 结果
{
"tokens" : [
{
"token" : "the",
"start_offset" : 0,
"end_offset" : 3,
"type" : "word",
"position" : 0
},
{
"token" : "2",
"start_offset" : 4,
"end_offset" : 5,
"type" : "word",
"position" : 1
},
{
"token" : "quick",
"start_offset" : 6,
"end_offset" : 11,
"type" : "word",
"position" : 2
},
{
"token" : "brown",
"start_offset" : 12,
"end_offset" : 17,
"type" : "word",
"position" : 3
},
{
"token" : "foxes",
"start_offset" : 18,
"end_offset" : 23,
"type" : "word",
"position" : 4
},
{
"token" : "jumped",
"start_offset" : 24,
"end_offset" : 30,
"type" : "word",
"position" : 5
},
{
"token" : "over",
"start_offset" : 31,
"end_offset" : 35,
"type" : "word",
"position" : 6
},
{
"token" : "the",
"start_offset" : 36,
"end_offset" : 39,
"type" : "word",
"position" : 7
},
{
"token" : "lazy",
"start_offset" : 40,
"end_offset" : 44,
"type" : "word",
"position" : 8
},
{
"token" : "dog",
"start_offset" : 45,
"end_offset" : 48,
"type" : "word",
"position" : 9
},
{
"token" : "s",
"start_offset" : 49,
"end_offset" : 50,
"type" : "word",
"position" : 10
},
{
"token" : "bone",
"start_offset" : 51,
"end_offset" : 55,
"type" : "word",
"position" : 11
}
]
}
# 自定义分析器
# 内建分析器 - pattern
# 自定义配置,使用下划线分割
PUT /test_analyzer
{
"settings": {
"analysis": {
"analyzer": {
"under_line_analyzer" : {
"type" : "pattern",
"pattern" : "_",
"lowercase" : true
}
}
}
}
}
POST /test_analyzer/_analyze
{
"analyzer": "under_line_analyzer",
"text": ["hello_this_is_my_###_analyzer_我_中国"]
}
# 结果
{
"tokens" : [
{
"token" : "hello",
"start_offset" : 0,
"end_offset" : 5,
"type" : "word",
"position" : 0
},
{
"token" : "this",
"start_offset" : 6,
"end_offset" : 10,
"type" : "word",
"position" : 1
},
{
"token" : "is",
"start_offset" : 11,
"end_offset" : 13,
"type" : "word",
"position" : 2
},
{
"token" : "my",
"start_offset" : 14,
"end_offset" : 16,
"type" : "word",
"position" : 3
},
{
"token" : "###",
"start_offset" : 17,
"end_offset" : 20,
"type" : "word",
"position" : 4
},
{
"token" : "analyzer",
"start_offset" : 21,
"end_offset" : 29,
"type" : "word",
"position" : 5
},
{
"token" : "我",
"start_offset" : 30,
"end_offset" : 31,
"type" : "word",
"position" : 6
},
{
"token" : "中国",
"start_offset" : 32,
"end_offset" : 34,
"type" : "word",
"position" : 7
}
]
}
# 内建分析器 - fingerprint
# 用于辅助聚类
POST _analyze
{
"analyzer": "fingerprint",
"text": ["The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.", "我,“”是中国人"]
}
# 结果
{
"tokens" : [
{
"token" : "2 bone brown dog's foxes jumped lazy over quick the",
"start_offset" : 0,
"end_offset" : 56,
"type" : "fingerprint",
"position" : 0
},
{
"token" : "中 人 国 我 是",
"start_offset" : 57,
"end_offset" : 65,
"type" : "fingerprint",
"position" : 2
}
]
}
# 内建分析器 - 特定语言的分析器 根据需要自己配置