ElasticSearch中安装ansj分词,且设置自定义分词器并且每个分词器使用自己定义的词库_基于ansj分词

530 阅读3分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

一、ElasticSearch中安装ansj分词

参考资料:github.com/NLPchina/el…

注:Es中安装ansj分词的时候需要找对应的ansj版本。否则就可能不兼容。

可以点击如下处下载对应的版本 image.png

image.png 安装ansj分词:

1.进入Elasticsearch目录运行如下命令

./bin/elasticsearch-plugin install https://github.com/NLPchina/elasticsearch-analysis-ansj/releases/download/v5.5.0/elasticsearch-analysis-ansj-5.5.0.0-release.zip

2.如果虚拟机无法联网,可以先下载github.com/NLPchina/el… 然后放在在es的plugins目录下解压,解压完成后删除安装包    

在ansj分词的源码中找到Library文件夹和ansj_library.properties文件,然后导入到es中,放于同级目录

安装完成后启动es时,控制台可能会报错。。Library文件夹找不到。。根据提示的路径。。将library文件夹放到对应的位置。

image.png

image.png

安装ansj分词时,可能出现的错误:

(1)如果es启动报如下错误的时候(莫名其妙突然出现的错误)

image.png  解决方法:在jdk中 的\jre\lib\security\java.policy文件中添加如下

grant { 

//对系统和用户目录“读”的权限

permission java.util.PropertyPermission "user.dir", "read";

permission java.util.PropertyPermission "user.home", "read";

permission java.util.PropertyPermission "java.home", "read";

permission java.util.PropertyPermission "java.class.path", "read";

//permission java.util.PropertyPermission "user.name", "read"; 

//对线程和线程组的操作权限

permission java.lang.RuntimePermission "modifyThread";

permission java.lang.RuntimePermission "modifyThreadGroup";

//操作Socket端口的各种权限

permission [java.net.SocketPermission](http://java.net.socketpermission/) "-", "listen";

permission [java.net.SocketPermission](http://java.net.socketpermission/) "-", "accept";

permission [java.net.SocketPermission](http://java.net.socketpermission/) "-", "connect";

//读写文件的权限

permission java.io.FilePermission "-", "read";

permission java.io.FilePermission "-", "write";

//退出系统的权限,例如System.exit(0)

permission java.lang.RuntimePermission "exitVM";

permission java.io.FilePermission "<<ALL FILES>>", "read,write";

};

如果配置完仍然出错。。重启虚拟机然后再次启动es

(2)如果出现下面错误:

image.png

解决方法:

vi /etc/security/limits.d/90-nproc.conf 
#es error

* soft nofile 65536

* hard nofile 131072

* soft nproc 2048

* hard nproc 4096
vi /etc/sysctl.conf 
vm.max_map_count=655360
sysctl -p

不同版本引发的错误操作:

如果你想把ansj作为你的默认分词需要在elasticsearch.yml加入如下配置:

#默认分词器,索引
index.analysis.analyzer.default.type: index_ansj

#默认分词器,查询
index.analysis.analyzer.default_search.type: query_ansj

这种方法在5.X版本中不适用了,加入之后就会报错

image.png

解决方法:

注释主配置文件里面的,官方建议生成索引时再设置,如下:

curl -XPUT '<http://192.168.0.111:9200/asdf?pretty>' -d
'{
    "mappings":**{
        "asdf":**{
            "properties":**{
                "name":**{
                    "type":"string",
                    "analyzer":"dic_ansj",
                    "search_analyzer":"query_ansj"
                }
            }
        }
    }
}'

image.png

二、ES中如何设置自定义分词器并且每个分词器使用自己定义的词库?

1.首先在ansj.cfg.yml中配置

image.png

image.png

然后在ansj-library.properties文件中添加词典放置路径,ansj-library.properties和library文件放同一路径下

image.png

image.png

curl -XPUT  'http://localhost:9200/fencitest3?pretty'  -d'
{
	"settings": {
		"analysis": {
			"analyzer": {
				"my_xm_analyzer": {
					"type": "custom",
					"tokenizer": "xm_dic"
				}
			},
			"tokenizer": {
				"xm_dic": {
					"type": "dic_ansj",
					"dic": "dicxm",
					"stop": "stop",
					"ambiguity": "ambiguity",
					"synonyms": "synonyms",
					"isNameRecognition": "true",
					"isNumRecognition": true,
					"isQuantifierRecognition": true,
					"isRealName": false
				}
			}
		}
	},
	"mappings": {
		"fencitest3": {
			"properties": {
				"title": {
					"type": "string",
					"analyzer": "my_xm_analyzer"
				}
			}
		}
	}
}'

curl -XGET 'http://localhost:9200/fencitest3/_analyze?pretty&analyzer=my_xm_analyzer'  -d '网五河是一个名字'

image.png

如果要在一个es中自定义多个分词器应如下

curl -XPUT   'http://localhost:9200/fencitest3?pretty'  -d

'{

"settings":{

"analysis": {

"analyzer": {

"my_xm_analyzer": {

"type": "custom", "tokenizer": "xm_dic"

},

"my_bm_analyzer": {

"type": "custom", "tokenizer": "bm_dic"

}


},

"tokenizer": {

"xm_dic": {

"type":"dic_ansj",

"dic": "dicxm",

"stop": "stop",

"ambiguity": "ambiguity",

"synonyms": "synonyms",

"isNameRecognition":"true",

"isNumRecognition": true,

"isQuantifierRecognition": true,

"isRealName": false

},

"bm_dic": {

"type":"dic_ansj",

"dic": "dicbm",

"stop": "stop",

"ambiguity": "ambiguity",

"synonyms": "synonyms",

"isNameRecognition":"true",

"isNumRecognition": true,

"isQuantifierRecognition": true,

"isRealName": false

}

}

}

},

"mappings":{

"fencitest3": {

"properties": {

"title": {"type": "string", "analyzer": "my_xm_analyzer"},

"name": {"type": "string", "analyzer": "my_bm_analyzer"}

}

}

}

}'
curl -XPUT  'http://localhost:9200/fencitest4?pretty'  -d '
{
	"settings": {
		"analysis": {
			"analyzer": {
				"my_xm_analyzer": {
					"type": "custom",
					"tokenizer": "xm_dic"
				},
				"my_bm_analyzer": {
					"type": "custom",
					"tokenizer": "bm_dic"
				}
			},
			"tokenizer": {
				"xm_dic": {
					"type": "dic_ansj",
					"dic": "dicxm",
					"stop": "stop",
					"ambiguity": "ambiguity",
					"synonyms": "synonyms",
					"isNameRecognition": "true",
					"isNumRecognition": true,
					"isQuantifierRecognition": true,
					"isRealName": false
				},
				"bm_dic": {
					"type": "dic_ansj",
					"dic": "dicbm",
					"stop": "stop",
					"ambiguity": "ambiguity",
					"synonyms": "synonyms",
					"isNameRecognition": "true",
					"isNumRecognition": true,
					"isQuantifierRecognition": true,
					"isRealName": false
				}
			}
		}
	},
	"mappings": {
		"fencitest4": {
			"properties": {
				"title": {
					"type": "string",
					"analyzer": "my_xm_analyzer"
				},
				"name": {
					"type": "string",
					"analyzer": "my_bm_analyzer"
				}
			}
		}
	}
}
'

'