持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第30天,点击查看活动详情
前言
昨天的文章中,我们希望使用ES数据库中的Suggestter API完成搜索推荐或者自动补全的功能,其中该API共有四种方法term suggester、Phrase Suggester、Completion Suggester、Context suggester,其中分别适用于单词纠错、补全;短语纠错、补全;自动补全功能;上下文补全功能。本次使用的是Completion Suggester,该方法能够将数据保存在内存中的FST中。因此性能能够保证,可以满足检索框实时展示补全结果的查询效率。并且完成了索引创建以及简单的数据插入,测试了结果,能够满足我们的需求,今天将把全部数据插入到ES数据库,并构建索引,完成自动补全接口。
数据插入ES
之前的漏洞数据库作为seed,插入到ES数据库中,如下:
建立索引:
PUT方法:
http://127.0.0.1:19200/vulnerability_index?pretty=true
body:
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"properties": {
"keywords": {
"type": "completion",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
},
"labels": {
"type": "completion",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
}
}
}
}
#python 插入数据
data2ES.py
import codecs
from elasticsearch import Elasticsearch
def cleanData(tempList):
returnList = []
for item in tempList:
itemDict = {}
type = item[0]
company = item[1]
product = item[2]
version = item[3]
influence = ' '.join(item[4:]).replace("From", " ").replace("(including)", " ").replace("(excluding)",
" ").replace("Up to",
" ").replace(
"-", " ").split()
influence = '-'.join([item for item in influence if item != ""])
itemDict["type"] = type
itemDict["company"] = company
itemDict["product"] = product
itemDict["version"] = version
itemDict["influence"] = influence
returnList.append(itemDict)
return returnList
def insert2es():
es = Elasticsearch("http://127.0.0.1:9200", request_timeout=3600)
count = 0
insertLine = []
dupDic = {}
lines = codecs.open("aliyunSpider.txt", 'r', 'UTF-8').readlines()
for line in lines:
if (count % 1000):
print("处理了 {} 条".format(count))
dic = (eval(line.strip()))
if "solution" in dic:
cve_number = (dic["cve_number"])
if cve_number not in dupDic:
dupDic[cve_number] = 1
data = {"keywords": cve_number, "labels": "cve_number"}
insertLine.append(data)
es.index(index='vulnerability_index', doc_type='_doc', body=data)
for item in cleanData(dic["data"]):
company = item["company"]
if company not in dupDic:
dupDic[company] = 1
data = {"keywords": company, "labels": "company"}
es.index(index='vulnerability_index', doc_type='_doc', body=data)
product = item["product"]
if product not in dupDic:
dupDic[product] = 1
data = {"keywords": product, "labels": "product"}
es.index(index='vulnerability_index', doc_type='_doc', body=data)
print(count)
print("Finish")
if __name__ == '__main__':
insert2es()
查看数据插入结果
GET方法
http://127.0.0.1:19200/_cat/indices?v
health status index uuid pri rep docs.count docs.deleted store.size pri.store.size
yellow open vulnerability_index _5k5hduWTNeDYIyRgV9Huw 1 1 111450 0 17.1mb 17.1mb
测试原生suggest
http://127.0.0.1:19200/vulnerability_index/_doc/_search?pretty=true
{
"suggest": {
"suggest": {
"text": "face",
"completion": {
"field": "keywords",
"skip_duplicates": true
}
}
}
}
{
"took": 1336,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 0,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"suggest": {
"suggest": [
{
"text": "face",
"offset": 0,
"length": 4,
"options": [
{
"text": "facebook",
"_index": "vulnerability_index",
"_type": "_doc",
"_id": "sPdCJ4QBabWewAf3Fmhn",
"_score": 1.0,
"_source": {
"keywords": "facebook",
"labels": "company"
}
},
{
"text": "facebook-wall-and-social-integration",
"_index": "vulnerability_index",
"_type": "_doc",
"_id": "cvdPJ4QBabWewAf3FLJ0",
"_score": 1.0,
"_source": {
"keywords": "facebook-wall-and-social-integration",
"labels": "product"
}
},
{
"text": "facebook-wall-and-social-integration_project",
"_index": "vulnerability_index",
"_type": "_doc",
"_id": "cfdPJ4QBabWewAf3FLJS",
"_score": 1.0,
"_source": {
"keywords": "facebook-wall-and-social-integration_project",
"labels": "company"
}
},
{
"text": "facebook_for_woocommerce",
"_index": "vulnerability_index",
"_type": "_doc",
"_id": "g_mcJ4QBabWewAf3n1m5",
"_score": 1.0,
"_source": {
"keywords": "facebook_for_woocommerce",
"labels": "product"
}
},
{
"text": "facesentry_access_control_system",
"_index": "vulnerability_index",
"_type": "_doc",
"_id": "sfhzJ4QBabWewAf3JXqq",
"_score": 1.0,
"_source": {
"keywords": "facesentry_access_control_system",
"labels": "product"
}
}
]
}
]
}
}
python flask接口
from Dto.ResultDto import ResultDto
from Utils.enumCode import Code
from Utils.enumMsg import Msg
from Utils.esUtils import esUtils
class suggest():
def __init__(self):
self.es = esUtils()
def getSuggest(self, index = None, tag=None, query=None, limit=None):
try:
data = self.es.suggest(index=index, tag=tag, query=query, suggest_size=limit)
return ResultDto(code=Code.Success.value, msg=Msg.Success.value, data=data).retrunMsg
except Exception as e:
return ResultDto(code=Code.ServiceError.value, msg="{}:{}".format(Msg.ServiceError.value, e),
data="联想词查询失败").retrunMsg
@app.route('/getSuggest', methods=['GET'])
def getSuggest():
index = request.args.get("labels")
tag = request.args.get("tag")
query = request.args.get("query")
limit = int(request.args.get("limit"))
return suggest().getSuggest(index, tag, query, limit)
对后端代码进行重新部署并启动程序并测试: