【大数据】Python封装ES的Suggest接口

357 阅读2分钟

image.png

持续创作,加速成长!这是我参与「掘金日新计划 · 10 月更文挑战」的第30天,点击查看活动详情

前言

昨天的文章中,我们希望使用ES数据库中的Suggestter API完成搜索推荐或者自动补全的功能,其中该API共有四种方法term suggester、Phrase Suggester、Completion Suggester、Context suggester,其中分别适用于单词纠错、补全;短语纠错、补全;自动补全功能;上下文补全功能。本次使用的是Completion Suggester,该方法能够将数据保存在内存中的FST中。因此性能能够保证,可以满足检索框实时展示补全结果的查询效率。并且完成了索引创建以及简单的数据插入,测试了结果,能够满足我们的需求,今天将把全部数据插入到ES数据库,并构建索引,完成自动补全接口。


数据插入ES

之前的漏洞数据库作为seed,插入到ES数据库中,如下:

建立索引:

PUT方法:

http://127.0.0.1:19200/vulnerability_index?pretty=true

body:

{
    "settings": {
        "number_of_shards": 1
    },
    "mappings": {
        "properties": {
            "keywords": {
                "type": "completion",
                "analyzer": "ik_max_word",
                "search_analyzer": "ik_smart"
            },
            "labels": {
                "type": "completion",
                "analyzer": "ik_max_word",
                "search_analyzer": "ik_smart"
            }
        }
    }
}

#python 插入数据

data2ES.py

import codecs
from elasticsearch import Elasticsearch


def cleanData(tempList):
    returnList = []
    for item in tempList:
        itemDict = {}
        type = item[0]
        company = item[1]
        product = item[2]
        version = item[3]
        influence = ' '.join(item[4:]).replace("From", " ").replace("(including)", " ").replace("(excluding)",
                                                                                                " ").replace("Up to",
                                                                                                             " ").replace(
            "-", " ").split()
        influence = '-'.join([item for item in influence if item != ""])
        itemDict["type"] = type
        itemDict["company"] = company
        itemDict["product"] = product
        itemDict["version"] = version
        itemDict["influence"] = influence
        returnList.append(itemDict)
    return returnList


def insert2es():
    es = Elasticsearch("http://127.0.0.1:9200", request_timeout=3600)
    count = 0
    insertLine = []
    dupDic = {}
    lines = codecs.open("aliyunSpider.txt", 'r', 'UTF-8').readlines()
    for line in lines:
        if (count % 1000):
            print("处理了 {} 条".format(count))
        dic = (eval(line.strip()))
        if "solution" in dic:
            cve_number = (dic["cve_number"])
            if cve_number not in dupDic:
                dupDic[cve_number] = 1
                data = {"keywords": cve_number, "labels": "cve_number"}
                insertLine.append(data)
                es.index(index='vulnerability_index', doc_type='_doc', body=data)
            for item in cleanData(dic["data"]):
                company = item["company"]
                if company not in dupDic:
                    dupDic[company] = 1
                    data = {"keywords": company, "labels": "company"}
                    es.index(index='vulnerability_index', doc_type='_doc', body=data)
                product = item["product"]
                if product not in dupDic:
                    dupDic[product] = 1
                    data = {"keywords": product, "labels": "product"}
                    es.index(index='vulnerability_index', doc_type='_doc', body=data)

    print(count)
    print("Finish")



if __name__ == '__main__':
    insert2es()

查看数据插入结果

GET方法

http://127.0.0.1:19200/_cat/indices?v
health status index               uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   vulnerability_index _5k5hduWTNeDYIyRgV9Huw   1   1     111450            0     17.1mb         17.1mb

image.png

测试原生suggest

http://127.0.0.1:19200/vulnerability_index/_doc/_search?pretty=true
{
    "suggest": {
        "suggest": {
            "text": "face",
            "completion": {
                "field": "keywords",
                "skip_duplicates": true
            }
        }
    }
}
{
    "took": 1336,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 0,
            "relation": "eq"
        },
        "max_score": null,
        "hits": []
    },
    "suggest": {
        "suggest": [
            {
                "text": "face",
                "offset": 0,
                "length": 4,
                "options": [
                    {
                        "text": "facebook",
                        "_index": "vulnerability_index",
                        "_type": "_doc",
                        "_id": "sPdCJ4QBabWewAf3Fmhn",
                        "_score": 1.0,
                        "_source": {
                            "keywords": "facebook",
                            "labels": "company"
                        }
                    },
                    {
                        "text": "facebook-wall-and-social-integration",
                        "_index": "vulnerability_index",
                        "_type": "_doc",
                        "_id": "cvdPJ4QBabWewAf3FLJ0",
                        "_score": 1.0,
                        "_source": {
                            "keywords": "facebook-wall-and-social-integration",
                            "labels": "product"
                        }
                    },
                    {
                        "text": "facebook-wall-and-social-integration_project",
                        "_index": "vulnerability_index",
                        "_type": "_doc",
                        "_id": "cfdPJ4QBabWewAf3FLJS",
                        "_score": 1.0,
                        "_source": {
                            "keywords": "facebook-wall-and-social-integration_project",
                            "labels": "company"
                        }
                    },
                    {
                        "text": "facebook_for_woocommerce",
                        "_index": "vulnerability_index",
                        "_type": "_doc",
                        "_id": "g_mcJ4QBabWewAf3n1m5",
                        "_score": 1.0,
                        "_source": {
                            "keywords": "facebook_for_woocommerce",
                            "labels": "product"
                        }
                    },
                    {
                        "text": "facesentry_access_control_system",
                        "_index": "vulnerability_index",
                        "_type": "_doc",
                        "_id": "sfhzJ4QBabWewAf3JXqq",
                        "_score": 1.0,
                        "_source": {
                            "keywords": "facesentry_access_control_system",
                            "labels": "product"
                        }
                    }
                ]
            }
        ]
    }
}

python flask接口

from Dto.ResultDto import ResultDto
from Utils.enumCode import Code
from Utils.enumMsg import Msg
from Utils.esUtils import esUtils


class suggest():
    def __init__(self):
        self.es = esUtils()

    def getSuggest(self, index = None, tag=None, query=None, limit=None):
        try:

            data = self.es.suggest(index=index, tag=tag, query=query, suggest_size=limit)
            return ResultDto(code=Code.Success.value, msg=Msg.Success.value, data=data).retrunMsg
        except Exception as e:
            return ResultDto(code=Code.ServiceError.value, msg="{}:{}".format(Msg.ServiceError.value, e),
                             data="联想词查询失败").retrunMsg
@app.route('/getSuggest', methods=['GET'])
def getSuggest():
    index = request.args.get("labels")
    tag = request.args.get("tag")
    query = request.args.get("query")
    limit = int(request.args.get("limit"))
    return suggest().getSuggest(index, tag, query, limit)

对后端代码进行重新部署并启动程序并测试:

image.png