springboot 2.x集成elasticSearch

157 阅读29分钟

依赖

<dependency>
  <groupId>org.elasticsearch.client</groupId>
  <artifactId>elasticsearch-rest-client</artifactId>
  <version>7.17.28</version>
</dependency>

    <dependency>
      <groupId>co.elastic.clients</groupId>
      <artifactId>elasticsearch-java</artifactId>
      <version>7.17.28</version>
      <exclusions>
        <exclusion>
          <groupId>org.elasticsearch.client</groupId>
          <artifactId>elasticsearch-rest-client</artifactId>
        </exclusion>
      </exclusions>
    </dependency>

    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-databind</artifactId>
      <version>2.17.0</version>
    </dependency>
    <dependency>
      <groupId>jakarta.json</groupId>
      <artifactId>jakarta.json-api</artifactId>
      <version>2.0.1</version>
    </dependency>

工具类

package org.example.esbak;

import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Configuration;

@Configuration
public class EsConfig {


    public static String url;

    public static String apiKeyValue;

    public static String userName;

    public static String password;

    public static String ip;

    public static Integer port;



    @Value("${elasticsearch.url:localhost:9200}")
    public void setUrl(String url) {
        this.url = url;
    }


    @Value("${elasticsearch.apiKeyValue:aWxkbFVwWUItaVZFbVpXZEJTc0c6U1dnQ0J2OFdULW1CcVE4YTAxN3BvQQ==}")
    public void setApiKey(String apiKeyValue) {
        this.apiKeyValue = apiKeyValue;
    }


    @Value("${elasticsearch.username:elastic}")
    public  void setUserName(String userName) {
        this.userName = userName;
    }

    @Value("${elasticsearch.password:123456}")
    public  void setPassword(String password) {
        this.password = password;
    }

    @Value("${elasticsearch.ip:localhost}")
    public  void setIp(String ip) {
        this.ip = ip;
    }

    @Value("${elasticsearch.port:9200}")
    public  void setPort(Integer port) {
        this.port = port;
    }
}
package org.example.esbak;

import co.elastic.clients.elasticsearch.ElasticsearchClient;
import co.elastic.clients.json.jackson.JacksonJsonpMapper;
import co.elastic.clients.transport.ElasticsearchTransport;
import co.elastic.clients.transport.rest_client.RestClientTransport;
import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.message.BasicHeader;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestClientBuilder;
import org.springframework.stereotype.Component;

import java.nio.charset.StandardCharsets;
import java.util.Base64;

@Component
public class Es7Client {


    private static ElasticsearchClient restClient = null;

    public ElasticsearchClient getInstance() {
        if (restClient == null) {
            synchronized (this) {
                final CredentialsProvider credentialsProvider =
                        new BasicCredentialsProvider();
                credentialsProvider.setCredentials(AuthScope.ANY,
                        new UsernamePasswordCredentials(EsConfig.userName, EsConfig.password));

                RestClientBuilder builder = RestClient.builder(
                                new HttpHost(EsConfig.ip, EsConfig.port))
                        .setHttpClientConfigCallback(httpClientBuilder -> httpClientBuilder
                                .setDefaultCredentialsProvider(credentialsProvider));
                ElasticsearchTransport transport = new RestClientTransport(
                        builder.build(), new JacksonJsonpMapper());
                restClient = new ElasticsearchClient(transport);
            }
        }
        return restClient;

    }

    public ElasticsearchClient getRestClientByApiKey () {

           if (restClient == null) {
               synchronized (this) {
//                   String apiKeyId = "ildlUpYB-iVEmZWdBSsG";
//                   String apiKeySecret = "SWgCBv8WT-mBqQ8a017poA";
//                   String apiKeyAuth =
//                           Base64.getEncoder().encodeToString(
//                                   (apiKeyId + ":" + apiKeySecret)
//                                           .getBytes(StandardCharsets.UTF_8));
                   RestClientBuilder builder = RestClient.builder(
                           new HttpHost(EsConfig.ip, EsConfig.port, "http"));
                   Header[] defaultHeaders =
                           new Header[]{new BasicHeader("Authorization",
                                   "ApiKey " + EsConfig.apiKeyValue)};
                   builder.setDefaultHeaders(defaultHeaders);
                   ElasticsearchTransport transport = new RestClientTransport(
                           builder.build(), new JacksonJsonpMapper());
                   restClient = new ElasticsearchClient(transport);
               }
           }

        return restClient;
    }

    public static void closeEsClient(){
        try{
            if (restClient != null) {
                restClient.close();
            }
        }catch(Exception e){
          log.error("关闭失败:",e);
        }
    }

}

创建索引

PUT /index_title
{
  "settings": {
    "number_of_shards": 3,
    "number_of_replicas": 1   
  },
  "mappings": {
    "properties": {
      "title": { "type": "keyword" },      
      "price": { "type": "float" },    
      "create_time": { 
        "type": "date", 
        "format": "yyyy-MM-dd HH:mm:ss" 
      },
	  "note":{"type":"text"}
    }
  }
}
@PostMapping(value = "/esTest")
public void esTest() {

    ElasticsearchClient instance = esClient.getRestClientByApiKey();
    try {
        CreateIndexResponse createIndexResponse = instance.indices().create(c ->
                c.index("index_user_info")
                        .settings(s -> s.numberOfShards("1").numberOfReplicas("3"))
                        .mappings(m -> m.properties("id", p -> p.text(d -> d))
                                .properties("name", p -> p.text(t -> t.analyzer("ik_max_word")))
                                .properties("height", p -> p.double_(d -> d))
                                .properties("createTime", p -> p.date(d -> d.format("yyyy-MM-dd")))
                        )
        );
        boolean acknowledged = createIndexResponse.acknowledged();
        log.info("创建index_user_info索引返回结果:{}",acknowledged);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }


}
DELETE /your_index_name

删除索引

@PostMapping(value = "/deleteIndex/{indexId}")
public void deleteIndex(@PathVariable(value = "indexId")String indexId) {

  // 首先判断索引是否存在,如果存在,则删除
    ExistsRequest existsRequest = ExistsRequest.of(e -> e.index(indexId));
    try {
        BooleanResponse exists = esClient.getInstance().indices().exists(existsRequest);
        if (exists.value()) {
            // 删除索引
            DeleteIndexRequest deleteIndexRequest = DeleteIndexRequest.of(d -> d.index(indexId));
            DeleteIndexResponse delete = esClient.getInstance().indices().delete(deleteIndexRequest);
            if (delete.acknowledged()) {
                log.info("删除索引成功{}",indexId);
            }
        }
    } catch (IOException e) {
        log.error("删除索引{}失败",indexId);
        throw new RuntimeException(e);
    }

}

推送数据

PUT /index_title/_doc/1
{
  "title": "John Ming",
  "price": 60,
  "create_time": "2025-05-14 10:00:00",
  "note": "备注1"
}
GET /index_user_info/_search
{
  "query": {
    "match_all": {}  
  },
  "size": 10         
}
{
  "took" : 858,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "03aNcZYB0cCRs9DKk9c0",
        "_score" : 1.0,
        "_source" : {
          "name" : "张名",
          "id" : "1",
          "height" : 172.1,
          "createTime" : "2025-04-26"
        }
      },
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "1HaNcZYB0cCRs9DKk9c0",
        "_score" : 1.0,
        "_source" : {
          "name" : "刘高",
          "id" : "2",
          "height" : 175.1,
          "createTime" : "2025-04-26"
        }
      },
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "1XaNcZYB0cCRs9DKk9c0",
        "_score" : 1.0,
        "_source" : {
          "name" : "陈古",
          "id" : "3",
          "height" : 180.1,
          "createTime" : "2025-04-26"
        }
      }
    ]
  }
}

POST /_bulk
{ "index": { "_index": "index_title", "_id": "4" }}
{ "title": "Data Analytics", "price": 59.99 , "create_time": "2025-05-14 10:00:00","note": "备注1"}
{ "index": { "_index": "index_title", "_id": "5" }}
{ "title": "Data AB", "price": 69.99 , "create_time": "2025-05-14 10:00:00", "note": "备注1"}
{ "index": { "_index": "index_title", "_id": "6" }}
{ "title": "Data AC", "price": 80.99 , "create_time": "2025-05-14 10:00:00","note": "备注1"}
{ "index": { "_index": "index_title", "_id": "7" }}
{ "title": "Data AD", "price": 90.99 , "create_time": "2025-05-14 10:00:00","note": "备注1" }
@PostMapping(value = "/pushData/{indexId}")
public void pushData(@PathVariable(value = "indexId")String indexId) {
    // 模拟数据
    List<UserInfo> userInfoList = new ArrayList<>();
    DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd");
    LocalDateTime date = LocalDateTime.now();
    String dateStr = formatter.format(date);
    userInfoList.add(new UserInfo().setId("1").setName("张名").setHeight(172.1).setCreateTime(dateStr));
    userInfoList.add(new UserInfo().setId("2").setName("刘高").setHeight(175.1).setCreateTime(dateStr));
    userInfoList.add(new UserInfo().setId("3").setName("陈古").setHeight(180.1).setCreateTime(dateStr));

    BulkRequest.Builder builder = new BulkRequest.Builder();
    for (UserInfo userInfo: userInfoList) {
       builder.operations(op -> op.index(idx -> idx.index(indexId).document(userInfo)));
    }
    try {
        BulkResponse bulk = esClient.getInstance().bulk(builder.build());
        boolean errors = bulk.errors();
        if (errors) {
            log.error("推送失败{}",  JSON.toJSONString(bulk));
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

删除数据

  • 已知_id的情况下
DELETE /索引名称/_doc/文档ID
DELETE /index_title/_doc/1
  • 先查询,在删除
  POST /index_title/_delete_by_query
{
  "query" :{
    "term" :{
      "title": "John Do"
    }
  }
}
  • 批量删除
  POST /_bulk
{ "delete": { "_index": "index_title", "_id": "8" }}
{ "delete": { "_index": "index_title", "_id": "9" }}
// 删除数据
@PostMapping(value = "/deleteData/{indexId}")
public void  deleteData(@PathVariable(value = "indexId")String indexId) {

    // id为docId
    DeleteRequest deleteRequest = DeleteRequest.of(dr -> dr.index(indexId).id("1"));
    try {
        DeleteResponse delete = esClient.getInstance().
                delete(deleteRequest);
        if (delete != null) {
            Boolean b = delete.forcedRefresh();
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
// 先查询,在删除
DeleteByQueryRequest request = DeleteByQueryRequest.of(b -> b
        .index(indexId)
        .query(q -> q
                .term(t -> t
                        .field("id")
                        .value(v -> v.stringValue("1"))
                )
        )
);

try {
    DeleteByQueryResponse deleteByQueryResponse = esClient.getInstance().deleteByQuery(request);
    Long deleted = deleteByQueryResponse.deleted();
    log.error("删除的个数{}",deleted);
} catch (Exception e) {

}
}

批量删除

// 批量删除
@PostMapping(value = "/batchDeleteData/{indexId}")
public void  batchDeleteData(@PathVariable(value = "indexId")String indexId) {

    // 文档ID删除

    List<String> ids = new ArrayList<>();
    ids.add("4");
    ids.add("5");
    BulkRequest.Builder builder = new BulkRequest.Builder();
    for (String a: ids) {
        // id为文档的_id
        builder.operations(op -> op.delete(idx -> idx.index(indexId).id(a)));
    }
    // 执行批量操作
    try {
        BulkResponse bulk = esClient.getInstance().bulk(builder.build());

        // 异常处理
        if (bulk.errors()) {
            bulk.items().forEach(e->{
                if (e.error() != null) {
                    log.error("删除失败 {} {}",e.id(),e.error().reason());
                }
            });
        }

    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    // 先查询,再删除
    DeleteByQueryRequest request = DeleteByQueryRequest.of(b -> b
            .index(indexId)
            .query(q -> q
                    .terms(t -> t
                            .field("id")
                            .terms(ts -> ts
                                    .value(ids.stream()
                                            .map(FieldValue::of).collect(Collectors.toList()))
                            )
                    )
            )
    );
    try {
        DeleteByQueryResponse deleteByQueryResponse = esClient.getInstance().deleteByQuery(request);
        Long deleted = deleteByQueryResponse.deleted();
        log.error("删除的个数{}",deleted);
    } catch (Exception e) {
    }
}

查询数据

  • 普通查询: Term(精确匹配)、Match(全文匹配)、Range(范围查询)等,通常仅支持单一条件查询
  • BoolQuery查询:通过 must(AND)、should(OR)、must_not(NOT)、filter(过滤,不计算相关性得分)等子句组合多个查询逻辑
  • 区别:Match Query 仅支持分词后的全文检索,Term Query 仅支持未分词的精确匹配,‌BoolQuery‌ 可以混合不同字段、不同查询类型的条件,结合 term(精确匹配 keyword 字段)与 match_phrase(短语匹配 text 字段),在 filter 子句中使用 range 过滤数值或日期范围

分页查询的方式

  • page/size 分页:这是较常用的一种分页方式, 使用简单,page指定页号, size指定返回的文档数, 但有记录数10000的限制(from+size<=10000), 且当越往后翻页时,性能越差,即不适合进行深分页
  • search after 分页: search after 利用游标来帮我们解决实时滚动的问题。 搜索时需要指定排序,并且保证排序是唯一的, 第一排序不唯一时,可使用多排序, 以消除排序不稳定的现象, 若业务字段无法保证排序唯一性, 可以通过最后加入 _id 来保证排序唯一性。 由于是滚动分页,因此不支持跳页
  • scroll 分页: 这也是滚动分页的一种方式,类似search after,但这种分页方式在分页遍历过程中, 新增的数据不会插进结果集中, scroll 分页最大的缺点是不支持高并发场景, 仅适用于管理后台数据导出等少量人员操作的场景,返回值中有scroll_id是有时效的, scroll=1m表示生成后1分钟内有效

查询语法介绍

match语法

GET /index_user_info/_search
{
  "query": {
    "match": {"name":"陈古"}  
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 3,
      "relation" : "eq"
    },
    "max_score" : 2.0572429,
    "hits" : [
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "1XaNcZYB0cCRs9DKk9c0",
        "_score" : 2.0572429,
        "_source" : {
          "name" : "陈古",
          "id" : "3",
          "height" : 180.1,
          "createTime" : "2025-04-26"
        }
      },
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "9clkx5YBNRvU4PL57GzP",
        "_score" : 1.7460635,
        "_source" : {
          "name" : "陈古51",
          "id" : "9",
          "height" : 180.1,
          "createTime" : "2025-05-13"
        }
      },
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "grnOw5YBwm5zQPEO_DwH",
        "_score" : 1.7460635,
        "_source" : {
          "name" : "陈古5",
          "id" : "6",
          "height" : 180.1,
          "createTime" : "2025-05-12"
        }
      }
    ]
  }
}
  • match:基础的全文检索, 对查询文本进行分词后匹配 分词后默认使用 OR 逻辑匹配任意词项(可通过 operator 设置为 AND) 不关注词项顺序和位置
GET /index_user_info/_search
{
  "query": {
    "match": {"name":"陈古"}  
  }
}

  • match_all:匹配所有文档,无查询条件,通常结合过滤或分页使用
GET /index_user_info/_search
{
  "query": {
    "match_all": {}  
  }
}

  • match_phrase:精确短语匹配,要求词项顺序一致且位置相邻, 支持 slop 参数,允许词项间有一定间隔(默认 slop=0)
GET /index_user_info/_search
{
  "query": {
    "match_phrase": {
      "name": {
        "query": "李开复",
        "slop": 1,
        "analyzer": ""
      }
    }  
  }
}
  • match_phrase_prefix:短语匹配,但最后一个词项匹配前缀 适用于实现“搜索即输入”的自动补全功能, 通过 max_expansions 限制前缀扩展数量(默认 50)
GET /index_user_info/_search
{
  "query": {
    "match_phrase_prefix": {
      "name": {
        "query": "陈古",
        "max_expansions": 10
      }
    }  
  }
}
  • multi_match:在多个字段上执行同一查询, 可通过 type 指定匹配策略(如 best_fields、most_fields), 可对字段加权(如 title^3)
GET /index_user_info/_search
{
  "query": {
    "multi_match": {
      "query": "陈古",
      "fields": ["id","name^2"],
      "type": "most_fields"
    }  
  }
}

term语法

  • term: 查询直接匹配索引中的原始词项(未经分词处理), 要求查询值与文档字段值完全一致, 避免对 text 字段使用 term‌ text 字段默认会被分词处理(如转为小写), 直接使用 term 可能无法匹配。若需精确匹配文本,应使用 keyword 类型字段 term 是包含操作,而非等值判断‌ 若字段值为数组(如 "tags": ["search", "open_source"]), term 会匹配包含该词项的文档,而非要求完全相等
GET /index_user_info/_search
{
  "query": {
    "term": {
      "id": {
        "value": "10"
      }
    } 
  }
}
  • terms:terms查询用于匹配字段中包含任意一个指定值的文档, 适用于同时搜索多个精确值的场景, 与 term 查询类似,terms 直接匹配索引中的原始词项(未经分词), 要求查询值与文档字段值完全一致, 查询值需以数组形式传递,支持数值、字符串(keyword 类型)、布尔值等结构化数据 boost‌:调整查询结果的权重,影响相关性评分(需与 bool 查询结合时使用), 避免对 text 字段使用‌ text 类型字段默认分词存储,若需精确匹配多个值,应使用 .keyword 子字段, ‌查询逻辑:包含而非等值‌ terms 匹配的是字段值包含数组中任意值的文档。 若字段值为数组(如 ["apple", "banana"]),查询值与数组元素有交集即匹配, ‌性能优化‌: 使用 constant_score 过滤器跳过评分计算,提高查询速度, 避免一次性传入大量查询值(如超过 1000 个),可能引发性能问题
GET /index_user_info/_search
{
  "query": {
    "terms": {
      "id": [
        "9",
        "10"
      ]
    } 
  }
}
GET /index_user_info/_search
{
  "query": {
    "constant_score": {
      "filter": {
        "terms": {
          "id": [
            "9",
            "10"
          ]
        }
      },
      "boost": 1.2
    }
  }
}

range语法

  • range: range 查询用于匹配字段值在指定区间内的文档, 支持数值、日期、字符串等有序类型字段的筛选 示例:筛选 price 在 100 到 500 之间, 或 register_date 在 2025-01-01 至 2025-05-13 内的文档 支持以下比较操作符定义区间边界: gt:大于 gte:大于等于 lt:小于 lte:小于等于 可组合使用(如 gt 与 lte 结合) 日期类型字段需指定 format 参数匹配索引中的日期格式(如 yyyy-MM-dd),支持时区修正(time_zone) 字段类型匹配‌ 数值类型(integer/long/double)直接比较数值大小 日期类型需严格遵循 format 定义格式,否则查询失败 支持通配符匹配字段名(如 logs-*-timestamp),但需注意性能影响(避免匹配过多字段) 结合 constant_score 过滤器跳过评分计算 避免大范围查询(如时间段跨度过长),建议分页或结合时间分区策略

GET /my_index/_search
{
  "query": {
    "range": {
      "<field>": {
        "gte": "<最小值>",  
        "lte": "<最大值>",  
        "format": "yyyy-MM-dd",  // 日期格式(可选)
        "time_zone": "+08:00",   // 时区(可选)
        "boost": 2.0             // 权重(可选)
      }
    }
  }
}
GET /index_user_info/_search
{
  "query": {
    "range": {
      "height": {
        "gte": 10,
        "lte": 200
      }
    }
  }
}

GET /index_user_info/_search
{
  "query": {
    "range": {
      "createTime": {
        "gte": "2025-05-12",
        "lte": "2025-05-12",
        "format": "yyyy-MM-dd",
        "time_zone": "+08:00"
      }
    }
  }
}

复合查询(bool

  • bool查询:‌布尔逻辑组合‌ bool 查询通过 must(AND)、 should(OR)、 must_not(NOT)、 filter(非评分过滤) 四种子句组合多个独立查询条件, 实现复杂逻辑筛选 示例:筛选 status 为 active(must)、 price 大于 100(must) 且不包含标签 expired(must_not)的商品。 ‌灵活的评分机制‌ must 和 should 子句影响文档相关性评分(_score), 而 filter 和 must_not 仅过滤文档,不参与评分 filter 子句会启用缓存机制,提升重复查询性能。 ‌嵌套查询支持‌ 可在 bool 子句中嵌套其他 bool 查询, 实现多层级逻辑组合(如 (A AND B) OR (C AND D)) ‌性能优化‌: 优先使用 filter 替代 must 处理非评分条件,减少不必要的评分计算 避免在 should 子句中添加过多条件,可能降低查询效率 ‌字段类型匹配‌: 精确值匹配(如 status)需使用 term 查询,且字段应为 keyword 类型 text 类型字段需通过 .keyword 子字段进行精确过滤 ‌嵌套查询深度控制‌ 多层嵌套 bool 查询可能增加复杂度,建议结合业务需求简化逻辑
GET /index/_search
{
  "query": {
    "bool": {
      "must": [         // 必须满足的所有条件(AND)
        { "term": { "status": "active" } }
      ],
      "must_not": [     // 必须不满足的条件(NOT)
        { "term": { "tag": "expired" } }
      ],
      "should": [       // 至少满足一个条件(OR)
        { "match": { "description": "urgent" } }
      ],
      "filter": [       // 过滤条件(不评分)
        { "range": { "price": { "gte": 100 } } }
      ],
      "minimum_should_match": 1  // should 子句最小匹配数
    }
  }
}
GET /index_user_info/_search
{
  "query": {
    "bool": {"must": [
      {"term": {
        "id": {
          "value": "9"
        }
      }}
    ]}
  }
}

GET /index_user_info/_search
{
  "query": {
    "bool": {
      
      "must_not": [
        {"term": {
          "name": {
            "value": "陈古"
          }
        }}
      ],
      "should": [
        {"match_phrase_prefix": {
          "name": "刘"
        }}
      ],
      "filter": [
        {"range": {
          "createTime": {
            "gte": "2025-05-12",
            "lte": "2025-05-12"
          }
        }}
      ], 
      "minimum_should_match": 1
      
    }
  }
}

分页和排序实例

  • ‌默认排序规则‌ Elasticsearch 默认按相关性评分 _score 降序排序 ‌自定义排序‌ 支持对以下字段类型排序: ‌数值类型‌:integer、long、double ‌日期类型‌:按时间戳排序(需指定 format) ‌Keyword类型‌:精确值按字典序排序 ‌地理坐标‌:按距离排序(需使用 geo_distance) ‌Text字段排序‌ text 类型字段需通过 .keyword 子字段排序(未经分词处理)

  • from+size

GET /index_user_info/_search
{
  "from": 0,
  "size": 100,
  "sort": [
    {
      "height": {
        "order": "desc"
      },
      "createTime": {"order": "desc"}
    }
  ],
  "query": {"match": {
    "name": "陈古"
  }}
}


‌使用 .keyword 子字段排序‌
‌原理‌:创建索引时自动生成未分词的 keyword 子字段,保留完整原始值

PUT /my_index
{
  "mappings": {
    "properties": {
      "title": {
        "type": "text",
        "fields": {
          "keyword": {  // 自动生成的 keyword 子字段
            "type": "keyword",
            "ignore_above": 256
          }
        }
      }
    }
  }
}

// 排序时指定 keyword 子字段
GET /my_index/_search
{
  "sort": [
    { "title.keyword": "asc" }  // 按完整原始值排序
  ]
}
‌优点‌:内存占用低,性能稳定。
‌限制‌:原始值超过 ignore_above 长度时会被截断
  • search_after方式排序
GET /index_user_info/_search
{
  
  "size": 2,
  "query":
  { 
    "match_all": {} 
    
  },
  "sort": [
    { 
      
      "createTime": "desc"    // 排序字段
      
    }, 
    { 
      "_id": "asc"   // 辅助字段
      
    }          
  ]

}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 8,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "-Mlox5YBNRvU4PL5mmzf",
        "_score" : null,
        "_source" : {
          "name" : "张志明",
          "id" : "10",
          "height" : 172.1,
          "createTime" : "2025-05-13"
        },
        "sort" : [
          1747094400000,
          "-Mlox5YBNRvU4PL5mmzf"
        ]
      },
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "-clox5YBNRvU4PL5mmzf",
        "_score" : null,
        "_source" : {
          "name" : "李开复",
          "id" : "11",
          "height" : 175.1,
          "createTime" : "2025-05-13"
        },
        "sort" : [
          1747094400000,
          "-clox5YBNRvU4PL5mmzf"
        ]
      }
    ]
  }
}
  • 记录下信息,传递到下一页
 "sort" : [
          1747094400000,
          "-clox5YBNRvU4PL5mmzf"
        ]
GET /index_user_info/_search
{
  
  "size": 2,
  "query":
  { 
    "match_all": {} 
    
  },
  "sort": [
    { 
      
      "createTime": "desc"    
      
    }, 
    { 
      "_id": "asc"   
      
    }          
  ],
  "search_after": [
    1747094400000,
          "-clox5YBNRvU4PL5mmzf"
    ]
}

  • scroll Scroll 的排序规则仅在‌初始化阶段‌生效,后续滚动请求沿用初始排序 ‌适用场景‌ ‌批量导出‌:全量数据离线处理(如日志备份)。 ‌一致性快照‌:保持排序结果不变(即使索引数据更新) ‌性能优化建议‌ ‌避免高开销排序‌:禁用 _score 计算("track_scores": false) ‌简化排序字段‌:优先使用 _doc 或数值/日期字段 ‌控制窗口时间‌:合理设置 scroll 参数(如 scroll=2m)避免资源泄漏
POST /logs/_search?scroll=2m
{
  "size": 500,
  "sort": [
    { "timestamp": "desc" },  // 主排序字段(日期类型)
    { "_doc": "asc" }          // 辅助排序(确保唯一性)
  ],
  "query": {
    "range": { "timestamp": { "gte": "2025-05-01" }}
  },
  "track_scores": false  // 关闭相关性评分计算
}
‌响应中携带 `_scroll_id`**‌:用于后续滚动请求
POST /_search/scroll { "scroll": "2m", "scroll_id": "DXF1ZXJ5...==_16" }
(后续请求无需重复指定排序参数)
POST /index_user_info/_search?scroll=1m
{
  
  "size": 2,
   "query": {"match_all": {}},
   "sort": [
     {
       "createTime": {
         "order": "desc"
       },
       "_id" :{
         "order": "asc"
       }
     }
   ],
   "track_scores": false

}

{
  "_scroll_id" : "FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFkdySU0ySm9sUmhPVTVfbjl3NnhIRmcAAAAAAAAtHRYxR0Y0Q0kzLVJUcUFCMW5oUnlnV1J3",
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 8,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "-Mlox5YBNRvU4PL5mmzf",
        "_score" : null,
        "_source" : {
          "name" : "张志明",
          "id" : "10",
          "height" : 172.1,
          "createTime" : "2025-05-13"
        },
        "sort" : [
          1747094400000,
          "-Mlox5YBNRvU4PL5mmzf"
        ]
      },
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "-clox5YBNRvU4PL5mmzf",
        "_score" : null,
        "_source" : {
          "name" : "李开复",
          "id" : "11",
          "height" : 175.1,
          "createTime" : "2025-05-13"
        },
        "sort" : [
          1747094400000,
          "-clox5YBNRvU4PL5mmzf"
        ]
      }
    ]
  }
}
POST /_search/scroll
{
  "scroll": "2m", 
  "scroll_id" : "FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFkdySU0ySm9sUmhPVTVfbjl3NnhIRmcAAAAAAAAt4hYxR0Y0Q0kzLVJUcUFCMW5oUnlnV1J3"
}
{
  "_scroll_id" : "FGluY2x1ZGVfY29udGV4dF91dWlkDXF1ZXJ5QW5kRmV0Y2gBFkdySU0ySm9sUmhPVTVfbjl3NnhIRmcAAAAAAAAt4hYxR0Y0Q0kzLVJUcUFCMW5oUnlnV1J3",
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 8,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "-slox5YBNRvU4PL5mmzf",
        "_score" : null,
        "_source" : {
          "name" : "赵子房",
          "id" : "12",
          "height" : 180.1,
          "createTime" : "2025-05-13"
        },
        "sort" : [
          1747094400000,
          "-slox5YBNRvU4PL5mmzf"
        ]
      },
      {
        "_index" : "index_user_info",
        "_type" : "_doc",
        "_id" : "88lkx5YBNRvU4PL57GzP",
        "_score" : null,
        "_source" : {
          "name" : "张名12",
          "id" : "7",
          "height" : 172.1,
          "createTime" : "2025-05-13"
        },
        "sort" : [
          1747094400000,
          "88lkx5YBNRvU4PL57GzP"
        ]
      }
    ]
  }
}

聚合查询(aggs

  • ‌桶聚合(Bucket Aggregations)
  • ‌指标聚合(Metric Aggregations)
  • ‌管道聚合(Pipeline Aggregations)‌
一、聚合分类与核心功能
‌桶聚合(Bucket Aggregations)‌
将文档按规则分组形成「桶」,类似 SQLGROUP BY

‌常见类型‌:
terms:按字段唯一值分组(需使用 .keyword 字段处理文本)
date_histogram:按时间间隔分组(如按月/小时统计)
range:自定义数值或时间范围分组

// 按 job 字段分组统计(需使用 keyword 类型)
GET /test/_search
{
  "size": 0,
  "aggs": {
    "job_count": {
      "terms": { "field": "job.keyword" }
    }
  }
}
‌指标聚合(Metric Aggregations)‌
对数值字段进行统计计算(如求和、求平均等)

‌核心指标‌:
avg、sum、min、max、stats(综合统计)、value_count(计数)


// 计算价格字段的平均值与最大值
"aggs": {
  "avg_price": { "avg": { "field": "price" } },
  "max_price": { "max": { "field": "price" } }
}

‌管道聚合(Pipeline Aggregations)‌
基于其他聚合结果二次计算(如百分比、移动平均等)47// 计算满足条件的文档占比(引用其他聚合结果)
"aggs": {
  "percentage_agg": {
    "bucket_script": {
      "buckets_path": {
        "total": "total_count", 
        "filtered": "filtered_count"
      },
      "script": "params.filtered / params.total * 100"
    }
  }
}
二、组合嵌套与高级用法
‌嵌套聚合‌
桶内嵌套指标或子桶,实现多维分析


// 先按品牌分组,再计算每组平均价格
"aggs": {
  "brand_agg": {
    "terms": { "field": "brand.keyword" },
    "aggs": { "avg_price": { "avg": { "field": "price" } } }
  }
}
‌聚合范围限定‌
结合 query 条件筛选参与聚合的文档


GET /hotel/_search
{
  "query": {
    "range": { "price": { "gte": 200 } }  // 仅统计价格≥200的文档
  },
  "aggs": { ... }
}
‌动态脚本聚合‌
使用 Painless 脚本实现复杂逻辑


// 统计字段非空率(通过脚本判断)
"aggs": {
  "filtered_count": {
    "value_count": {
      "script": "doc['my_field'].size() != 0 ? 1 : 0"
    }
  }
}
三、性能优化与注意事项
‌字段类型限制‌

参与聚合的字段需为 keyword、numeric 或 date 类型,text 类型需转换为 .keyword
‌内存与效率优化‌

设置 size: 0 避免返回原始文档
限制返回桶数量(如 terms 聚合的 size 参数)
‌实时性权衡‌

高频更新的索引聚合结果可能滞后,可通过 refresh_interval 调整刷新频率
  • 例子1
GET /index_title/_search
{
  "size": 0,
  "aggs": {
    "title_count": {
     "terms": {
       "field": "title"
    }
    }
  }
}
{
  "took" : 18,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 9,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "title_count" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Data AC",
          "doc_count" : 2
        },
        {
          "key" : "Data AD",
          "doc_count" : 2
        },
        {
          "key" : "Data AB",
          "doc_count" : 1
        },
        {
          "key" : "Data Analytics",
          "doc_count" : 1
        },
        {
          "key" : "Elasticsearch Guide",
          "doc_count" : 1
        },
        {
          "key" : "John Doe",
          "doc_count" : 1
        },
        {
          "key" : "John Ming",
          "doc_count" : 1
        }
      ]
    }
  }
}


  • 例子2
GET /index_title/_search
{
  "size": 0,
  "aggs": {
    "title_count": {
     "terms": {
       "field": "note.keyword"
    }
    }
  }
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 9,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "title_count" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [ ]
    }
  }
}

  • 例子3
GET /index_title/_search
{
  "size": 0,
  "aggs": {
    "avg_price": {
     "avg": {
       "field": "price"
    }
    }
  }
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 9,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "avg_price" : {
      "value" : 71.65777693854437
    }
  }
}

  • 例子4
GET /index_title/_search
{
  "size": 0,
  "aggs": {
    "max_price": {
     "max": {
       "field": "price"
    }
    }
  }
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 9,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "max_price" : {
      "value" : 103.98999786376953
    }
  }
}

  • 例子5,先分组,再求平均值
GET /index_title/_search
{
  "size": 0,
  "aggs": {
  "title_agg": {
    "terms": { "field": "title" },
    "aggs": { "avg_price": { "avg": { "field": "price" } } }
  }
}
}
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 9,
      "relation" : "eq"
    },
    "max_score" : null,
    "hits" : [ ]
  },
  "aggregations" : {
    "title_agg" : {
      "doc_count_error_upper_bound" : 0,
      "sum_other_doc_count" : 0,
      "buckets" : [
        {
          "key" : "Data AC",
          "doc_count" : 2,
          "avg_price" : {
            "value" : 91.98999786376953
          }
        },
        {
          "key" : "Data AD",
          "doc_count" : 2,
          "avg_price" : {
            "value" : 97.48999786376953
          }
        },
        {
          "key" : "Data AB",
          "doc_count" : 1,
          "avg_price" : {
            "value" : 69.98999786376953
          }
        },
        {
          "key" : "Data Analytics",
          "doc_count" : 1,
          "avg_price" : {
            "value" : 59.9900016784668
          }
        },
        {
          "key" : "Elasticsearch Guide",
          "doc_count" : 1,
          "avg_price" : {
            "value" : 45.9900016784668
          }
        },
        {
          "key" : "John Doe",
          "doc_count" : 1,
          "avg_price" : {
            "value" : 29.989999771118164
          }
        },
        {
          "key" : "John Ming",
          "doc_count" : 1,
          "avg_price" : {
            "value" : 60.0
          }
        }
      ]
    }
  }
}

模糊查询

  • 通配符查询(Wildcard Query) 匹配包含特定通配符模式的词项,适用于简单模糊匹配
GET /index_title/_search
{
  "query": {
    "wildcard": {
      "title": {
        "value": "John*"
      }
    }
  }
}

{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 1.0,
        "_source" : {
          "title" : "John Ming",
          "price" : 60,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "title" : "John Doe",
          "price" : 29.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      }
    ]
  }
}

  • ‌前缀查询(Prefix Query)‌ ‌用途‌:匹配以指定前缀开头的词项,如搜索用户输入提示
GET /index_title/_search
{
  "query": {
    "prefix": {
      "title": {
        "value": "John"
      }
    }
  }
}

{
  "took" : 0,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 2,
      "relation" : "eq"
    },
    "max_score" : 1.0,
    "hits" : [
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 1.0,
        "_source" : {
          "title" : "John Ming",
          "price" : 60,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 1.0,
        "_source" : {
          "title" : "John Doe",
          "price" : 29.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      }
    ]
  }
}

  • ‌正则表达式查询(Regexp Query)‌ ‌用途‌:使用正则表达式匹配复杂模式,如邮箱或电话号码验证
GET /users/_search
{
  "query": {
    "regexp": { "email": ".*@example\\.com" }
  }
}

  • ‌模糊查询(Fuzzy Query)‌ ‌用途‌:基于编辑距离(Levenshtein 算法)匹配近似词,适合拼写纠错
GET /index_title/_search
{
  "query": {
    "fuzzy": {
      "note": {
        "value": "备注1",
        "fuzziness": 2
      }
    }
  }
}

{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 3,
    "successful" : 3,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 9,
      "relation" : "eq"
    },
    "max_score" : 0.0,
    "hits" : [
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "5",
        "_score" : 0.0,
        "_source" : {
          "title" : "Data AB",
          "price" : 69.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "7",
        "_score" : 0.0,
        "_source" : {
          "title" : "Data AD",
          "price" : 90.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "2",
        "_score" : 0.0,
        "_source" : {
          "title" : "John Ming",
          "price" : 60,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "3",
        "_score" : 0.0,
        "_source" : {
          "title" : "Elasticsearch Guide",
          "price" : 45.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "4",
        "_score" : 0.0,
        "_source" : {
          "title" : "Data Analytics",
          "price" : 59.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "10",
        "_score" : 0.0,
        "_source" : {
          "title" : "Data AC",
          "price" : 102.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "1",
        "_score" : 0.0,
        "_source" : {
          "title" : "John Doe",
          "price" : 29.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "6",
        "_score" : 0.0,
        "_source" : {
          "title" : "Data AC",
          "price" : 80.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      },
      {
        "_index" : "index_title",
        "_type" : "_doc",
        "_id" : "11",
        "_score" : 0.0,
        "_source" : {
          "title" : "Data AD",
          "price" : 103.99,
          "create_time" : "2025-05-14 10:00:00",
          "note" : "备注1"
        }
      }
    ]
  }
}

  • 模糊查询总结:注意模糊查询性能问题
 一:自定义分词 + match_phrase‌ 的组合可优化短语匹配的精准性与灵活性
 以下是核心实现策略与注意事项:
 1、‌自定义分词器配置‌
通过 nGram 或 edge_ngram 分词器增强灵活性,例如中文场景结合 ik 分词器
PUT /my_index
{
  "settings": {
    "analysis": {
      "analyzer": {
        "ik_ngram": {
          "tokenizer": "ik_max_word",  // 中文分词
          "filter": ["ngram_filter"]   // 添加 nGram 切分
        }
      },
      "filter": {
        "ngram_filter": {
          "type": "nGram",
          "min_gram": 2,
          "max_gram": 3
        }
      }
    }
  }
}

‌2、索引数据时应用分词规则‌
定义字段映射时指定自定义分词器:
PUT /my_index/_mapping
{
  "properties": {
    "title": {
      "type": "text",
      "analyzer": "ik_ngram"  // 使用自定义分词器
    }
  }
}

‌3、查询时使用 match_phrase‌
结合分词结果进行短语匹配,保持词序一致:
GET /my_index/_search
{
  "query": {
    "match_phrase": {
      "title": {
        "query": "人大四次会议开幕",
        "slop": 1  // 允许词间间隔 1 个位置
      }
    }
  }
}

二、适用场景
‌中文模糊匹配‌
解决中文分词颗粒度问题(如「人大四次」无法匹配「人大四次会议」的分词结果)
‌顺序敏感型搜索‌
例如法律条文、地址匹配等要求词序严格一致的场景
‌搜索性能优化‌
相比 wildcard 查询,通过预分词减少全字段扫描

三、存在的问题与优化
‌索引体积膨胀‌
nGram 分词导致索引体积显著增大,需权衡存储与查询性能
‌优化‌:针对核心字段使用自定义分词,非核心字段采用默认分词

‌查询性能开销‌
match_phrase 需计算词项位置,大数据量时性能低于普通 match 查询
‌优化‌:

合理设置 slop 参数(允许词间间隔)提升召回率
结合 bool 查询混合 match 过滤初步结果
‌分词策略冲突‌
若查询文本分词结果与索引分词不一致,会导致匹配失败(如「人大四次」分词为 ["人大", "四次"],但索引中为 ["人大四次", "会议"])
‌优化‌:

统一查询与索引的分词器配置
添加同义词扩展词库减少分词差异

四、典型问题案例
// 文档内容
{"title": "人大四次会议开幕"}

// 查询语句
GET /my_index/_search
{
  "query": {
    "match_phrase": {
      "title": "人大四次"  // 默认分词为 ["人大", "四次"]
    }
  }
}
‌问题原因‌:索引中 title 字段可能分词为 ["人大四次会议", "开幕"],无法匹配查询词项顺序

‌解决方案‌:

调整分词器 min_gram=2 并添加 slop 参数
使用 ik_max_word + nGram 组合分词提升细粒度

通过合理配置分词策略与参数调优,可显著提升 match_phrase 的召回率与查询效率
PUT /index_ik_data
{
  "settings": {
    "analysis": {
      "analyzer": {
        "ik_ngram": {
          "tokenizer": "ik_max_word",  
          "filter": ["ngram_filter"]   
        }
      },
      "filter": {
        "ngram_filter": {
          "type": "ngram",
          "min_gram": 2,
          "max_gram": 3
        }
      }
    }
  },
  "mappings": {
  "properties": {
    "title": {
      "type": "text",
      "analyzer": "ik_ngram"  
    }
  }
  }
}

PUT /index_ik_data/_doc/1
{
  "title": "中华人民共和国人大四次会议开幕",
}




GET /index_ik_data/_search
{
  "query": {
    "match_phrase": {
      "title": {
        "query": "人大四次会议开幕",
        "slop": 1  
      }
    }
  }
}

GET /index_ik_data/_search
{
  "query": {
    "match_phrase": {
      "title": "人大四次"  
    }
  }
}

DELETE /index_ik_data

分词器


一、分词器选择与安装
‌内置分词器适用场景‌

‌standard‌:默认分词器,适合英文文本(按空格/标点拆分并转小写)
‌simple‌:仅按非字母字符拆分,适合标准化文本的简单处理
中文场景推荐 ‌IK 分词器‌(细分 ik_smart 和 ik_max_word 模式)
‌IK 分词器安装‌
下载与 Elasticsearch 完全匹配的版本(如 7.17.3 对应插件版本一致)37。
解压至 plugins/ik 目录并重启服务,通过 elasticsearch-plugin list 验证安装状态35。
二、分词器配置与使用
‌索引 Mapping 定义‌

PUT /my_index
{
  "mappings": {
    "properties": {
      "content": {
        "type": "text",
        "analyzer": "ik_max_word",  // 索引时分词(细粒度)
        "search_analyzer": "ik_smart"  // 搜索时分词(粗粒度)
      }
    }
  }
}
analyzer 定义索引存储的分词策略,search_analyzer 控制搜索匹配逻辑
‌动态测试分词效果‌

GET /_analyze
{
  "text": "我爱美羊羊",
  "analyzer": "ik_smart"
}
通过 _analyze API 验证分词结果,调整算法参数
三、自定义词库扩展
‌本地词库配置‌

编辑 config/main.dic 文件添加新词(需分词器重启生效)
适用于静态词库维护,如专业术语
‌远程词库(推荐)‌

配置 config/IKAnalyzer.cfg.xml 指定远程 URL:

<entry key="remote_ext_dict">http://your-nginx-server/es_dict.txt</entry>
定时热更新(默认 60 秒轮询)15,支持动态扩展网络热词
四、注意事项
‌版本兼容性‌:插件版本需与 Elasticsearch 严格匹配,否则启动失败
‌性能权衡‌:ik_max_word 存储开销更高,搜索时建议用 ik_smart 减少计算量
‌热更新限制‌:远程词库更新仅增删词项,不支持修改已有词权重
通过以上策略,可平衡分词准确性、维护成本及系统性能,实现灵活高效的中文搜索体验

四、分词器核心组件
Elasticsearch 的分词器由三部分组成

‌Character Filters(字符过滤器)‌:预处理原始文本(如删除 HTML 标签、符号替换);
‌Tokenizer(分词器)‌:按规则切分文本(如按空格分割);
‌Token Filters(词元过滤器)‌:标准化处理分词结果(如转小写、停用词移除、同义词扩展)
五、自定义分词器实现步骤
1. 基于外部词库扩展
‌适用场景‌:需添加行业术语、新词或停用词。

‌创建词库文件‌:新建文本文件(如 custom_words.txt),每行一个词

特殊术语1
特殊术语2
‌部署词库‌:
将文件放入 $ES_HOME/config/analysis/ 目录
通过配置 synonyms_path 或 keywords_path 指定路径
2. 配置自定义分析器
在索引配置中定义分析器组件


PUT /my_index
{
  "settings": {
    "analysis": {
      "filter": {
        "my_stopwords": {
          "type": "stop",
          "stopwords": ["的", "了", "是"]  // 自定义停用词
        },
        "my_synonyms": {
          "type": "synonym",
          "synonyms_path": "analysis/synonyms.txt"  // 同义词文件路径
        }
      },
      "analyzer": {
        "my_custom_analyzer": {
          "type": "custom",
          "char_filter": ["html_strip"],  // 字符过滤器
          "tokenizer": "ik_max_word",      // 使用 IK 分词器
          "filter": ["lowercase", "my_stopwords", "my_synonyms"]
        }
      }
    }
  }
}
3. 应用自定义分词器
在字段映射中指定分词器

PUT /my_index/_mapping
{
  "properties": {
    "content": {
      "type": "text",
      "analyzer": "my_custom_analyzer"  // 应用自定义分析器
    }
  }
}
4. 重启并验证
‌重启 Elasticsearch 服务‌
‌通过 API 测试分词效果‌

GET /my_index/_analyze
{
  "analyzer": "my_custom_analyzer",
  "text": "测试文本包含特殊术语1"
}
六:高级自定义(插件开发)
‌适用场景‌:需完全定制分词逻辑(如专用算法)

编写 Java 类继承 AbstractAnalyzer,重写 tokenize 方法实现分词逻辑
打包为插件并部署到 Elasticsearch 的 plugins 目录
在索引配置中引用插件分词器。
七、典型实践推荐
‌中文分词‌:优先集成 IK 分词器,通过 ik_smart(粗粒度)或 ik_max_word(细粒度)模式优化
‌同义词扩展‌:通过 synonym 过滤器加载外部词库文件
‌停用词过滤‌:结合业务场景动态更新停用词列表
通过上述方法,可灵活适配不同语言和业务场景的分词需求。调试时建议结合 _analyze API 实时验证分词效果
PUT /my_index
{
  "mappings": {
    "properties": {
      "content": {
        "type": "text",
        "analyzer": "ik_max_word",  
        "search_analyzer": "ik_smart" 
      }
    }
  }
}

PUT /my_index/_doc/1
{
  "title": "美羊羊,我爱美羊羊"
}


GET /my_index/_analyze
{
  "text": "我爱美羊羊",
  "analyzer": "ik_smart"
}
  • 倒序索引
1. 倒排索引的基本概念‌
‌倒排索引‌是一种将文档中的‌词项(Term)‌映射到包含这些词项的‌文档集合‌的数据结构。
与传统数据库的“正排索引”(通过文档ID查找内容)不同,
倒排索引是通过‌关键词反向追溯文档‌,
类似于书籍末页的“关键词索引”。

‌举例说明‌
假设有以下三个文档:

‌Doc1‌: "苹果是一种水果"Doc2‌: "苹果公司生产手机"Doc3‌: "水果手机很流行"
倒排索引会将这些文档拆解为词项,并记录每个词项所在的文档信息:

text
Copy Code
词项    | 出现的文档及位置
-------------------------------
苹果    → Doc1(位置0), Doc2(位置0)
水果    → Doc1(位置2), Doc3(位置0)
手机    → Doc2(位置3), Doc3(位置2)
公司    → Doc2(位置1)
生产    → Doc2(位置2)
...
‌2. 倒排索引的核心组成‌
‌词项字典(Term Dictionary)‌:所有文档中唯一的词项列表,通常排序后存储在内存中以加速查找。
‌倒排列表(Postings List)‌:每个词项对应的文档ID列表,以及附加信息(如词频、位置、偏移量等)。
‌文档IDDoc ID)‌:包含该词项的文档标识。
‌词频(Term Frequency, TF)‌:词项在文档中出现的次数(用于相关性评分)。
‌位置(Position)‌:词项在文档中的位置(用于短语查询或邻近搜索)。
‌偏移量(Offset)‌:词项在文本中的起始和结束字符位置。
‌3. Elasticsearch 中倒排索引的构建流程‌
‌文本分析(Text Analysis)‌:

‌分词(Tokenization)‌:将文本拆分为独立的词项(如“苹果公司” → “苹果”和“公司”)。
‌过滤(Filtering)‌:移除停用词(如“的”、“是”),并进行大小写转换、词干提取(如“running” → “run”)等操作。
最终生成标准化的词项列表。
‌索引写入‌:

将词项与文档的映射关系(包括位置、词频等信息)写入倒排索引。
‌4. 倒排索引的优势‌
‌高效查询‌:直接通过词项定位文档,无需逐条扫描所有文档。
‌支持复杂搜索‌:
‌布尔查询‌(AND/OR/NOT):合并多个词项的倒排列表。
‌短语查询‌:利用位置信息匹配连续的词项序列。
‌模糊查询‌:通过编辑距离(如“appel” → “apple”)匹配近似词项。
‌相关性评分‌:基于词频(TF)、逆文档频率(IDF)等计算文档与查询的相关性(如TF-IDF算法或BM25)。
‌5. 倒排索引的实际应用示例‌
‌搜索“苹果手机”‌:
拆分为词项“苹果”和“手机”。
查询倒排索引,找到包含这两个词项的文档(如Doc2Doc3)。
根据位置信息判断是否为连续短语(如Doc2中“苹果”在位置0,“公司”在位置1,“生产”在位置2,“手机”在位置3,因此“苹果手机”不连续;Doc3中“水果”在位置0,“手机”在位置2,也不连续)。
返回相关性评分最高的结果。
‌6. 与其他索引的对比‌
‌正排索引(Forward Index)‌:通过文档ID查找内容,适用于文档检索,但无法高效支持关键词搜索。
‌B-Tree索引(传统数据库)‌:适合精确匹配和范围查询,但在全文搜索中性能较低。
‌总结‌
Elasticsearch 的倒排索引通过词项到文档的反向映射,
实现了高效的全文搜索能力。结合分词、过滤、评分等机制,
使其能够快速处理复杂的查询需求(如布尔逻辑、短语匹配、模糊搜索等),
成为大规模文本搜索场景下的核心工具。