记录一次ES配置ik插件远程字典的坑

631 阅读3分钟

问题

环境:

elasticsearch:7.4.2

elasticsearch-analysis-ik:7.4.2

加载远程字典代码

@GetMapping(value = "/load-customwords")
public String loadCustomWords(HttpServletResponse response) {
    String res = esService.deploy(response);
    try {
        return new String(res.getBytes(),"utf-8");
    }catch (Exception e){
        e.printStackTrace();
    }
    return "";
}
@Override
public String deploy(HttpServletResponse response) {
    List<CustomWord> allWords = customWordDao.getAllWords();

    StringBuffer buffer = new StringBuffer();

    if (CollectionUtils.isNotEmpty(allWords)) {
        logger.info("开始加载自定义词典...");
        int i = 0;
        for (CustomWord word : allWords) {
            if (i == 0) {
                try {
                    response.setHeader("Last-Modified", DateUtil.getDateTime("yyyy-MM-dd hh:mm:ss", word.getCreatedTime()));
                    response.setHeader("ETag", DateUtil.getDateTime("yyyy-MM-dd hh:mm:ss", word.getUpdatedTime()));
                } catch (Exception e){
                    e.printStackTrace();
                }
                buffer.append(word.getWordName());
            } else {
                buffer.append("\n" + word.getWordName());
            }
            i++;
        }
        logger.info("加载自定义词典完毕...");
    }
    return buffer.toString();
}

配置

image.png

使用

问题

没有加载字典!!!

最关键的是,我用5.6.2的版本和7.12.0的版本都去试了一下,发现热更新加载远程分词都ok,我们公司使用的7.4.2的版本竟然无法加载...瞬间气的跳脚的节奏。

怎么办?不可能让公司直接换es的版本,那就只能去看下源码了,呜呜...

源码

分别下载v5.6.2,v7.12.0,v7.4.2 的ik源码,不知道怎么下载的可看如下:

  1. 进入官网地址

image.png

  1. 打开releases

image.png

  1. 选择对应的版本

image.png

  1. 下载源码

image.png

v5.6.2

加载远程字典的核心方法getRemoteWords()

/**
 * 从远程服务器上下载自定义词条
 */
private static List<String> getRemoteWords(String location) {

   List<String> buffer = new ArrayList<String>();
   RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
         .setSocketTimeout(60 * 1000).build();
   CloseableHttpClient httpclient = HttpClients.createDefault();
   CloseableHttpResponse response;
   BufferedReader in;
   HttpGet get = new HttpGet(location);
   get.setConfig(rc);
   try {
      response = httpclient.execute(get);
      if (response.getStatusLine().getStatusCode() == 200) {

         String charset = "UTF-8";
         // 获取编码,默认为utf-8
         if (response.getEntity().getContentType().getValue().contains("charset=")) {
            String contentType = response.getEntity().getContentType().getValue();
            charset = contentType.substring(contentType.lastIndexOf("=") + 1);
         }
         in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), charset));
         String line;
         while ((line = in.readLine()) != null) {
            buffer.add(line);
         }
         in.close();
         response.close();
         return buffer;
      }
      response.close();
   } catch (ClientProtocolException e) {
      logger.error("getRemoteWords {} error", e, location);
   } catch (IllegalStateException e) {
      logger.error("getRemoteWords {} error", e, location);
   } catch (IOException e) {
      logger.error("getRemoteWords {} error", e, location);
   }
   return buffer;
}

v7.12.0

加载远程字典的核心方法getRemoteWordsUnprivileged()

/**
 * 从远程服务器上下载自定义词条
 */
private static List<String> getRemoteWordsUnprivileged(String location) {

   List<String> buffer = new ArrayList<String>();
   RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
         .setSocketTimeout(60 * 1000).build();
   CloseableHttpClient httpclient = HttpClients.createDefault();
   CloseableHttpResponse response;
   BufferedReader in;
   HttpGet get = new HttpGet(location);
   get.setConfig(rc);
   try {
      response = httpclient.execute(get);
      if (response.getStatusLine().getStatusCode() == 200) {

         String charset = "UTF-8";
         // 获取编码,默认为utf-8
         HttpEntity entity = response.getEntity();
         if(entity!=null){
            Header contentType = entity.getContentType();
            if(contentType!=null&&contentType.getValue()!=null){
               String typeValue = contentType.getValue();
               if(typeValue!=null&&typeValue.contains("charset=")){
                  charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
               }
            }

            if (entity.getContentLength() > 0 || entity.isChunked()) {
               in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
               String line;
               while ((line = in.readLine()) != null) {
                  buffer.add(line);
               }
               in.close();
               response.close();
               return buffer;
            }
      }
      }
      response.close();
   } catch (IllegalStateException | IOException e) {
      logger.error("getRemoteWords {} error", e, location);
   }
   return buffer;
}

v7.4.2

加载远程字典的核心方法getRemoteWordsUnprivileged()

/**
 * 从远程服务器上下载自定义词条
 */
private static List<String> getRemoteWordsUnprivileged(String location) {

   List<String> buffer = new ArrayList<String>();
   RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
         .setSocketTimeout(60 * 1000).build();
   CloseableHttpClient httpclient = HttpClients.createDefault();
   CloseableHttpResponse response;
   BufferedReader in;
   HttpGet get = new HttpGet(location);
   get.setConfig(rc);
   try {
      response = httpclient.execute(get);
      if (response.getStatusLine().getStatusCode() == 200) {

         String charset = "UTF-8";
         // 获取编码,默认为utf-8
         HttpEntity entity = response.getEntity();
         if(entity!=null){
            Header contentType = entity.getContentType();
            if(contentType!=null&&contentType.getValue()!=null){
               String typeValue = contentType.getValue();
               if(typeValue!=null&&typeValue.contains("charset=")){
                  charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
               }
            }

            if (entity.getContentLength() > 0) {
               in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
               String line;
               while ((line = in.readLine()) != null) {
                  buffer.add(line);
               }
               in.close();
               response.close();
               return buffer;
            }
      }
      }
      response.close();
   } catch (IllegalStateException | IOException e) {
      logger.error("getRemoteWords {} error", e, location);
   }
   return buffer;
}

对比三个版本对处理行的地方发现,v5.6.2没有对 entity 做校验,直接获取了行 line = in.readLine(),v7.12.0在获取流的时候做了校验 entity.getContentLength() > 0 || entity.isChunked(),而v7.4.2虽然也做了校验,但是只校验了文件的长度entity.getContentLength() > 0,那我们接下来进行下对该值的打印。

image.png

重新打包

  1. 将打包好的插件解压重新安装到es

image.png

  1. 触发远程字典加载

image.png

  1. 日志

image.png

此时我们发现,这个值为-1,所以根本没有获取走判断里面的内容,所以最后的list数组是个空数组。

  1. 加上v7.12.0的判断重新打包安装

image.png

  1. 分词加载成功,分词生效

image.png

解决方案

从上面的分析来看,源码是存在一点问题的,那我们要么就在线上重新安装自己打包的插件,可是公司的环境,说不定运维人员没被通知到,从而安装了自己从网上下载下来的插件呢?那有没有不污染源码的情况解决这个问题呢?

首先我们有个疑问entity.getContentLength() 为啥会等于-1 呢?那么有了这个疑问就好解决,我们可以绕过这个判断吗?

我们强行设置文件长度 response.setContentLength(allWords.size());

image.png

然后测试一下,发现,成功解决!!!