长文本业务数据index mapping分析

115 阅读1分钟

以邮件为例,邮件的信息包括id、用户id、邮件类型、发件人、收件人、发送时间、邮件主题、邮件内容等。

setting参数refresh_interval:为-1时,代表不刷新索引。为正数时,需等待相应时间后,才可以在es索引中搜索到。邮件的搜索允许最新的文档搜索不到。因此可以适当放大刷新间隔

mapping参数:

  • id:邮件id,同uid(用户id)、type(邮件类型)、from(发件人)、to(收件人),均设置为keyword,不需要分词,精确匹配。
  • send_time(发送时间),需要根据该字段进行排序,默认doc_values为true
  • subject(邮件主题),定义为text类型,需要分词支持模糊匹配等
  • content(邮件内容),可能存在html标签,写入时需要去除html标签,需要分词支持模糊匹配等
PUT mail-test
{
  "settings": {
    "refresh_interval": "10s",
    "number_of_shards": 5,
    "number_of_replicas": 1,
    "sort.field": [
      "send_time"
    ],
    "sort.order": [
      "desc"
    ],
    "search.slowlog.threshold.query.warn": "500ms",
    "search.slowlog.threshold.fetch.warn": "1s",
    "indexing.slowlog.threshold.index.warn": "1s",
    "analysis": {
      "analyzer": {
        "ik_index_analyzer": {
          "type": "custom",
          "tokenizer": "ik_max_word",
          "filter": [
            "lowercase"
          ]
        },
        "ik_index_html_strip_analyzer": {
          "type": "custom",
          "tokenizer": "ik_smart",
          "filter": [
            "lowercase"
          ]
        }
      }
    }
  },
  "mappings": {
    "_source": {
      "excludes": [
        "content"
      ]
    },
    "properties": {
      "id": {
        "type": "keyword",
        "doc_values": false
      },
      "uid": {
        "type": "keyword",
        "doc_values": true
      },
      "type": {
        "type": "keyword",
        "doc_values": false
      },
      "from": {
        "type": "keyword",
        "doc_values": false
      },
      "to": {
        "type": "keyword",
        "doc_values": false
      },
      "send_time": {
        "type": "long"
      },
      "subject": {
        "type": "text",
        "analyzer": "ik_index_analyzer",
        "search_analyzer": "ik_search_analyzer"
      },
      "content": {
        "type": "text",
        "analyzer": "ik_index_html_strip_analyzer",
        "search_analyzer": "ik_search_analyzer"
      }
    }
  }
}