记录动态查询eslasticsearch中nested数据问题

1,353 阅读5分钟

目的:

订单和订单详情数据存储在elasticsearch中,过滤条件筛选数据,通过返回的数据长度判断是否是复购(数据长度大于一就是复购)

创建索引:

PUT /test/
{"settings":{"number_of_shards":1,"number_of_replicas":0},"mappings":{"properties":{"mobile":{"type":"keyword"},"name":{"type":"keyword"},"sex":{"type":"keyword"},"age":{"type":"integer"},"province":{"type":"keyword"},"city":{"type":"keyword"},"idcard":{"type":"keyword"},"repurchase":{"type":"byte"},"condition":{"type":"keyword"},"createdAt":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"updatedAt":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"orders":{"type":"nested","properties":{"orderId":{"type":"keyword"},"ticketId":{"type":"keyword"},"ticketFullname":{"type":"text"},"bookingTime":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"bookingChannelId":{"type":"integer"},"bookingChannelName":{"type":"keyword"},"orderPayAmount":{"type":"integer"},"bookingNum":{"type":"integer"},"orderStatusText":{"type":"keyword"},"orderStatus":{"type":"byte"},"vertifyTime":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"bookingPrice":{"type":"integer"},"isAround":{"type":"byte"},"landType":{"type":"keyword"},"landId":{"type":"integer"},"landName":{"type":"keyword"},"landPublisherId":{"type":"integer"},"landPublisherName":{"type":"keyword"}}}}}}

构建数据:

POST /test/_doc
{
  "createdAt": "2021-08-25 18:06:23",
  "province": "辽宁省",
  "city": "沈阳市",
  "sortId": 159935252540211100,
  "sex": "女",
  "mobile": "xxxxxxxxx",
  "name": "测试用户",
  "age": 31,
  "updatedAt": "2021-08-25 18:06:23",
  "condition": [
    "1596547641,18,530534",
    "1596547607,19,530535"
  ],
  "orders": [
    {
      "orderId": "102577843",
      "ticketId": 530534,
      "ticketFullname": "测试数据",
      "bookingTime": "2020-08-06 21:25:41",
      "bookingChannelId": 18,
      "bookingChannelName": "微平台",
      "orderPayAmount": 9400,
      "bookingNum": 1,
      "orderStatusText": "测试",
      "orderStatus": 1,
      "vertifyTime": "2020-08-26 14:50:04",
      "bookingPrice": 9400,
      "isAround": 0,
      "landType": "A",
      "landId": 194128,
      "landName": "测试数据",
      "landPublisherId": 433029,
      "landPublisherName": "测试用户"
    },
    {
      "orderId": "102577915",
      "ticketId": 530535,
      "ticketFullname": "测试数据",
      "bookingTime": "2020-08-04 21:26:47",
      "bookingChannelId": 19,
      "bookingChannelName": "微平台",
      "orderPayAmount": 9400,
      "bookingNum": 1,
      "orderStatusText": "测试",
      "orderStatus": 1,
      "vertifyTime": "2020-08-26 14:50:03",
      "bookingPrice": 9400,
      "isAround": 0,
      "landType": "A",
      "landId": 194128,
      "landName": "测试数据",
      "landPublisherId": 433028,
      "landPublisherName": "测试用户"
    }
  ]
}

查询数据:

GET /test/_search
{
  "from": 0,
  "size": 10,
  "query": {
    "bool": {
      "must": [
        {
          "term": {
            "sex": "女"
          }
        },
        {
          "range": {
            "age": {
              "gte": 10,
              "lte": 40
            }
          }
        },
        {
          "bool": {
            "should": [
              {
                "bool": {
                  "must": [
                    {
                      "match_phrase": {
                        "province": "辽宁省"
                      }
                    },
                    {
                      "match_phrase": {
                        "city": "沈阳市"
                      }
                    }
                  ]
                }
              },
              {
                "bool": {
                  "must": [
                    {
                      "match_phrase": {
                        "province": "辽宁省"
                      }
                    },
                    {
                      "match_phrase": {
                        "city": "抚顺市"
                      }
                    }
                  ]
                }
              }
            ]
          }
        },
        {
          "nested": {
            "path": "orders",
            "inner_hits": {
              "from": 0,
              "size": 100
            },
            "query": {
              "bool": {
                "must": [
                  {
                    "range": {
                      "orders.bookingTime": {
                        "gte": "2020-08-04 21:26:47",                        "lte": "2020-08-04 21:26:47",                        "format": "yyyy-MM-dd HH:mm:ss"
                      }
                    }
                  },
                  {
                    "terms": {
                      "orders.ticketId": [
                        530535
                      ]
                    }
                  },
                  {
                    "terms": {
                      "orders.bookingChannelId": [
                        19
                      ]
                    }
                  }
                ]
              }
            }
          }
        }
      ]
    }
  },
  "sort": {
    "sortId": {
      "order": "desc"
    }
  },
  "_source": {
    "excludes": [
      "orders"
    ]
  }
}

查询结果:

说明:

nested数据查询,只要数据满足其中一条数据,那么剩下的数据都会返回。查询nested数据可以添加inner_hits参数去只显示子集

"nested": {
            "path": "orders",
            "inner_hits": {
              "from": 0,
              "size": 100  //返回的子集长度
            },

上述查询语句中的_source参数可以用来过滤父级文档字段的显示与隐藏

"_source": {
    "excludes": [
      "orders"   //表示排除字段
    ],
    "includes": [
        "name"  //表示包含字段
    ]
  }

目前返回的数据结构已经基本能满足业务需求,但是怎么去判断是否复购还是没有解决。目前想到的方案有三种,各有弊端,暂时没有一种完美的解决方案。

方案一:

通过filter去过滤字段,在数据里面添加一个冗余字段(condition),并且是数组类型(因为script里面只能获取到数组的数据而不能获取到nested里面数据,es官网描述只能获取简单类型),每个关于nested里面的查询条件以逗号分割。

目前只要考虑到的问题是:查询效率会随着查询数量,逐渐递减。

创建一个painless脚本
POST _scripts/repurchase/
{
  "script": {
    "lang": "painless",
    "source": """
            int count = 0;
            for (int i = 0; i < doc['condition.keyword'].length; i++) {
              boolean conditional = true;
              String[] array= /,/.split(doc['condition.keyword'][i]);
              if(params.bookingChannelId!=null){
                conditional = conditional && params.bookingChannelId.contains(Integer.parseInt(array[1]));
              }
              if(params.ticketId!=null){
                conditional = conditional && params.ticketId.contains(Integer.parseInt(array[2]));
              }
              if(params.startBookingTime!=null){
                conditional = conditional && Integer.parseInt(array[0])>=params.startBookingTime;
              }
              if(params.endBookingTime!=null){
                conditional = conditional && Integer.parseInt(array[0])<=params.endBookingTime;
              }
              if(conditional){
                count++;
                if(count>1){break;}
              }
            }
            if(params.repurchase == 1){
              return count > 1;
            }else{
              return count == 1;
            }
            """
  }
}

然后查询条件里面加上
"filter": {
        "script": {
          "script": {
            "params": {
              "startBookingTime": 1596547607,              "endBookingTime": 1596547607,              "repurchase": 1, //1-复购,2-不是复购
              "bookingChannelId": 19,
              "ticketId": 530535
            },
            "id": "repurchase"
          }
        }
      }

方案二:

更改数据结构,以订单维度存储数据,通过组合聚合的方式(考虑到后期可能会通过多字段聚合数据,目前是以手机号做聚合,而且scroll分页比较方便)。

存在的问题:聚合桶计算之后如果总共有一百条数据,但是前五十条数据不是复购,后五十条数据是复购,那么查询复购通过size去控制会出现前面是空桶的情况,不好做分页,而且composite里面不支持min_dou_count字段去控制结果集(不知道是不是写的不对)。

GET /test/_search
{
  "size": 0,
  "query": {
    //查询条件
  },
  "aggs": {
    "my_buckets": {
      "composite": {
        "size": 100,
        "sources": [
          {
            "group_by_mobile": {
              "terms": {
                "field": "ticketTakerMobile.keyword" //以手机号分组,如果mapping里面这个字段不是keyword类型,需要加上keyword
              }
            }
          }
        ],
        "after": {
          "group_by_mobile": "xxxxxx" //从那个字段向下查询,不支持随机或者向上分页
        }
      },
      "aggs": {
        "total": {
          "value_count": {
            "field": "orderId.keyword" //计算聚合桶里面的数量
          }
        },
        "sales_bucket_filter": {
          "bucket_selector": {
            "buckets_path": {
              "totals": "total"
            },
            "script": "params.totals > 1" //筛选聚合桶大于一的数据
          }
        }
      }
    }
  }
}

方案三:

这个方案和方案二比较类似,都是以订单维度去聚合数据

存在的问题:es默认的size大小是一万条数据,但是可以通过修改配置去解决,但是潜在的问题是数据量过大会不会将内存沾满。

// 修改ES中所有的index配置PUT _all/_settings{  "index":{    "max_result_window": 100000  }}// 修改ES中指定的index配置PUT index_name/_settings{  "index":{    "max_result_window": 100000  }}
GET /test/_search
{
  "size": 0, //不返回原始数据
  "query": {
    //查询条件
  },
  "aggs": {
    "group_by_mobile": {
      "terms": {
        "field": "ticketTakerMobile.keyword",
        "size": 100
      },
      "aggregations": {
        "top": {
          "top_hits": {
            "from": 0,
            "size": 100, // 桶内显示的最大数据长度
            "version": false,
            "seq_no_primary_term": false,
            "explain": false,
            "_source": {
              "excludes": [] //桶内展示的字段
            }
          }
        },
        "bucket_field": {
          "bucket_sort": {
            "sort": [],
            "from": 20,  //偏移量
            "size": 100  //展示的总数据条数
          }
        },
        "total": {
          "value_count": {
            "field": "orderId.keyword"
          }
        },
        "sales_bucket_filter": {
          "bucket_selector": {
            "buckets_path": {
              "totals": "total"
            },
            "script": "params.totals > 1"  //筛选复购
          }
        }
      }
    }
  }
}

结语:

接触elasticsearch时间不多,感觉painless的语法结构和java类似,但是使用java的某些函数去判断会出错。目前采用的是方案一去做处理,不知道还有没有其他完美的解决方案,存在错误欢迎指正。