目的:
订单和订单详情数据存储在elasticsearch中,过滤条件筛选数据,通过返回的数据长度判断是否是复购(数据长度大于一就是复购)
创建索引:
PUT /test/
{"settings":{"number_of_shards":1,"number_of_replicas":0},"mappings":{"properties":{"mobile":{"type":"keyword"},"name":{"type":"keyword"},"sex":{"type":"keyword"},"age":{"type":"integer"},"province":{"type":"keyword"},"city":{"type":"keyword"},"idcard":{"type":"keyword"},"repurchase":{"type":"byte"},"condition":{"type":"keyword"},"createdAt":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"updatedAt":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"orders":{"type":"nested","properties":{"orderId":{"type":"keyword"},"ticketId":{"type":"keyword"},"ticketFullname":{"type":"text"},"bookingTime":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"bookingChannelId":{"type":"integer"},"bookingChannelName":{"type":"keyword"},"orderPayAmount":{"type":"integer"},"bookingNum":{"type":"integer"},"orderStatusText":{"type":"keyword"},"orderStatus":{"type":"byte"},"vertifyTime":{"type":"date","format":"yyyy-MM-dd HH:mm:ss"},"bookingPrice":{"type":"integer"},"isAround":{"type":"byte"},"landType":{"type":"keyword"},"landId":{"type":"integer"},"landName":{"type":"keyword"},"landPublisherId":{"type":"integer"},"landPublisherName":{"type":"keyword"}}}}}}
构建数据:
POST /test/_doc
{
"createdAt": "2021-08-25 18:06:23",
"province": "辽宁省",
"city": "沈阳市",
"sortId": 159935252540211100,
"sex": "女",
"mobile": "xxxxxxxxx",
"name": "测试用户",
"age": 31,
"updatedAt": "2021-08-25 18:06:23",
"condition": [
"1596547641,18,530534",
"1596547607,19,530535"
],
"orders": [
{
"orderId": "102577843",
"ticketId": 530534,
"ticketFullname": "测试数据",
"bookingTime": "2020-08-06 21:25:41",
"bookingChannelId": 18,
"bookingChannelName": "微平台",
"orderPayAmount": 9400,
"bookingNum": 1,
"orderStatusText": "测试",
"orderStatus": 1,
"vertifyTime": "2020-08-26 14:50:04",
"bookingPrice": 9400,
"isAround": 0,
"landType": "A",
"landId": 194128,
"landName": "测试数据",
"landPublisherId": 433029,
"landPublisherName": "测试用户"
},
{
"orderId": "102577915",
"ticketId": 530535,
"ticketFullname": "测试数据",
"bookingTime": "2020-08-04 21:26:47",
"bookingChannelId": 19,
"bookingChannelName": "微平台",
"orderPayAmount": 9400,
"bookingNum": 1,
"orderStatusText": "测试",
"orderStatus": 1,
"vertifyTime": "2020-08-26 14:50:03",
"bookingPrice": 9400,
"isAround": 0,
"landType": "A",
"landId": 194128,
"landName": "测试数据",
"landPublisherId": 433028,
"landPublisherName": "测试用户"
}
]
}
查询数据:
GET /test/_search
{
"from": 0,
"size": 10,
"query": {
"bool": {
"must": [
{
"term": {
"sex": "女"
}
},
{
"range": {
"age": {
"gte": 10,
"lte": 40
}
}
},
{
"bool": {
"should": [
{
"bool": {
"must": [
{
"match_phrase": {
"province": "辽宁省"
}
},
{
"match_phrase": {
"city": "沈阳市"
}
}
]
}
},
{
"bool": {
"must": [
{
"match_phrase": {
"province": "辽宁省"
}
},
{
"match_phrase": {
"city": "抚顺市"
}
}
]
}
}
]
}
},
{
"nested": {
"path": "orders",
"inner_hits": {
"from": 0,
"size": 100
},
"query": {
"bool": {
"must": [
{
"range": {
"orders.bookingTime": {
"gte": "2020-08-04 21:26:47", "lte": "2020-08-04 21:26:47", "format": "yyyy-MM-dd HH:mm:ss"
}
}
},
{
"terms": {
"orders.ticketId": [
530535
]
}
},
{
"terms": {
"orders.bookingChannelId": [
19
]
}
}
]
}
}
}
}
]
}
},
"sort": {
"sortId": {
"order": "desc"
}
},
"_source": {
"excludes": [
"orders"
]
}
}
查询结果:
说明:
nested数据查询,只要数据满足其中一条数据,那么剩下的数据都会返回。查询nested数据可以添加inner_hits参数去只显示子集
"nested": {
"path": "orders",
"inner_hits": {
"from": 0,
"size": 100 //返回的子集长度
},
上述查询语句中的_source参数可以用来过滤父级文档字段的显示与隐藏
"_source": {
"excludes": [
"orders" //表示排除字段
],
"includes": [
"name" //表示包含字段
]
}
目前返回的数据结构已经基本能满足业务需求,但是怎么去判断是否复购还是没有解决。目前想到的方案有三种,各有弊端,暂时没有一种完美的解决方案。
方案一:
通过filter去过滤字段,在数据里面添加一个冗余字段(condition),并且是数组类型(因为script里面只能获取到数组的数据而不能获取到nested里面数据,es官网描述只能获取简单类型),每个关于nested里面的查询条件以逗号分割。
目前只要考虑到的问题是:查询效率会随着查询数量,逐渐递减。
创建一个painless脚本
POST _scripts/repurchase/
{
"script": {
"lang": "painless",
"source": """
int count = 0;
for (int i = 0; i < doc['condition.keyword'].length; i++) {
boolean conditional = true;
String[] array= /,/.split(doc['condition.keyword'][i]);
if(params.bookingChannelId!=null){
conditional = conditional && params.bookingChannelId.contains(Integer.parseInt(array[1]));
}
if(params.ticketId!=null){
conditional = conditional && params.ticketId.contains(Integer.parseInt(array[2]));
}
if(params.startBookingTime!=null){
conditional = conditional && Integer.parseInt(array[0])>=params.startBookingTime;
}
if(params.endBookingTime!=null){
conditional = conditional && Integer.parseInt(array[0])<=params.endBookingTime;
}
if(conditional){
count++;
if(count>1){break;}
}
}
if(params.repurchase == 1){
return count > 1;
}else{
return count == 1;
}
"""
}
}
然后查询条件里面加上
"filter": {
"script": {
"script": {
"params": {
"startBookingTime": 1596547607, "endBookingTime": 1596547607, "repurchase": 1, //1-复购,2-不是复购
"bookingChannelId": 19,
"ticketId": 530535
},
"id": "repurchase"
}
}
}
方案二:
更改数据结构,以订单维度存储数据,通过组合聚合的方式(考虑到后期可能会通过多字段聚合数据,目前是以手机号做聚合,而且scroll分页比较方便)。
存在的问题:聚合桶计算之后如果总共有一百条数据,但是前五十条数据不是复购,后五十条数据是复购,那么查询复购通过size去控制会出现前面是空桶的情况,不好做分页,而且composite里面不支持min_dou_count字段去控制结果集(不知道是不是写的不对)。
GET /test/_search
{
"size": 0,
"query": {
//查询条件
},
"aggs": {
"my_buckets": {
"composite": {
"size": 100,
"sources": [
{
"group_by_mobile": {
"terms": {
"field": "ticketTakerMobile.keyword" //以手机号分组,如果mapping里面这个字段不是keyword类型,需要加上keyword
}
}
}
],
"after": {
"group_by_mobile": "xxxxxx" //从那个字段向下查询,不支持随机或者向上分页
}
},
"aggs": {
"total": {
"value_count": {
"field": "orderId.keyword" //计算聚合桶里面的数量
}
},
"sales_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"totals": "total"
},
"script": "params.totals > 1" //筛选聚合桶大于一的数据
}
}
}
}
}
}
方案三:
这个方案和方案二比较类似,都是以订单维度去聚合数据
存在的问题:es默认的size大小是一万条数据,但是可以通过修改配置去解决,但是潜在的问题是数据量过大会不会将内存沾满。
// 修改ES中所有的index配置PUT _all/_settings{ "index":{ "max_result_window": 100000 }}// 修改ES中指定的index配置PUT index_name/_settings{ "index":{ "max_result_window": 100000 }}
GET /test/_search
{
"size": 0, //不返回原始数据
"query": {
//查询条件
},
"aggs": {
"group_by_mobile": {
"terms": {
"field": "ticketTakerMobile.keyword",
"size": 100
},
"aggregations": {
"top": {
"top_hits": {
"from": 0,
"size": 100, // 桶内显示的最大数据长度
"version": false,
"seq_no_primary_term": false,
"explain": false,
"_source": {
"excludes": [] //桶内展示的字段
}
}
},
"bucket_field": {
"bucket_sort": {
"sort": [],
"from": 20, //偏移量
"size": 100 //展示的总数据条数
}
},
"total": {
"value_count": {
"field": "orderId.keyword"
}
},
"sales_bucket_filter": {
"bucket_selector": {
"buckets_path": {
"totals": "total"
},
"script": "params.totals > 1" //筛选复购
}
}
}
}
}
}
结语:
接触elasticsearch时间不多,感觉painless的语法结构和java类似,但是使用java的某些函数去判断会出错。目前采用的是方案一去做处理,不知道还有没有其他完美的解决方案,存在错误欢迎指正。