Elasticsearch高手进阶篇(49)
深入聚合数据分析_cardinality算法之优化内存开销以及HLL算法
cardinality,count(distinct),5%的错误率,性能在100ms左右
precision_threshold优化准确率和内存开销
GET /waws_tvs/sales/_search
{
"size" : 0,
"aggs" : {
"distinct_brand" : {
"cardinality" : {
"field" : "brand",
"precision_threshold" : 100
}
}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 8,
"max_score": 0,
"hits": []
},
"aggregations": {
"distinct_brand": {
"value": 4
}
}
}
-
precision_threshold
- brand去重,如果brand的unique value,在100个以内,小米,长虹,三星,TCL,HTL
- 在多少个unique value以内,cardinality,几乎保证100%准确
-
cardinality算法
- 会占用precision_threshold * 8 byte 内存消耗,100 * 8 = 800个字节 占用内存很小,而且unique value如果的确在值以内,那么可以确保100%准确 100,数百万的unique value,错误率在5%以内
- precision_threshold,值设置的越大,占用内存越大,1000 * 8 = 8000 / 1000 = 8KB,可以确保更多unique value的场景下,100%的准确
field,去重,count,这时候,unique value,10000,precision_threshold=10000,10000 * 8 = 80000个byte,80KB
HyperLogLog++ (HLL)算法性能优化
-
cardinality底层算法:
-
HLL算法,HLL算法的性能
- 会对所有的uqniue value取hash值,通过hash值近似去求distcint count,误差
-
-
默认情况下,发送一个cardinality请求的时候,会动态地对所有的field value,取hash值;
- 将取hash值的操作,前移到建立索引的时候
PUT /waws_tvs/
{
"mappings": {
"sales": {
"properties": {
"brand": {
"type": "text",
"fields": {
"hash": {
"type": "murmur3"
}
}
}
}
}
}
}
GET /waws_tvs/sales/_search
{
"size" : 0,
"aggs" : {
"distinct_brand" : {
"cardinality" : {
"field" : "brand.hash",
"precision_threshold" : 100
}
}
}
}
Elasticsearch高手进阶篇(50)
深入聚合数据分析_percentiles百分比算法以及网站访问时延统计
需求:比如有一个网站,记录下了每次请求的访问的耗时,需要统计tp50,tp90,tp99
- tp50:50%的请求的耗时最长在多长时间
- tp90:90%的请求的耗时最长在多长时间
- tp99:99%的请求的耗时最长在多长时间
-
percentiles
- 数据在这个设置的百分比列表上的计算数值
-
设置索引
PUT /waws_website
{
"mappings": {
"waws_logs": {
"properties": {
"latency": {
"type": "long"
},
"province": {
"type": "keyword"
},
"timestamp": {
"type": "date"
}
}
}
}
}
- 添加数据
POST /waws_website/waws_logs/_bulk
{ "index": {}}
{ "latency" : 105, "province" : "江苏", "timestamp" : "2016-10-28" }
{ "index": {}}
{ "latency" : 83, "province" : "江苏", "timestamp" : "2016-10-29" }
{ "index": {}}
{ "latency" : 92, "province" : "江苏", "timestamp" : "2016-10-29" }
{ "index": {}}
{ "latency" : 112, "province" : "江苏", "timestamp" : "2016-10-28" }
{ "index": {}}
{ "latency" : 68, "province" : "江苏", "timestamp" : "2016-10-28" }
{ "index": {}}
{ "latency" : 76, "province" : "江苏", "timestamp" : "2016-10-29" }
{ "index": {}}
{ "latency" : 101, "province" : "新疆", "timestamp" : "2016-10-28" }
{ "index": {}}
{ "latency" : 275, "province" : "新疆", "timestamp" : "2016-10-29" }
{ "index": {}}
{ "latency" : 166, "province" : "新疆", "timestamp" : "2016-10-29" }
{ "index": {}}
{ "latency" : 654, "province" : "新疆", "timestamp" : "2016-10-28" }
{ "index": {}}
{ "latency" : 389, "province" : "新疆", "timestamp" : "2016-10-28" }
{ "index": {}}
{ "latency" : 302, "province" : "新疆", "timestamp" : "2016-10-29" }
- pencentiles
GET /waws_website/waws_logs/_search
{
"size": 0,
"aggs": {
"latency_percentiles": {
"percentiles": {
"field": "latency",
"percents": [
50,
95,
99
]
}
},
"latency_avg": {
"avg": {
"field": "latency"
}
}
}
}
{
"took": 7,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 12,
"max_score": 0,
"hits": []
},
"aggregations": {
"latency_avg": {
"value": 201.91666666666666
},
"latency_percentiles": {
"values": {
"50.0": 108.5,
"95.0": 508.24999999999983,
"99.0": 624.8500000000001
}
}
}
}
- 50%的请求,数值的最大的值是多少,不是完全准确的
GET /waws_website/waws_logs/_search
{
"size": 0,
"aggs": {
"group_by_province": {
"terms": {
"field": "province"
},
"aggs": {
"latency_percentiles": {
"percentiles": {
"field": "latency",
"percents": [
50,
95,
99
]
}
},
"latency_avg": {
"avg": {
"field": "latency"
}
}
}
}
}
}
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 12,
"max_score": 0,
"hits": []
},
"aggregations": {
"group_by_province": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{
"key": "新疆",
"doc_count": 6,
"latency_avg": {
"value": 314.5
},
"latency_percentiles": {
"values": {
"50.0": 288.5,
"95.0": 587.75,
"99.0": 640.75
}
}
},
{
"key": "江苏",
"doc_count": 6,
"latency_avg": {
"value": 89.33333333333333
},
"latency_percentiles": {
"values": {
"50.0": 87.5,
"95.0": 110.25,
"99.0": 111.65
}
}
}
]
}
}
}