用户画像(一):clickhouse AggregateFunction(groupBitmap, UInt64)

495 阅读2分钟

1.groupBitmap
用于存储UInt64数值,且自动去重,这对于我们存储uid非常的适合。

2.使用
通过配置宽表,我们可以得出一些基本的画像规则,用来制定一些常用的标签属性,如性别,地理位置,星座等。

--字符型标签
select uid,
       case
           when birthday in ('1970-01-01', '1990-03-21', '2000-01-01') then '未知'
           when birthday >= toDate(concat(toString(toYear(birthday)) as yy, '-01-20')) and
                birthday <= toDate(concat(yy, '-02-18')) then '水瓶座'
           when birthday >= toDate(concat(yy, '-02-19')) and birthday <= toDate(concat(yy, '-03-20')) then '双鱼座'
           when birthday >= toDate(concat(yy, '-03-21')) and birthday <= toDate(concat(yy, '-04-19')) then '白羊座'
           when birthday >= toDate(concat(yy, '-04-20')) and birthday <= toDate(concat(yy, '-05-20')) then '金牛座'
           when birthday >= toDate(concat(yy, '-05-21')) and birthday <= toDate(concat(yy, '-06-21')) then '双子座'
           when birthday >= toDate(concat(yy, '-06-22')) and birthday <= toDate(concat(yy, '-07-22')) then '巨蟹座'
           when birthday >= toDate(concat(yy, '-07-23')) and birthday <= toDate(concat(yy, '-08-22')) then '狮子座'
           when birthday >= toDate(concat(yy, '-08-23')) and birthday <= toDate(concat(yy, '-09-22')) then '处女座'
           when birthday >= toDate(concat(yy, '-09-23')) and birthday <= toDate(concat(yy, '-10-23')) then '天秤座'
           when birthday >= toDate(concat(yy, '-10-24')) and birthday <= toDate(concat(yy, '-11-22')) then '天蝎座'
           when birthday >= toDate(concat(yy, '-11-23')) and birthday <= toDate(concat(yy, '-12-21')) then '射手座'
           when (birthday >= toDate(concat(yy, '-12-22')) and birthday <= toDate(concat(yy, '-12-31'))) or
                (birthday >= toDate(concat(yy, '-01-01')) and birthday <= toDate(concat(yy, '-01-19'))) then '摩羯座'
           else '未知' end as constellation
from default.user_wide_all;

新建画像的存储表

create table user_label_string on_cluster ck_cluster
(
    label_name  String,
    label_value String,
    bitmap_uid  AggregateFunction(groupBitmap, UInt64)
)
    engine = ReplicatedAggregatingMergeTree('/clickhouse/dsp_iop/tables/{shard}/user_label_string', '{replica}')
        PARTITION BY label_name
        ORDER BY (label_name, label_value)
        SETTINGS index_granularity = 128;
create table user_label_string_all
(
    label_name  String,
    label_value String,
    bitmap_uid  AggregateFunction(groupBitmap, UInt64)
)
    engine = Distributed('ck_cluster', 'dsp_iop', 'user_label_string', rand());

写入数据:

insert into user_label_string(bitmap_uid, label_name ,label_value)
SELECT groupBitmapState(toUInt64(uid)) as bm,
       'constellation',
       constellation
from (
select uid,
       case
           when birthday in ('1970-01-01', '1990-03-21', '2000-01-01') then '未知'
           when birthday >= toDate(concat(toString(toYear(birthday)) as yy, '-01-20')) and
                birthday <= toDate(concat(yy, '-02-18')) then '水瓶座'
           when birthday >= toDate(concat(yy, '-02-19')) and birthday <= toDate(concat(yy, '-03-20')) then '双鱼座'
           when birthday >= toDate(concat(yy, '-03-21')) and birthday <= toDate(concat(yy, '-04-19')) then '白羊座'
           when birthday >= toDate(concat(yy, '-04-20')) and birthday <= toDate(concat(yy, '-05-20')) then '金牛座'
           when birthday >= toDate(concat(yy, '-05-21')) and birthday <= toDate(concat(yy, '-06-21')) then '双子座'
           when birthday >= toDate(concat(yy, '-06-22')) and birthday <= toDate(concat(yy, '-07-22')) then '巨蟹座'
           when birthday >= toDate(concat(yy, '-07-23')) and birthday <= toDate(concat(yy, '-08-22')) then '狮子座'
           when birthday >= toDate(concat(yy, '-08-23')) and birthday <= toDate(concat(yy, '-09-22')) then '处女座'
           when birthday >= toDate(concat(yy, '-09-23')) and birthday <= toDate(concat(yy, '-10-23')) then '天秤座'
           when birthday >= toDate(concat(yy, '-10-24')) and birthday <= toDate(concat(yy, '-11-22')) then '天蝎座'
           when birthday >= toDate(concat(yy, '-11-23')) and birthday <= toDate(concat(yy, '-12-21')) then '射手座'
           when (birthday >= toDate(concat(yy, '-12-22')) and birthday <= toDate(concat(yy, '-12-31'))) or
                (birthday >= toDate(concat(yy, '-01-01')) and birthday <= toDate(concat(yy, '-01-19'))) then '摩羯座'
           else '未知' end as constellation
from default.dwd_user_wide_all
     ) group by constellation;

3.查询

SELECT
    label_name,
    label_value,
    bitmapCardinality(bitmap_uid) AS bm
FROM dsp_iop.user_label_string_all
WHERE label_name = 'constellation'

Query id: dfc31066-c7dd-419a-ba5d-110339c2785d

┌─label_name────┬─label_value─┬───────bm─┐
│ constellation │ 双子座      │  796268 │
│ constellation │ 双鱼座      │  725469 │
│ constellation │ 处女座      │  877766 │
│ constellation │ 天秤座      │  828114 │
│ constellation │ 天蝎座      │  931031 │
│ constellation │ 射手座      │  724913 │
│ constellation │ 巨蟹座      │  831805 │
│ constellation │ 摩羯座      │ 1464983 │
│ constellation │ 未知        │ 3401269 │
│ constellation │ 水瓶座      │  760884 │
│ constellation │ 狮子座      │  733547 │
│ constellation │ 白羊座      │  628192 │
│ constellation │ 金牛座      │  690814 │
└───────────────┴─────────────┴──────────┘