hive表中连续N天问题的实现

109 阅读2分钟

建表

--切换数据库
use db_function;

--建表
create table tb_login(
  userid string,
  logintime string
) row format delimited fields terminated by '\t';

创建数据:vim /export/data/login.log

A       2021-03-22
B       2021-03-22
C       2021-03-22
A       2021-03-23
C       2021-03-23
A       2021-03-24
B       2021-03-24

加载数据

load data local inpath '/export/data/login.log' into table tb_login;

查询数据

select * from tb_login;

1.png 统计连续两天登录

select
  userid,
  logintime,
  --本次登陆日期的第二天
  date_add(logintime,1) as nextday,
  --按照用户id分区,按照登陆日期排序,取下一次登陆时间,取不到就为0
  lead(logintime,1,0) over (partition by userid order by logintime) as nextlogin
from tb_login;

2.png

with t1 as (
  select
    userid,
    logintime,
    --本次登陆日期的第二天
      date_add(logintime,1) as nextday,
    --按照用户id分区,按照登陆日期排序,取下一次登陆时间,取不到就为0
     lead(logintime,1,0) over (partition by userid order by logintime) as nextlogin
from tb_login )
select distinct userid from t1 where nextday = nextlogin;

统计连续N天登录

select
  userid,
  logintime,
  --本次登陆日期的第N天
  date_add(logintime,N-1) as nextday,
  --按照用户id分区,按照登陆日期排序,取下下一次登陆时间,取不到就为0
  lead(logintime,N-1,0) over (partition by userid order by logintime) as nextlogin
from tb_login;

在这张表中,只要nextlogin和nextday相等则表示符合连续N天登录,再进行用户id去重即可得到连续N天登录的用户id,
所以将上面的表用with封装成临时表t1,再对临时表进行筛选即可得出答案。

with t1 as (
select
  userid,
  logintime,
  --本次登陆日期的第三天
  date_add(logintime,2) as nextday,
  --按照用户id分区,按照登陆日期排序,取下下一次登陆时间,取不到就为0
  lead(logintime,2,0) over (partition by userid order by logintime) as nextlogin
from tb_login )
select distinct userid from t1 where nextday = nextlogin;

本文参考了:(44条消息) hive表中连续N天问题的实现_hive连续登录三天_万里长江横渡的博客-CSDN博客