hive sql专题

138 阅读13分钟

1. 行转列(转置)

行转列的常规做法是,group by+sum(if())【或count(if())】

yearmonthamount
199111.1
199121.2
199131.3
199141.4
199212.1
199222.2
199232.3
199242.4

查成这样一个结果

yearm1m2m3m4
19911.11.21.31.4
19922.12.22.32.4

解答

  • use test_sql;
    set hive.exec.mode.local.auto=true;
    create table table2(year int,month int ,amount double) ;
     insert overwrite table table2 values
               (1991,1,1.1),
               (1991,2,1.2),
               (1991,3,1.3),
               (1991,4,1.4),
               (1992,1,2.1),
               (1992,2,2.2),
               (1992,3,2.3),
               (1992,4,2.4);
    select * from table2;
    --行转列
    --常规做法是,group by+sum(if())
    --SQLserver中有pivot专门用来行转列
    --原始写法
    select year,
           sum(a) as m1,
           sum(b) as m2,
           sum(c) as m3,
           sum(d) as m4
    from
        (select *,
                if(month=1,amount,0) a,
                if(month=2,amount,0) b,
                if(month=3,amount,0) c,
                if(month=4,amount,0) d
         from table2) t
    group by t.year
    ;
    --简化写法
    select year,
           sum(if(month=1,amount,0)) m1,
           sum(if(month=2,amount,0)) m2,
           sum(if(month=3,amount,0)) m3,
           sum(if(month=4,amount,0)) m4
    from table2
    group by year;  
    

    华泰证券2

  • 查询课程编号“2”的成绩比课程编号“1”低的所有同学的学号、姓名。

  • 【这是行转列的衍生题】

  • create table student(sid int, sname string, gender string, class_id int);
    insert overwrite table student
    values (1, '张三', '女', 1),
           (2, '李四', '女', 1),
           (3, '王五', '男', 2);
    create table  course (cid int, cname string, teacher_id int);
    insert overwrite table course
    values (1, '生物', 1),
           (2, '体育', 1),
           (3, '物理', 2);
    select * from course;
    
    create table score (sid int, student_id int, course_id int, number int);
    insert overwrite table score
    values (1, 1, 1, 58),
           (4, 1, 2, 50),
           (2, 1, 2, 68),
           (3, 2, 2, 89);
    with t1 as(
        select student_id,
           sum(if(course_id=2,number,0)) as pe, --体育
           sum(if(course_id=1,number,0)) as bio --生物
    from score
    group by student_id
    having pe<bio)
    select sid, sname
    from t1
    join student
    on t1.student_id = sid
    ;
    
    

    腾讯游戏

表table如下:

DDateshengfu
2015-05-09
2015-05-09
2015-05-09
2015-05-09
2015-05-10
2015-05-10
2015-05-10

如果要生成下列结果, 该如何写sql语句?

DDate
2015-05-0922
2015-05-1012
--建表
create table table1(DDate string, shengfu string) ;
insert overwrite table table1 values ('2015-05-09', "胜"),
       ('2015-05-09', "胜"),
       ('2015-05-09', "负"),
       ('2015-05-09', "负"),
       ('2015-05-10', "胜"),
       ('2015-05-10', "负"),
       ('2015-05-10', "负");

select DDate,
       SUM(case when shengfu = '胜' then 1 else 0 end) `胜`,
       SUM(case when shengfu = '负' then 1 else 0 end) `负`
from table1
group by DDate;

腾讯QQ

假设tableA如表5, tableB如表6,

表5

qq号(字段名:qq)游戏(字段名:game)
10000a
10000b
10000c
20000c
20000d

表6

qq号(字段名:qq)游戏(字段名:game)
10000a_b_c
20000c_d

请写出以下sql逻辑:

a, 将tableA输出为tableB的格式; 【行转列】

b, 将tableB输出为tableA的格式; 【列转行】

create table tableA(qq string, game string) 
insert overwrite table tableA values 
       (10000, 'a'),
       (10000, 'b'),
       (10000, 'c'),
       (20000, 'c'),
       (20000, 'd');

create table tableB(qq string, game string) ;
insert overwrite table tableB values 
(10000, 'a_b_c'),
(20000, 'c_d');
       
--将tableA输出为tableB的格式;  
select qq,
       concat_ws('_', collect_list(game)) game
from tableA
group by qq;   
       
--将tableB输出为tableA的格式;    
select qq,
       tmp.game
from tableB lateral view explode(split(game, '_')) tmp as game;

连续N天登陆

  • --核心代码
    ->distinct
    ->date_add(dt,N-1) as date2
    ->lead(dt,N-1) over(partition by userid order by dt) as date3
    ->where date2=date3
    ->distinct
    
    

    OPPO

3、以下为用户登陆游戏的日期,用一条sQL语句查询出连续三天登录的人员姓名

namedate
张三2021-01-01
张三2021-01-02
张三2021-01-03
张三2021-01-02
李四2021-01-01
李四2021-01-02
王五2021-01-03
王五2021-01-02
王五2021-01-02
create table game(name string,  `date` string);
insert overwrite table game values
('张三','2021-01-01'),
('张三','2021-01-02'),
('张三','2021-01-03'),
('张三','2021-01-02'),

('张三','2021-01-07'),
('张三','2021-01-08'),
('张三','2021-01-09'),

('李四','2021-01-01'),
('李四','2021-01-02'),
('王五','2021-01-03'),
('王五','2021-01-02'),
('王五','2021-01-02');
with t1 as ( select distinct  name,date from game),
     t2 as ( select *,
                    row_number() over (partition by name order by date) rn
         from t1),
     t3 as ( select *,date_sub(date,rn) date2 from t2 )
     select distinct name from t3 group by name,date2 having count(1)>=3;
     
     
--方案二
select * from game;
with t1 as (
    select distinct name,`date` from game
),
    t2 as (
        select *,
               date_add(`date`,3-1) as date2,
               lead(`date`,3-1) over(partition by name order by `date`) as date3
          from t1
    )
select distinct name from t2 where date2=date3;
--方案二的写法2
with t1 as (
    select distinct name,`date` from game
),
    t2 as (
        select *,
               lead(`date`,3-1) over(partition by name order by `date`) as date3
          from t1
    )
select distinct name from t2 where datediff(date3,`date`)=2 ;

脉脉

用户每日登陆脉脉会访问app不同的模块,现有两个表

表1记录了每日脉脉活跃用户的uid和不同模块的活跃时长

表2记录了脉脉所有注册用户的一些属性

表1:maimai.dau

duidmoduleactive_duration列说明
2020-01-011jobs324d:活跃的日期uid:用户的唯一编码module:用户活跃模块actre.duration:该模块下对应的活跃时长(单位:s)
2020-01-012feeds445
2020-01-013im345
2020-01-022network765
2020-01-023jobs342

在过去一个月内,曾连续两天活跃的用户

-- 建表
-- 表1 dau   记录了每日脉脉活跃用户的uid和不同模块的活跃时长
create table dau(d string, uid int, module string, active_duration int);
insert overwrite table dau
values ('2020-01-01', 1, 'jobs', 324),
       ('2020-01-01', 2, 'feeds', 445),
       ('2020-01-01', 3, 'im', 345),
       ('2020-01-02', 2, 'network', 765),
       ('2020-01-02', 3, 'jobs', 342);
select *from dau;


with t1 as (
    select DISTINCT d, uid from dau),
     t2 as (
         select *,
                date_sub(d, (row_number() over (partition by uid order by d))) dis
         from t1
         where d <= `current_date`()
           and d >= date_sub((`current_date`()), 30)),
t3 as (
    select uid,
           min(d)   `开始日期`,
           max(d)   `结束日期`,
           count(1) `连续登入天数`
    from t2
    group by uid,dis
    having count(*) >= 2 
)
select DISTINCT uid from t3 ;

广州银行

有一张表C_T(列举了部分数据)表示持卡人消费记录,表结构如下:

CARD NERVARCHAR2卡号,
C_MONTHNUMBER消费月份,
C_DATEDATE消费日期,
C_TYPEVARCHAR2消费类型
C_ATMNUMBER消费金额

每个月每张卡连续消费的最大天数(如卡在当月只有一次消费则为1)。

连续消费天数:指一楼时间内连续每天都有消费,同一天有多笔消费算一天消费,不能跨月份统计。

create table c_t
(
    card_nbr string,
    c_month  string,
    c_date   string,
    c_type   string,
    c_atm    decimal
);
insert overwrite table c_t values
                               (1,'2022-01','2022-01-01','网购',100),
                               (1,'2022-01','2022-01-02','网购',200),
                               (1,'2022-01','2022-01-03','网购',300),
                               (1,'2022-01','2022-01-15','网购',100),
                               (1,'2022-01','2022-01-16','网购',200),
                               (2,'2022-01','2022-01-06','网购',500),
                               (2,'2022-01','2022-01-07','网购',800),
                               (1,'2022-02','2022-02-01','网购',100),
                               (1,'2022-02','2022-02-02','网购',200),
                               (1,'2022-02','2022-02-03','网购',300),
                               (2,'2022-02','2022-02-06','网购',500),
                               (2,'2022-02','2022-02-07','网购',800);
with t1 as (select distinct card_nbr,c_month,c_date from c_t),
     t2 as (select *,row_number() over (partition by card_nbr,c_month order by c_date) rn from t1  ),
     t3 as (select *,date_sub(c_date,rn) dt2 from t2  ),
     t4 as (select  dt2,card_nbr,c_month,count(1) as cnt from t3 group by dt2,card_nbr,c_month),
     t5 as ( select *,row_number() over (partition by card_nbr,c_month order by cnt desc) as rn from t4)
select card_nbr,c_month,cnt from t5 where rn=1

N日留存率

  • 核心代码

  • 主要还是要利用好行转列的函数
    -> where 日期 in (首日,1天后,7天后)
    -> group by 用户
    ->count(if(日期=首日,1,null))  as cnt
      count(if(日期=1天后,1,null)) as cnt2
      count(if(日期=7天后,1,null)) as cnt8
    ->having cnt>0
    ->count(user_id) as 首日总数
      count(if(cnt2>0,1,null)) as 次日留存数
      count(if(cnt8>0,1,null)) as 7日留存数
    ->次日留存数/首日总数 as 次日留存率
      7日留存数/首日总数 as 7日留存率
    
  • 先按用户分组,得到每个用户的各相关日期的登录情况。

    • select cuid,
             count(if(event_day='2020-04-01',1,null)) as cnt,
             count(if(event_day='2020-04-02',1,null)) as cnt2,
             count(if(event_day='2020-04-08',1,null)) as cnt8
        from tb_cuid_1d
        --提前过滤数据
        where event_day in ('2020-04-01','2020-04-02','2020-04-08')
      group by cuid
      -- 2020-04-01必须登录,剔除掉2020-04-01没登录的
      having cnt>0
      

image.png

  • 再对上面的用户汇总

    • select count(cnt) as uv,
             count(if(cnt2!=0,1,null)) as uv2,
             count(if(cnt8!=0,1,null)) as uv8
      
    • image.png

  • 最后再用 【后续日期的留存数】除以【首日总数】,就是【留存率】

腾讯视频号游戏直播

表:tableA

ds(日期)deviceuser_idis_active
2020-03-01ios00010
2020-03-01ios00021
2020-03-01android00031
2020-03-02ios00010
2020-03-02ios00020
2020-03-02android00031

20200301的ios设备用户活跃的次日留存率是多少?

use test_sql;
set hive.exec.mode.local.auto=true;
--腾讯视频号游戏直播
drop table if exists tableA;
create table tableA
(ds string comment '(日期)'  ,device string,user_id string,is_active int) ;
insert overwrite table  tableA values
('2020-03-01','ios','0001',0),
('2020-03-01','ios','0002',1),
('2020-03-01','ios','0004',1),
('2020-03-01','android','0003',1),
('2020-03-02','ios','0001',0),
('2020-03-02','ios','0002',0),
('2020-03-02','android','0003',1),
('2020-03-02','ios','0005',1) ,
('2020-03-02','ios','0004',1) ;



--方案1,过程见下面的顺序编号
with t1 as (
    select user_id,
           --3-一个用户如果在'2020-03-01'活跃,则cnt1>0
           count(if(ds = '2020-03-01', 1, null)) cnt1,
           --4-一个用户如果在'2020-03-02'活跃,则cnt2>0
           count(if(ds = '2020-03-02' and is_active = 1, 1, null)) cnt2
    from tableA
    --1-预先全局过滤
    where device = 'ios'
      and ( (ds='2020-03-01' and is_active = 1) or ds='2020-03-02')
    --2-按用户分组
    group by user_id
    --6-只筛选'2020-03-01'活跃的用户,他在'2020-03-02'是否活跃,看cnt2=0则不活跃,>0则活跃
    having cnt1 > 0
)
select count(cnt1)                               sum1,--'2020-03-01'的活跃数
       count(if(cnt2 > 0, user_id, null))        sum2,----并且在次日依然活跃的用户数
       count(if(cnt2 > 0, user_id, null)) / count(cnt1) rate--次日留存率
from t1;


百度

有1张表

create table if not exists tb_cuid_1d
(
    cuid         string comment '用户的唯一标识',
    os           string comment '平台',
    soft_version string comment '版本',
    event_day    string comment '日期',
    visit_time    int comment '用户访问时间戳',
    duration     decimal comment '用户访问时长',
    ext          array<string> comment '扩展字段'
);
insert overwrite table tb_cuid_1d values
 (1,'android',1,'2020-04-01',1234567,100,`array`('')),
 (1,'android',1,'2020-04-02',1234567,100,`array`('')),
 (1,'android',1,'2020-04-08',1234567,100,`array`('')),
 (2,'android',1,'2020-04-01',1234567,100,`array`('')),
 (3,'android',1,'2020-04-02',1234567,100,`array`(''));
 
 

写出用户表 tb_cuid_1d的 20200401 的次日、次7日留存的具体HQL :

一条sql统计出以下指标 (4.1号uv,4.1号在4.2号的留存uv,4.1号在4.8号的留存uv);

--一个理解简单,但是性能不快的做法
select count(a.cuid) uv,
       count(b.cuid) uv2,
       count(c.cuid) uv7
from (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-01') as a
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-02') as b on a.cuid=b.cuid
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-08') as c on a.cuid=c.cuid;
--另一个理解稍微复杂,但是性能快的做法
with t1 as (
    select cuid,
           count(if(event_day='2020-04-01',1,null)) as cnt1,
           count(if(event_day='2020-04-02',1,null)) as cnt2,
           count(if(event_day='2020-04-08',1,null)) as cnt8
     from tb_cuid_1d
     where event_day in ('2020-04-01','2020-04-02','2020-04-08')
     group by cuid
     having cnt1 >0
),
     t2 as (select count(cuid)                  as uv1,
                   count(if(cnt2 > 0, 1, null)) as uv2,
                   count(if(cnt8 > 0, 1, null)) as uv7
            from t1
            )
select *,
       uv2 / uv1 as `次日留存率`,
       uv7 / uv1 as `7日留存率`
from t2

分组内top前几

  • 需求常见词:【每组xxx的第一个yyy的zzz】【每组xxx的最后一个】

    【每组xxx的前n个】【每组最xx的前n个】

  • 公式:row_number() over(partition by 组名) as rn,再筛选rn<=N名