hive sql专题1. 行转列(转置) 行转列的常规做法是，group by+sum(if())【或count(if(

1. 行转列(转置)

行转列的常规做法是，group by+sum(if())【或count(if())】

year	month	amount
1991	1	1.1
1991	2	1.2
1991	3	1.3
1991	4	1.4
1992	1	2.1
1992	2	2.2
1992	3	2.3
1992	4	2.4

查成这样一个结果

year	m1	m2	m3	m4
1991	1.1	1.2	1.3	1.4
1992	2.1	2.2	2.3	2.4

解答

use test_sql;
set hive.exec.mode.local.auto=true;
create table table2(year int,month int ,amount double) ;
 insert overwrite table table2 values
           (1991,1,1.1),
           (1991,2,1.2),
           (1991,3,1.3),
           (1991,4,1.4),
           (1992,1,2.1),
           (1992,2,2.2),
           (1992,3,2.3),
           (1992,4,2.4);
select * from table2;
--行转列
--常规做法是，group by+sum(if())
--SQLserver中有pivot专门用来行转列
--原始写法
select year,
       sum(a) as m1,
       sum(b) as m2,
       sum(c) as m3,
       sum(d) as m4
from
    (select *,
            if(month=1,amount,0) a,
            if(month=2,amount,0) b,
            if(month=3,amount,0) c,
            if(month=4,amount,0) d
     from table2) t
group by t.year
;
--简化写法
select year,
       sum(if(month=1,amount,0)) m1,
       sum(if(month=2,amount,0)) m2,
       sum(if(month=3,amount,0)) m3,
       sum(if(month=4,amount,0)) m4
from table2
group by year;

华泰证券2

查询课程编号“2”的成绩比课程编号“1”低的所有同学的学号、姓名。
【这是行转列的衍生题】

create table student(sid int, sname string, gender string, class_id int);
insert overwrite table student
values (1, '张三', '女', 1),
       (2, '李四', '女', 1),
       (3, '王五', '男', 2);
create table  course (cid int, cname string, teacher_id int);
insert overwrite table course
values (1, '生物', 1),
       (2, '体育', 1),
       (3, '物理', 2);
select * from course;

create table score (sid int, student_id int, course_id int, number int);
insert overwrite table score
values (1, 1, 1, 58),
       (4, 1, 2, 50),
       (2, 1, 2, 68),
       (3, 2, 2, 89);
with t1 as(
    select student_id,
       sum(if(course_id=2,number,0)) as pe, --体育
       sum(if(course_id=1,number,0)) as bio --生物
from score
group by student_id
having pe<bio)
select sid, sname
from t1
join student
on t1.student_id = sid
;

腾讯游戏

表table如下：

DDate	shengfu
2015-05-09	胜
2015-05-09	胜
2015-05-09	负
2015-05-09	负
2015-05-10	胜
2015-05-10	负
2015-05-10	负

如果要生成下列结果, 该如何写sql语句?

DDate	胜	负
2015-05-09	2	2
2015-05-10	1	2

--建表
create table table1(DDate string, shengfu string) ;
insert overwrite table table1 values ('2015-05-09', "胜"),
       ('2015-05-09', "胜"),
       ('2015-05-09', "负"),
       ('2015-05-09', "负"),
       ('2015-05-10', "胜"),
       ('2015-05-10', "负"),
       ('2015-05-10', "负");

select DDate,
       SUM(case when shengfu = '胜' then 1 else 0 end) `胜`,
       SUM(case when shengfu = '负' then 1 else 0 end) `负`
from table1
group by DDate;

腾讯QQ

假设tableA如表5, tableB如表6,

表5

qq号（字段名：qq）	游戏（字段名：game）
10000	a
10000	b
10000	c
20000	c
20000	d

表6

qq号（字段名：qq）	游戏（字段名：game）
10000	a_b_c
20000	c_d

请写出以下sql逻辑：

a, 将tableA输出为tableB的格式；【行转列】

b, 将tableB输出为tableA的格式; 【列转行】

create table tableA(qq string, game string) 
insert overwrite table tableA values 
       (10000, 'a'),
       (10000, 'b'),
       (10000, 'c'),
       (20000, 'c'),
       (20000, 'd');

create table tableB(qq string, game string) ;
insert overwrite table tableB values 
(10000, 'a_b_c'),
(20000, 'c_d');
       
--将tableA输出为tableB的格式；  
select qq,
       concat_ws('_', collect_list(game)) game
from tableA
group by qq;   
       
--将tableB输出为tableA的格式;    
select qq,
       tmp.game
from tableB lateral view explode(split(game, '_')) tmp as game;

连续N天登陆

--核心代码
->distinct
->date_add(dt,N-1) as date2
->lead(dt,N-1) over(partition by userid order by dt) as date3
->where date2=date3
->distinct

OPPO

3、以下为用户登陆游戏的日期，用一条sQL语句查询出连续三天登录的人员姓名

name	date
张三	2021-01-01
张三	2021-01-02
张三	2021-01-03
张三	2021-01-02
李四	2021-01-01
李四	2021-01-02
王五	2021-01-03
王五	2021-01-02
王五	2021-01-02

create table game(name string,  `date` string);
insert overwrite table game values
('张三','2021-01-01'),
('张三','2021-01-02'),
('张三','2021-01-03'),
('张三','2021-01-02'),

('张三','2021-01-07'),
('张三','2021-01-08'),
('张三','2021-01-09'),

('李四','2021-01-01'),
('李四','2021-01-02'),
('王五','2021-01-03'),
('王五','2021-01-02'),
('王五','2021-01-02');
with t1 as ( select distinct  name,date from game),
     t2 as ( select *,
                    row_number() over (partition by name order by date) rn
         from t1),
     t3 as ( select *,date_sub(date,rn) date2 from t2 )
     select distinct name from t3 group by name,date2 having count(1)>=3;
     
     
--方案二
select * from game;
with t1 as (
    select distinct name,`date` from game
),
    t2 as (
        select *,
               date_add(`date`,3-1) as date2,
               lead(`date`,3-1) over(partition by name order by `date`) as date3
          from t1
    )
select distinct name from t2 where date2=date3;
--方案二的写法2
with t1 as (
    select distinct name,`date` from game
),
    t2 as (
        select *,
               lead(`date`,3-1) over(partition by name order by `date`) as date3
          from t1
    )
select distinct name from t2 where datediff(date3,`date`)=2 ;

脉脉

用户每日登陆脉脉会访问app不同的模块，现有两个表

表1记录了每日脉脉活跃用户的uid和不同模块的活跃时长

表2记录了脉脉所有注册用户的一些属性

表1：maimai.dau

d	uid	module	active_duration	列说明
2020-01-01	1	jobs	324	d：活跃的日期uid：用户的唯一编码module：用户活跃模块actre.duration：该模块下对应的活跃时长（单位：s）
2020-01-01	2	feeds	445
2020-01-01	3	im	345
2020-01-02	2	network	765
2020-01-02	3	jobs	342
…	…	…	…

在过去一个月内,曾连续两天活跃的用户

-- 建表
-- 表1 dau   记录了每日脉脉活跃用户的uid和不同模块的活跃时长
create table dau(d string, uid int, module string, active_duration int);
insert overwrite table dau
values ('2020-01-01', 1, 'jobs', 324),
       ('2020-01-01', 2, 'feeds', 445),
       ('2020-01-01', 3, 'im', 345),
       ('2020-01-02', 2, 'network', 765),
       ('2020-01-02', 3, 'jobs', 342);
select *from dau;


with t1 as (
    select DISTINCT d, uid from dau),
     t2 as (
         select *,
                date_sub(d, (row_number() over (partition by uid order by d))) dis
         from t1
         where d <= `current_date`()
           and d >= date_sub((`current_date`()), 30)),
t3 as (
    select uid,
           min(d)   `开始日期`,
           max(d)   `结束日期`,
           count(1) `连续登入天数`
    from t2
    group by uid,dis
    having count(*) >= 2 
)
select DISTINCT uid from t3 ;

广州银行

有一张表C_T（列举了部分数据）表示持卡人消费记录，表结构如下：

CARD NER	VARCHAR2	卡号，
C_MONTH	NUMBER	消费月份，
C_DATE	DATE	消费日期，
C_TYPEVAR	CHAR2	消费类型
C_ATM	NUMBER	消费金额

每个月每张卡连续消费的最大天数（如卡在当月只有一次消费则为1）。

连续消费天数：指一楼时间内连续每天都有消费，同一天有多笔消费算一天消费，不能跨月份统计。

create table c_t
(
    card_nbr string,
    c_month  string,
    c_date   string,
    c_type   string,
    c_atm    decimal
);
insert overwrite table c_t values
                               (1,'2022-01','2022-01-01','网购',100),
                               (1,'2022-01','2022-01-02','网购',200),
                               (1,'2022-01','2022-01-03','网购',300),
                               (1,'2022-01','2022-01-15','网购',100),
                               (1,'2022-01','2022-01-16','网购',200),
                               (2,'2022-01','2022-01-06','网购',500),
                               (2,'2022-01','2022-01-07','网购',800),
                               (1,'2022-02','2022-02-01','网购',100),
                               (1,'2022-02','2022-02-02','网购',200),
                               (1,'2022-02','2022-02-03','网购',300),
                               (2,'2022-02','2022-02-06','网购',500),
                               (2,'2022-02','2022-02-07','网购',800);
with t1 as (select distinct card_nbr,c_month,c_date from c_t),
     t2 as (select *,row_number() over (partition by card_nbr,c_month order by c_date) rn from t1  ),
     t3 as (select *,date_sub(c_date,rn) dt2 from t2  ),
     t4 as (select  dt2,card_nbr,c_month,count(1) as cnt from t3 group by dt2,card_nbr,c_month),
     t5 as ( select *,row_number() over (partition by card_nbr,c_month order by cnt desc) as rn from t4)
select card_nbr,c_month,cnt from t5 where rn=1

N日留存率

核心代码

主要还是要利用好行转列的函数
-> where 日期 in (首日,1天后,7天后)
-> group by 用户
->count(if(日期=首日,1,null))  as cnt
  count(if(日期=1天后,1,null)) as cnt2
  count(if(日期=7天后,1,null)) as cnt8
->having cnt>0
->count(user_id) as 首日总数
  count(if(cnt2>0,1,null)) as 次日留存数
  count(if(cnt8>0,1,null)) as 7日留存数
->次日留存数/首日总数 as 次日留存率
  7日留存数/首日总数 as 7日留存率

先按用户分组，得到每个用户的各相关日期的登录情况。

select cuid,
       count(if(event_day='2020-04-01',1,null)) as cnt,
       count(if(event_day='2020-04-02',1,null)) as cnt2,
       count(if(event_day='2020-04-08',1,null)) as cnt8
  from tb_cuid_1d
  --提前过滤数据
  where event_day in ('2020-04-01','2020-04-02','2020-04-08')
group by cuid
-- 2020-04-01必须登录，剔除掉2020-04-01没登录的
having cnt>0

再对上面的用户汇总

select count(cnt) as uv,
       count(if(cnt2!=0,1,null)) as uv2,
       count(if(cnt8!=0,1,null)) as uv8

最后再用【后续日期的留存数】除以【首日总数】，就是【留存率】

腾讯视频号游戏直播

表：tableA

ds(日期)	device	user_id	is_active
2020-03-01	ios	0001	0
2020-03-01	ios	0002	1
2020-03-01	android	0003	1
2020-03-02	ios	0001	0
2020-03-02	ios	0002	0
2020-03-02	android	0003	1

20200301的ios设备用户活跃的次日留存率是多少？

use test_sql;
set hive.exec.mode.local.auto=true;
--腾讯视频号游戏直播
drop table if exists tableA;
create table tableA
(ds string comment '(日期)'  ,device string,user_id string,is_active int) ;
insert overwrite table  tableA values
('2020-03-01','ios','0001',0),
('2020-03-01','ios','0002',1),
('2020-03-01','ios','0004',1),
('2020-03-01','android','0003',1),
('2020-03-02','ios','0001',0),
('2020-03-02','ios','0002',0),
('2020-03-02','android','0003',1),
('2020-03-02','ios','0005',1) ,
('2020-03-02','ios','0004',1) ;



--方案1，过程见下面的顺序编号
with t1 as (
    select user_id,
           --3-一个用户如果在'2020-03-01'活跃，则cnt1>0
           count(if(ds = '2020-03-01', 1, null)) cnt1,
           --4-一个用户如果在'2020-03-02'活跃，则cnt2>0
           count(if(ds = '2020-03-02' and is_active = 1, 1, null)) cnt2
    from tableA
    --1-预先全局过滤
    where device = 'ios'
      and ( (ds='2020-03-01' and is_active = 1) or ds='2020-03-02')
    --2-按用户分组
    group by user_id
    --6-只筛选'2020-03-01'活跃的用户，他在'2020-03-02'是否活跃，看cnt2=0则不活跃，>0则活跃
    having cnt1 > 0
)
select count(cnt1)                               sum1,--'2020-03-01'的活跃数
       count(if(cnt2 > 0, user_id, null))        sum2,----并且在次日依然活跃的用户数
       count(if(cnt2 > 0, user_id, null)) / count(cnt1) rate--次日留存率
from t1;

百度

有1张表

create table if not exists tb_cuid_1d
(
    cuid         string comment '用户的唯一标识',
    os           string comment '平台',
    soft_version string comment '版本',
    event_day    string comment '日期',
    visit_time    int comment '用户访问时间戳',
    duration     decimal comment '用户访问时长',
    ext          array<string> comment '扩展字段'
);
insert overwrite table tb_cuid_1d values
 (1,'android',1,'2020-04-01',1234567,100,`array`('')),
 (1,'android',1,'2020-04-02',1234567,100,`array`('')),
 (1,'android',1,'2020-04-08',1234567,100,`array`('')),
 (2,'android',1,'2020-04-01',1234567,100,`array`('')),
 (3,'android',1,'2020-04-02',1234567,100,`array`(''));

写出用户表 tb_cuid_1d的 20200401 的次日、次7日留存的具体HQL ：

一条sql统计出以下指标（4.1号uv，4.1号在4.2号的留存uv，4.1号在4.8号的留存uv）;

--一个理解简单，但是性能不快的做法
select count(a.cuid) uv,
       count(b.cuid) uv2,
       count(c.cuid) uv7
from (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-01') as a
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-02') as b on a.cuid=b.cuid
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-08') as c on a.cuid=c.cuid;
--另一个理解稍微复杂，但是性能快的做法
with t1 as (
    select cuid,
           count(if(event_day='2020-04-01',1,null)) as cnt1,
           count(if(event_day='2020-04-02',1,null)) as cnt2,
           count(if(event_day='2020-04-08',1,null)) as cnt8
     from tb_cuid_1d
     where event_day in ('2020-04-01','2020-04-02','2020-04-08')
     group by cuid
     having cnt1 >0
),
     t2 as (select count(cuid)                  as uv1,
                   count(if(cnt2 > 0, 1, null)) as uv2,
                   count(if(cnt8 > 0, 1, null)) as uv7
            from t1
            )
select *,
       uv2 / uv1 as `次日留存率`,
       uv7 / uv1 as `7日留存率`
from t2

分组内top前几

需求常见词：【每组xxx的第一个yyy的zzz】【每组xxx的最后一个】

【每组xxx的前n个】【每组最xx的前n个】
公式：row_number() over(partition by 组名) as rn，再筛选rn<=N名