1. 行转列(转置)
行转列的常规做法是,group by+sum(if())【或count(if())】
| year | month | amount |
|---|---|---|
| 1991 | 1 | 1.1 |
| 1991 | 2 | 1.2 |
| 1991 | 3 | 1.3 |
| 1991 | 4 | 1.4 |
| 1992 | 1 | 2.1 |
| 1992 | 2 | 2.2 |
| 1992 | 3 | 2.3 |
| 1992 | 4 | 2.4 |
查成这样一个结果
| year | m1 | m2 | m3 | m4 |
|---|---|---|---|---|
| 1991 | 1.1 | 1.2 | 1.3 | 1.4 |
| 1992 | 2.1 | 2.2 | 2.3 | 2.4 |
解答
-
use test_sql; set hive.exec.mode.local.auto=true; create table table2(year int,month int ,amount double) ; insert overwrite table table2 values (1991,1,1.1), (1991,2,1.2), (1991,3,1.3), (1991,4,1.4), (1992,1,2.1), (1992,2,2.2), (1992,3,2.3), (1992,4,2.4); select * from table2; --行转列 --常规做法是,group by+sum(if()) --SQLserver中有pivot专门用来行转列 --原始写法 select year, sum(a) as m1, sum(b) as m2, sum(c) as m3, sum(d) as m4 from (select *, if(month=1,amount,0) a, if(month=2,amount,0) b, if(month=3,amount,0) c, if(month=4,amount,0) d from table2) t group by t.year ; --简化写法 select year, sum(if(month=1,amount,0)) m1, sum(if(month=2,amount,0)) m2, sum(if(month=3,amount,0)) m3, sum(if(month=4,amount,0)) m4 from table2 group by year;华泰证券2
-
查询课程编号“2”的成绩比课程编号“1”低的所有同学的学号、姓名。
-
【这是行转列的衍生题】
-
create table student(sid int, sname string, gender string, class_id int); insert overwrite table student values (1, '张三', '女', 1), (2, '李四', '女', 1), (3, '王五', '男', 2); create table course (cid int, cname string, teacher_id int); insert overwrite table course values (1, '生物', 1), (2, '体育', 1), (3, '物理', 2); select * from course; create table score (sid int, student_id int, course_id int, number int); insert overwrite table score values (1, 1, 1, 58), (4, 1, 2, 50), (2, 1, 2, 68), (3, 2, 2, 89); with t1 as( select student_id, sum(if(course_id=2,number,0)) as pe, --体育 sum(if(course_id=1,number,0)) as bio --生物 from score group by student_id having pe<bio) select sid, sname from t1 join student on t1.student_id = sid ;腾讯游戏
表table如下:
| DDate | shengfu |
|---|---|
| 2015-05-09 | 胜 |
| 2015-05-09 | 胜 |
| 2015-05-09 | 负 |
| 2015-05-09 | 负 |
| 2015-05-10 | 胜 |
| 2015-05-10 | 负 |
| 2015-05-10 | 负 |
如果要生成下列结果, 该如何写sql语句?
| DDate | 胜 | 负 |
|---|---|---|
| 2015-05-09 | 2 | 2 |
| 2015-05-10 | 1 | 2 |
--建表
create table table1(DDate string, shengfu string) ;
insert overwrite table table1 values ('2015-05-09', "胜"),
('2015-05-09', "胜"),
('2015-05-09', "负"),
('2015-05-09', "负"),
('2015-05-10', "胜"),
('2015-05-10', "负"),
('2015-05-10', "负");
select DDate,
SUM(case when shengfu = '胜' then 1 else 0 end) `胜`,
SUM(case when shengfu = '负' then 1 else 0 end) `负`
from table1
group by DDate;
腾讯QQ
假设tableA如表5, tableB如表6,
表5
| qq号(字段名:qq) | 游戏(字段名:game) |
|---|---|
| 10000 | a |
| 10000 | b |
| 10000 | c |
| 20000 | c |
| 20000 | d |
表6
| qq号(字段名:qq) | 游戏(字段名:game) |
|---|---|
| 10000 | a_b_c |
| 20000 | c_d |
请写出以下sql逻辑:
a, 将tableA输出为tableB的格式; 【行转列】
b, 将tableB输出为tableA的格式; 【列转行】
create table tableA(qq string, game string)
insert overwrite table tableA values
(10000, 'a'),
(10000, 'b'),
(10000, 'c'),
(20000, 'c'),
(20000, 'd');
create table tableB(qq string, game string) ;
insert overwrite table tableB values
(10000, 'a_b_c'),
(20000, 'c_d');
--将tableA输出为tableB的格式;
select qq,
concat_ws('_', collect_list(game)) game
from tableA
group by qq;
--将tableB输出为tableA的格式;
select qq,
tmp.game
from tableB lateral view explode(split(game, '_')) tmp as game;
连续N天登陆
-
--核心代码 ->distinct ->date_add(dt,N-1) as date2 ->lead(dt,N-1) over(partition by userid order by dt) as date3 ->where date2=date3 ->distinctOPPO
3、以下为用户登陆游戏的日期,用一条sQL语句查询出连续三天登录的人员姓名
| name | date |
|---|---|
| 张三 | 2021-01-01 |
| 张三 | 2021-01-02 |
| 张三 | 2021-01-03 |
| 张三 | 2021-01-02 |
| 李四 | 2021-01-01 |
| 李四 | 2021-01-02 |
| 王五 | 2021-01-03 |
| 王五 | 2021-01-02 |
| 王五 | 2021-01-02 |
create table game(name string, `date` string);
insert overwrite table game values
('张三','2021-01-01'),
('张三','2021-01-02'),
('张三','2021-01-03'),
('张三','2021-01-02'),
('张三','2021-01-07'),
('张三','2021-01-08'),
('张三','2021-01-09'),
('李四','2021-01-01'),
('李四','2021-01-02'),
('王五','2021-01-03'),
('王五','2021-01-02'),
('王五','2021-01-02');
with t1 as ( select distinct name,date from game),
t2 as ( select *,
row_number() over (partition by name order by date) rn
from t1),
t3 as ( select *,date_sub(date,rn) date2 from t2 )
select distinct name from t3 group by name,date2 having count(1)>=3;
--方案二
select * from game;
with t1 as (
select distinct name,`date` from game
),
t2 as (
select *,
date_add(`date`,3-1) as date2,
lead(`date`,3-1) over(partition by name order by `date`) as date3
from t1
)
select distinct name from t2 where date2=date3;
--方案二的写法2
with t1 as (
select distinct name,`date` from game
),
t2 as (
select *,
lead(`date`,3-1) over(partition by name order by `date`) as date3
from t1
)
select distinct name from t2 where datediff(date3,`date`)=2 ;
脉脉
用户每日登陆脉脉会访问app不同的模块,现有两个表
表1记录了每日脉脉活跃用户的uid和不同模块的活跃时长
表2记录了脉脉所有注册用户的一些属性
表1:maimai.dau
| d | uid | module | active_duration | 列说明 |
|---|---|---|---|---|
| 2020-01-01 | 1 | jobs | 324 | d:活跃的日期uid:用户的唯一编码module:用户活跃模块actre.duration:该模块下对应的活跃时长(单位:s) |
| 2020-01-01 | 2 | feeds | 445 | |
| 2020-01-01 | 3 | im | 345 | |
| 2020-01-02 | 2 | network | 765 | |
| 2020-01-02 | 3 | jobs | 342 | |
| … | … | … | … |
在过去一个月内,曾连续两天活跃的用户
-- 建表
-- 表1 dau 记录了每日脉脉活跃用户的uid和不同模块的活跃时长
create table dau(d string, uid int, module string, active_duration int);
insert overwrite table dau
values ('2020-01-01', 1, 'jobs', 324),
('2020-01-01', 2, 'feeds', 445),
('2020-01-01', 3, 'im', 345),
('2020-01-02', 2, 'network', 765),
('2020-01-02', 3, 'jobs', 342);
select *from dau;
with t1 as (
select DISTINCT d, uid from dau),
t2 as (
select *,
date_sub(d, (row_number() over (partition by uid order by d))) dis
from t1
where d <= `current_date`()
and d >= date_sub((`current_date`()), 30)),
t3 as (
select uid,
min(d) `开始日期`,
max(d) `结束日期`,
count(1) `连续登入天数`
from t2
group by uid,dis
having count(*) >= 2
)
select DISTINCT uid from t3 ;
广州银行
有一张表C_T(列举了部分数据)表示持卡人消费记录,表结构如下:
| CARD NER | VARCHAR2 | 卡号, |
|---|---|---|
| C_MONTH | NUMBER | 消费月份, |
| C_DATE | DATE | 消费日期, |
| C_TYPEVAR | CHAR2 | 消费类型 |
| C_ATM | NUMBER | 消费金额 |
每个月每张卡连续消费的最大天数(如卡在当月只有一次消费则为1)。
连续消费天数:指一楼时间内连续每天都有消费,同一天有多笔消费算一天消费,不能跨月份统计。
create table c_t
(
card_nbr string,
c_month string,
c_date string,
c_type string,
c_atm decimal
);
insert overwrite table c_t values
(1,'2022-01','2022-01-01','网购',100),
(1,'2022-01','2022-01-02','网购',200),
(1,'2022-01','2022-01-03','网购',300),
(1,'2022-01','2022-01-15','网购',100),
(1,'2022-01','2022-01-16','网购',200),
(2,'2022-01','2022-01-06','网购',500),
(2,'2022-01','2022-01-07','网购',800),
(1,'2022-02','2022-02-01','网购',100),
(1,'2022-02','2022-02-02','网购',200),
(1,'2022-02','2022-02-03','网购',300),
(2,'2022-02','2022-02-06','网购',500),
(2,'2022-02','2022-02-07','网购',800);
with t1 as (select distinct card_nbr,c_month,c_date from c_t),
t2 as (select *,row_number() over (partition by card_nbr,c_month order by c_date) rn from t1 ),
t3 as (select *,date_sub(c_date,rn) dt2 from t2 ),
t4 as (select dt2,card_nbr,c_month,count(1) as cnt from t3 group by dt2,card_nbr,c_month),
t5 as ( select *,row_number() over (partition by card_nbr,c_month order by cnt desc) as rn from t4)
select card_nbr,c_month,cnt from t5 where rn=1
N日留存率
-
核心代码
-
主要还是要利用好行转列的函数 -> where 日期 in (首日,1天后,7天后) -> group by 用户 ->count(if(日期=首日,1,null)) as cnt count(if(日期=1天后,1,null)) as cnt2 count(if(日期=7天后,1,null)) as cnt8 ->having cnt>0 ->count(user_id) as 首日总数 count(if(cnt2>0,1,null)) as 次日留存数 count(if(cnt8>0,1,null)) as 7日留存数 ->次日留存数/首日总数 as 次日留存率 7日留存数/首日总数 as 7日留存率 -
先按用户分组,得到每个用户的各相关日期的登录情况。
-
select cuid, count(if(event_day='2020-04-01',1,null)) as cnt, count(if(event_day='2020-04-02',1,null)) as cnt2, count(if(event_day='2020-04-08',1,null)) as cnt8 from tb_cuid_1d --提前过滤数据 where event_day in ('2020-04-01','2020-04-02','2020-04-08') group by cuid -- 2020-04-01必须登录,剔除掉2020-04-01没登录的 having cnt>0
-
-
再对上面的用户汇总
-
select count(cnt) as uv, count(if(cnt2!=0,1,null)) as uv2, count(if(cnt8!=0,1,null)) as uv8 -
-
-
最后再用 【后续日期的留存数】除以【首日总数】,就是【留存率】
腾讯视频号游戏直播
表:tableA
| ds(日期) | device | user_id | is_active |
|---|---|---|---|
| 2020-03-01 | ios | 0001 | 0 |
| 2020-03-01 | ios | 0002 | 1 |
| 2020-03-01 | android | 0003 | 1 |
| 2020-03-02 | ios | 0001 | 0 |
| 2020-03-02 | ios | 0002 | 0 |
| 2020-03-02 | android | 0003 | 1 |
20200301的ios设备用户活跃的次日留存率是多少?
use test_sql;
set hive.exec.mode.local.auto=true;
--腾讯视频号游戏直播
drop table if exists tableA;
create table tableA
(ds string comment '(日期)' ,device string,user_id string,is_active int) ;
insert overwrite table tableA values
('2020-03-01','ios','0001',0),
('2020-03-01','ios','0002',1),
('2020-03-01','ios','0004',1),
('2020-03-01','android','0003',1),
('2020-03-02','ios','0001',0),
('2020-03-02','ios','0002',0),
('2020-03-02','android','0003',1),
('2020-03-02','ios','0005',1) ,
('2020-03-02','ios','0004',1) ;
--方案1,过程见下面的顺序编号
with t1 as (
select user_id,
--3-一个用户如果在'2020-03-01'活跃,则cnt1>0
count(if(ds = '2020-03-01', 1, null)) cnt1,
--4-一个用户如果在'2020-03-02'活跃,则cnt2>0
count(if(ds = '2020-03-02' and is_active = 1, 1, null)) cnt2
from tableA
--1-预先全局过滤
where device = 'ios'
and ( (ds='2020-03-01' and is_active = 1) or ds='2020-03-02')
--2-按用户分组
group by user_id
--6-只筛选'2020-03-01'活跃的用户,他在'2020-03-02'是否活跃,看cnt2=0则不活跃,>0则活跃
having cnt1 > 0
)
select count(cnt1) sum1,--'2020-03-01'的活跃数
count(if(cnt2 > 0, user_id, null)) sum2,----并且在次日依然活跃的用户数
count(if(cnt2 > 0, user_id, null)) / count(cnt1) rate--次日留存率
from t1;
百度
有1张表
create table if not exists tb_cuid_1d
(
cuid string comment '用户的唯一标识',
os string comment '平台',
soft_version string comment '版本',
event_day string comment '日期',
visit_time int comment '用户访问时间戳',
duration decimal comment '用户访问时长',
ext array<string> comment '扩展字段'
);
insert overwrite table tb_cuid_1d values
(1,'android',1,'2020-04-01',1234567,100,`array`('')),
(1,'android',1,'2020-04-02',1234567,100,`array`('')),
(1,'android',1,'2020-04-08',1234567,100,`array`('')),
(2,'android',1,'2020-04-01',1234567,100,`array`('')),
(3,'android',1,'2020-04-02',1234567,100,`array`(''));
写出用户表 tb_cuid_1d的 20200401 的次日、次7日留存的具体HQL :
一条sql统计出以下指标 (4.1号uv,4.1号在4.2号的留存uv,4.1号在4.8号的留存uv);
--一个理解简单,但是性能不快的做法
select count(a.cuid) uv,
count(b.cuid) uv2,
count(c.cuid) uv7
from (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-01') as a
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-02') as b on a.cuid=b.cuid
left join (select distinct event_day, cuid from tb_cuid_1d where event_day='2020-04-08') as c on a.cuid=c.cuid;
--另一个理解稍微复杂,但是性能快的做法
with t1 as (
select cuid,
count(if(event_day='2020-04-01',1,null)) as cnt1,
count(if(event_day='2020-04-02',1,null)) as cnt2,
count(if(event_day='2020-04-08',1,null)) as cnt8
from tb_cuid_1d
where event_day in ('2020-04-01','2020-04-02','2020-04-08')
group by cuid
having cnt1 >0
),
t2 as (select count(cuid) as uv1,
count(if(cnt2 > 0, 1, null)) as uv2,
count(if(cnt8 > 0, 1, null)) as uv7
from t1
)
select *,
uv2 / uv1 as `次日留存率`,
uv7 / uv1 as `7日留存率`
from t2
分组内top前几
-
需求常见词:【每组xxx的第一个yyy的zzz】【每组xxx的最后一个】
【每组xxx的前n个】【每组最xx的前n个】
-
公式:row_number() over(partition by 组名) as rn,再筛选rn<=N名