1. 连续三天登录
SELECT
id
FROM
(SELECT
t2.id,
t2.login_date,
DATE_SUB(t2.login_date, INTERVAL t2.rk DAY) AS continus_date
FROM
(SELECT
t1.id,
t1.login_date,
// row_number() 在同一个id里面,根据login_date由大到小排序;一个xin的id是一个新的排序;(每个id的rk都是从1开始)
ROW_NUMBER() OVER(PARTITION BY t1.id ORDER BY t1.login_date) AS rk
FROM
(
SELECT
id,
DATE_FORMAT(login_date,
FROM
login
GROUP BY
id, DATE_FORMAT(login_date,
) AS t1) AS t2) AS t3
GROUP BY
t3.id, t3.continus_date
HAVING COUNT(id) >= 3;
2. 一张用户日活信息表,用户某一天的用户info为空,需要用该用户的最近一天的info去补,请问该如何实现.
SELECT
dt,
uid,
IF(info IS NULL, IF(ABS(DATEDIFF(lag_dt, dt)) <= ABS(DATEDIFF(lead_dt, dt)), lag_info, lead_info), info) AS info
FROM
(SELECT
uid,
info,
DATE_FORMAT(dt, '%Y-%m-%d') AS dt,
// uid分组,dt排序后,当前这条数据的lag_dt等于上一条数据的dt
LAG(dt, 1, '1970-01-01') OVER(PARTITION BY uid ORDER BY dt ASC)AS lag_dt,
// uid分组,dt排序后,当前这条数据的lead_dt等于下一条数据的dt
LEAD(dt, 1, '9999-12-31') OVER(PARTITION BY uid ORDER BY dt ASC) AS lead_dt,
LAG(info, 1, NULL) OVER(PARTITION BY uid ORDER BY dt ASC)AS lag_info,
LEAD(info, 1, NULL) OVER(PARTITION BY uid ORDER BY dt ASC)AS lead_info
FROM
user_info) AS t1
ORDER BY
uid;
3. 会话生成(2次时间超过60s算一个新的会话)
SELECT
user_id,
page_id,
view_timestamp,
// 根据user_id进行分组,view_timestamp排序)
CONCAT(user_id, '-' ,SUM(id) OVER(PARTITION BY user_id ORDER BY view_timestamp)) AS session_id
FROM
(SELECT
user_id,
page_id,
view_timestamp,
IF(view_timestamp - lag_timestamp > 60, 1, 0) AS id
FROM
(SELECT
user_id,
page_id,
view_timestamp,
LAG(view_timestamp, 1, 0) OVER(PARTITION BY user_id ORDER BY view_timestamp) AS lag_timestamp
FROM
page_view_events) AS t1) AS t2
4. 间断连续登录用户(1 3 5 6登录,算连续登录6天)
方法一
SELECT
user_id,
new_login_date
FROM
(
SELECT
id,
login_date,
IF (DATEDIFF(next_login_date, login_date) = 2, array(login_date, DATE_ADD(login_date, 1)), array(login_date)) AS array_date
FROM
(
SELECT
id,
login_date,
LEAD(login_date, 1, '9999-12-31') OVER(PARTITION BY id ORDER BY login_date) AS next_login_date
FROM
(
SELECT
id,
DATE_FORMAT(login_date, '%Y-%m-%d') AS login_date
FROM
login
GROUP BY
id, DATE_FORMAT(login_date, '%Y-%m-%d')
) AS t1
) AS t2
) AS t3
LATERAL VIEW EXPLODE(array_date) tmp AS new_login_date
再回到求连续用户的SQL上.
方法二
SELECT
DISTINCT id
FROM
(
SELECT
id,
login_date,
SUM(diff) OVER(PARTITION BY id ORDER BY login_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS continus_lab
FROM
(
SELECT
id,
login_date,
IF (DATEDIFF(login_date, lag_date) > 1, 1, 0) AS diff
FROM
(
SELECT
id,
login_date,
LAG(login_date, 1, '1970-01-01') OVER(PARTITION BY id ORDER BY login_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lag_date
FROM
(
SELECT
id,
DATE_FORMAT(login_date, '%Y-%m-%d') AS login_date
FROM
login
GROUP BY
id, DATE_FORMAT(login_date, '%Y-%m-%d')
) AS t1
) AS t2
) AS t3
) AS t4
GROUP BY
id, continus_lab
HAVING COUNT(*) >= 3
5. 优惠日期统计(日期去重后的总天数)
方法一
SELECT
name,
SUM(DATEDIFF(end_time, start_time) + 1) AS cnt
FROM
(
SELECT
name,
IF(max_end_time IS NULL, start_time, IF(start_time > max_end_time, start_time, DATE_ADD(max_end_time, INTERVAL 1 DAY))) AS start_time,
end_time
FROM
(
SELECT
name,
start_time,
end_time,
MAX(end_time) OVER(PARTITION BY name ORDER BY start_time ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS max_end_time
FROM
brand
) AS t1
) AS t2
WHERE
end_time > start_time
GROUP BY
names
方法二
SELECT
name,
COUNT(DISTINCT event_time) AS cnt
FROM
(
SELECT
name,
DATE_ADD(start_time, pos) AS event_time
FROM
(
SELECT
name,
start_time,
end_time,
diff,
SPLIT(REPEAT(',', diff), ',') AS arr
FROM
(
SELECT
name,
start_time,
end_time,
datediff(end_time, start_time) AS diff
FROM
brand
) AS t1
) AS t2
LATERAL VIEW POSEXPLODE(arr) tmp AS pos, item
) AS t3
GROUP BY
name
6. 统计同时在线的人数
SELECT
MAX(num) AS max_num
FROM
(
SELECT
uid,
event_time,
SUM(sa) OVER(ORDER BY event_time ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS num
FROM
(
SELECT
uid,
login_time AS event_time,
1 AS sa
FROM
user_status
UNION ALL
SELECT
uid,
logout_time AS event_time,
-1 AS sa
FROM
user_status
) AS t1
) AS t2
7. 去除部门最高工资和最低工资后的平均工资
SELECT
dept_no,
AVG(salary) AS avg_salary
FROM
(
SELECT
uid,
salary,
dept_no,
ROW_NUMBER() OVER(PARTITION BY dept_no ORDER BY salary) AS rk1,
ROW_NUMBER() OVER(PARTITION BY dept_no ORDER BY salary DESC) AS rk2
FROM
dept_salary
) t1
WHERE
rk1 > 1 AND rk2 > 1
GROUP BY
dept_no
8. 求连续点击三次的用户数,而且中间不能有别人的点击
SELECT
DISTINCT uid
FROM
(
SELECT
*,
rk2 - rk1 AS rk
FROM
(
SELECT
uid,
click_time,
ROW_NUMBER() OVER(PARTITION BY uid ORDER BY click_time) AS rk1,
ROW_NUMBER() OVER(ORDER BY click_time) AS rk2
FROM
click
) t1
) t2
GROUP BY
uid, rk
HAVING COUNT(*) >= 3
9. 留存的计算
SELECT
'2020-06-14',
t1.date_id AS create_date,
DATEDIFF('2020-06-14', t1.date_id) AS retention_day,
SUM(IF(t1.date_id = '2020-06-14', 1, 0)) AS retention_count,
COUNT(*) AS new_user_count,
CAST(SUM(IF(login_date_last='2020-06-14', 1, 0)) / COUNT(*) *100 AS DECIMAL(16, 2)) retention_rate
FROM
(
SELECT
user_id,
date_id
FROM
dwd_user_register_inc
WHERE
dt >= date_sub('2020-06-14', 7) AND
dt < '2020-06-14'
) t1
JOIN
(
SELECT
user_id,
login_date_last
FROM
dws_user_login_td
WHERE
dt = '2020-06-14'
) t2
ON
t1.user_id = t2.user_id
GROUP BY
t1.date_id