sql

102 阅读2分钟

1. 连续三天登录

    SELECT
        id
    FROM
        (SELECT
            t2.id,
            t2.login_date,
            DATE_SUB(t2.login_date, INTERVAL t2.rk DAY) AS continus_date
        FROM
            (SELECT
                t1.id,
                t1.login_date,
                // row_number() 在同一个id里面,根据login_date由大到小排序;一个xin的id是一个新的排序;(每个id的rk都是从1开始)
                ROW_NUMBER() OVER(PARTITION BY t1.id ORDER BY t1.login_date) AS rk
            FROM
                (
                    SELECT
                        id,
                        DATE_FORMAT(login_date, '%Y-%m-%d') AS login_date
                    FROM
                        login
                    GROUP BY
                        id, DATE_FORMAT(login_date, '%Y-%m-%d')
                ) AS t1) AS t2) AS t3
    GROUP BY
        t3.id, t3.continus_date
    HAVING COUNT(id) >= 3;

2. 一张用户日活信息表,用户某一天的用户info为空,需要用该用户的最近一天的info去补,请问该如何实现.

 SELECT
        dt,
        uid,
        IF(info IS NULL, IF(ABS(DATEDIFF(lag_dt, dt)) <= ABS(DATEDIFF(lead_dt, dt)), lag_info, lead_info), info) AS info
    FROM
        (SELECT
            uid,
            info,
            DATE_FORMAT(dt, '%Y-%m-%d') AS dt,
            // uid分组,dt排序后,当前这条数据的lag_dt等于上一条数据的dt
            LAG(dt, 1, '1970-01-01') OVER(PARTITION BY uid ORDER BY dt ASC)AS lag_dt,
            // uid分组,dt排序后,当前这条数据的lead_dt等于下一条数据的dt
            LEAD(dt, 1, '9999-12-31') OVER(PARTITION BY uid ORDER BY dt ASC) AS lead_dt,
            LAG(info, 1, NULL) OVER(PARTITION BY uid ORDER BY dt ASC)AS lag_info,
            LEAD(info, 1, NULL) OVER(PARTITION BY uid ORDER BY dt ASC)AS lead_info
        FROM
            user_info) AS t1
    ORDER BY
        uid;

3. 会话生成(2次时间超过60s算一个新的会话)

SELECT
        user_id,
        page_id,
        view_timestamp,
        // 根据user_id进行分组,view_timestamp排序)
        CONCAT(user_id, '-' ,SUM(id) OVER(PARTITION BY user_id ORDER BY view_timestamp)) AS session_id
    FROM
        (SELECT
            user_id,
            page_id,
            view_timestamp,
            IF(view_timestamp - lag_timestamp > 60, 1, 0) AS id
        FROM
            (SELECT
                user_id,
                page_id,
                view_timestamp,
                LAG(view_timestamp, 1, 0) OVER(PARTITION BY user_id ORDER BY view_timestamp) AS lag_timestamp
            FROM
                page_view_events) AS t1) AS t2

4. 间断连续登录用户(1 3 5 6登录,算连续登录6天)

方法一
    SELECT
        user_id,
        new_login_date
    FROM
        (
            SELECT
                id,
                login_date,
                IF (DATEDIFF(next_login_date, login_date) = 2, array(login_date, DATE_ADD(login_date, 1)), array(login_date)) AS array_date
            FROM
                (
                    SELECT
                        id, 
                        login_date,
                        LEAD(login_date, 1, '9999-12-31') OVER(PARTITION BY id ORDER BY login_date) AS next_login_date
                    FROM
                        (
                            SELECT
                                id,
                                DATE_FORMAT(login_date, '%Y-%m-%d') AS login_date
                            FROM
                                login
                            GROUP BY
                                id, DATE_FORMAT(login_date, '%Y-%m-%d')
                        ) AS t1
                ) AS t2
        ) AS t3
    LATERAL VIEW EXPLODE(array_date) tmp AS new_login_date
    再回到求连续用户的SQL上.
    方法二
    SELECT
        DISTINCT id
    FROM
        (
            SELECT
                id, 
                login_date,
                SUM(diff) OVER(PARTITION BY id ORDER BY login_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS continus_lab
            FROM
                (
                    SELECT
                        id, 
                        login_date,
                        IF (DATEDIFF(login_date, lag_date) > 1, 1, 0) AS diff
                    FROM
                        (
                            SELECT
                                id, 
                                login_date,
                                LAG(login_date, 1, '1970-01-01') OVER(PARTITION BY id ORDER BY login_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS lag_date
                            FROM
                                (
                                    SELECT
                                        id,
                                        DATE_FORMAT(login_date, '%Y-%m-%d') AS login_date
                                    FROM
                                        login
                                    GROUP BY
                                        id, DATE_FORMAT(login_date, '%Y-%m-%d')
                                ) AS t1
                        ) AS t2
                ) AS t3
        ) AS t4
    GROUP BY
        id, continus_lab
    HAVING COUNT(*) >= 3

5. 优惠日期统计(日期去重后的总天数)

方法一
    SELECT
        name,
        SUM(DATEDIFF(end_time, start_time) + 1) AS cnt
    FROM
        (
            SELECT
                name,
                IF(max_end_time IS NULL, start_time, IF(start_time > max_end_time, start_time, DATE_ADD(max_end_time, INTERVAL 1 DAY)))  AS start_time,
                end_time
            FROM
                (
                    SELECT
                        name,
                        start_time,
                        end_time,
                        MAX(end_time) OVER(PARTITION BY name ORDER BY start_time ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS max_end_time
                    FROM
                        brand
                ) AS t1
        ) AS t2
    WHERE
        end_time > start_time
    GROUP BY
        names
    方法二
    SELECT
        name,
        COUNT(DISTINCT event_time) AS cnt
    FROM
        (
            SELECT
                name,
                DATE_ADD(start_time, pos) AS event_time
            FROM
                (
                    SELECT
                        name,
                        start_time,
                        end_time,
                        diff,
                        SPLIT(REPEAT(',', diff), ',') AS arr
                    FROM
                        (
                            SELECT
                                name,
                                start_time,
                                end_time,
                                datediff(end_time, start_time) AS diff
                            FROM
                                brand
                        ) AS t1
                ) AS t2 
                LATERAL VIEW POSEXPLODE(arr) tmp AS pos, item
        ) AS t3
    GROUP BY
        name

6. 统计同时在线的人数

SELECT
        MAX(num) AS max_num
    FROM
        (
            SELECT
                uid,
                event_time,
                SUM(sa) OVER(ORDER BY event_time ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS num
            FROM
                (
                    SELECT
                        uid,
                        login_time AS event_time,
                        1 AS sa
                    FROM
                        user_status
                    UNION ALL
                    SELECT
                        uid,
                        logout_time AS event_time,
                        -1 AS sa
                    FROM
                        user_status
                ) AS t1
        ) AS t2

7. 去除部门最高工资和最低工资后的平均工资

SELECT
        dept_no,
        AVG(salary) AS avg_salary
    FROM
        (
            SELECT
                uid,
                salary,
                dept_no,
                ROW_NUMBER() OVER(PARTITION BY dept_no ORDER BY salary) AS rk1,
                ROW_NUMBER() OVER(PARTITION BY dept_no ORDER BY salary DESC) AS rk2
            FROM
                dept_salary
        ) t1
    WHERE
        rk1 > 1 AND rk2 > 1
    GROUP BY
        dept_no

8. 求连续点击三次的用户数,而且中间不能有别人的点击

SELECT
        DISTINCT uid
    FROM
        (
            SELECT
                *,
                rk2 - rk1 AS rk   
            FROM
                (
                    SELECT
                        uid,
                        click_time,
                        ROW_NUMBER() OVER(PARTITION BY uid ORDER BY click_time) AS rk1,
                        ROW_NUMBER() OVER(ORDER BY click_time) AS rk2
                    FROM
                        click
                ) t1
        ) t2
    GROUP BY
        uid, rk  
    HAVING COUNT(*) >= 3

9. 留存的计算

 SELECT
        '2020-06-14',
        t1.date_id AS create_date,
        DATEDIFF('2020-06-14', t1.date_id) AS retention_day,
        SUM(IF(t1.date_id = '2020-06-14', 1, 0)) AS retention_count,
        COUNT(*) AS new_user_count,
        CAST(SUM(IF(login_date_last='2020-06-14', 1, 0)) / COUNT(*) *100 AS DECIMAL(16, 2)) retention_rate
    FROM
        (
            SELECT
                user_id,
                date_id
            FROM
                dwd_user_register_inc
            WHERE
                dt >= date_sub('2020-06-14', 7) AND
                dt < '2020-06-14'
        ) t1
    JOIN
        (
            SELECT
                user_id,
                login_date_last
            FROM
                dws_user_login_td
            WHERE
                dt = '2020-06-14'
        ) t2
    ON 
        t1.user_id = t2.user_id
    GROUP BY
        t1.date_id