1. 窗口函数格式
SELECT
col1[, col2, ...]
WINDOW_FUNCTION() OVER(WINDOW) [AS col_w]
FROM table;
WINDOW_FUNCTION包括:
- SUM
- MIN
- MAX
- AVG
- NTILE
- ROW_NUMBER
- RANK
- DENSE_RANK
- PERCENT_RANK
- LAG
- LEAD
- FIRST_VALUE
- LAST_VALUE
- CUME_DIST
WINDOW的定义:
[PARTITION BY col1 [ORDER BY col2 [ASC|DESC]] [ROWS BETWEEN A AND B]]
A和B用于指定计算行的范围,可以是:
CURRENT ROW:当前行n PRECEDING:往前n行n FOLLOWING:往后n行UNBOUNDED:起点UNBOUNDED PRECEDING:从前面开始的起点UNBOUNDED PRECEDING:到后面的终点
2. 测试数据
CREATE TABLE cookie_1 (
cookie_name STRING,
create_date STRING,
pv INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';
CREATE TABLE cookie_2 (
cookie_name STRING,
create_date STRING,
pv INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';
CREATE TABLE cookie_3 (
cookie_name STRING,
create_time STRING,
url STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';
CREATE TABLE dept_w (
dept_name STRING COMMENT '部门名称',
emp_name STRING COMMENT '员工姓名',
salary INT COMMENT '工资'
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';
INSERT INTO cookie_1 VALUES
('cookie1', '2015-04-10', 1),
('cookie1', '2015-04-11', 5),
('cookie1', '2015-04-12', 7),
('cookie1', '2015-04-13', 3),
('cookie1', '2015-04-14', 2),
('cookie1', '2015-04-15', 4),
('cookie1', '2015-04-16', 4);
INSERT INTO cookie_2 VALUES
('cookie1', '2015-04-10', 1),
('cookie1', '2015-04-11', 5),
('cookie1', '2015-04-12', 7),
('cookie1', '2015-04-13', 3),
('cookie1', '2015-04-14', 2),
('cookie1', '2015-04-15', 4),
('cookie1', '2015-04-16', 4),
('cookie2', '2015-04-10', 2),
('cookie2', '2015-04-11', 3),
('cookie2', '2015-04-12', 5),
('cookie2', '2015-04-13', 6),
('cookie2', '2015-04-14', 3),
('cookie2', '2015-04-15', 9),
('cookie2', '2015-04-16', 7);
INSERT INTO cookie_3 VALUES
('COOKIE_1', '2015-04-10 10:00:02', 'URL_02'),
('COOKIE_1', '2015-04-10 10:00:00', 'URL_01'),
('COOKIE_1', '2015-04-10 10:03:04', 'URL_03'),
('COOKIE_1', '2015-04-10 10:50:05', 'URL_06'),
('COOKIE_1', '2015-04-10 11:00:00', 'URL_07'),
('COOKIE_1', '2015-04-10 10:10:00', 'URL_04'),
('COOKIE_1', '2015-04-10 10:50:01', 'URL_05'),
('COOKIE_2', '2015-04-10 10:00:02', 'URL_22'),
('COOKIE_2', '2015-04-10 10:00:00', 'URL_11'),
('COOKIE_2', '2015-04-10 10:03:04', 'URL_33'),
('COOKIE_2', '2015-04-10 10:50:05', 'URL_66'),
('COOKIE_2', '2015-04-10 11:00:00', 'URL_77'),
('COOKIE_2', '2015-04-10 10:10:00', 'URL_44'),
('COOKIE_2', '2015-04-10 10:50:01', 'URL_55');
INSERT INTO dept_w VALUES
('D1', 'USER1', 1000),
('D1', 'USER2', 2000),
('D1', 'USER3', 3000),
('D2', 'USER4', 4000),
('D2', 'USER5', 5000);
SELECT * FROM cookie_1;
+-----------------------+-----------------------+--------------+
| cookie_1.cookie_name | cookie_1.create_date | cookie_1.pv |
+-----------------------+-----------------------+--------------+
| cookie1 | 2015-04-10 | 1 |
| cookie1 | 2015-04-11 | 5 |
| cookie1 | 2015-04-12 | 7 |
| cookie1 | 2015-04-13 | 3 |
| cookie1 | 2015-04-14 | 2 |
| cookie1 | 2015-04-15 | 4 |
| cookie1 | 2015-04-16 | 4 |
+-----------------------+-----------------------+--------------+
SELECT * FROM cookie_2;
+-----------------------+-----------------------+--------------+
| cookie_2.cookie_name | cookie_2.create_date | cookie_2.pv |
+-----------------------+-----------------------+--------------+
| cookie1 | 2015-04-10 | 1 |
| cookie1 | 2015-04-11 | 5 |
| cookie1 | 2015-04-12 | 7 |
| cookie1 | 2015-04-13 | 3 |
| cookie1 | 2015-04-14 | 2 |
| cookie1 | 2015-04-15 | 4 |
| cookie1 | 2015-04-16 | 4 |
| cookie2 | 2015-04-10 | 2 |
| cookie2 | 2015-04-11 | 3 |
| cookie2 | 2015-04-12 | 5 |
| cookie2 | 2015-04-13 | 6 |
| cookie2 | 2015-04-14 | 3 |
| cookie2 | 2015-04-15 | 9 |
| cookie2 | 2015-04-16 | 7 |
+-----------------------+-----------------------+--------------+
SELECT * FROM cookie_3;
+-----------------------+-----------------------+---------------+
| cookie_3.cookie_name | cookie_3.create_time | cookie_3.url |
+-----------------------+-----------------------+---------------+
| COOKIE_1 | 2015-04-10 10:00:02 | URL_02 |
| COOKIE_1 | 2015-04-10 10:00:00 | URL_01 |
| COOKIE_1 | 2015-04-10 10:03:04 | URL_03 |
| COOKIE_1 | 2015-04-10 10:50:05 | URL_06 |
| COOKIE_1 | 2015-04-10 11:00:00 | URL_07 |
| COOKIE_1 | 2015-04-10 10:10:00 | URL_04 |
| COOKIE_1 | 2015-04-10 10:50:01 | URL_05 |
| COOKIE_2 | 2015-04-10 10:00:02 | URL_22 |
| COOKIE_2 | 2015-04-10 10:00:00 | URL_11 |
| COOKIE_2 | 2015-04-10 10:03:04 | URL_33 |
| COOKIE_2 | 2015-04-10 10:50:05 | URL_66 |
| COOKIE_2 | 2015-04-10 11:00:00 | URL_77 |
| COOKIE_2 | 2015-04-10 10:10:00 | URL_44 |
| COOKIE_2 | 2015-04-10 10:50:01 | URL_55 |
+-----------------------+-----------------------+---------------+
SELECT * FROM dept_w;
+-------------------+------------------+----------------+
| dept_w.dept_name | dept_w.emp_name | dept_w.salary |
+-------------------+------------------+----------------+
| D1 | USER1 | 1000 |
| D1 | USER2 | 2000 |
| D1 | USER3 | 3000 |
| D2 | USER4 | 4000 |
| D2 | USER5 | 5000 |
+-------------------+------------------+----------------+
3. 窗口函数
3.1 SUM()
作用:求和
需求1:求cookie1的总pv数
SELECT
cookie_name,
create_date,
pv,
SUM(pv) OVER(PARTITION BY cookie_name) AS pv_sum
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | pv_sum |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-16 | 4 | 26 |
| cookie1 | 2015-04-15 | 4 | 26 |
| cookie1 | 2015-04-14 | 2 | 26 |
| cookie1 | 2015-04-13 | 3 | 26 |
| cookie1 | 2015-04-12 | 7 | 26 |
| cookie1 | 2015-04-11 | 5 | 26 |
| cookie1 | 2015-04-10 | 1 | 26 |
+--------------+--------------+-----+---------+
SELECT
cookie_name,
create_date,
pv,
SUM(pv) OVER(PARTITION BY cookie_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS pv_sum
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | pv_sum |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-16 | 4 | 26 |
| cookie1 | 2015-04-15 | 4 | 26 |
| cookie1 | 2015-04-14 | 2 | 26 |
| cookie1 | 2015-04-13 | 3 | 26 |
| cookie1 | 2015-04-12 | 7 | 26 |
| cookie1 | 2015-04-11 | 5 | 26 |
| cookie1 | 2015-04-10 | 1 | 26 |
+--------------+--------------+-----+---------+
需求2:按照create_date升序排序,求cookie1截止到当天的pv累加值
SELECT
cookie_name,
create_date,
pv,
SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv_sum
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | pv_sum |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-10 | 1 | 1 | -- 1
| cookie1 | 2015-04-11 | 5 | 6 | -- 1 + 5 = 6
| cookie1 | 2015-04-12 | 7 | 13 | -- 1 + 5 + 7 = 13
| cookie1 | 2015-04-13 | 3 | 16 | -- 1 + 5 + 7 + 3 = 16
| cookie1 | 2015-04-14 | 2 | 18 | -- 1 + 5 + 7 + 3 + 2 = 18
| cookie1 | 2015-04-15 | 4 | 22 | -- 1 + 5 + 7 + 3 + 2 + 4 = 22
| cookie1 | 2015-04-16 | 4 | 26 | -- 1 + 5 + 7 + 3 + 2 + 4 + 4 = 26
+--------------+--------------+-----+---------+
SELECT
cookie_name,
create_date,
pv,
SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS pv_sum
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | pv_sum |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-10 | 1 | 1 |
| cookie1 | 2015-04-11 | 5 | 6 |
| cookie1 | 2015-04-12 | 7 | 13 |
| cookie1 | 2015-04-13 | 3 | 16 |
| cookie1 | 2015-04-14 | 2 | 18 |
| cookie1 | 2015-04-15 | 4 | 22 |
| cookie1 | 2015-04-16 | 4 | 26 |
+--------------+--------------+-----+---------+
总结:
- 排序规则默认为
ASC - 默认的计算范围为
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
需求3:滚动求cookie1最近3天(包括当天)的pv累加值
SELECT
cookie_name,
create_date,
pv,
SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS pv_sum
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | pv_sum |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-10 | 1 | 1 | -- 1
| cookie1 | 2015-04-11 | 5 | 6 | -- 1 + 5 = 6
| cookie1 | 2015-04-12 | 7 | 13 | -- 1 + 5 + 7 = 13
| cookie1 | 2015-04-13 | 3 | 15 | -- 5 + 7 + 3 = 15
| cookie1 | 2015-04-14 | 2 | 12 | -- 7 + 3 + 2 = 12
| cookie1 | 2015-04-15 | 4 | 9 | -- 3 + 2 + 4 = 9
| cookie1 | 2015-04-16 | 4 | 10 | -- 2 + 4 + 4 = 10
+--------------+--------------+-----+---------+
需求4:滚动求cookie1当天到后2天(包括当天)的pv累加值
SELECT
cookie_name,
create_date,
pv,
SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) AS pv_sum
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | pv_sum |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-10 | 1 | 13 | -- 1 + 5 + 7 = 13
| cookie1 | 2015-04-11 | 5 | 15 | -- 5 + 7 + 3 = 15
| cookie1 | 2015-04-12 | 7 | 12 | -- 7 + 3 + 2 = 12
| cookie1 | 2015-04-13 | 3 | 9 | -- 3 + 2 + 4 = 9
| cookie1 | 2015-04-14 | 2 | 10 | -- 2 + 4 + 4 = 10
| cookie1 | 2015-04-15 | 4 | 8 | -- 4 + 4 = 8
| cookie1 | 2015-04-16 | 4 | 4 | -- 4
+--------------+--------------+-----+---------+
需求5:统计cookie1当天到最后一天的pv累加值
SELECT
cookie_name,
create_date,
pv,
SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv_sum
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | pv_sum |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-10 | 1 | 26 | -- 1 + 5 + 7 + 3 + 2 + 4 + 4 = 26
| cookie1 | 2015-04-11 | 5 | 25 | -- 5 + 7 + 3 + 2 + 4 + 4 = 25
| cookie1 | 2015-04-12 | 7 | 20 | -- 7 + 3 + 2 + 4 + 4 = 20
| cookie1 | 2015-04-13 | 3 | 13 | -- 3 + 2 + 4 + 4 = 13
| cookie1 | 2015-04-14 | 2 | 10 | -- 2 + 4 + 4 = 10
| cookie1 | 2015-04-15 | 4 | 8 | -- 4 + 4 = 8
| cookie1 | 2015-04-16 | 4 | 4 | -- 4
+--------------+--------------+-----+---------+
3.2 AVG()
作用:求平均值
SELECT
cookie_name,
create_date,
pv,
AVG(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS avg_pv
FROM cookie_1;
+--------------+--------------+-----+---------------------+
| cookie_name | create_date | pv | avg_sum |
+--------------+--------------+-----+---------------------+
| cookie1 | 2015-04-10 | 1 | 1.0 | -- 1 / 1
| cookie1 | 2015-04-11 | 5 | 3.0 | -- (1 + 5) / 2 = 6 / 2
| cookie1 | 2015-04-12 | 7 | 4.333333333333333 | -- (1 + 5 + 7) / 3 = 13 / 3
| cookie1 | 2015-04-13 | 3 | 4.0 | -- (1 + 5 + 7 + 3) / 4 = 16 / 4
| cookie1 | 2015-04-14 | 2 | 3.6 | -- (1 + 5 + 7 + 3 + 2) / 5 = 18 / 5
| cookie1 | 2015-04-15 | 4 | 3.6666666666666665 | -- (1 + 5 + 7 + 3 + 2 + 4) / 6 = 22 / 6
| cookie1 | 2015-04-16 | 4 | 3.7142857142857144 | -- (1 + 5 + 7 + 3 + 2 + 4 + 4) / 7 = 26 / 7
+--------------+--------------+-----+---------------------+
3.3 MIN()
作用:求最小值
SELECT
cookie_name,
create_date,
pv,
MIN(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS min_pv
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | min_pv |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-10 | 1 | 1 | -- min(1) = 1
| cookie1 | 2015-04-11 | 5 | 1 | -- min(1, 5) = 1
| cookie1 | 2015-04-12 | 7 | 1 | -- min(1, 5, 7) = 1
| cookie1 | 2015-04-13 | 3 | 1 | -- min(1, 5, 7, 3) = 1
| cookie1 | 2015-04-14 | 2 | 1 | -- min(1, 5, 7, 3, 2) = 1
| cookie1 | 2015-04-15 | 4 | 1 | -- min(1, 5, 7, 3, 2, 4) = 1
| cookie1 | 2015-04-16 | 4 | 1 | -- min(1, 5, 7, 3, 2, 4, 4) = 1
+--------------+--------------+-----+---------+
3.4 MAX()
作用:求最大值
SELECT
cookie_name,
create_date,
pv,
MAX(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS max_pv
FROM cookie_1;
+--------------+--------------+-----+---------+
| cookie_name | create_date | pv | max_pv |
+--------------+--------------+-----+---------+
| cookie1 | 2015-04-10 | 1 | 1 | -- max(1) = 1
| cookie1 | 2015-04-11 | 5 | 5 | -- max(1, 5) = 5
| cookie1 | 2015-04-12 | 7 | 7 | -- max(1, 5, 7) = 7
| cookie1 | 2015-04-13 | 3 | 7 | -- max(1, 5, 7, 3) = 7
| cookie1 | 2015-04-14 | 2 | 7 | -- max(1, 5, 7, 3, 2) = 7
| cookie1 | 2015-04-15 | 4 | 7 | -- max(1, 5, 7, 3, 2, 4) = 7
| cookie1 | 2015-04-16 | 4 | 7 | -- max(1, 5, 7, 3, 2, 4, 4) = 7
+--------------+--------------+-----+---------+
3.5 ROW_NUMBER()
组内排名,生成行号,排名不重复
需求:求每个cookie排名前三的pv
-- 首先分组后进行排名,生成 row_number
SELECT
cookie_name,
create_date,
pv,
ROW_NUMBER() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS row_number
FROM cookie_2;
+--------------+--------------+-----+-------------+
| cookie_name | create_date | pv | row_number |
+--------------+--------------+-----+-------------+
| cookie1 | 2015-04-12 | 7 | 1 |
| cookie1 | 2015-04-11 | 5 | 2 |
| cookie1 | 2015-04-16 | 4 | 3 |
| cookie1 | 2015-04-15 | 4 | 4 |
| cookie1 | 2015-04-13 | 3 | 5 |
| cookie1 | 2015-04-14 | 2 | 6 |
| cookie1 | 2015-04-10 | 1 | 7 |
| cookie2 | 2015-04-15 | 9 | 1 |
| cookie2 | 2015-04-16 | 7 | 2 |
| cookie2 | 2015-04-13 | 6 | 3 |
| cookie2 | 2015-04-12 | 5 | 4 |
| cookie2 | 2015-04-11 | 3 | 5 |
| cookie2 | 2015-04-14 | 3 | 6 |
| cookie2 | 2015-04-10 | 2 | 7 |
+--------------+--------------+-----+-------------+
-- 然后过滤出排名前三的pv
SELECT * FROM (
SELECT
cookie_name,
create_date,
pv,
ROW_NUMBER() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS row_number
FROM cookie_2
) AS t
WHERE t.row_number <= 3;
+----------------+----------------+-------+---------------+
| t.cookie_name | t.create_date | t.pv | t.row_number |
+----------------+----------------+-------+---------------+
| cookie1 | 2015-04-12 | 7 | 1 |
| cookie1 | 2015-04-11 | 5 | 2 |
| cookie1 | 2015-04-16 | 4 | 3 |
| cookie2 | 2015-04-15 | 9 | 1 |
| cookie2 | 2015-04-16 | 7 | 2 |
| cookie2 | 2015-04-13 | 6 | 3 |
+----------------+----------------+-------+---------------+
注意:
- 对于cookie1,2015-04-16 的pv为4,排名为3,2015-04-15的pv也为4,排名为4
- 对于cookie2,2015-04-11 的pv为3,排名为5,2015-04-14的pv也为3,排名为6
- 可见,row_number的特点是,排名依次递增,不会有重复,相同的值排名会不同
3.6 RANK()
组内排名,生成行号,如果比较的值一样,则排名相同,名次不连续
SELECT
cookie_name,
create_date,
pv,
RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS rank
FROM cookie_2;
+--------------+--------------+-----+-------+
| cookie_name | create_date | pv | rank |
+--------------+--------------+-----+-------+
| cookie1 | 2015-04-12 | 7 | 1 |
| cookie1 | 2015-04-11 | 5 | 2 |
| cookie1 | 2015-04-16 | 4 | 3 |
| cookie1 | 2015-04-15 | 4 | 3 |
| cookie1 | 2015-04-13 | 3 | 5 |
| cookie1 | 2015-04-14 | 2 | 6 |
| cookie1 | 2015-04-10 | 1 | 7 |
| cookie2 | 2015-04-15 | 9 | 1 |
| cookie2 | 2015-04-16 | 7 | 2 |
| cookie2 | 2015-04-13 | 6 | 3 |
| cookie2 | 2015-04-12 | 5 | 4 |
| cookie2 | 2015-04-11 | 3 | 5 |
| cookie2 | 2015-04-14 | 3 | 5 |
| cookie2 | 2015-04-10 | 2 | 7 |
+--------------+--------------+-----+-------+
注意:
- 对于cookie1,2015-04-16 的pv为4,排名为3,2015-04-15的pv也为4,排名也为3,下一位的排名为5
- 对于cookie2,2015-04-11 的pv为3,排名为5,2015-04-14的pv也为3,排名也为5,下一位的排名为7
- 可见,rank的特点是,相同的比较值排名就一样,名次不连续
3.7 DENSE_RANK()
组内排名,生成行号,如果比较的值一样,则排名相同,名次连续
SELECT
cookie_name,
create_date,
pv,
DENSE_RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS rank
FROM cookie_2;
+--------------+--------------+-----+-------+
| cookie_name | create_date | pv | rank |
+--------------+--------------+-----+-------+
| cookie1 | 2015-04-12 | 7 | 1 |
| cookie1 | 2015-04-11 | 5 | 2 |
| cookie1 | 2015-04-16 | 4 | 3 |
| cookie1 | 2015-04-15 | 4 | 3 |
| cookie1 | 2015-04-13 | 3 | 4 |
| cookie1 | 2015-04-14 | 2 | 5 |
| cookie1 | 2015-04-10 | 1 | 6 |
| cookie2 | 2015-04-15 | 9 | 1 |
| cookie2 | 2015-04-16 | 7 | 2 |
| cookie2 | 2015-04-13 | 6 | 3 |
| cookie2 | 2015-04-12 | 5 | 4 |
| cookie2 | 2015-04-11 | 3 | 5 |
| cookie2 | 2015-04-14 | 3 | 5 |
| cookie2 | 2015-04-10 | 2 | 6 |
+--------------+--------------+-----+-------+
注意:
- 对于cookie1,2015-04-16 的pv为4,排名为3,2015-04-15的pv也为4,排名也为3,下一位的排名为4
- 对于cookie2,2015-04-11 的pv为3,排名为5,2015-04-14的pv也为3,排名也为5,下一位的排名为6
- 可见,rank的特点是,相同的比较值排名就一样,名次连续
ROW_NUMBER()、RANK()、DENSE_RANK()的区别总结:
| 学生 | 分数 | row_number | rank | dense_rank |
|---|---|---|---|---|
| 张三 | 100 | 1 | 1 | 1 |
| 李四 | 100 | 2 | 1 | 1 |
| 王五 | 100 | 3 | 1 | 1 |
| 赵六 | 98 | 4 | 4 | 2 |
3.8 PERCENT_RANK()
分组排序后,(当前行的RANK值 - 1) / (组内总行数 -1)
SELECT
cookie_name,
create_date,
pv,
RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS rank,
PERCENT_RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS percent_rank
FROM cookie_2;
+--------------+--------------+-----+-------+----------------------+
| cookie_name | create_date | pv | rank | percent_rank |
+--------------+--------------+-----+-------+----------------------+
| cookie1 | 2015-04-12 | 7 | 1 | 0.0 | -- (1-1)/(7-1)
| cookie1 | 2015-04-11 | 5 | 2 | 0.16666666666666666 | -- (2-1)/(7-1)
| cookie1 | 2015-04-16 | 4 | 3 | 0.3333333333333333 | -- (3-1)/(7-1)
| cookie1 | 2015-04-15 | 4 | 3 | 0.3333333333333333 | -- (3-1)/(7-1)
| cookie1 | 2015-04-13 | 3 | 5 | 0.6666666666666666 | -- (5-1)/(7-1)
| cookie1 | 2015-04-14 | 2 | 6 | 0.8333333333333334 | -- (6-1)/(7-1)
| cookie1 | 2015-04-10 | 1 | 7 | 1.0 | -- (7-1)/(7-1)
| cookie2 | 2015-04-15 | 9 | 1 | 0.0 | -- (1-1)/(7-1)
| cookie2 | 2015-04-16 | 7 | 2 | 0.16666666666666666 | -- (2-1)/(7-1)
| cookie2 | 2015-04-13 | 6 | 3 | 0.3333333333333333 | -- (3-1)/(7-1)
| cookie2 | 2015-04-12 | 5 | 4 | 0.5 | -- (4-1)/(7-1)
| cookie2 | 2015-04-11 | 3 | 5 | 0.6666666666666666 | -- (5-1)/(7-1)
| cookie2 | 2015-04-14 | 3 | 5 | 0.6666666666666666 | -- (5-1)/(7-1)
| cookie2 | 2015-04-10 | 2 | 7 | 1.0 | -- (7-1)/(7-1)
+--------------+--------------+-----+-------+----------------------+
注意:
- PERCENT_RANK()函数不能设置
ROWS BETWEEN ... AND ...子句,除非是ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
3.9 NTILE()
将分组数据按照顺序分成n片,返回切片值,如果切片不均匀,给第一个切片增加数据
SELECT
cookie_name,
create_date,
pv,
NTILE(2) OVER(PARTITION BY cookie_name ORDER BY create_date) AS rank
FROM cookie_2;
+--------------+--------------+-----+-------+
| cookie_name | create_date | pv | rank |
+--------------+--------------+-----+-------+
| cookie1 | 2015-04-10 | 1 | 1 |
| cookie1 | 2015-04-11 | 5 | 1 |
| cookie1 | 2015-04-12 | 7 | 1 |
| cookie1 | 2015-04-13 | 3 | 1 |
| cookie1 | 2015-04-14 | 2 | 2 |
| cookie1 | 2015-04-15 | 4 | 2 |
| cookie1 | 2015-04-16 | 4 | 2 |
| cookie2 | 2015-04-10 | 2 | 1 |
| cookie2 | 2015-04-11 | 3 | 1 |
| cookie2 | 2015-04-12 | 5 | 1 |
| cookie2 | 2015-04-13 | 6 | 1 |
| cookie2 | 2015-04-14 | 3 | 2 |
| cookie2 | 2015-04-15 | 9 | 2 |
| cookie2 | 2015-04-16 | 7 | 2 |
+--------------+--------------+-----+-------+
注意:
- 每个分组有7条数据,如果无法均匀分配,多余的数据给切片1
- NTILE()函数不能设置
ROWS BETWEEN ... AND ...子句,除非是ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
3.10 LAG()
函数作用:分组后,向上取第n行取指定一列的数据到当前行
函数定义:LAG(col, n, DEFAULT)
- col:指定一列
- 设置往上取第n行的数据,默认为1
- 默认值,如果往上取第n行没有数据,则给指定的默认值,默认给NULL
测试:
SELECT
cookie_name,
create_time,
url,
LAG(create_time, 2, '1970-01-01 08:00:00') OVER(PARTITION BY cookie_name ORDER BY create_time) AS last_2_create_time
FROM cookie_3;
+--------------+----------------------+---------+----------------------+
| cookie_name | create_time | url | last_2_create_time |
+--------------+----------------------+---------+----------------------+
| COOKIE_1 | 2015-04-10 10:00:00 | URL_01 | 1970-01-01 08:00:00 |
| COOKIE_1 | 2015-04-10 10:00:02 | URL_02 | 1970-01-01 08:00:00 |
| COOKIE_1 | 2015-04-10 10:03:04 | URL_03 | 2015-04-10 10:00:00 |
| COOKIE_1 | 2015-04-10 10:10:00 | URL_04 | 2015-04-10 10:00:02 |
| COOKIE_1 | 2015-04-10 10:50:01 | URL_05 | 2015-04-10 10:03:04 |
| COOKIE_1 | 2015-04-10 10:50:05 | URL_06 | 2015-04-10 10:10:00 |
| COOKIE_1 | 2015-04-10 11:00:00 | URL_07 | 2015-04-10 10:50:01 |
| COOKIE_2 | 2015-04-10 10:00:00 | URL_11 | 1970-01-01 08:00:00 |
| COOKIE_2 | 2015-04-10 10:00:02 | URL_22 | 1970-01-01 08:00:00 |
| COOKIE_2 | 2015-04-10 10:03:04 | URL_33 | 2015-04-10 10:00:00 |
| COOKIE_2 | 2015-04-10 10:10:00 | URL_44 | 2015-04-10 10:00:02 |
| COOKIE_2 | 2015-04-10 10:50:01 | URL_55 | 2015-04-10 10:03:04 |
| COOKIE_2 | 2015-04-10 10:50:05 | URL_66 | 2015-04-10 10:10:00 |
| COOKIE_2 | 2015-04-10 11:00:00 | URL_77 | 2015-04-10 10:50:01 |
+--------------+----------------------+---------+----------------------+
SELECT
cookie_name,
create_time,
url,
LAG(create_time) OVER(PARTITION BY cookie_name ORDER BY create_time) AS last_create_time
FROM cookie_3;
+--------------+----------------------+---------+----------------------+
| cookie_name | create_time | url | last_create_time |
+--------------+----------------------+---------+----------------------+
| COOKIE_1 | 2015-04-10 10:00:00 | URL_01 | NULL |
| COOKIE_1 | 2015-04-10 10:00:02 | URL_02 | 2015-04-10 10:00:00 |
| COOKIE_1 | 2015-04-10 10:03:04 | URL_03 | 2015-04-10 10:00:02 |
| COOKIE_1 | 2015-04-10 10:10:00 | URL_04 | 2015-04-10 10:03:04 |
| COOKIE_1 | 2015-04-10 10:50:01 | URL_05 | 2015-04-10 10:10:00 |
| COOKIE_1 | 2015-04-10 10:50:05 | URL_06 | 2015-04-10 10:50:01 |
| COOKIE_1 | 2015-04-10 11:00:00 | URL_07 | 2015-04-10 10:50:05 |
| COOKIE_2 | 2015-04-10 10:00:00 | URL_11 | NULL |
| COOKIE_2 | 2015-04-10 10:00:02 | URL_22 | 2015-04-10 10:00:00 |
| COOKIE_2 | 2015-04-10 10:03:04 | URL_33 | 2015-04-10 10:00:02 |
| COOKIE_2 | 2015-04-10 10:10:00 | URL_44 | 2015-04-10 10:03:04 |
| COOKIE_2 | 2015-04-10 10:50:01 | URL_55 | 2015-04-10 10:10:00 |
| COOKIE_2 | 2015-04-10 10:50:05 | URL_66 | 2015-04-10 10:50:01 |
| COOKIE_2 | 2015-04-10 11:00:00 | URL_77 | 2015-04-10 10:50:05 |
+--------------+----------------------+---------+----------------------+
3.11 LEAD()
函数作用:分组后,向下取第n行取指定一列的数据到当前行
函数定义:LAG(col, n, DEFAULT)
- col:指定一列
- 设置往下取第n行的数据,默认为1
- 默认值,如果往下取第n行没有数据,则给指定的默认值,默认给NULL
SELECT
cookie_name,
create_time,
url,
LEAD(create_time) OVER(PARTITION BY cookie_name ORDER BY create_time) AS next_create_time
FROM cookie_3;
+--------------+----------------------+---------+----------------------+
| cookie_name | create_time | url | next_create_time |
+--------------+----------------------+---------+----------------------+
| COOKIE_1 | 2015-04-10 10:00:00 | URL_01 | 2015-04-10 10:00:02 |
| COOKIE_1 | 2015-04-10 10:00:02 | URL_02 | 2015-04-10 10:03:04 |
| COOKIE_1 | 2015-04-10 10:03:04 | URL_03 | 2015-04-10 10:10:00 |
| COOKIE_1 | 2015-04-10 10:10:00 | URL_04 | 2015-04-10 10:50:01 |
| COOKIE_1 | 2015-04-10 10:50:01 | URL_05 | 2015-04-10 10:50:05 |
| COOKIE_1 | 2015-04-10 10:50:05 | URL_06 | 2015-04-10 11:00:00 |
| COOKIE_1 | 2015-04-10 11:00:00 | URL_07 | NULL |
| COOKIE_2 | 2015-04-10 10:00:00 | URL_11 | 2015-04-10 10:00:02 |
| COOKIE_2 | 2015-04-10 10:00:02 | URL_22 | 2015-04-10 10:03:04 |
| COOKIE_2 | 2015-04-10 10:03:04 | URL_33 | 2015-04-10 10:10:00 |
| COOKIE_2 | 2015-04-10 10:10:00 | URL_44 | 2015-04-10 10:50:01 |
| COOKIE_2 | 2015-04-10 10:50:01 | URL_55 | 2015-04-10 10:50:05 |
| COOKIE_2 | 2015-04-10 10:50:05 | URL_66 | 2015-04-10 11:00:00 |
| COOKIE_2 | 2015-04-10 11:00:00 | URL_77 | NULL |
+--------------+----------------------+---------+----------------------+
3.12 FIRST_VALUE()
分组排序后取截止到当前行的第一行指定列的数据
SELECT
cookie_name,
create_time,
url,
FIRST_VALUE(url) OVER(PARTITION BY cookie_name ORDER BY create_time) AS first_url
FROM cookie_3;
+--------------+----------------------+---------+------------+
| cookie_name | create_time | url | first_url |
+--------------+----------------------+---------+------------+
| COOKIE_1 | 2015-04-10 10:00:00 | URL_01 | URL_01 |
| COOKIE_1 | 2015-04-10 10:00:02 | URL_02 | URL_01 |
| COOKIE_1 | 2015-04-10 10:03:04 | URL_03 | URL_01 |
| COOKIE_1 | 2015-04-10 10:10:00 | URL_04 | URL_01 |
| COOKIE_1 | 2015-04-10 10:50:01 | URL_05 | URL_01 |
| COOKIE_1 | 2015-04-10 10:50:05 | URL_06 | URL_01 |
| COOKIE_1 | 2015-04-10 11:00:00 | URL_07 | URL_01 |
| COOKIE_2 | 2015-04-10 10:00:00 | URL_11 | URL_11 |
| COOKIE_2 | 2015-04-10 10:00:02 | URL_22 | URL_11 |
| COOKIE_2 | 2015-04-10 10:03:04 | URL_33 | URL_11 |
| COOKIE_2 | 2015-04-10 10:10:00 | URL_44 | URL_11 |
| COOKIE_2 | 2015-04-10 10:50:01 | URL_55 | URL_11 |
| COOKIE_2 | 2015-04-10 10:50:05 | URL_66 | URL_11 |
| COOKIE_2 | 2015-04-10 11:00:00 | URL_77 | URL_11 |
+--------------+----------------------+---------+------------+
3.13 LAST_VALUE()
分组排序后取截止到当前行的最后一行指定列的数据
SELECT
cookie_name,
create_time,
url,
LAST_VALUE(url) OVER(PARTITION BY cookie_name ORDER BY create_time) AS last_url
FROM cookie_3;
+--------------+----------------------+---------+-----------+
| cookie_name | create_time | url | last_url |
+--------------+----------------------+---------+-----------+
| COOKIE_1 | 2015-04-10 10:00:00 | URL_01 | URL_01 |
| COOKIE_1 | 2015-04-10 10:00:02 | URL_02 | URL_02 |
| COOKIE_1 | 2015-04-10 10:03:04 | URL_03 | URL_03 |
| COOKIE_1 | 2015-04-10 10:10:00 | URL_04 | URL_04 |
| COOKIE_1 | 2015-04-10 10:50:01 | URL_05 | URL_05 |
| COOKIE_1 | 2015-04-10 10:50:05 | URL_06 | URL_06 |
| COOKIE_1 | 2015-04-10 11:00:00 | URL_07 | URL_07 |
| COOKIE_2 | 2015-04-10 10:00:00 | URL_11 | URL_11 |
| COOKIE_2 | 2015-04-10 10:00:02 | URL_22 | URL_22 |
| COOKIE_2 | 2015-04-10 10:03:04 | URL_33 | URL_33 |
| COOKIE_2 | 2015-04-10 10:10:00 | URL_44 | URL_44 |
| COOKIE_2 | 2015-04-10 10:50:01 | URL_55 | URL_55 |
| COOKIE_2 | 2015-04-10 10:50:05 | URL_66 | URL_66 |
| COOKIE_2 | 2015-04-10 11:00:00 | URL_77 | URL_77 |
+--------------+----------------------+---------+-----------+
3.14 CUME_DIST()
分组排序后,求小于等于或大于等于当前行排序列的值的比例
SELECT
dept_name,
emp_name,
salary,
CUME_DIST() OVER(PARTITION BY dept_name ORDER BY salary) AS cume
FROM dept_w;
+------------+-----------+---------+---------------------+
| dept_name | emp_name | salary | cume |
+------------+-----------+---------+---------------------+
| D1 | USER1 | 1000 | 0.3333333333333333 | -- 在D1部门内,薪水小于等于1000的比例为1/3
| D1 | USER2 | 2000 | 0.6666666666666666 | -- 在D1部门内,薪水小于等于2000的比例为2/3
| D1 | USER3 | 3000 | 1.0 | -- 在D1部门内,薪水小于等于3000的比例为3/3
| D2 | USER4 | 4000 | 0.5 | -- 在D2部门内,薪水小于等于4000的比例为1/2
| D2 | USER5 | 5000 | 1.0 | -- 在D2部门内,薪水小于等于5000的比例为2/2
+------------+-----------+---------+---------------------+
SELECT
dept_name,
emp_name,
salary,
CUME_DIST() OVER(PARTITION BY dept_name ORDER BY salary DESC) AS cume
FROM dept_w;
+------------+-----------+---------+---------------------+
| dept_name | emp_name | salary | cume |
+------------+-----------+---------+---------------------+
| D1 | USER3 | 3000 | 0.3333333333333333 |
| D1 | USER2 | 2000 | 0.6666666666666666 |
| D1 | USER1 | 1000 | 1.0 |
| D2 | USER5 | 5000 | 0.5 |
| D2 | USER4 | 4000 | 1.0 |
+------------+-----------+---------+---------------------+
-- 不分组
SELECT
dept_name,
emp_name,
salary,
CUME_DIST() OVER(ORDER BY salary) AS cume
FROM dept_w;
+------------+-----------+---------+-------+
| dept_name | emp_name | salary | cume |
+------------+-----------+---------+-------+
| D1 | USER1 | 1000 | 0.2 | -- 在全部数据中,薪水小于等于1000的比例为1/5
| D1 | USER2 | 2000 | 0.4 | -- 在全部数据中,薪水小于等于2000的比例为2/5
| D1 | USER3 | 3000 | 0.6 | -- 在全部数据中,薪水小于等于3000的比例为3/5
| D2 | USER4 | 4000 | 0.8 | -- 在全部数据中,薪水小于等于4000的比例为4/5
| D2 | USER5 | 5000 | 1.0 | -- 在全部数据中,薪水小于等于5000的比例为5/5
+------------+-----------+---------+-------+
SELECT
dept_name,
emp_name,
salary,
CUME_DIST() OVER(ORDER BY salary DESC) AS cume
FROM dept_w;
+------------+-----------+---------+-------+
| dept_name | emp_name | salary | cume |
+------------+-----------+---------+-------+
| D2 | USER5 | 5000 | 0.2 |
| D2 | USER4 | 4000 | 0.4 |
| D1 | USER3 | 3000 | 0.6 |
| D1 | USER2 | 2000 | 0.8 |
| D1 | USER1 | 1000 | 1.0 |
+------------+-----------+---------+-------+
注意:
- 使用哪一列排序,就是针对那一列进行比例统计
- 升序,则统计组内小于等于该列值的比例
- 将许,则统计组内大于等于该列值的比例
- 可以不分组统计
- 不能设置
ROWS BETWEEN ... AND ...子句,除非是ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING