Hive内置函数大全-11-窗口函数

187 阅读22分钟

1. 窗口函数格式

SELECT 
  col1[, col2, ...]
  WINDOW_FUNCTION() OVER(WINDOW) [AS col_w]
FROM table;

WINDOW_FUNCTION包括:

  • SUM
  • MIN
  • MAX
  • AVG
  • NTILE
  • ROW_NUMBER
  • RANK
  • DENSE_RANK
  • PERCENT_RANK
  • LAG
  • LEAD
  • FIRST_VALUE
  • LAST_VALUE
  • CUME_DIST

WINDOW的定义:

[PARTITION BY col1 [ORDER BY col2 [ASC|DESC]] [ROWS BETWEEN A AND B]]

A和B用于指定计算行的范围,可以是:

  • CURRENT ROW:当前行
  • n PRECEDING:往前n行
  • n FOLLOWING:往后n行
  • UNBOUNDED:起点
    • UNBOUNDED PRECEDING:从前面开始的起点
    • UNBOUNDED PRECEDING:到后面的终点

2. 测试数据

CREATE TABLE cookie_1 (
  cookie_name STRING,
  create_date STRING,
  pv INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';

CREATE TABLE cookie_2 (
  cookie_name STRING,
  create_date STRING,
  pv INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';

CREATE TABLE cookie_3 (
  cookie_name STRING,
  create_time STRING,
  url STRING
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';

CREATE TABLE dept_w (
  dept_name STRING COMMENT '部门名称',
  emp_name STRING COMMENT '员工姓名',
  salary INT COMMENT '工资'
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';

INSERT INTO cookie_1 VALUES 
('cookie1', '2015-04-10', 1), 
('cookie1', '2015-04-11', 5),
('cookie1', '2015-04-12', 7),
('cookie1', '2015-04-13', 3),
('cookie1', '2015-04-14', 2),
('cookie1', '2015-04-15', 4),
('cookie1', '2015-04-16', 4);

INSERT INTO cookie_2 VALUES 
('cookie1', '2015-04-10', 1),
('cookie1', '2015-04-11', 5),
('cookie1', '2015-04-12', 7),
('cookie1', '2015-04-13', 3),
('cookie1', '2015-04-14', 2),
('cookie1', '2015-04-15', 4),
('cookie1', '2015-04-16', 4),
('cookie2', '2015-04-10', 2),
('cookie2', '2015-04-11', 3),
('cookie2', '2015-04-12', 5),
('cookie2', '2015-04-13', 6),
('cookie2', '2015-04-14', 3),
('cookie2', '2015-04-15', 9),
('cookie2', '2015-04-16', 7);

INSERT INTO cookie_3 VALUES 
('COOKIE_1', '2015-04-10 10:00:02', 'URL_02'),
('COOKIE_1', '2015-04-10 10:00:00', 'URL_01'),
('COOKIE_1', '2015-04-10 10:03:04', 'URL_03'),
('COOKIE_1', '2015-04-10 10:50:05', 'URL_06'),
('COOKIE_1', '2015-04-10 11:00:00', 'URL_07'),
('COOKIE_1', '2015-04-10 10:10:00', 'URL_04'),
('COOKIE_1', '2015-04-10 10:50:01', 'URL_05'),
('COOKIE_2', '2015-04-10 10:00:02', 'URL_22'),
('COOKIE_2', '2015-04-10 10:00:00', 'URL_11'),
('COOKIE_2', '2015-04-10 10:03:04', 'URL_33'),
('COOKIE_2', '2015-04-10 10:50:05', 'URL_66'),
('COOKIE_2', '2015-04-10 11:00:00', 'URL_77'),
('COOKIE_2', '2015-04-10 10:10:00', 'URL_44'),
('COOKIE_2', '2015-04-10 10:50:01', 'URL_55');

INSERT INTO dept_w VALUES 
('D1', 'USER1', 1000),
('D1', 'USER2', 2000),
('D1', 'USER3', 3000),
('D2', 'USER4', 4000),
('D2', 'USER5', 5000);

SELECT * FROM cookie_1;

+-----------------------+-----------------------+--------------+
| cookie_1.cookie_name  | cookie_1.create_date  | cookie_1.pv  |
+-----------------------+-----------------------+--------------+
| cookie1               | 2015-04-10            | 1            |
| cookie1               | 2015-04-11            | 5            |
| cookie1               | 2015-04-12            | 7            |
| cookie1               | 2015-04-13            | 3            |
| cookie1               | 2015-04-14            | 2            |
| cookie1               | 2015-04-15            | 4            |
| cookie1               | 2015-04-16            | 4            |
+-----------------------+-----------------------+--------------+

SELECT * FROM cookie_2;

+-----------------------+-----------------------+--------------+
| cookie_2.cookie_name  | cookie_2.create_date  | cookie_2.pv  |
+-----------------------+-----------------------+--------------+
| cookie1               | 2015-04-10            | 1            |
| cookie1               | 2015-04-11            | 5            |
| cookie1               | 2015-04-12            | 7            |
| cookie1               | 2015-04-13            | 3            |
| cookie1               | 2015-04-14            | 2            |
| cookie1               | 2015-04-15            | 4            |
| cookie1               | 2015-04-16            | 4            |
| cookie2               | 2015-04-10            | 2            |
| cookie2               | 2015-04-11            | 3            |
| cookie2               | 2015-04-12            | 5            |
| cookie2               | 2015-04-13            | 6            |
| cookie2               | 2015-04-14            | 3            |
| cookie2               | 2015-04-15            | 9            |
| cookie2               | 2015-04-16            | 7            |
+-----------------------+-----------------------+--------------+


SELECT * FROM cookie_3;

+-----------------------+-----------------------+---------------+
| cookie_3.cookie_name  | cookie_3.create_time  | cookie_3.url  |
+-----------------------+-----------------------+---------------+
| COOKIE_1              | 2015-04-10 10:00:02   | URL_02        |
| COOKIE_1              | 2015-04-10 10:00:00   | URL_01        |
| COOKIE_1              | 2015-04-10 10:03:04   | URL_03        |
| COOKIE_1              | 2015-04-10 10:50:05   | URL_06        |
| COOKIE_1              | 2015-04-10 11:00:00   | URL_07        |
| COOKIE_1              | 2015-04-10 10:10:00   | URL_04        |
| COOKIE_1              | 2015-04-10 10:50:01   | URL_05        |
| COOKIE_2              | 2015-04-10 10:00:02   | URL_22        |
| COOKIE_2              | 2015-04-10 10:00:00   | URL_11        |
| COOKIE_2              | 2015-04-10 10:03:04   | URL_33        |
| COOKIE_2              | 2015-04-10 10:50:05   | URL_66        |
| COOKIE_2              | 2015-04-10 11:00:00   | URL_77        |
| COOKIE_2              | 2015-04-10 10:10:00   | URL_44        |
| COOKIE_2              | 2015-04-10 10:50:01   | URL_55        |
+-----------------------+-----------------------+---------------+

SELECT * FROM dept_w;

+-------------------+------------------+----------------+
| dept_w.dept_name  | dept_w.emp_name  | dept_w.salary  |
+-------------------+------------------+----------------+
| D1                | USER1            | 1000           |
| D1                | USER2            | 2000           |
| D1                | USER3            | 3000           |
| D2                | USER4            | 4000           |
| D2                | USER5            | 5000           |
+-------------------+------------------+----------------+

3. 窗口函数

3.1 SUM()

作用:求和

需求1:求cookie1的总pv数

SELECT 
  cookie_name, 
  create_date, 
  pv,
  SUM(pv) OVER(PARTITION BY cookie_name) AS pv_sum
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | pv_sum  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-16   | 4   | 26      |
| cookie1      | 2015-04-15   | 4   | 26      |
| cookie1      | 2015-04-14   | 2   | 26      |
| cookie1      | 2015-04-13   | 3   | 26      |
| cookie1      | 2015-04-12   | 7   | 26      |
| cookie1      | 2015-04-11   | 5   | 26      |
| cookie1      | 2015-04-10   | 1   | 26      |
+--------------+--------------+-----+---------+

SELECT 
  cookie_name, 
  create_date, 
  pv,
  SUM(pv) OVER(PARTITION BY cookie_name ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS pv_sum
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | pv_sum  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-16   | 4   | 26      |
| cookie1      | 2015-04-15   | 4   | 26      |
| cookie1      | 2015-04-14   | 2   | 26      |
| cookie1      | 2015-04-13   | 3   | 26      |
| cookie1      | 2015-04-12   | 7   | 26      |
| cookie1      | 2015-04-11   | 5   | 26      |
| cookie1      | 2015-04-10   | 1   | 26      |
+--------------+--------------+-----+---------+

需求2:按照create_date升序排序,求cookie1截止到当天的pv累加值

SELECT 
  cookie_name, 
  create_date, 
  pv,
  SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS pv_sum
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | pv_sum  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-10   | 1   | 1       | -- 1
| cookie1      | 2015-04-11   | 5   | 6       | -- 1 + 5 = 6
| cookie1      | 2015-04-12   | 7   | 13      | -- 1 + 5 + 7 = 13
| cookie1      | 2015-04-13   | 3   | 16      | -- 1 + 5 + 7 + 3 = 16 
| cookie1      | 2015-04-14   | 2   | 18      | -- 1 + 5 + 7 + 3 + 2 = 18 
| cookie1      | 2015-04-15   | 4   | 22      | -- 1 + 5 + 7 + 3 + 2 + 4 = 22 
| cookie1      | 2015-04-16   | 4   | 26      | -- 1 + 5 + 7 + 3 + 2 + 4 + 4 = 26 
+--------------+--------------+-----+---------+

SELECT 
  cookie_name, 
  create_date, 
  pv,
  SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS pv_sum
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | pv_sum  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-10   | 1   | 1       |
| cookie1      | 2015-04-11   | 5   | 6       |
| cookie1      | 2015-04-12   | 7   | 13      |
| cookie1      | 2015-04-13   | 3   | 16      |
| cookie1      | 2015-04-14   | 2   | 18      |
| cookie1      | 2015-04-15   | 4   | 22      |
| cookie1      | 2015-04-16   | 4   | 26      |
+--------------+--------------+-----+---------+

总结:

  • 排序规则默认为ASC
  • 默认的计算范围为ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW

需求3:滚动求cookie1最近3天(包括当天)的pv累加值

SELECT 
  cookie_name, 
  create_date, 
  pv,
  SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS pv_sum
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | pv_sum  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-10   | 1   | 1       | -- 1
| cookie1      | 2015-04-11   | 5   | 6       | -- 1 + 5 = 6
| cookie1      | 2015-04-12   | 7   | 13      | -- 1 + 5 + 7 = 13
| cookie1      | 2015-04-13   | 3   | 15      | -- 5 + 7 + 3 = 15
| cookie1      | 2015-04-14   | 2   | 12      | -- 7 + 3 + 2 = 12
| cookie1      | 2015-04-15   | 4   | 9       | -- 3 + 2 + 4 = 9
| cookie1      | 2015-04-16   | 4   | 10      | -- 2 + 4 + 4 = 10
+--------------+--------------+-----+---------+

需求4:滚动求cookie1当天到后2天(包括当天)的pv累加值

SELECT 
  cookie_name, 
  create_date, 
  pv,
  SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN CURRENT ROW AND 2 FOLLOWING) AS pv_sum
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | pv_sum  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-10   | 1   | 13      | -- 1 + 5 + 7 = 13
| cookie1      | 2015-04-11   | 5   | 15      | -- 5 + 7 + 3 = 15
| cookie1      | 2015-04-12   | 7   | 12      | -- 7 + 3 + 2 = 12
| cookie1      | 2015-04-13   | 3   | 9       | -- 3 + 2 + 4 = 9
| cookie1      | 2015-04-14   | 2   | 10      | -- 2 + 4 + 4 = 10
| cookie1      | 2015-04-15   | 4   | 8       | -- 4 + 4 = 8
| cookie1      | 2015-04-16   | 4   | 4       | -- 4
+--------------+--------------+-----+---------+

需求5:统计cookie1当天到最后一天的pv累加值

SELECT 
  cookie_name, 
  create_date, 
  pv,
  SUM(pv) OVER(PARTITION BY cookie_name ORDER BY create_date ASC ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS pv_sum
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | pv_sum  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-10   | 1   | 26      | -- 1 + 5 + 7 + 3 + 2 + 4 + 4 = 26
| cookie1      | 2015-04-11   | 5   | 25      | -- 5 + 7 + 3 + 2 + 4 + 4 = 25
| cookie1      | 2015-04-12   | 7   | 20      | -- 7 + 3 + 2 + 4 + 4 = 20
| cookie1      | 2015-04-13   | 3   | 13      | -- 3 + 2 + 4 + 4 = 13
| cookie1      | 2015-04-14   | 2   | 10      | -- 2 + 4 + 4 = 10
| cookie1      | 2015-04-15   | 4   | 8       | -- 4 + 4 = 8
| cookie1      | 2015-04-16   | 4   | 4       | -- 4
+--------------+--------------+-----+---------+

3.2 AVG()

作用:求平均值

SELECT 
  cookie_name, 
  create_date, 
  pv,
  AVG(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS avg_pv
FROM cookie_1;

+--------------+--------------+-----+---------------------+
| cookie_name  | create_date  | pv  |       avg_sum       |
+--------------+--------------+-----+---------------------+
| cookie1      | 2015-04-10   | 1   | 1.0                 | -- 1 / 1
| cookie1      | 2015-04-11   | 5   | 3.0                 | -- (1 + 5) / 2 = 6 / 2 
| cookie1      | 2015-04-12   | 7   | 4.333333333333333   | -- (1 + 5 + 7) / 3 = 13 / 3
| cookie1      | 2015-04-13   | 3   | 4.0                 | -- (1 + 5 + 7 + 3) / 4 = 16 / 4
| cookie1      | 2015-04-14   | 2   | 3.6                 | -- (1 + 5 + 7 + 3 + 2) / 5 = 18 / 5
| cookie1      | 2015-04-15   | 4   | 3.6666666666666665  | -- (1 + 5 + 7 + 3 + 2 + 4) / 6 = 22 / 6
| cookie1      | 2015-04-16   | 4   | 3.7142857142857144  | -- (1 + 5 + 7 + 3 + 2 + 4 + 4) / 7 = 26 / 7
+--------------+--------------+-----+---------------------+

3.3 MIN()

作用:求最小值

SELECT 
  cookie_name, 
  create_date, 
  pv,
  MIN(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS min_pv
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | min_pv  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-10   | 1   | 1       | -- min(1) = 1
| cookie1      | 2015-04-11   | 5   | 1       | -- min(1, 5) = 1
| cookie1      | 2015-04-12   | 7   | 1       | -- min(1, 5, 7) = 1
| cookie1      | 2015-04-13   | 3   | 1       | -- min(1, 5, 7, 3) = 1
| cookie1      | 2015-04-14   | 2   | 1       | -- min(1, 5, 7, 3, 2) = 1
| cookie1      | 2015-04-15   | 4   | 1       | -- min(1, 5, 7, 3, 2, 4) = 1
| cookie1      | 2015-04-16   | 4   | 1       | -- min(1, 5, 7, 3, 2, 4, 4) = 1
+--------------+--------------+-----+---------+

3.4 MAX()

作用:求最大值

SELECT 
  cookie_name, 
  create_date, 
  pv,
  MAX(pv) OVER(PARTITION BY cookie_name ORDER BY create_date) AS max_pv
FROM cookie_1;

+--------------+--------------+-----+---------+
| cookie_name  | create_date  | pv  | max_pv  |
+--------------+--------------+-----+---------+
| cookie1      | 2015-04-10   | 1   | 1       | -- max(1) = 1
| cookie1      | 2015-04-11   | 5   | 5       | -- max(1, 5) = 5
| cookie1      | 2015-04-12   | 7   | 7       | -- max(1, 5, 7) = 7
| cookie1      | 2015-04-13   | 3   | 7       | -- max(1, 5, 7, 3) = 7
| cookie1      | 2015-04-14   | 2   | 7       | -- max(1, 5, 7, 3, 2) = 7
| cookie1      | 2015-04-15   | 4   | 7       | -- max(1, 5, 7, 3, 2, 4) = 7
| cookie1      | 2015-04-16   | 4   | 7       | -- max(1, 5, 7, 3, 2, 4, 4) = 7
+--------------+--------------+-----+---------+

3.5 ROW_NUMBER()

组内排名,生成行号,排名不重复

需求:求每个cookie排名前三的pv

-- 首先分组后进行排名,生成 row_number
SELECT 
  cookie_name,
  create_date,
  pv,
  ROW_NUMBER() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS row_number
FROM cookie_2;

+--------------+--------------+-----+-------------+
| cookie_name  | create_date  | pv  | row_number  |
+--------------+--------------+-----+-------------+
| cookie1      | 2015-04-12   | 7   | 1           |
| cookie1      | 2015-04-11   | 5   | 2           |
| cookie1      | 2015-04-16   | 4   | 3           |
| cookie1      | 2015-04-15   | 4   | 4           |
| cookie1      | 2015-04-13   | 3   | 5           |
| cookie1      | 2015-04-14   | 2   | 6           |
| cookie1      | 2015-04-10   | 1   | 7           |
| cookie2      | 2015-04-15   | 9   | 1           |
| cookie2      | 2015-04-16   | 7   | 2           |
| cookie2      | 2015-04-13   | 6   | 3           |
| cookie2      | 2015-04-12   | 5   | 4           |
| cookie2      | 2015-04-11   | 3   | 5           |
| cookie2      | 2015-04-14   | 3   | 6           |
| cookie2      | 2015-04-10   | 2   | 7           |
+--------------+--------------+-----+-------------+

-- 然后过滤出排名前三的pv
SELECT * FROM (
  SELECT 
    cookie_name,
    create_date,
    pv,
    ROW_NUMBER() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS row_number
  FROM cookie_2
) AS t
WHERE t.row_number <= 3;

+----------------+----------------+-------+---------------+
| t.cookie_name  | t.create_date  | t.pv  | t.row_number  |
+----------------+----------------+-------+---------------+
| cookie1        | 2015-04-12     | 7     | 1             |
| cookie1        | 2015-04-11     | 5     | 2             |
| cookie1        | 2015-04-16     | 4     | 3             |
| cookie2        | 2015-04-15     | 9     | 1             |
| cookie2        | 2015-04-16     | 7     | 2             |
| cookie2        | 2015-04-13     | 6     | 3             |
+----------------+----------------+-------+---------------+

注意:

  • 对于cookie1,2015-04-16 的pv为4,排名为3,2015-04-15的pv也为4,排名为4
  • 对于cookie2,2015-04-11 的pv为3,排名为5,2015-04-14的pv也为3,排名为6
  • 可见,row_number的特点是,排名依次递增,不会有重复,相同的值排名会不同

3.6 RANK()

组内排名,生成行号,如果比较的值一样,则排名相同,名次不连续

SELECT 
  cookie_name,
  create_date,
  pv,
  RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS rank
FROM cookie_2;

+--------------+--------------+-----+-------+
| cookie_name  | create_date  | pv  | rank  |
+--------------+--------------+-----+-------+
| cookie1      | 2015-04-12   | 7   | 1     |
| cookie1      | 2015-04-11   | 5   | 2     |
| cookie1      | 2015-04-16   | 4   | 3     |
| cookie1      | 2015-04-15   | 4   | 3     |
| cookie1      | 2015-04-13   | 3   | 5     |
| cookie1      | 2015-04-14   | 2   | 6     |
| cookie1      | 2015-04-10   | 1   | 7     |
| cookie2      | 2015-04-15   | 9   | 1     |
| cookie2      | 2015-04-16   | 7   | 2     |
| cookie2      | 2015-04-13   | 6   | 3     |
| cookie2      | 2015-04-12   | 5   | 4     |
| cookie2      | 2015-04-11   | 3   | 5     |
| cookie2      | 2015-04-14   | 3   | 5     |
| cookie2      | 2015-04-10   | 2   | 7     |
+--------------+--------------+-----+-------+

注意:

  • 对于cookie1,2015-04-16 的pv为4,排名为3,2015-04-15的pv也为4,排名也为3,下一位的排名为5
  • 对于cookie2,2015-04-11 的pv为3,排名为5,2015-04-14的pv也为3,排名也为5,下一位的排名为7
  • 可见,rank的特点是,相同的比较值排名就一样,名次不连续

3.7 DENSE_RANK()

组内排名,生成行号,如果比较的值一样,则排名相同,名次连续

SELECT 
  cookie_name,
  create_date,
  pv,
  DENSE_RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS rank
FROM cookie_2;

+--------------+--------------+-----+-------+
| cookie_name  | create_date  | pv  | rank  |
+--------------+--------------+-----+-------+
| cookie1      | 2015-04-12   | 7   | 1     |
| cookie1      | 2015-04-11   | 5   | 2     |
| cookie1      | 2015-04-16   | 4   | 3     |
| cookie1      | 2015-04-15   | 4   | 3     |
| cookie1      | 2015-04-13   | 3   | 4     |
| cookie1      | 2015-04-14   | 2   | 5     |
| cookie1      | 2015-04-10   | 1   | 6     |
| cookie2      | 2015-04-15   | 9   | 1     |
| cookie2      | 2015-04-16   | 7   | 2     |
| cookie2      | 2015-04-13   | 6   | 3     |
| cookie2      | 2015-04-12   | 5   | 4     |
| cookie2      | 2015-04-11   | 3   | 5     |
| cookie2      | 2015-04-14   | 3   | 5     |
| cookie2      | 2015-04-10   | 2   | 6     |
+--------------+--------------+-----+-------+

注意:

  • 对于cookie1,2015-04-16 的pv为4,排名为3,2015-04-15的pv也为4,排名也为3,下一位的排名为4
  • 对于cookie2,2015-04-11 的pv为3,排名为5,2015-04-14的pv也为3,排名也为5,下一位的排名为6
  • 可见,rank的特点是,相同的比较值排名就一样,名次连续

ROW_NUMBER()、RANK()、DENSE_RANK()的区别总结:

学生分数row_numberrankdense_rank
张三100111
李四100211
王五100311
赵六98442

3.8 PERCENT_RANK()

分组排序后,(当前行的RANK值 - 1) / (组内总行数 -1)

SELECT 
  cookie_name,
  create_date,
  pv,
  RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS rank,
  PERCENT_RANK() OVER(PARTITION BY cookie_name ORDER BY pv DESC) AS percent_rank
FROM cookie_2;

+--------------+--------------+-----+-------+----------------------+
| cookie_name  | create_date  | pv  | rank  |     percent_rank     |
+--------------+--------------+-----+-------+----------------------+
| cookie1      | 2015-04-12   | 7   | 1     | 0.0                  | -- (1-1)/(7-1) 
| cookie1      | 2015-04-11   | 5   | 2     | 0.16666666666666666  | -- (2-1)/(7-1) 
| cookie1      | 2015-04-16   | 4   | 3     | 0.3333333333333333   | -- (3-1)/(7-1) 
| cookie1      | 2015-04-15   | 4   | 3     | 0.3333333333333333   | -- (3-1)/(7-1) 
| cookie1      | 2015-04-13   | 3   | 5     | 0.6666666666666666   | -- (5-1)/(7-1) 
| cookie1      | 2015-04-14   | 2   | 6     | 0.8333333333333334   | -- (6-1)/(7-1) 
| cookie1      | 2015-04-10   | 1   | 7     | 1.0                  | -- (7-1)/(7-1)
| cookie2      | 2015-04-15   | 9   | 1     | 0.0                  | -- (1-1)/(7-1)
| cookie2      | 2015-04-16   | 7   | 2     | 0.16666666666666666  | -- (2-1)/(7-1) 
| cookie2      | 2015-04-13   | 6   | 3     | 0.3333333333333333   | -- (3-1)/(7-1) 
| cookie2      | 2015-04-12   | 5   | 4     | 0.5                  | -- (4-1)/(7-1) 
| cookie2      | 2015-04-11   | 3   | 5     | 0.6666666666666666   | -- (5-1)/(7-1) 
| cookie2      | 2015-04-14   | 3   | 5     | 0.6666666666666666   | -- (5-1)/(7-1) 
| cookie2      | 2015-04-10   | 2   | 7     | 1.0                  | -- (7-1)/(7-1) 
+--------------+--------------+-----+-------+----------------------+

注意:

  • PERCENT_RANK()函数不能设置ROWS BETWEEN ... AND ... 子句,除非是ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING

3.9 NTILE()

将分组数据按照顺序分成n片,返回切片值,如果切片不均匀,给第一个切片增加数据

SELECT 
  cookie_name,
  create_date,
  pv,
  NTILE(2) OVER(PARTITION BY cookie_name ORDER BY create_date) AS rank
FROM cookie_2;

+--------------+--------------+-----+-------+
| cookie_name  | create_date  | pv  | rank  |
+--------------+--------------+-----+-------+
| cookie1      | 2015-04-10   | 1   | 1     |
| cookie1      | 2015-04-11   | 5   | 1     |
| cookie1      | 2015-04-12   | 7   | 1     |
| cookie1      | 2015-04-13   | 3   | 1     |
| cookie1      | 2015-04-14   | 2   | 2     |
| cookie1      | 2015-04-15   | 4   | 2     |
| cookie1      | 2015-04-16   | 4   | 2     |
| cookie2      | 2015-04-10   | 2   | 1     |
| cookie2      | 2015-04-11   | 3   | 1     |
| cookie2      | 2015-04-12   | 5   | 1     |
| cookie2      | 2015-04-13   | 6   | 1     |
| cookie2      | 2015-04-14   | 3   | 2     |
| cookie2      | 2015-04-15   | 9   | 2     |
| cookie2      | 2015-04-16   | 7   | 2     |
+--------------+--------------+-----+-------+

注意:

  • 每个分组有7条数据,如果无法均匀分配,多余的数据给切片1
  • NTILE()函数不能设置ROWS BETWEEN ... AND ...子句,除非是ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING

3.10 LAG()

函数作用:分组后,向上取第n行取指定一列的数据到当前行

函数定义:LAG(col, n, DEFAULT)

  • col:指定一列
  • 设置往上取第n行的数据,默认为1
  • 默认值,如果往上取第n行没有数据,则给指定的默认值,默认给NULL

测试:

SELECT 
  cookie_name,
  create_time,
  url,
  LAG(create_time, 2, '1970-01-01 08:00:00') OVER(PARTITION BY cookie_name ORDER BY create_time) AS last_2_create_time
FROM cookie_3;

+--------------+----------------------+---------+----------------------+
| cookie_name  |     create_time      |   url   |  last_2_create_time  |
+--------------+----------------------+---------+----------------------+
| COOKIE_1     | 2015-04-10 10:00:00  | URL_01  | 1970-01-01 08:00:00  |
| COOKIE_1     | 2015-04-10 10:00:02  | URL_02  | 1970-01-01 08:00:00  |
| COOKIE_1     | 2015-04-10 10:03:04  | URL_03  | 2015-04-10 10:00:00  |
| COOKIE_1     | 2015-04-10 10:10:00  | URL_04  | 2015-04-10 10:00:02  |
| COOKIE_1     | 2015-04-10 10:50:01  | URL_05  | 2015-04-10 10:03:04  |
| COOKIE_1     | 2015-04-10 10:50:05  | URL_06  | 2015-04-10 10:10:00  |
| COOKIE_1     | 2015-04-10 11:00:00  | URL_07  | 2015-04-10 10:50:01  |
| COOKIE_2     | 2015-04-10 10:00:00  | URL_11  | 1970-01-01 08:00:00  |
| COOKIE_2     | 2015-04-10 10:00:02  | URL_22  | 1970-01-01 08:00:00  |
| COOKIE_2     | 2015-04-10 10:03:04  | URL_33  | 2015-04-10 10:00:00  |
| COOKIE_2     | 2015-04-10 10:10:00  | URL_44  | 2015-04-10 10:00:02  |
| COOKIE_2     | 2015-04-10 10:50:01  | URL_55  | 2015-04-10 10:03:04  |
| COOKIE_2     | 2015-04-10 10:50:05  | URL_66  | 2015-04-10 10:10:00  |
| COOKIE_2     | 2015-04-10 11:00:00  | URL_77  | 2015-04-10 10:50:01  |
+--------------+----------------------+---------+----------------------+

SELECT 
  cookie_name,
  create_time,
  url,
  LAG(create_time) OVER(PARTITION BY cookie_name ORDER BY create_time) AS last_create_time
FROM cookie_3;

+--------------+----------------------+---------+----------------------+
| cookie_name  |     create_time      |   url   |   last_create_time   |
+--------------+----------------------+---------+----------------------+
| COOKIE_1     | 2015-04-10 10:00:00  | URL_01  | NULL                 |
| COOKIE_1     | 2015-04-10 10:00:02  | URL_02  | 2015-04-10 10:00:00  |
| COOKIE_1     | 2015-04-10 10:03:04  | URL_03  | 2015-04-10 10:00:02  |
| COOKIE_1     | 2015-04-10 10:10:00  | URL_04  | 2015-04-10 10:03:04  |
| COOKIE_1     | 2015-04-10 10:50:01  | URL_05  | 2015-04-10 10:10:00  |
| COOKIE_1     | 2015-04-10 10:50:05  | URL_06  | 2015-04-10 10:50:01  |
| COOKIE_1     | 2015-04-10 11:00:00  | URL_07  | 2015-04-10 10:50:05  |
| COOKIE_2     | 2015-04-10 10:00:00  | URL_11  | NULL                 |
| COOKIE_2     | 2015-04-10 10:00:02  | URL_22  | 2015-04-10 10:00:00  |
| COOKIE_2     | 2015-04-10 10:03:04  | URL_33  | 2015-04-10 10:00:02  |
| COOKIE_2     | 2015-04-10 10:10:00  | URL_44  | 2015-04-10 10:03:04  |
| COOKIE_2     | 2015-04-10 10:50:01  | URL_55  | 2015-04-10 10:10:00  |
| COOKIE_2     | 2015-04-10 10:50:05  | URL_66  | 2015-04-10 10:50:01  |
| COOKIE_2     | 2015-04-10 11:00:00  | URL_77  | 2015-04-10 10:50:05  |
+--------------+----------------------+---------+----------------------+

3.11 LEAD()

函数作用:分组后,向下取第n行取指定一列的数据到当前行

函数定义:LAG(col, n, DEFAULT)

  • col:指定一列
  • 设置往下取第n行的数据,默认为1
  • 默认值,如果往下取第n行没有数据,则给指定的默认值,默认给NULL
SELECT 
  cookie_name,
  create_time,
  url,
  LEAD(create_time) OVER(PARTITION BY cookie_name ORDER BY create_time) AS next_create_time
FROM cookie_3;

+--------------+----------------------+---------+----------------------+
| cookie_name  |     create_time      |   url   |   next_create_time   |
+--------------+----------------------+---------+----------------------+
| COOKIE_1     | 2015-04-10 10:00:00  | URL_01  | 2015-04-10 10:00:02  |
| COOKIE_1     | 2015-04-10 10:00:02  | URL_02  | 2015-04-10 10:03:04  |
| COOKIE_1     | 2015-04-10 10:03:04  | URL_03  | 2015-04-10 10:10:00  |
| COOKIE_1     | 2015-04-10 10:10:00  | URL_04  | 2015-04-10 10:50:01  |
| COOKIE_1     | 2015-04-10 10:50:01  | URL_05  | 2015-04-10 10:50:05  |
| COOKIE_1     | 2015-04-10 10:50:05  | URL_06  | 2015-04-10 11:00:00  |
| COOKIE_1     | 2015-04-10 11:00:00  | URL_07  | NULL                 |
| COOKIE_2     | 2015-04-10 10:00:00  | URL_11  | 2015-04-10 10:00:02  |
| COOKIE_2     | 2015-04-10 10:00:02  | URL_22  | 2015-04-10 10:03:04  |
| COOKIE_2     | 2015-04-10 10:03:04  | URL_33  | 2015-04-10 10:10:00  |
| COOKIE_2     | 2015-04-10 10:10:00  | URL_44  | 2015-04-10 10:50:01  |
| COOKIE_2     | 2015-04-10 10:50:01  | URL_55  | 2015-04-10 10:50:05  |
| COOKIE_2     | 2015-04-10 10:50:05  | URL_66  | 2015-04-10 11:00:00  |
| COOKIE_2     | 2015-04-10 11:00:00  | URL_77  | NULL                 |
+--------------+----------------------+---------+----------------------+

3.12 FIRST_VALUE()

分组排序后取截止到当前行的第一行指定列的数据

SELECT 
  cookie_name,
  create_time,
  url,
  FIRST_VALUE(url) OVER(PARTITION BY cookie_name ORDER BY create_time) AS first_url
FROM cookie_3;

+--------------+----------------------+---------+------------+
| cookie_name  |     create_time      |   url   | first_url  |
+--------------+----------------------+---------+------------+
| COOKIE_1     | 2015-04-10 10:00:00  | URL_01  | URL_01     |
| COOKIE_1     | 2015-04-10 10:00:02  | URL_02  | URL_01     |
| COOKIE_1     | 2015-04-10 10:03:04  | URL_03  | URL_01     |
| COOKIE_1     | 2015-04-10 10:10:00  | URL_04  | URL_01     |
| COOKIE_1     | 2015-04-10 10:50:01  | URL_05  | URL_01     |
| COOKIE_1     | 2015-04-10 10:50:05  | URL_06  | URL_01     |
| COOKIE_1     | 2015-04-10 11:00:00  | URL_07  | URL_01     |
| COOKIE_2     | 2015-04-10 10:00:00  | URL_11  | URL_11     |
| COOKIE_2     | 2015-04-10 10:00:02  | URL_22  | URL_11     |
| COOKIE_2     | 2015-04-10 10:03:04  | URL_33  | URL_11     |
| COOKIE_2     | 2015-04-10 10:10:00  | URL_44  | URL_11     |
| COOKIE_2     | 2015-04-10 10:50:01  | URL_55  | URL_11     |
| COOKIE_2     | 2015-04-10 10:50:05  | URL_66  | URL_11     |
| COOKIE_2     | 2015-04-10 11:00:00  | URL_77  | URL_11     |
+--------------+----------------------+---------+------------+

3.13 LAST_VALUE()

分组排序后取截止到当前行的最后一行指定列的数据

SELECT 
  cookie_name,
  create_time,
  url,
  LAST_VALUE(url) OVER(PARTITION BY cookie_name ORDER BY create_time) AS last_url
FROM cookie_3;

+--------------+----------------------+---------+-----------+
| cookie_name  |     create_time      |   url   | last_url  |
+--------------+----------------------+---------+-----------+
| COOKIE_1     | 2015-04-10 10:00:00  | URL_01  | URL_01    |
| COOKIE_1     | 2015-04-10 10:00:02  | URL_02  | URL_02    |
| COOKIE_1     | 2015-04-10 10:03:04  | URL_03  | URL_03    |
| COOKIE_1     | 2015-04-10 10:10:00  | URL_04  | URL_04    |
| COOKIE_1     | 2015-04-10 10:50:01  | URL_05  | URL_05    |
| COOKIE_1     | 2015-04-10 10:50:05  | URL_06  | URL_06    |
| COOKIE_1     | 2015-04-10 11:00:00  | URL_07  | URL_07    |
| COOKIE_2     | 2015-04-10 10:00:00  | URL_11  | URL_11    |
| COOKIE_2     | 2015-04-10 10:00:02  | URL_22  | URL_22    |
| COOKIE_2     | 2015-04-10 10:03:04  | URL_33  | URL_33    |
| COOKIE_2     | 2015-04-10 10:10:00  | URL_44  | URL_44    |
| COOKIE_2     | 2015-04-10 10:50:01  | URL_55  | URL_55    |
| COOKIE_2     | 2015-04-10 10:50:05  | URL_66  | URL_66    |
| COOKIE_2     | 2015-04-10 11:00:00  | URL_77  | URL_77    |
+--------------+----------------------+---------+-----------+

3.14 CUME_DIST()

分组排序后,求小于等于或大于等于当前行排序列的值的比例

SELECT 
  dept_name,
  emp_name,
  salary,
  CUME_DIST() OVER(PARTITION BY dept_name ORDER BY salary) AS cume
FROM dept_w;

+------------+-----------+---------+---------------------+
| dept_name  | emp_name  | salary  |        cume         |
+------------+-----------+---------+---------------------+
| D1         | USER1     | 1000    | 0.3333333333333333  | -- 在D1部门内,薪水小于等于1000的比例为1/3
| D1         | USER2     | 2000    | 0.6666666666666666  | -- 在D1部门内,薪水小于等于2000的比例为2/3
| D1         | USER3     | 3000    | 1.0                 | -- 在D1部门内,薪水小于等于3000的比例为3/3
| D2         | USER4     | 4000    | 0.5                 | -- 在D2部门内,薪水小于等于4000的比例为1/2
| D2         | USER5     | 5000    | 1.0                 | -- 在D2部门内,薪水小于等于5000的比例为2/2
+------------+-----------+---------+---------------------+

SELECT 
  dept_name,
  emp_name,
  salary,
  CUME_DIST() OVER(PARTITION BY dept_name ORDER BY salary DESC) AS cume
FROM dept_w;

+------------+-----------+---------+---------------------+
| dept_name  | emp_name  | salary  |        cume         |
+------------+-----------+---------+---------------------+
| D1         | USER3     | 3000    | 0.3333333333333333  |
| D1         | USER2     | 2000    | 0.6666666666666666  |
| D1         | USER1     | 1000    | 1.0                 |
| D2         | USER5     | 5000    | 0.5                 |
| D2         | USER4     | 4000    | 1.0                 |
+------------+-----------+---------+---------------------+

-- 不分组
SELECT 
  dept_name,
  emp_name,
  salary,
  CUME_DIST() OVER(ORDER BY salary) AS cume
FROM dept_w;

+------------+-----------+---------+-------+
| dept_name  | emp_name  | salary  | cume  |
+------------+-----------+---------+-------+
| D1         | USER1     | 1000    | 0.2   | -- 在全部数据中,薪水小于等于1000的比例为1/5
| D1         | USER2     | 2000    | 0.4   | -- 在全部数据中,薪水小于等于2000的比例为2/5
| D1         | USER3     | 3000    | 0.6   | -- 在全部数据中,薪水小于等于3000的比例为3/5
| D2         | USER4     | 4000    | 0.8   | -- 在全部数据中,薪水小于等于4000的比例为4/5
| D2         | USER5     | 5000    | 1.0   | -- 在全部数据中,薪水小于等于5000的比例为5/5
+------------+-----------+---------+-------+

SELECT 
  dept_name,
  emp_name,
  salary,
  CUME_DIST() OVER(ORDER BY salary DESC) AS cume
FROM dept_w;

+------------+-----------+---------+-------+
| dept_name  | emp_name  | salary  | cume  |
+------------+-----------+---------+-------+
| D2         | USER5     | 5000    | 0.2   |
| D2         | USER4     | 4000    | 0.4   |
| D1         | USER3     | 3000    | 0.6   |
| D1         | USER2     | 2000    | 0.8   |
| D1         | USER1     | 1000    | 1.0   |
+------------+-----------+---------+-------+

注意:

  • 使用哪一列排序,就是针对那一列进行比例统计
  • 升序,则统计组内小于等于该列值的比例
  • 将许,则统计组内大于等于该列值的比例
  • 可以不分组统计
  • 不能设置ROWS BETWEEN ... AND ...子句,除非是ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING