Hive内置函数大全-08-多维分析

147 阅读2分钟

GROUPING SETS

准备测试数据:

SET hive.exec.mode.local.auto=true;

CREATE TABLE test_grouping(
  deptno INT,
  name STRING,
  gender INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';

INSERT INTO test_grouping VALUES 
(10, 'TOM', 0), 
(10, 'TONY', 1),
(10, 'KATE', 0),
(20, 'JACK', 1),
(20, 'JACKSON', 1),
(20, 'BOB', 0),
(20, 'ALLEN', 1);

需求:统计每个部门各有多少人、男女各多少人、每个部门男女各多少人

实现:

-- 普通实现
SELECT deptno, NULL AS gender, COUNT(*) AS count FROM test_grouping GROUP BY deptno
UNION
SELECT NULL AS deptno, gender, COUNT(*) AS count FROM test_grouping GROUP BY gender
UNION
SELECT deptno, gender, COUNT(*) AS count FROM test_grouping GROUP BY deptno, gender;
+-------------+-------------+------------+
| _u2.deptno  | _u2.gender  | _u2.count  |
+-------------+-------------+------------+
| NULL        | 0           | 3          |
| NULL        | 1           | 4          |
| 10          | NULL        | 3          |
| 10          | 0           | 2          |
| 10          | 1           | 1          |
| 20          | NULL        | 4          |
| 20          | 0           | 1          |
| 20          | 1           | 3          |
+-------------+-------------+------------+

-- GROUPING SETS
SELECT deptno, gender, COUNT(*) AS count 
FROM test_grouping 
GROUP BY deptno, gender 
GROUPING SETS(deptno, gender, (deptno, gender));

+---------+---------+--------+
| deptno  | gender  | count  |
+---------+---------+--------+
| NULL    | 0       | 3      |
| NULL    | 1       | 4      |
| 10      | NULL    | 3      |
| 10      | 0       | 2      |
| 10      | 1       | 1      |
| 20      | NULL    | 4      |
| 20      | 0       | 1      |
| 20      | 1       | 3      |
+---------+---------+--------+

GROUPING__ID

与GROUPING SETS一起使用,显示每个分组的ID,注意GROUPING和ID之间是两个下划线。

SELECT 
  GROUPING__ID,
  deptno, 
  gender, 
  COUNT(*) AS count 
FROM test_grouping 
GROUP BY deptno, gender 
GROUPING SETS(deptno, gender, (deptno, gender))
ORDER BY GROUPING__ID;;

+---------------+---------+---------+--------+
| grouping__id  | deptno  | gender  | count  |
+---------------+---------+---------+--------+
| 0             | 20      | 1       | 3      |
| 0             | 20      | 0       | 1      |
| 0             | 10      | 1       | 1      |
| 0             | 10      | 0       | 2      |
| 1             | 20      | NULL    | 4      |
| 1             | 10      | NULL    | 3      |
| 2             | NULL    | 1       | 4      |
| 2             | NULL    | 0       | 3      |
+---------------+---------+---------+--------+

CUBE

根据GROUP BY的所有字段的排列组合进行多维分析

SELECT 
  GROUPING__ID,
  deptno, 
  gender, 
  COUNT(*) AS count 
FROM test_grouping 
GROUP BY deptno, gender 
WITH CUBE
ORDER BY GROUPING__ID;;

+---------------+---------+---------+--------+
| grouping__id  | deptno  | gender  | count  |
+---------------+---------+---------+--------+
| 0             | 20      | 1       | 3      |
| 0             | 20      | 0       | 1      |
| 0             | 10      | 1       | 1      |
| 0             | 10      | 0       | 2      |
| 1             | 20      | NULL    | 4      |
| 1             | 10      | NULL    | 3      |
| 2             | NULL    | 1       | 4      |
| 2             | NULL    | 0       | 3      |
| 3             | NULL    | NULL    | 7      |
+---------------+---------+---------+--------+

说明:GROUP BY a, b, c WITH CUBE相当于GROUP BY a, b, c GROUPING SETS(a, b, c, (a, b), (a, c), (b, c), (a, b, c)) + 不分组的全局统计

ROLLUP

CUBE的子集

SELECT 
  GROUPING__ID,
  deptno, 
  gender, 
  COUNT(*) AS count 
FROM test_grouping 
GROUP BY deptno, gender 
WITH ROLLUP
ORDER BY GROUPING__ID;;

+---------------+---------+---------+--------+
| grouping__id  | deptno  | gender  | count  |
+---------------+---------+---------+--------+
| 0             | 20      | 1       | 3      |
| 0             | 20      | 0       | 1      |
| 0             | 10      | 1       | 1      |
| 0             | 10      | 0       | 2      |
| 1             | 20      | NULL    | 4      |
| 1             | 10      | NULL    | 3      |
| 3             | NULL    | NULL    | 7      |
+---------------+---------+---------+--------+

说明:ROLLUP只统计GROUP BY的第一个字段有关的维度,GROUP BY a, b, c WITH ROLLUP相当于GROUP BY a, b, c GROUPING SETS(a, (a, b), (a, c), (a, b, c)) + 不分组的全局统计