GROUPING SETS
准备测试数据:
SET hive.exec.mode.local.auto=true;
CREATE TABLE test_grouping(
deptno INT,
name STRING,
gender INT
)
ROW FORMAT DELIMITED FIELDS TERMINATED by ',';
INSERT INTO test_grouping VALUES
(10, 'TOM', 0),
(10, 'TONY', 1),
(10, 'KATE', 0),
(20, 'JACK', 1),
(20, 'JACKSON', 1),
(20, 'BOB', 0),
(20, 'ALLEN', 1);
需求:统计每个部门各有多少人、男女各多少人、每个部门男女各多少人
实现:
-- 普通实现
SELECT deptno, NULL AS gender, COUNT(*) AS count FROM test_grouping GROUP BY deptno
UNION
SELECT NULL AS deptno, gender, COUNT(*) AS count FROM test_grouping GROUP BY gender
UNION
SELECT deptno, gender, COUNT(*) AS count FROM test_grouping GROUP BY deptno, gender;
+-------------+-------------+------------+
| _u2.deptno | _u2.gender | _u2.count |
+-------------+-------------+------------+
| NULL | 0 | 3 |
| NULL | 1 | 4 |
| 10 | NULL | 3 |
| 10 | 0 | 2 |
| 10 | 1 | 1 |
| 20 | NULL | 4 |
| 20 | 0 | 1 |
| 20 | 1 | 3 |
+-------------+-------------+------------+
-- GROUPING SETS
SELECT deptno, gender, COUNT(*) AS count
FROM test_grouping
GROUP BY deptno, gender
GROUPING SETS(deptno, gender, (deptno, gender));
+---------+---------+--------+
| deptno | gender | count |
+---------+---------+--------+
| NULL | 0 | 3 |
| NULL | 1 | 4 |
| 10 | NULL | 3 |
| 10 | 0 | 2 |
| 10 | 1 | 1 |
| 20 | NULL | 4 |
| 20 | 0 | 1 |
| 20 | 1 | 3 |
+---------+---------+--------+
GROUPING__ID
与GROUPING SETS一起使用,显示每个分组的ID,注意GROUPING和ID之间是两个下划线。
SELECT
GROUPING__ID,
deptno,
gender,
COUNT(*) AS count
FROM test_grouping
GROUP BY deptno, gender
GROUPING SETS(deptno, gender, (deptno, gender))
ORDER BY GROUPING__ID;;
+---------------+---------+---------+--------+
| grouping__id | deptno | gender | count |
+---------------+---------+---------+--------+
| 0 | 20 | 1 | 3 |
| 0 | 20 | 0 | 1 |
| 0 | 10 | 1 | 1 |
| 0 | 10 | 0 | 2 |
| 1 | 20 | NULL | 4 |
| 1 | 10 | NULL | 3 |
| 2 | NULL | 1 | 4 |
| 2 | NULL | 0 | 3 |
+---------------+---------+---------+--------+
CUBE
根据GROUP BY的所有字段的排列组合进行多维分析
SELECT
GROUPING__ID,
deptno,
gender,
COUNT(*) AS count
FROM test_grouping
GROUP BY deptno, gender
WITH CUBE
ORDER BY GROUPING__ID;;
+---------------+---------+---------+--------+
| grouping__id | deptno | gender | count |
+---------------+---------+---------+--------+
| 0 | 20 | 1 | 3 |
| 0 | 20 | 0 | 1 |
| 0 | 10 | 1 | 1 |
| 0 | 10 | 0 | 2 |
| 1 | 20 | NULL | 4 |
| 1 | 10 | NULL | 3 |
| 2 | NULL | 1 | 4 |
| 2 | NULL | 0 | 3 |
| 3 | NULL | NULL | 7 |
+---------------+---------+---------+--------+
说明:GROUP BY a, b, c WITH CUBE相当于GROUP BY a, b, c GROUPING SETS(a, b, c, (a, b), (a, c), (b, c), (a, b, c)) + 不分组的全局统计
ROLLUP
CUBE的子集
SELECT
GROUPING__ID,
deptno,
gender,
COUNT(*) AS count
FROM test_grouping
GROUP BY deptno, gender
WITH ROLLUP
ORDER BY GROUPING__ID;;
+---------------+---------+---------+--------+
| grouping__id | deptno | gender | count |
+---------------+---------+---------+--------+
| 0 | 20 | 1 | 3 |
| 0 | 20 | 0 | 1 |
| 0 | 10 | 1 | 1 |
| 0 | 10 | 0 | 2 |
| 1 | 20 | NULL | 4 |
| 1 | 10 | NULL | 3 |
| 3 | NULL | NULL | 7 |
+---------------+---------+---------+--------+
说明:ROLLUP只统计GROUP BY的第一个字段有关的维度,GROUP BY a, b, c WITH ROLLUP相当于GROUP BY a, b, c GROUPING SETS(a, (a, b), (a, c), (a, b, c)) + 不分组的全局统计