这是我参与8月更文挑战的第31天,活动详情查看:8月更文挑战
DQL -- Data Query Language 数据查询语言
select 语法:
SELECT [ALL | DISTINCT] select_expr, select_expr, ...
FROM table_reference
[WHERE where_condition]
[GROUP BY col_list]
[ORDER BY col_list]
[CLUSTER BY col_list | [DISTRIBUTE BY col_list] [SORT BY
col_list]]
[LIMIT [offset,] rows]
SQL 语句书写注意事项:
SQL语句对大小写不敏感SQL语句可以写一行(简单SQL)也可以写多行(复杂SQL)- 关键字不能缩写, 也不能分行
- 各子句一般要分行
- 使用缩进格式, 提高SQL语句的可读性(重要)
创建表,加载数据
-- 测试数据 /home/hadoop/data/emp.dat
7369,SMITH,CLERK,7902,2010-12-17,800,,20
7499,ALLEN,SALESMAN,7698,2011-02-20,1600,300,30
7521,WARD,SALESMAN,7698,2011-02-22,1250,500,30
7566,JONES,MANAGER,7839,2011-04-02,2975,,20
7654,MARTIN,SALESMAN,7698,2011-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,2011-05-01,2850,,30
7782,CLARK,MANAGER,7839,2011-06-09,2450,,10
7788,SCOTT,ANALYST,7566,2017-07-13,3000,,20
7839,KING,PRESIDENT,,2011-11-07,5000,,10
7844,TURNER,SALESMAN,7698,2011-09-08,1500,0,30
7876,ADAMS,CLERK,7788,2017-07-13,1100,,20
7900,JAMES,CLERK,7698,2011-12-03,950,,30
7902,FORD,ANALYST,7566,2011-12-03,3000,,20
7934,MILLER,CLERK,7782,2012-01-23,1300,,10
-- 建表并加载数据
CREATE TABLE emp (
empno int,
ename string,
job string,
mgr int,
hiredate DATE,
sal int,
comm int,
deptno int
)row format delimited fields terminated by ",";
-- 加载数据
hive (mydb)> LOAD DATA LOCAL INPATH '/home/hadoop/data/emp.dat' INTO TABLE emp;
Loading data to table mydb.emp
OK
Time taken: 0.316 seconds
一、基本查询
-- 省略from子句的查询
hive (mydb)> select 8*888;
OK
_c0
7104
Time taken: 0.097 seconds, Fetched: 1 row(s)
hive (mydb)> select current_date;
OK
_c0
2020-08-17
Time taken: 0.091 seconds, Fetched: 1 row(s)
-- 使用列别名
hive (mydb)> select 8*888 product;
OK
product
7104
Time taken: 0.071 seconds, Fetched: 1 row(s)
hive (mydb)> select current_date as currdate;
OK
currdate
2020-08-17
Time taken: 0.068 seconds, Fetched: 1 row(s)
-- 全表查询
hive (mydb)> select * from emp;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
Time taken: 0.116 seconds, Fetched: 14 row(s)
-- 选择特定列查询
hive (mydb)> select ename, sal, comm from emp;
OK
ename sal comm
SMITH 800 NULL
ALLEN 1600 300
WARD 1250 500
JONES 2975 NULL
MARTIN 1250 1400
BLAKE 2850 NULL
CLARK 2450 NULL
SCOTT 3000 NULL
KING 5000 NULL
TURNER 1500 0
ADAMS 1100 NULL
JAMES 950 NULL
FORD 3000 NULL
MILLER 1300 NULL
Time taken: 0.129 seconds, Fetched: 14 row(s)
-- 使用函数
hive (mydb)> select count(*) from emp;
OK
_c0
14
Time taken: 1.402 seconds, Fetched: 1 row(s)
-- count(colname) 按字段进行count,不统计NULL
hive (mydb)> select sum(sal) from emp;
hive (mydb)> select max(sal) from emp;
hive (mydb)> select min(sal) from emp;
hive (mydb)> select avg(sal) from emp;
-- 使用limit子句限制返回的行数
hive (mydb)> select * from emp limit 3;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
Time taken: 0.124 seconds, Fetched: 3 row(s)
二、where 子句
WHERE 子句紧随 FROM 子句, 使用 WHERE 子句,过滤不满足条件的数据;
WHERE 子句中不能使用列的别名;
hive (mydb)> select * from emp where sal > 2000;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
Time taken: 0.177 seconds, Fetched: 6 row(s)
- 比较运算符
通常情况下 NULL 参与运算,返回值为 NULL;
NULL <=> NULL的结果为true
- 逻辑运算符
-- 比较运算符,null参与运算
hive (mydb)> select null == null;
OK
_c0
NULL
Time taken: 0.09 seconds, Fetched: 1 row(s)
hive (mydb)> select null = null;
OK
_c0
NULL
Time taken: 0.11 seconds, Fetched: 1 row(s)
hive (mydb)> select null <=> null;
OK
_c0
true
Time taken: 0.061 seconds, Fetched: 1 row(s)
-- 使用 is null 判空
hive (mydb)> select * from emp where comm is null;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
Time taken: 0.274 seconds, Fetched: 10 row(s)
-- 使用 in
hive (mydb)> select * from emp where deptno in (20, 30);
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
Time taken: 0.184 seconds, Fetched: 11 row(s)
-- 使用 between ... and ...
hive (mydb)> select * from emp where sal between 1000 and 2000;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
Time taken: 0.124 seconds, Fetched: 6 row(s)
-- 使用 like
hive (mydb)> select ename, sal from emp where ename like '%L%';
OK
ename sal
ALLEN 1600
BLAKE 2850
CLARK 2450
MILLER 1300
Time taken: 0.181 seconds, Fetched: 4 row(s)
-- 使用 rlike。正则表达式,名字以A或S开头
hive (mydb)> select ename, sal from emp where ename rlike '^(A|S).*';
OK
ename sal
SMITH 800
ALLEN 1600
SCOTT 3000
ADAMS 1100
Time taken: 0.142 seconds, Fetched: 4 row(s)
三、group by 子句
GROUP BY 语句通常与聚组函数一起使用, 按照一个或多个列对数据进行分组, 对每个组进行聚合操作。
-- 计算emp表每个部门的平均工资
select deptno, avg(sal)
from emp
group by deptno;
deptno _c1
10 2916.6666666666665
20 2175.0
30 1566.6666666666667
-- 计算emp每个部门中每个岗位的最高薪水
select deptno, job, max(sal)
from emp
group by deptno, job;
deptno job _c2
20 ANALYST 3000
10 CLERK 1300
20 CLERK 1100
30 CLERK 950
10 MANAGER 2450
20 MANAGER 2975
30 MANAGER 2850
10 PRESIDENT 5000
30 SALESMAN 1600
-
where子句针对表中的数据发挥作用;having针对查询结果(聚组以后的结果)发挥作用 -
where子句不能有分组函数;having子句可以有分组函数 -
having只用于group by分组统计之后
-- 求每个部门的平均薪水大于2000的部门
select deptno, avg(sal)
from emp
group by deptno
having avg(sal) > 2000;
deptno _c1
10 2916.6666666666665
20 2175.0
四、表连接
Hive 支持通常的 SQL JOIN 语句。
默认情况下, 仅支持等值连接, 不支持非等值连接。
JOIN 语句中经常会使用表的别名。使用别名可以简化SQL语句的编写, 使用表名前缀可以提高SQL的解析效率。
连接查询操作分为两大类:
-
内连接
[inner] join -
外连接
(outer join)
- 左外连接。
left [outer] join, 左表的数据全部显示- 右外连接。
right [outer] join, 右表的数据全部显示- 全外连接。
full [outer] join, 两张表的数据都显示
如图:
(1)案例演示
- 数据准备
-- 准备数据 /home/hadoop/data/u1.txt , /home/hadoop/data/u2.txt
u1.txt数据:
1,a
2,b
3,c4,d
5,e
6,f
u2.txt数据:
4,d
5,e
6,f
7,g
8,h
9,i
create table if not exists u1(
id int,
name string
)
row format delimited fields terminated by ',';
create table if not exists u2(
id int,
name string
)
row format delimited fields terminated by ',';
load data local inpath '/home/hadoop/data/u1.txt' into table u1;
load data local inpath '/home/hadoop/data/u2.txt' into table u2;
- 查询
-- 内连接
hive (mydb)> select * from u1 join u2 on u1.id = u2.id;
OK
u1.id u1.name u2.id u2.name
5 e 5 e
6 f 6 f
Time taken: 12.058 seconds, Fetched: 2 row(s)
-- 左外连接
hive (mydb)> select * from u1 left join u2 on u1.id = u2.id;
OK
u1.id u1.name u2.id u2.name
1 a NULL NULL
2 b NULL NULL
3 c4 NULL NULL
5 e 5 e
6 f 6 f
Time taken: 11.871 seconds, Fetched: 5 row(s)
-- 右外连接
hive (mydb)> select * from u1 right join u2 on u1.id = u2.id;
OK
u1.id u1.name u2.id u2.name
NULL NULL 4 d
5 e 5 e
6 f 6 f
NULL NULL 7 g
NULL NULL 8 h
NULL NULL 9 i
Time taken: 11.587 seconds, Fetched: 6 row(s)
-- 全外连接
hive (mydb)> select * from u1 full join u2 on u1.id = u2.id;
OK
u1.id u1.name u2.id u2.name
1 a NULL NULL
2 b NULL NULL
3 c4 NULL NULL
NULL NULL 4 d
5 e 5 e
6 f 6 f
NULL NULL 7 g
NULL NULL 8 h
NULL NULL 9 i
Time taken: 1.401 seconds, Fetched: 9 row(s)
- 多表连接
连接 n 张表, 至少需要 n-1 个连接条件。
例如: 连接四张表, 至少需要三个连接条件。
多表连接查询, 查询老师对应的课程, 以及对应的分数, 对应的学生:
select *
from techer t
left join course c on t.t_id = c.t_id
left join score s on s.c_id = c.c_id
left join student stu on s.s_id = stu.s_id;
Hive 总是按照从左到右的顺序执行, Hive 会对每对 JOIN 连接对象启动一个 MapReduce 任务。
上面的例子中:
-
首先启动一个
MapReduce job对表 t 和表 c 进行连接操作; -
再启动一个
MapReduce job将第一个MapReduce job的输出和表 s 进行连接操作; -
再继续直到全部操作;
- 笛卡尔积
满足以下条件将会产生笛卡尔集:
- 没有连接条件
- 连接条件无效
- 所有表中的所有行互相连接
如果表 A、B 分别有 M、N 条数据, 其笛卡尔积的结果将有 M*N 条数据;
缺省条件下 hive 不支持笛卡尔积运算;
hive (mydb)> select * from u1, u2;
FAILED: SemanticException Cartesian products are disabled for safety reasons. If you know what you are doing, please sethive.strict.checks.cartesian.product to false and that hive.mapred.mode is not set to 'strict' to proceed. Note that if you may get errors or incorrect results if you make a mistake while using some of the unsafe features.
hive (mydb)> set hive.strict.checks.cartesian.product=false;
hive (mydb)> select * from u1, u2;
五、排序子句
排序小结:
-
order by: 执行全局排序, 效率低。生产环境中慎用 -
sort by: 使数据局部有序(在reduce内部有序) -
distribute by: 按照指定的条件将数据分组, 常与sort by联用, 使数据局部有序。 -
cluster by: 当distribute by与sort by是同一个字段时, 可使用cluster by简化语法
(1)全局排序
-
order by子句出现在select语句的结尾; -
order by子句对最终的结果进行排序; -
默认使用升序(
ASC); -
可以使用
DESC, 跟在字段名之后表示降序; -
ORDER BY执行全局排序, 只有一个reduce;
-- 普通排序
hive (mydb)> select * from emp order by deptno;
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
Time taken: 5.329 seconds, Fetched: 14 row(s)
-- 按别名排序
select empno, ename, job, mgr, sal + nvl(comm, 0) salcomm, deptno
from emp
order by salcomm desc;
OK
empno ename job mgr salcomm deptno
7839 KING PRESIDENT NULL 5000 10
7902 FORD ANALYST 7566 3000 20
7788 SCOTT ANALYST 7566 3000 20
7566 JONES MANAGER 7839 2975 20
7698 BLAKE MANAGER 7839 2850 30
7654 MARTIN SALESMAN 7698 2650 30
7782 CLARK MANAGER 7839 2450 10
7499 ALLEN SALESMAN 7698 1900 30
7521 WARD SALESMAN 7698 1750 30
7844 TURNER SALESMAN 7698 1500 30
7934 MILLER CLERK 7782 1300 10
7876 ADAMS CLERK 7788 1100 20
7900 JAMES CLERK 7698 950 30
7369 SMITH CLERK 7902 800 20
Time taken: 1.989 seconds, Fetched: 14 row(s)
-- 多列排序
select empno, ename, job, mgr, sal + nvl(comm, 0) salcomm, deptno
from emp
order by deptno, salcomm desc;
OK
empno ename job mgr salcomm deptno
7839 KING PRESIDENT NULL 5000 10
7782 CLARK MANAGER 7839 2450 10
7934 MILLER CLERK 7782 1300 10
7788 SCOTT ANALYST 7566 3000 20
7902 FORD ANALYST 7566 3000 20
7566 JONES MANAGER 7839 2975 20
7876 ADAMS CLERK 7788 1100 20
7369 SMITH CLERK 7902 800 20
7698 BLAKE MANAGER 7839 2850 30
7654 MARTIN SALESMAN 7698 2650 30
7499 ALLEN SALESMAN 7698 1900 30
7521 WARD SALESMAN 7698 1750 30
7844 TURNER SALESMAN 7698 1500 30
7900 JAMES CLERK 7698 950 30
Time taken: 1.56 seconds, Fetched: 14 row(s)
-- 排序字段要出现在select子句中。以下语句无法执行(因为select子句中缺少deptno):
hive (mydb)> select empno, ename, job, mgr, sal + nvl(comm, 0) salcomm
> from emp
> order by deptno, salcomm desc;
FAILED: SemanticException [Error 10004]: Line 3:9 Invalid table alias or column reference 'deptno': (possible column names are: empno, ename, job, mgr, salcomm)
(2)每个MR 内部排序 (sort by)
-
对于大规模数据而言
order by效率低; -
在很多业务场景, 我们并不需要全局有序的数据, 此时可以使用
sort by; -
sort by为每个reduce产生一个排序文件, 在reduce内部进行排序, 得到局部有序的结果;
-- 默认 hive,设置参数如下
hive (mydb)> set mapreduce.job.reduces;
mapreduce.job.reduces=-1
-- 设置reduce个数
set mapreduce.job.reduces=2;
-- 按照工资降序查看员工信息
-- 查看查询,可以看到有两组排序
hive (mydb)> select * from emp sort by sal desc;
Cannot run job locally: Number of reducers (= 2) is more than 1
OK
emp.empno emp.ename emp.job emp.mgr emp.hiredate emp.sal emp.comm emp.deptno
7902 FORD ANALYST 7566 2011-12-03 3000 NULL 20
7788 SCOTT ANALYST 7566 2017-07-13 3000 NULL 20
7566 JONES MANAGER 7839 2011-04-02 2975 NULL 20
7844 TURNER SALESMAN 7698 2011-09-08 1500 0 30
7521 WARD SALESMAN 7698 2011-02-22 1250 500 30
7654 MARTIN SALESMAN 7698 2011-09-28 1250 1400 30
7876 ADAMS CLERK 7788 2017-07-13 1100 NULL 20
7900 JAMES CLERK 7698 2011-12-03 950 NULL 30
7369 SMITH CLERK 7902 2010-12-17 800 NULL 20
7839 KING PRESIDENT NULL 2011-11-07 5000 NULL 10
7698 BLAKE MANAGER 7839 2011-05-01 2850 NULL 30
7782 CLARK MANAGER 7839 2011-06-09 2450 NULL 10
7499 ALLEN SALESMAN 7698 2011-02-20 1600 300 30
7934 MILLER CLERK 7782 2012-01-23 1300 NULL 10
Time taken: 26.569 seconds, Fetched: 14 row(s)
-- 将查询结果导入到文件中(按照工资降序)。生成两个输出文件,每个文件内部数据按工资降序排列
insert overwrite local directory '/home/hadoop/output/sortsal'
select * from emp sort by sal desc;
[root@linux123 sortsal]# pwd
/home/hadoop/output/sortsal
[root@linux123 sortsal]# ll
total 8
-rw-r--r--. 1 root root 411 Aug 18 01:10 000000_0
-rw-r--r--. 1 root root 230 Aug 18 01:10 000001_0
(3)分区排序(distribute by)
distribute by 将特定的行发送到特定的 reducer 中, 便于后继的聚合 与 排序操作;
distribute by 类似于 MR 中的分区操作, 可以结合 sort by 操作, 使分区数据有序;
distribute by 要写在 sort by 之前;
-- 启动 2 个 reducer task; 先按 deptno 分区, 在分区内按 sal+comm 排序
hive (mydb)> set mapreduce.job.reduces = 2;
-- 将结果输出到文件,观察输出结果
insert overwrite local directory '/home/hadoop/output/distBy'
select empno, ename, job, deptno, sal + nvl(comm, 0) salcomm
from emp
distribute by deptno
sort by salcomm desc;
-- 查看结果
[root@linux123 distBy]# pwd
/home/hadoop/output/distBy
[root@linux123 distBy]# ll
total 4
-rw-r--r--. 1 root root 374 Aug 18 17:30 000000_0
-rw-r--r--. 1 root root 0 Aug 18 17:30 000001_0
-- 上例中, 数据被分到了统一区, 看不出分区的结果
-- 将数据分到3个区中,每个分区都有数据
set mapreduce.job.reduces=3;
insert overwrite local directory '/home/hadoop/output/distBy1'
select empno, ename, job, deptno, sal + nvl(comm, 0) salcomm
from emp
distribute by deptno
sort by salcomm desc;
-- 查看结果
[root@linux123 distBy1]# ll
total 12
-rw-r--r--. 1 root root 164 Aug 18 17:33 000000_0
-rw-r--r--. 1 root root 81 Aug 18 17:33 000001_0
-rw-r--r--. 1 root root 129 Aug 18 17:33 000002_0
[root@linux123 distBy1]# cat -A 000000_0
7698^ABLAKE^AMANAGER^A30^A2850$
7654^AMARTIN^ASALESMAN^A30^A2650$
7499^AALLEN^ASALESMAN^A30^A1900$
7521^AWARD^ASALESMAN^A30^A1750$
7844^ATURNER^ASALESMAN^A30^A1500$
7900^AJAMES^ACLERK^A30^A950$
(4)cluster by
当 distribute by 与 sort by 是同一个字段时, 可使用 cluster by 简化语法;
cluster by 只能是剩下, 不能指定排序规则;
-- 语法上是等价的,升序是等价的,降序又不一样了。
select * from emp distribute by deptno sort by deptno;
select * from emp cluster by deptno;