DDL(Data Definition Language)
数据库
创建
create database [if not exists] database_name
[comment database_comment]
[location hdfs_path]
[with dbproperties (property_name=value, ...)];
查询
show databases [like 'identifier_with_wildcards'];
修改
alter database database_name set dbproperties (property_name=property_value, ...);
alter database database_name set location hdfs_path;
alter database database_name set owner user user_name;
删除数据库
drop database [if exists] database_name [restrict|cascade];
切换数据库
use database_name;
表
创建表,普通
create [temporary] [external] table [if not exists] [db_name.]table_name
[(col_name data_type [comment col_comment], ...)]
[comment table_name]
[partitioned by (col_name data_type [comment col_comment], ...)]
[clustered by (col_name, col_name, ...)]
[sorted by (col_name [ASC|DESC], ...) into num_buckets nuckets]
[row format rowformat]
[stored as file_format]
[location hdfs_path]
[tableproperties (property_name=property_value, ...)]
1. temporary
2. external
3. partitioned by
4. clustred by ... sorted by ... into ... buckets
5. row format
ROW FORAMT DELIMITED
[FIELDS TERMINATED BY char]
[COLLECTION ITEMS TERMINATED BY char]
[MAP KEYS TERMINATED BY char]
[LINES TERMINATED BY char]
[NULL DEFINED AS char]
ROW FORMAT SERDE serde_name [WITH
SERDEPROPERTIES(property_name=property_value,property_name=property_value, ...)]
6. stored as
7. location
8. tblproperties
Hive | 说明 | 定义 |
---|
tinyint | 1byte有符号整数 | |
smallint | 2byte有符号整数 | |
int | 4byte有符号整数 | |
bigint | 8byte有符号整数 | |
boolean | 布尔类型,true或者false | |
float | 单精度浮点数 | |
double | 双精度浮点数 | |
decimal | 十进制精准数字类型 | decimal(16,2) |
varchar | 字符序列,需指定最大长度,最大长度的范围是[1,65535] | varchar(32) |
string | 字符串,无需指定最大长度 | |
timestamp | 时间类型 | |
binary | 二进制数据 | |
类型 | 说明 | 定义 | 取值 |
---|
array | 数组是一组相同类型的值的集合 | array< string > | arr[0] |
map | map是一组相同类型的键-值对集合 | map< string, int > | map['key'] |
struct | 结构体由多个属性组成,每个属性都有自己的属性名和数据类型 | struct<id:int, name:string> | struct.id |
创建表,CTAS建表
- 该语法允许用户利用select查询语句返回的结果,直接建表,表的结构和查询语句的结构保持一致,且保证包含select查询语句返回的内容。(不能创建外部表)
CREATE TEMPORARY TABLE IF NOT EXISTS table_name
[COMMENT table_comment]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)]
[AS select_statement]
create table like创建表
- 该语法允许用户复刻一张已经存在的表结构,与上述的CTAS语法不同,该语法创建出来的表中不包含数据。
CREATE [TEMPORARY] [EXTERNAL] TABLE [IF NOT EXISTS] [db_name.]table_name
[LIKE exist_table_name]
[ROW FORMAT row_format]
[STORED AS file_format]
[LOCATION hdfs_path]
[TBLPROPERTIES (property_name=property_value, ...)]
查看表
语法
show tables [in database_name] like ['*']
describe [extended | formatted] [da_name.]table_name
desc formatted stu;
修改表
alter table table_name rename to new_table_name
alter table table_name add columns (col_name data_type [comment col_comment], ...)
alter table table_name change [column] col_old_name col_new_name column_type [comment
col_comment] [first|after column_name]
set hive.metastore.disallow.incompatible.col.type.change=false;
alter table table_name replace columns (col_name data_type [comment col_comment], ...)
删除表(对外部表无效)
drop table [if exists] table_name;
清空表(对外部表无效)
truncate [table] table_name;
DML(Data Manipulation language)数据操作
Load(Load语句将文件导入Hive表中)
load data [local] inpath 'filepath' [overwrite] into table tablename [partition (partcoll=val1,
partcol2=val2, ...)]
local: 表示将文件从本地加载到Hive表中,否则从HDFS加载数据到HIVE表。此处的本地,若使用的是HiveClient
模式,则本地就是值HiveCli本地,若使用的是jdbc连接,则本地指的是hiveserver2的服务器。
overwrite: 表示覆盖表中已有数据,否则表示追加
partition: 表示上传文件到指定分区,若目标是分区表,需指定分区。
Insert
将查询结果写到表中
insert (into | overwrite) table tablename [partition] (partcol1=val1, patcol2=val2 ... )]
select_statement;
给定values插入表中
insert into table student1 values(1, 'wangwu'), (2, 'zhaoliu');
将查询结果写入目标路径
insert overwrite [local] directory directory [row format row_format] [stored as file_format]
select_statement
insert overwrite local directory '/opt/module/datas/student' ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.JsonSerDe'
select id,name from student;
Export&Import
- Export导出语句可将表的数据和元数据信息一并到处的HDFS路径,Import可将Export导出的内容导入Hive,表的数据和元数据信息都会恢复。Export和Import可用于两个Hive实例之间的数据迁移。
export table tablename to 'export_target_path'
import [external] table new_or_original_tablename from 'source_path' [location 'import_target_path']
查询
语法
select [all | distinct] select_expr, select_expr, ...
from table_reference
[where where_condition]
[group by col_list]
[having col_list]
[order by col_list]
[cluister bby col_list | [distribute by col_list] [sort by col_list]]
[limit number]
join
select
loc,
count(*) cnt
from emp join dept
on emp.deptno = dept.deptno
group by loc;
内连接
- innner join 默认连接方式, 只对两张表中共有的数据进行连接
select
loc,
count(*) cnt
from emp inner join dept
on emp.deptno = dept.deptno
group by loc;
左外连接
- lfet join, 返回的数据包含左表的全部行,以及右表中能与左标关联的数据, 左表中与右表不关联的数据补null值。
select
loc,
count(*) cnt
from emp e
left join dept d
on e.deptno = d.deptno
group by loc;
右外连接
- right join, 返回的数据包含右表的全部行,以及左表中能与右表关联的数据,右表中不与左表中关联的数据补null值。
select
loc,
count(*) cnt
from emp e
right join dept d
on e.deptno = d.deptno
group by loc;
满外连接
- full join, 返回左右表中的全部行,其中不关联的数据补null值
select
loc,
count(*) cnt
from emp e
full join dept d
on e.deptno = d.deptno
group by loc;
笛卡尔积
- 产生条件1:省略连接条件
- 产生条件2:连接条件无效
- 产生条件3:所有表中的所有行互相连接
select
empno,
dname
from emp, dept;
联合(union & union all)
- union 和union all都是上下拼接sql的结果,这点和join有区别,join做的是左右连接,union和union all是上下拼接。union去重,union all不去重。
- union和union all在上下拼接sql结果时有两个要求。
- (1) 两个sql的结果,列的个数必须相同。
- (2) 两个sql的结果,上下所对应列的类型必须一致。
- 此外,union 只能连接两个select 语句,不能直接用于表连接,连接后的字段名以第一个select中的定义为基准。
select
empno empid, ename, job, sal, deptno
from emp
where deptno = 40
union
select
empno, ename, job, sal, deptno
from emp
where deptno = 40
select
*
from emp
where deptno in (20, 40);
排序
全局排序(Order by)
- Order by 全局排序,只有一个reduce
- asc (ascend) 升序,默认
- desc (descend) 降序
- oder by 会在map端执行,也会在reduce端执行,由于order by是全局排序,因此只能有一个reduce任务,当数据量大时,会造成reduce端数据倾斜,因此,常与limit联合使用
select
*
from emp
order by sal desc, deptno
limit 10;
每个Reduce内部排序(Sort by) Hive特有
- Sort By:对于大规模的数据集order by的效率非常低。在很多情况下,并不需要全局排序,此时可以使用Sort by。
- Sort by为每个reduce产生一个排序文件。每个Reduce内部进行排序,对全局结果集来说不是排序。
set mapreduce.job.reduces = 3;
insert overwrite local directory '/usr/local/datas/empdata'
select
*
from emp sort by deptno;
分区(Distribute By) Hive特有,用于tansform
- Distribute By:在有些情况下,我们需要控制某个特定行应该到哪个Reducer,通常是为了进行后续的聚集操作。distribute by子句可以做这件事。distribute by类似MapReduce中partition(自定义分区),进行分区,结合sort by使用。
- 对于distribute by进行测试,一定要分配多reduce进行处理,否则无法看到distribute by的效果。
set mapreduce.job.reduces = 3;
insert overwrite local directory '/usr/local/datas/empdata_distribute'
select
*
from emp
distribute by job
sort by sal desc;
分区排序(Cluster By) Hive特有, 用于tansform
- 根据某一个字段进行分区以及区内排序,等价于针对同一字段使用 sort by 以及distribute by
代码练习
多表联合
select
t4.id id,
coalesce(t1.score, 0) score1,
coalesce(t2.score, 0) score2,
coalesce(t3.score, 0) score3,
t4.cnt cnt,
t4.avgscore avgscore
from
(
select
stu_id id,
score score
from score_info where course_id=(
select course_info.course_id from course_info where course_name='语文'
)
) t1
full join (
select
stu_id id,
score score
from score_info where course_id=(
select course_info.course_id from course_info where course_name='数学'
)
) t2
on t1.id = t2.id
full join (
select
stu_id id,
score score
from score_info where course_id=(
select course_info.course_id from course_info where course_name='英语'
)
) t3
on nvl(t1.id, t2.id) = t3.id
join (
select
stu_id id,
count(*) cnt,
avg(score) avgscore
from score_info
group by stu_id
) t4
on coalesce(t1.id, t2.id, t3.id) = t4.id
order by t4.avgscore desc;
函数
show functions;
desc function upper;
desc function extended upper;
单行函数
- 单行函数的特点是一进一出,即输入一行,输出一行。
- 单行函数按照功能可分为如下几类:日期函数、字符串函数、集合函数、数学函数、流程控制函数等。
数值函数
select round(3.3, 2)
select round(-1.5)
select ceil(0.5)
select floor(0.5)
字符串函数
select substring(str, pos[, len])
replace(string A, string B, string C)
select replace('atguigu', 'a', 'A')
re_gexp_replace(string A, string B, string C)
select 'dfsaaaaa' regexp 'dfsa+'
select repeat('123', 3)
select split('a-b-c-d', '-')
select nvl(null, 1);
select concat('beijing', '-', 'sahnghai', '-', 'shenzhen')
- concat_ws:以指定分割符拼接字符串或者字符串数组
select concat_ws('-', 'beijing', 'shanghai', 'shenzhen');
- get_json_object: 解析json字符串
select get_json_object('[{"name":"大海海","sex":"男","age":"25"},{"name":"小宋
宋","sex":"男","age":"47"}]','$.[0].name');
select get_json_object('[{"name":"大海海","sex":"男","age":"25"},{"name":"小宋
宋","sex":"男","age":"47"}]','$.[0]');
日期函数
- unix_timestamp: 返回当前或指定时间的时间戳(UTC,格林威治时间)
select unix_timestamp('2022/08/08 08-08-08', 'yyyy/MM/dd HH-mm-ss');
- from_unixtime: 转换UNIX时间戳(从1970-01-01 00:00:00 UTC到指定时间)到当前时区的时间格式
select from_unixtime(1659946088);
select current_date;
- current_timestamp:当前的日期加时间,并精确到毫秒
select current_timestamp;
select month('2022-08-08 08:08:08')
select months_between(current_date(), '2021-09-09')
select day('2022-08-08 08:08:08')
select hour('2022-08-08 08:08:08')
- datediff: 两个日期相差的天数(结束日期减去开始日期的天数)
select datediff('2021-08-08','2022-10-09')
select date_add('2022-08-08',2);
select date_sub('2022-08-08',2);
- data_format: 将标准日期解析成指定格式字符串
select date_format('2022-08-08','yyyy年-MM月-dd日');
流程控制函数
select
stu_id,
course_id,
case
when score >= 90 then 'A'
when score >= 80 then 'B'
when score >= 70 then 'C'
when score >= 60 then 'D'
else '不及格'
end
from score_info;
select
stu_id,
course_id,
case score
when 90 then 'A'
when 80 then 'B'
when 70 then 'C'
when 60 then 'D'
else '不及格'
end
from score_info
select if(10 > 5,'正确','错误');
集合函数(处理复杂数据类型)
select size(friends) from test;
select map('xiaohaihai', 1, 'dahaihai', 2);
select map_keys(map('xiaohai', 1, 'dahai', 2));
select map_values(map('xiaohai', 1, 'dahai', 2));
select array('1','2','3','4');
- array_contains: 判断array中是否包含某个元素
select array_contains(array('a', 'b', 'c', 'd'), 'a');
select sort_array(array('a', 'd', 'c'));
select struct('name', 'age', 'weight');
{"col1":"name","col2":"age","col3":"weight"}
- named_struct声明struct的属性和值
select named_struct('name', 'xiaosong', 'age', 18, 'weight', 80);
{"name":"xiaosong","age":18,"weight":80}
高级聚合函数
- collect_list 收集指定列元素并形成list集合,结果不去重
- collect_set 收集指定列元素并形成set集合,结果去重
select
collect_list(job),
collect_set(job)
from employee;
炸裂函数(UDTF) User-Define-Table-Generating Functions
- 输入一个数组,可将数据转换为多行元素返回(as item 是指定列名)
- 语法:select explode(array("a", "b", "c")) as item;
- 输入一个map,返回多行两列,一列是key,一列是val (as (key, value) 用于指定列明,默认 key value)
- 语法:select explode(map("a", 1, "b", 2, "c", 3)) as (key, value);
- 输入一个数组,返回多行,两列,一列是索引,一列是数组值
- 语法:select posexplode(array("a", "b", "c")) as (pos, item);
- inline(array< struct<f1:T1, ..., fn:Tn>> a)
- 输入一个结构体数组,返回多行多列,每行是一个结构体,每列是结构体的一个元素值
- 语法:select inline(array(
named_struct("id", 1, "name", "zs"),
named_struct("id", 2, "name", "ls"),
named_struct("id", 3, "name", "ww")
)) as (id, name)
- 定义:Lateral View 通常与UDTF配合使用。Lateral View可以将UDTF应用到源表中的每行数据,将每行数据转换为一行或多行,并将源表中每行的输出结果与该行关联起来,形成一个虚表。
select
id,
name,
hobies,
hobby
from person lateral view explode(hobbies) tmp as hobby;
窗口函数
概述
- 窗口函数,能为每行数据划分一个窗口,然后对窗口范围内的数据进行计算,最后将计算结果返回给该行数据。
- 窗口函数的语法中主要包括"窗口"和"函数"两部分。其中"窗口"用于定义计算范围,"函数"用于定义计算逻辑
- 基本语法如下
select
order_id,
order_date,
amount,
函数(amount) over (窗口范围) total_amount
from order_info;
partitinr by order by [column] range | rows between l and r
unbounded preceding
[num] preceding
current row
[num] following
[num] preceding
current row
[num] following
unbounded following
current row
[num] following
unbounded following
[num] following
unbounded following
- over()中的三部分内容 partition by 、ordered by 、 (rows | range) between ... and... 均可省略不写
range between unbounded preceding and current row
rows between unbounded preceding and unbounded following
常见窗口函数
聚合函数
跨行取值函数
select
order_id,
user_id,
order_date,
amount,
lag(order_date, 1, '1970-01-01') over (partition by user_id order_by order_date) last_date,
lead(order_date, 1, '9999-12-31') over (partition by user_id order by order_date) next_date
from order_info;
select
order_id,
user_id,
order_date,
amount,
first_value(order_date, false) over (partition by user_id order by order_date) first_date,
last_value(order_date, false) over (partition by user_id order by order_date) last_date
from order_info;
排名函数
select
stu_id,
course,
score,
rank() over(partition by course order by score desc) rk,
dense_rank() over(partition by course order by score desc) dense_rk,
row_num() over(partition by course order by score desc) rn
from score_info;
几种去重方法(面试题)
select
distinct
id,
name
from table;
select
id, name
from table
group by id, name;
select
id, name
from (
select
id, name row_number() over (partition by id, name) rn
from table
) t1
where t1.rn = 1;