「这是我参与11月更文挑战的第10天,活动详情查看:2021最后一次更文挑战」
Hive函数进阶
-
explode(UDTF)
- NBA总冠军球队
create table the_nba_championship( team_name string, champion_year array<string> ) row format delimited fields terminated by ',' collection items terminated by '|'; -- 使用explode函数对champion_year进行拆分 -
lateral View
-
行列转换 concat
列比较
select count(case when gender='男' then id end) as '男',count(case when gender='女' then id end) as '女' from students男 女 30 40 行比较
select gender,count(id) as count from student group by gendergender count 男 30 女 40 - 行转列
create table row2col2( col1 string, col2 string, col3 int )row format delimited fields terminated by '\t';- 列转行
create table col2row2( col1 string, col2 string, col3 string )row format delimited fields terminated by '\t'; -
json数据处理
- 单个字段
create table tb_json_test1 ( json string ); get_json_object(json,'$.deviceType')- 多个字段
create table tb_json_test2 ( device string, deviceType string, signal double, `time` string ) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE; json_tuple(json,'device','deviceType')- serder
create table tb_json_test2 ( device string, deviceType string, signal double, `time` string ) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS TEXTFILE;-- 拓展--xml数据处理 desc function extended xpath; SELECT xpath('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>', 'a/b/text()')
窗口函数
聚合函数、分析函数、排序函数 over(partition by 字段1... order by 字段1... rows 范围)
- 建表
---建表并且加载数据
create table website_pv_info(
cookieid string,
createtime string, --day
pv int
) row format delimited
fields terminated by ',';
create table website_url_info (
cookieid string,
createtime string, --访问时间
url string --访问页面
) row format delimited
fields terminated by ','
- 求出每个用户总pv数sum+group by普通常规聚合操作
select cookieid,sum(pv) from website_pv_info group by cookieid;
- 求出网站总的pv数 所有用户所有访问加起来
- 求出每个用户总pv数
select cookieid,sum(pv) over(partition by cookieid) from website_pv_info;
- 求出每个用户截止到当天,累积的总pv数
-- 求出每个用户截止到当天,累积的总pv数 between unbounded preceding and current row 等价 unbounded preceding
-- order by 默认有一个计算范围 unbounded preceding
select cookieid,sum(pv) over(partition by cookieid order by createtime rows between unbounded preceding and current row ) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime rows unbounded preceding) from website_pv_info;
select cookieid,sum(pv) over(partition by cookieid order by createtime ) from website_pv_info;
- 自定义窗口范围
unbounded preceding
- 找出每个用户访问pv最多的Top3重复并列的不考虑
-- 排序 rank 遇到重复会生成不连续的序号 dense_rank连续序号 row_number生成行号
select *,rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,row_number() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
with tb1 as ( select *,dense_rank() over (partition by cookieid order by pv desc ) as rank_num from website_pv_info )
from tb1
select *
where rank_num <= 3;
- 统计每个用户pv数最多的前3分之1天
with tb1 as (select *,ntile(3) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info)
from tb1
select *
where rank_num = 1;
- 分析函数
select *,lag(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 向下取值
select *,lead(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 取第一个值
select *,first_value(pv) over (partition by cookieid order by pv desc ) as rank_num from website_pv_info;
-- 去最后一个值 注意取值范围 因为orderby 所用取值范围需要增加向下的范围
select *,last_value(pv) over (partition by cookieid order by pv desc rows between unbounded preceding and unbounded following) as rank_num from website_pv_info;
select sum(pv) sum(sum(pv)) over() from website_pv_info
数据压缩
- 压缩算法
- map压缩
1)开启hive中间传输数据压缩功能
set hive.exec.compress.intermediate=true;
2)开启mapreduce中map输出压缩功能
set mapreduce.map.output.compress=true;
3)设置mapreduce中map输出数据的压缩方式
Set mapreduce.map.output.compress.codec= org.apache.hadoop.io.compress.SnappyCodec;
- reduce压缩
1)开启hive最终输出数据压缩功能
set hive.exec.compress.output=true;
2)开启mapreduce最终输出数据压缩
set mapreduce.output.fileoutputformat.compress=true;
3)设置mapreduce最终数据输出压缩方式
set mapreduce.output.fileoutputformat.compress.codec = org.apache.hadoop.io.compress.SnappyCodec;
4)设置mapreduce最终数据输出压缩为块压缩
set mapreduce.output.fileoutputformat.compress.type=BLOCK;
数据存储格式
- 行存储和列存储
- TextFILE
- ORC
- PAROUET
- 格式对比
TEXTfile
create table log_text (
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE ;
orc
create table log_orc(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS orc ;
PARQUET
create table log_parquet(
track_time string,
url string,
session_id string,
referer string,
ip string,
end_user_id string,
city_id string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
STORED AS PARQUET ;