hive没有专门的数据存储格式,默认keyi直接加载文本文件TextFile,还支持SequenceFile、RCFile等。cwiki.apache.org/confluence/…
- TextFile
- SequenceFile
- RCFile 0.6
- Avro 0.9
- ORC 0.11
- PARQUET 0.13
创建原始表
create external table stu_original(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
location '/stu_original';
原始数据生成Java代码
package com.strivelearn.hadoop.hdfs.compress;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
/**
* @author xys
* @version GenerateData2.java, 2022年10月23日
*/
public class GenerateData2 {
public static void main(String[] args) throws IOException {
String fileName = "/Users/strivelearn/Desktop/stu.data";
System.out.println("start:开始生成2G文件->" + fileName);
BufferedWriter bfw = new BufferedWriter(new FileWriter(fileName));
int num = 0;
//5千万行数据
while (num < 50000000) {
//生成的内容:
//1,zhangsan1,suzhou1
//2,zhangsan2,suzhou2
//....
bfw.write(num + ",zhangsan" + num + ",suzhou" + num);
bfw.newLine();
num++;
if (num % 10000 == 0) {
bfw.flush();
}
}
System.out.println("end:2G文件已生成");
}
}
上传原始数据
hdfs dfs -put /root/data/stu.data /stu_original
一、TextFile存储格式
Hive的默认存储格式,行存储。
- 磁盘存储开销大,数据解析开销大。
- 支持压缩,压缩后的TextFile在Hive中无法切割
1.1、创建压缩格式为deflate,数据存储格式为:TextFile的hive表
create external table stu_textfile_deflate_compress(
id int,
name string,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
location '/stu_textfile_deflate_compress';
导入数据前,设置会话hive配置
set hive.exec.compress.output=true;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.DeflateCodec;
导入数据
set mapreduce.job.reduces=1;
insert into stu_textfile_deflate_compress select id,name,city from stu_original group by id,name,city;
验证是否可进行split
select id,count(*) from stu_textfile_deflate_compress group by id;
1.2、创建压缩格式为bzip2,数据存储格式为:TextFile的hive表
create external table stu_textfile_bzip2_compress(
id int,
name string,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
location '/stu_textfile_bzip2_compress';
导入数据前,设置会话hive配置
set hive.exec.compress.output=true;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.BZip2Codec;
导入数据
set mapreduce.job.reduces=1;
insert into stu_textfile_bzip2_compress select id,name,city from stu_original group by id,name,city;
验证是否可进行split
select id,count(*) from stu_textfile_bzip2_compress group by id;
hive默认拆分的大小
set mapred.max.split.size;
默认是256M
二、SequenceFile存储格式(实际项目,不常用)
- 二进制文件,内部数据是<key,value>的形式,行存储
- 使用方便,可切分,可压缩
- 支持none、record、block级别的压缩
2.1、创建压缩格式为无,数据存储格式为:SequenceFile的hive表
create external table stu_seqfile_none_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as sequencefile
location '/stu_seqfile_none_compress';
确认创建的表的格式
show create table stu_seqfile_none_compress;
导入数据
set mapreduce.job.reduces=1;
insert into stu_seqfile_none_compress select id,name,city from stu_original group by id,name,city;
验证是否可以进行split
select id,count(*) from stu_seqfile_none_compress group by id;
2.2、创建压缩格式为deflate,数据存储格式为:SequenceFile的hive表
create external table stu_seqfile_deflate_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as sequencefile
location '/stu_seqfile_deflate_compress';
导入数据前,设置会话hive配置
set hive.exec.compress.output=true;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.DeflateCodec;
set io.seqfile.compression.type=BLOCK;
导入数据
set mapreduce.job.reduces=1;
insert into stu_seqfile_deflate_compress select id,name,city from stu_original group by id,name,city;
验证是否进行split
select id,count(*) from stu_seqfile_deflate_compress group by id;
三、RCFile存储格式
- 数据按行分组,每组按照列存储,整合了行存储和列存储的优点,属于行列式存储。
- 压缩快,可分割,支持快速列存储
3.1、创建压缩格式为无,数据存储格式为:RCFile的hive表
create external table stu_seqfile_deflate_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as rcfile
location '/stu_rcfile_none_compress';
导入数据
set mapreduce.job.reduces=1;
insert into stu_rcfile_none_compress select id,name,city from stu_original group by id,name,city;
验证是否可以进行split
select id,count(*) from stu_rcfile_none_compress group by id;
3.2、创建压缩格式为deflate,数据存储格式为:RCFile的hive表
create external table stu_rcfile_deflate_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as rcfile
location '/stu_rcfile_deflate_compress';
导入数据前,设置会话hive配置
set hive.exec.compress.output=true;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.DeflateCodec;
导入数据
set mapreduce.job.reduces=1;
insert into stu_rcfile_deflate_compress select id,name,city from stu_original group by id,name,city;
验证是否可以进行split
select id,count(*) from stu_rcfile_deflate_compress group by id;
四、ORC存储格式
orc存储格式是在RCFile的基础之上做了些升级,在性能上大幅度提升。
在创建orc存储格式的时候,可以在创建表的语句中,指定压缩格式。
create table t1(
id int,
name string
)stored as orc tabproperties("orc.compress"="NONE");
下面为tabproperties的各个参数名。多个参数,用逗号分割
| orc.compress | ZLIB | high level compression (one of NONE, ZLIB, SNAPPY) |
|---|---|---|
| orc.compress.size | 262,144 | number of bytes in each compression chunk |
| orc.stripe.size | 67,108,864 | number of bytes in each stripe |
| orc.row.index.stride | 10,000 | number of rows between index entries (must be >= 1000) |
| orc.create.index | true | whether to create row indexes |
| orc.bloom.filter.columns | "" | comma separated list of column names for which bloom filter should be created |
| orc.bloom.filter.fpp | 0.05 | false positive probability for bloom filter (must >0.0 and <1.0) |
4.1、创建压缩格式为无,数据存储格式为:ORC的hive表
create external table stu_orc_none_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc
location '/stu_orc_none_compress'
tblproperties("orc.compress"="NONE");
导入数据
set mapreduce.job.reduces=1;
insert into stu_orc_none_compress select id,name,city from stu_original group by id,name,city;
验证是否可以进行拆分
select id,count(*) from stu_orc_none_compress group by id;
4.2、创建压缩格式为Zlib,数据存储格式为:ORC的hive表
create external table stu_orc_zlib_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc
location '/stu_orc_zlib_compress'
tblproperties("orc.compress"="ZLIB");
导入数据
set mapreduce.job.reduces=1;
insert into stu_orc_zlib_compress select id,name,city from stu_original group by id,name,city;
验证是否可以进行拆分
select id,count(*) from stu_orc_zlib_compress group by id;
4.3、创建压缩格式为snappy,数据存储格式为:ORC的hive表
create external table stu_orc_snappy_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as orc
location '/stu_orc_snappy_compress'
tblproperties("orc.compress"="SNAPPY");
导入数据
set mapreduce.job.reduces=1;
insert into stu_orc_snappy_compress select id,name,city from stu_original group by id,name,city;
验证是否可以进行拆分
select id,count(*) from stu_orc_snappy_compress group by id;
orc支持的压缩格式
public enum CompressionKind {
NONE, ZLIB, SNAPPY, LZO, LZ4
}
五、PARQUET存储格式
Parquet是一种新型的、与语言无关的,并且不和任何一种数据处理框架绑定的列式存储结构,适配多种语言和组件。Parquet可以支持Hive、Impala、Spark等计算引擎中使用。
5.1、创建压缩格式为无,数据存储格式为:Parquet的hive表
create external table stu_parquet_none_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as parquet
location '/stu_parquet_none_compress'
tblproperties("parquet.compression"="uncompressed");
导入数据
set mapreduce.job.reduces=1;
insert into stu_parquet_none_compress select id,name,city from stu_original group by id,name,city;
验证是否可以进行拆分(不可拆分)
select id,count(*) from stu_parquet_none_compress group by id;
5.2、创建压缩格式为gzip,数据存储格式为:Parquet的hive表
create external table stu_parquet_gzip_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as parquet
location '/stu_parquet_gzip_compress'
tblproperties("parquet.compression"="gzip");
5.3、创建压缩格式为snappy,数据存储格式为:Parquet的hive表
create external table stu_parquet_snappy_compress(
id int,
name String,
city string
)row format delimited
fields terminated by ','
lines terminated by '\n'
stored as parquet
location '/stu_parquet_snappy_compress'
tblproperties("parquet.compression"="gzip");
六、总结
原始数据文件:2G
| TextFile | SequenceFile | RCFile | ORC | PARQUET | |
|---|---|---|---|---|---|
| 未压缩 | 2G | 2.7G | 1.6G | 1.38G | 2G |
| Deflate | 0.37G | 2.39G | 0.37G | 0.26G | 0.34G |
ORC比较好,并且自身存储能力是非常好的,并且也支持切分,所以在选择压缩格式的时候就重点考虑:压缩和解压速度。常见的压缩格式:Lzo、Snappy(更好的)的压缩和解压缩的速度是最快的。
所以一般建议:选择ORC的存储格式、Snappy的压缩格式。
但是有时候需要考虑数据存储格式的兼容度,是否支持多种计算引擎。
但是Parquet可以应用Hadoop生态圈中的任何项目中。