Hive创建hive表hive版的wordcount分区表partition分桶表bucket采样sampling四个b

创建hive表

# 创建库
CREATE DATABASE elaiza; 

# 创建表(内部表) 删除内部表的时候，不仅删除了元数据，还删除了HDFS上存的数据
create table news_noseg (sentence string) 
# 对源数据的一行数据进行 , 分割映射成对应的字段
row format delimited fields terminated by '\n';  

# 查看表结构
desc article

# 查询表数据(前10条)
select * from news_noseg limit 10;

# 本地-导入数
load data local inpath '/usr/local/src/data/hive_about/allfiles.txt' into table news_noseg;

hive版的wordcount

--分割转数组
select split('But it is used with a suitable irony',' ');  => 返回的是数组

--行转列
select explode(split('But it is used with a suitable irony',' ')); => 把数组每个元素单独列成一行

--应用到文章
select explode(split(sentence,' ')) as word from article limit 10;

--group by
select word, count(*) as cnt
from 
(
select 
explode(split(sentence,' ')) as word 
from article limit 10
) as tmp 
group by word;

--正则测试
select regexp_extract('12elaiza89', '[a-zA-Z]+', 0);

--小写转换
select lower('Elaiza');

--完整版
select 
regexp_extract(lower(word), '[a-zA-Z]+', 0) as word, count(*) as cnt 
from (select explode(split(sentence,' ')) as word from article limit 10) as tmp 
group by word;

分区表 partition

# partition是辅助查询，缩小查询范围：date(日期，按天进行分区), 渠道(三端app, m页面, pc)
# 分析条件 - 口径
# 创建partition分区表
create table art_dt(sentence string) 
partitioned by (dt string) 
row format delimited fields terminated by '\n';

# 插入数据（从其他表里查询数据插入到分区表里）
insert into table art_dt partition(dt='20191109') 
select * from article limit 100;

# 覆盖数据，删除目录在新建
insert overwrite table art_dt partition(dt='20191110') 
select * from article limit 100;

# 查看表的分区信息
show partitions art_dt;

分桶表bucket

# 设置
set hive.enforce.bucketing = true;

# 创建（注：会有32个reduce）
create table bucket_user (id int) clustered by (id) into 32 buckets;

# 创建一个用户临时表并导入测试数据
create table tmp_user (id int)
row format delimited fields terminated by '\n';

# 插入数据
load data local inpath '/usr/local/src/data/hive_about/hive_data_2/bucket.txt' into table tmp_user;

# 将用户临时表的数据插入到bucket分桶表里去
insert overwrite table bucket_user 
select * from tmp_user;

采样sampling

res % y = (x - 1)
select * from bucket_user tablesample(bucket 1 out of 32 on id); => 32
select * from bucket_user tablesample(bucket 2 out of 32 on id); => 1
select * from bucket_user tablesample(bucket 1 out of 16 on id); => 32, 16
select * from bucket_user tablesample(bucket 2 out of 16 on id); => 1, 17
注：bucket x out of y
# y必须是table总bucket数的倍数或者因子

四个by

--order by      # 全局排序，不管设置几个reduce系统默认都会只会使用一个reduce来跑
--sort by       # 区内排序（和distribute结合用，reduce为多个）
--distribute by # 分区排序
--custer by     # 当distribute by 和 sort by 后面的字段是一样的时候用custer by 代替
set mapreduce.job.reduces = 2

# 插入数据
insert overwrite local directory "/usr/local/src/data/hive_about/sort_by"
select * from test sort by user_id;

# 先按照user_id 分桶，每个桶再根据product_count排序
insert overwrite local directory "/usr/local/src/data/hive_about/dis_by"
select * from test distribute by user_id sort by product_count;

组内排序

rank（）就是排序 相同的排序是一样的，但是下一个小的会跳着排序，比如 
等级  排序
23 1
23 1
22  3
dense_rank()相同的排序相同，下一个小的会紧挨着排序，比如
等级  排序
23  1
23 1
22  2 
这样总个数是相对减少的，适合求某些指标前几个等级的个数。
row_number()就很简单，顺序排序。比如
等级  排序
23  1
23  2
22  3 
这种排序 总个数是不变的，适合求某些值的前几名。

空字段赋值

nvl(null,0)

时间类

1.格式化: date_format('2019-11-11 23:32:22','yyyy-MM-dd HH:mm:ss')
2.date_add('2019-11-11',3)
3.date_format(regexp_replace('2019/1/23','/','-'),'yyyy-MM-dd');

三元表达式

1.case when 和 if(布尔表达式,为true的值,为false的值)

行转列

1.concat('hello','-','world')
2.concat_ws(分割符，str，str...)
3.collect_set(col) 将列去重汇总成数组
4.concat_ws('-',collect_set(col)) 结合使用

列转行侧写 lateral view

explode() 
# 将数组数据拆分成一行一行

# 源数据
movie       category
疑犯追踪	悬疑,爱情,动作,喜剧
别对我说慌	剧情,悬疑,动作
奥特曼	动作,动画

# 创建表和数据插入
create table movies (
movie string,
category array<string>
)
row  format delimited fields terminated by "\t"  # 行与行之间是由 tab 来分开的
collection items terminated by ",";  # 集合间的元素是有 , 分开的

# 插入数据
load data local inpath '/usr/local/src/data/hive_about/hive_data_2/movies.txt' into table movies;

# 查询数据
select
   movie,
   category_name
from
   movies lateral view explode(category) table_tmp as category_name;

# 转化成的数据个格式
movie	category_name
疑犯追踪	悬疑
疑犯追踪	爱情
疑犯追踪	动作
疑犯追踪	喜剧
别对我说慌	剧情
别对我说慌	悬疑
别对我说慌	动作
奥特曼	动作
奥特曼	动画

窗口函数

over()
# 指定分析函数工作的数据窗口大小，这个数据窗口大小可能会随着行的变化而变化
# 必须在聚合函数后面

# 表：business
select name , count(*) from business where substring(orderdate,1,7) = '2017-04' group by name;
jack 1
mark 4

select name , count(*) over() from business where substring(orderdate,1,7) = '2017-04' group by name;
jack 2
mark 2

# over()空的 是对整个数据集开窗
select name, orderdate, sum(cost) over(order by orderdate) from business;
jack	2017-01-01	10
tony	2017-01-02	25
tony	2017-01-04	54
jack	2017-01-05	100
tony	2017-01-07	150
jack	2017-01-08	205
jack	2017-02-03	228
jack	2017-04-06	270
mart	2017-04-08	332
mart	2017-04-09	400
mart	2017-04-11	475
mart	2017-04-13	569
neil	2017-05-10	581
neil	2017-06-12	661

# over(distribute by name)
select name, orderdate, cost, sum(cost) over(distribute by name) from business;
jack	2017-01-05	46	176
jack	2017-01-08	55	176
jack	2017-01-01	10	176
jack	2017-04-06	42	176
jack	2017-02-03	23	176
mart	2017-04-13	94	299
mart	2017-04-11	75	299
mart	2017-04-09	68	299
mart	2017-04-08	62	299
neil	2017-05-10	12	92
neil	2017-06-12	80	92
tony	2017-01-04	29	94
tony	2017-01-02	15	94
tony	2017-01-07	50	94

# over(distribute by name sort by orderdate)
select name, orderdate, cost, sum(cost) over(distribute by name sort by orderdate) from business;
jack	2017-01-01	10	10
jack	2017-01-05	46	56
jack	2017-01-08	55	111
jack	2017-02-03	23	134
jack	2017-04-06	42	176
mart	2017-04-08	62	62
mart	2017-04-09	68	130
mart	2017-04-11	75	205
mart	2017-04-13	94	299
neil	2017-05-10	12	12
neil	2017-06-12	80	92
tony	2017-01-02	15	15
tony	2017-01-04	29	44
tony	2017-01-07	50	94

# lag 和 lead
select name, orderdate, cost, lag(orderdate,1,'1900-01-01') over(distribute by name sort by orderdate) from business;
select name, orderdate, cost, lead(orderdate,1,'1900-01-01') over(distribute by name sort by orderdate) from business;
jack	2017-01-01	10	1900-01-01
jack	2017-01-05	46	2017-01-01
jack	2017-01-08	55	2017-01-05
jack	2017-02-03	23	2017-01-08
jack	2017-04-06	42	2017-02-03
mart	2017-04-08	62	1900-01-01
mart	2017-04-09	68	2017-04-08
mart	2017-04-11	75	2017-04-09
mart	2017-04-13	94	2017-04-11
neil	2017-05-10	12	1900-01-01
neil	2017-06-12	80	2017-05-10
tony	2017-01-02	15	1900-01-01
tony	2017-01-04	29	2017-01-02
tony	2017-01-07	50	2017-01-04

# over(rows between 2 preceding and current row)
select name, orderdate, cost, sum(cost) over(rows between 2 preceding and current row) from business;
mart	2017-04-13	94	94
neil	2017-06-12	80	174
mart	2017-04-11	75	249
neil	2017-05-10	12	167
mart	2017-04-09	68	155
mart	2017-04-08	62	142
jack	2017-01-08	55	185
tony	2017-01-07	50	167
jack	2017-04-06	42	147
jack	2017-01-05	46	138
tony	2017-01-04	29	117
jack	2017-02-03	23	98
tony	2017-01-02	15	67
jack	2017-01-01	10	48

# 应用场景，前20%的数据
select name, orderdate, cost from (select name, orderdate, cost, ntile(5) over(order by orderdate) pr from business)t where pr = 1;
select name, orderdate, cost, ntile(5) over() from business;
mart	2017-04-13	94	1
neil	2017-06-12	80	1
mart	2017-04-11	75	1
neil	2017-05-10	12	2
mart	2017-04-09	68	2
mart	2017-04-08	62	2
jack	2017-01-08	55	3
tony	2017-01-07	50	3
jack	2017-04-06	42	3
jack	2017-01-05	46	4
tony	2017-01-04	29	4
jack	2017-02-03	23	4
tony	2017-01-02	15	5
jack	2017-01-01	10	5

系统函数

show functions;
desc function extended upper;

hive的优化

一、解释多少个map和reduce的个数？
    1.block大小
    set dfs.block.size;
    2.设置split切的数据大小
    set mapred.max.split.size;
    3.map个数 = split个数
    map个数/split个数 = 大文件 / max(mapred.max.split.size, dfs.block.size);
    3.手动设置map任务数
    set mapred.map.tasks;
    4.设置reduce处理的数据大小
    set hive.exec.reducers.bytes.per.reducer;
    5.手动设置reduce个数
    set mapred.reduce.tasks;

二、什么时候只有一个reduce？(每个reduce都会生成哟个文件)
    1.没有group by
    2.order by 全局排序（可以使用distribute by 和 sort by 这是每个reduce里的数据排序）
    3.笛卡尔积
    
三、如何合并小文件，减少map数？
    1.使用hadoop archive命令把小文件进行归档
    2.配置map和reduce的任务数

四、优化方法
    使用group by 来代替distinct
    提前过滤空值方便join
    默认开启mapjoin 小表在前
    数据倾斜，前面加上随机数来，避免大量的数据被一个reduce来处理
    万能的方法：hive.groupby.skewindata=true

动态分区

开启动态分区参数设置
set hive.exec.dynamic.partition=true;
设置为非严格模式
set hive.exec.dynamic.partition.mode=nonstrict;
在所有执行MR的节点上，最大一共可以创建多少个动态分区
set hive.exec.max.dynamic.partitions=1000;
每个执行MR的节点上，最大可以创建多少个动态分区 (例如:一年365个day,那么就要设置大于365的值)
set hive.exec.max.dynamic.partitions.pernode=100
整个MRJob中，最大可以创建多少个HDFS文件

hive 文件存储格式

列式存储和行式存储
TextFile格式
Orc个格式
Parquet格式

练习

###练习题表的创建和数据的插入###
	--创建orders表

	create table orders (
	order_id string,
	user_id string,
	eval_set string,
	order_number string,
	order_dow string,
	order_hour_of_day string,
	days_since_prior_order string
	)
	row format delimited fields terminated by ',';
	
	--
	order_id 订单编号
	user_id 用户id
	eval_set 标识订单是否为历史数据
	order_number 用户购买订单序号
	order_dow 星期几
	days_since_prior_order 哪个小时产生的订单
    --

	--插入数据
	load data local inpath '/usr/local/src/data/original_data/orders.csv' into table orders;

	--创建商品和订单的关系表
	create table priors (
	order_id string,
	product_id string,
	add_to_cart_order string,
	reordered string
	)
	row format delimited fields terminated by ',';

	--
	order_id 订单id
	product_id 商品id
	add_to_cart_order 订单支付先后顺序的位置
	reordered 当前商品是否为重复下单的行为
	--

	--插入数据
	load data local inpath '/usr/local/src/data/order_products__prior.csv' into table priors;

1.每个用户有多少订单？
	insert overwrite local directory "/usr/local/src/data/hive_about/answer_01"
	select user_id, count(1) as order_count from orders group by user_id order by order_count;
	select user_id, Max(cast(order_number as int)) as order_count from orders group by user_id order by order_count;

2.每个用户购买的商品数量？
	insert overwrite local directory "/usr/local/src/data/hive_about/answer_02"
	select 
	C.user_id as user_id,
	sum(C.product_count) as product_count
	from (
	select 
	A.user_id,
	B.product_count 
	from orders A join (select order_id, count(1) as product_count from priors group by order_id) B 
	on A.order_id = B.order_id ) C
	group by C.user_id;

3.每个用户平均每个订单是多少商品？
	insert overwrite local directory "/usr/local/src/data/hive_about/answer_03"
	select 
	C.user_id as user_id,
	avg(C.product_count) as product_avg
	from (
	select 
	A.user_id,
	B.product_count 
	from orders A join (select order_id, count(1) as product_count from priors group by order_id) B 
	on A.order_id = B.order_id ) C
	group by C.user_id;

4.每个用户在一周中的购买订单的分布情况？
	insert overwrite local directory "/usr/local/src/data/hive_about/answer_04"
	select 
	user_id,
	count(case when order_dow = 0 then order_id end),
	count(case when order_dow = 1 then order_id end),
	count(case when order_dow = 2 then order_id end),
	count(case when order_dow = 3 then order_id end),
	count(case when order_dow = 4 then order_id end),
	count(case when order_dow = 5 then order_id end),
	count(case when order_dow = 6 then order_id end) 
	from orders
	group by user_id;
     
5.一个用户平均每个月购买多少商品？30天为一个月
	间隔天数累计除去30向上取整，注意只有一个订单的时候是个特殊情况，需要判断下，因为是null

6.每个用户最喜爱的三个product是什么？
	# 创建零时表
	create table user_product (user_id string, product_id string, product_count int)
	row format delimited fields terminated by '\t';
	# 零时数据插入到零时表
	insert overwrite table user_product 
	select 
	A.user_id,
	A.product_id,
	count(A.product_id) as product_count
	from (
	select 
	o.user_id,
	p.product_id
	from orders o join priors p on o.order_id = p.order_id ) A group by A.user_id, A.product_id 
	# 根据零时数据求Top
	insert overwrite local directory "/usr/local/src/data/hive_about/answer_06"
	select
	A.user_id,
	collect_set(case when rank = 1 then A.product_id end)[0],
	collect_set(case when rank = 2 then A.product_id end)[0],
	collect_set(case when rank = 3 then A.product_id end)[0]
	from 
	(
	select 
	user_id,
	product_id,
	product_count,
	row_number() over (partition by user_id order by product_count desc) rank
	from user_product
	group by  user_id, product_id, product_count
	) as A
	where A.rank <= 3
	group by A.user_id;


	--question6的测试数据
	99962	38944	26
	99962	39200	1
	99962	39794	4
	99962	4066	1
	99962	41030	3
	99962	41328	3
	99962	41544	1
	99962	41621	1
	99962	42509	10
	99962	42585	2
	99962	4392	1
	99962	44396	7
	99962	44422	3
	99962	44426	5
	99962	44532	3
	99962	44873	1
	99962	46820	1
	99962	47209	1
	99962	47223	11
	99962	47766	3
	99962	4855	12
	99962	48679	2
	99962	48745	5
	99962	5184	1
	99962	5250	1
	99962	6020	1
	99962	6541	1
	99982	10108	15
	99982	1025	2
	99982	11046	1
	99982	2091	2
	99982	21137	9
	99982	21195	1
	99982	25489	1
	99982	26497	2
	99982	28918	1
	99982	30391	2
	99982	35855	1
	99982	36070	1
	99982	39121	1
	99982	39275	7
	99982	43713	6
	99982	46709	1
	99982	48493	2
	99982	5025	5
	99982	5322	1
	99982	6664	2
	99982	7485	1
	99982	7746	1
	99982	7948	30
	99982	8424	12
	99983	39275	7
	99983	43713	6
	99983	46709	1
	99983	48493	2
	99983	5025	5
	99983	5322	1
	99983	6664	2
	99983	7485	1
	99983	7746	1
	99983	7948	2
	99984	8424	15
	99984	7746	1
	99984	7948	3
	99985	8424	3
	create table test (
	user_id string,
	product_id string,
	product_count int
	)
	row format delimited fields terminated by '\t';

	load data local inpath '/usr/local/src/data/hive_about/hive_data_2/test.txt' into table test;


	select
	A.user_id,
	collect_set(case when rank = 1 then A.product_id end)[0],
	collect_set(case when rank = 2 then A.product_id end)[0],
	collect_set(case when rank = 3 then A.product_id end)[0]
	from 
	(
	select 
	user_id,
	product_id,
	product_count,
	row_number() over (partition by user_id order by product_count desc) rank
	from test
	group by  user_id,product_id,product_count
	) as A
	where A.rank <= 3
	group by A.user_id;

7.蚂蚁金服题目
	# 创建表
	create table jd (user_id string, shop string)
	row format delimited fields terminated by "\t";
	# 导入数据
	load data local inpath '/usr/local/src/data/hive_about/hive_data_2/jd.txt' into table jd;
	# 数据源
	u1	a
	u2	b
	u1	b
	u1	a
	u3	c
	u4	b
	u1	a
	u2	c
	u5	b
	u4	b
	u6	c
	u2	c
	u1	a
	u2	a
	u2	a
	u3	a
	u5	a
	u5	a
	u5	a

	# 求每个店铺的UV(访客数)？
	-- 使用distinct的效率不高(不推荐)
	select shop, count(distinct user_id) from jd group by shop;

	-- 使用group by(推荐)
	select shop, count(1) from 
	(select shop, user_id from jd group by shop, user_id) tmp
	group by tmp.shop;

	# 每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数



8.对每个用户最喜爱购买的三个product的改编，每个用户最喜爱的top10%的商品
	如果一个用户购买一共购买了10个，返回top1，购买了20个返回top2
	如果一个用户购买了3个，3*10%=0.3 返回top1，也就是不够的至少一个 
	select
	user_id,
	product_id,
	product_count
	from (
	select 
	user_id,
	product_id,
	product_count,
	row_number() over (partition by user_id order by product_count desc) rank,
	count(1) over (partition by user_id) user_product_count
	from test) t1 where rank <= ceiling(t1.user_product_count * 0.1);
	99961	38944	26
	99961	42509	10
	99961	42509	10
	99961	42509	10
	99961	38944	26
	99961	38944	26
	99961	38944	26
	99962	4855	12
	99962	38944	26
	99962	47223	11
	99982	8424	12
	99982	10108	15
	99982	7948	30
	99983	39275	7
	99984	8424	15
	99985	8424	3

9.建分区表，orders表按照order_dow建立分区表orders_part，然后从hive查询orders动态插入orders_part表中
    1.创建分区表
	create table orders_part(
	order_id string,
	user_id string,
	eval_set string,
	order_number string,
	order_hour_of_day string,
	days_since_prior_order string)
	partitioned by (order_dow string)
	row format delimited fields terminated by ',';

	2.查询导入数据(分区的字段放到最后面)
	insert into table orders_part partition(order_dow) 
	select 
	order_id, 
	user_id, 
	eval_set,
	order_number,
	order_hour_of_day,
	days_since_prior_order,
	order_dow 
	from orders;

10.统计商品的销量（pv），购买的用户数（uv），reorder数，在同一个sql中展示出来
	即：输出形式
	| 商品id | 商品销量 | 购买用户数 |reordered数|
	| :-------- | --------:| :------: |
	| product_id | product_pv | product_uv | reordered_cnt|
	--创建测试表 test2
	select user_id, product_id, reordered from orders o left join priors p on o.order_id = p.order_id limit 100;
	88772	37053	0
	88772	3682	1
	88772	9422	0
	444	45638	0
	444	28849	1
	444	32052	1
	444	39911	1
	444	37687	1
	444	35939	1
	444	432	1
	444	27966	1
	444	27104	1
	444	13535	1
	444	2979	1
	444	26369	1
	444	11777	1
	444	18272	1
	444	28985	1
	444	49075	1
	444	22963	1
	444	42265	1
	444	30489	1
	444	16797	1
	444	48679	1
	444	30252	1
	444	44514	1
	444	38400	1
	444	25931	1
	444	2846	1
	444	44683	1
	444	21938	1
	444	45007	1
	444	47626	1
	444	2295	1
	444	24184	1
	444	49683	1
	444	13176	1
	144784	30489	0
	144784	25832	0
	144784	30450	1
	144784	31506	0
	144784	19154	0
	144784	20063	0
	144784	25948	0
	144784	14335	0
	144784	23734	0
	144784	11140	0
	144784	23622	0
	144784	28849	0
	144784	45819	0
	144784	10603	0
	144784	17949	0
	144784	11818	0
	144784	25340	1
	144784	10305	1
	144784	14333	1
	144784	37594	1
	157967	2481	0
	157967	42423	1
	41222	28204	1
	41222	20794	1
	41222	47766	1
	41222	16911	0
	41222	13874	0
	41222	15803	0
	41222	34466	0
	41222	11777	0
	41222	13176	1
	44785	45007	1
	44785	46676	1
	44785	7781	1
	44785	42615	1
	44785	11136	1
	44785	21454	1
	44785	25890	1
	44785	18564	1
	44785	23029	1
	44785	18818	1
	44785	11137	1
	44785	29432	1
	44785	44359	1
	44785	24852	1
	44785	40545	1
	44785	23765	1
	44785	36397	1
	164031	44570	1
	164031	8193	1
	164031	48679	0
	164031	8277	1
	164031	11440	0
	164031	42265	0
	164031	14992	1
	164031	49683	1
	164031	41220	1
	119023	4137	0
	119023	20899	0
	119023	2057	0
	119023	30016	0
	119023	9245	0
	119023	2452	0

select t.product_id, count(t.product_id),count(distinct(t.user_id)),sum(t.reordered) from 
(select user_id, product_id, reordered from orders o left join priors p on o.order_id = p.order_id limit 1000) t group by t.product_id;

11.每个用户购买的product商品去重后的集合数据（用字符串表示以‘，’分割）
	共两列即： user_id product_id1, product_id2, product_id3...
	select t.user_id, concat_ws(',', collect_set(t.product_id))
	from (select user_id,product_id from test2 group by user_id, product_id) t group by t.user_id;

练习题


第1题
我们有如下的用户访问数据
userId  visitDate   visitCount
u01 2017/1/21   5
u02 2017/1/23   6
u03 2017/1/22   8
u04 2017/1/20   3
u01 2017/1/23   6
u01 2017/2/21   8
U02 2017/1/23   6
U01 2017/2/22   4
要求使用SQL统计出每个用户的累积访问次数，如下表所示：
用户id    月份  小计  累积
u01 2017-01 11  11
u01 2017-02 12  23
u02 2017-01 12  12
u03 2017-01 8   8
u04 2017-01 3   3

解答
SELECT 
t2.userid,
t2.s_day,
t2.vc,
sum(t2.vc) over(PARTITION BY t2.userid order by t2.s_day ROWS BETWEEN UNBOUNDED PRECEDING and CURRENT ROW) ccc
from (
SELECT 
t1.userid,
t1.s_day,
sum(t1.visitcount) vc
from (
SELECT 
userid, 
from_unixtime(unix_timestamp(visitdate,"yyyy/mm/dd"),"yyyy-mm") s_day,
visitcount
from first ) t1
group by t1.userid,t1.s_day ) t2


第2题 京东
有50W个京东店铺，每个顾客访问任何一个店铺的任何一个商品时都会产生一条访问日志，访问日志存储的表名为Visit，访客的用户id为user_id，被访问的店铺名称为shop，请统计：
1）每个店铺的UV（访客数）
2）每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数

解答
1. select t1.shop, count(1) from (SELECT user_id, shop from Second_Visit group by user_id, shop) t1 group by t1.shop

2. 
select 
select 
t2.shop,
t2.user_id,
t2.vc
from (
select 
t1.shop,
t1.user_id,
t1.vc,
rank() over (PARTITION by t1.shop order by t1.vc) as rank
from (
    select user_id,shop,count(1) vc from Second_Visit group by user_id, shop
) t1
 ) t2  where t2.rank <= 3



第3题
已知一个表STG.ORDER，有如下字段:Date，Order_id，User_id，amount
请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57
1）给出 2017年每个月的订单数、用户数、总成交金额。
2）给出2017年11月的新客数(指在11月才有第一笔订单)

解答
1.
select 
count(Order_id) order_count,
count(Distinct(User_id)) user_count,
sum(amount) as amount_sum,
subString(`Date`,1,)
from second_order where subString(`Date`,1,4) = '2017' group by substring(`Date`,1,7)
或
select 
count(Order_id) order_count,
count(Distinct(User_id)) user_count,
sum(amount) as amount_sum,
from_unixtime(unix_timestamp(`Date`,"yyyy-mm-dd"),"yyyy-mm")
from second_order where subString(`Date`,1,4) = '2017' group by from_unixtime(unix_timestamp(`Date`,"yyyy-mm-dd"),"yyyy-mm")

2.
select 
t1.user_id
from 
(select User_id, count(Order_id) order_count from second_order where subString(`Date`,1,7) = '2017-11' group by User_id) t1
where t1.order_count = 1



第4题
有一个5000万的用户文件(user_id，name，age)， forth_user
一个2亿记录的用户看电影的记录文件(user_id，url)，forth_log
根据年龄段观看电影的次数进行排序？关键词 电影的次数

解答
select 

t3.category,
sum(t3.con)
from (
select 
case when t2.age>=0 and t2.age<20 then '0-20'
     when t2.age>=20 and t2.age<40 then '0-20'
     when t2.age>=40 and t2.age<60 then '0-20'
     else '' end as category
,

t2.con
from (
select
    u1.age age ,
    t1.con con
from forth_user u1
join 
(
    select 
    user_id,
    count(*) con
from forth_log
group by user_id   
)t1
on u1.user_id=t1.user_id
) t2

) t3
group by t3.category


第5题
有日志如下，请写出代码求得所有用户和活跃用户的总数及平均年龄。
（活跃用户指连续两天都有访问记录的用户）
日期 用户 年龄
11,test_1,23
11,test_2,19
11,test_3,39
11,test_1,23
11,test_3,39
11,test_1,23
12,test_2,19
13,test_1,23

1.
select count(1), sum(age)/count(1) from (select user_id,age from fiveth group by user_id, age) t1

2.
select * from (
select 
t1.`date`,
t1.user_id,
t1.age,
t1.`date` - rank() over(partition by t1.user_id order by t1.`date`) cz
from (
    select `date`, user_id , age from fiveth group by `date`, user_id, age
) t1
) t2
group by t2.user_id , t2.age, t2.rank
having count(1) >


第6题
请用sql写出所有用户中在今年10月份第一次购买商品的金额，表ordertable字段
（购买用户：userid，金额：money，购买时间：paymenttime(格式：2017-10-01)，订单id：orderid）


select 
    paymenttime,
    userid,
    monty,
    orderid,
    rank() over(partition by userid order by paymenttime) row_con
from sixth


第7题
现有图书管理数据库的三个数据模型如下：
图书（数据表名：BOOK）
序号  字段名称    字段描述    字段类型
1   BOOK_ID 总编号 文本
2   SORT    分类号 文本
3   BOOK_NAME   书名  文本
4   WRITER  作者  文本
5   OUTPUT  出版单位    文本
6   PRICE   单价  数值（保留小数点后2位）
读者（数据表名：READER）
序号  字段名称    字段描述    字段类型
1   READER_ID   借书证号    文本
2   COMPANY 单位  文本
3   NAME    姓名  文本
4   SEX 性别  文本
5   GRADE   职称  文本
6   ADDR    地址  文本
借阅记录（数据表名：BORROW LOG）
序号  字段名称    字段描述    字段类型
1   READER_ID   借书证号    文本
2   BOOK_ID  总编号 文本
3   BORROW_DATE  借书日期    日期
（1）创建图书管理库的图书、读者和借阅三个基本表的表结构。请写出建表语句。
（2）找出姓李的读者姓名（NAME）和所在单位（COMPANY）。
（3）查找“高等教育出版社”的所有图书名称（BOOK_NAME）及单价（PRICE），结果按单价降序排序。
（4）查找价格介于10元和20元之间的图书种类(SORT）出版单位（OUTPUT）和单价（PRICE），结果按出版单位（OUTPUT）和单价（PRICE）升序排序。
（5）查找所有借了书的读者的姓名（NAME）及所在单位（COMPANY）。
（6）求”科学出版社”图书的最高单价、最低单价、平均单价。
（7）找出当前至少借阅了2本图书（大于等于2本）的读者姓名及其所在单位。
（8）考虑到数据安全的需要，需定时将“借阅记录”中数据进行备份，请使用一条SQL语句，在备份用户bak下创建与“借阅记录”表结构完全一致的数据表BORROW_LOG_BAK.井且将“借阅记录”中现有数据全部复制到BORROW_L0G_ BAK中。
（9）现在需要将原Oracle数据库中数据迁移至Hive仓库，请写出“图书”在Hive中的建表语句（Hive实现，提示：列分隔符|；数据表数据需要外部导入：分区分别以month＿part、day＿part 命名）
（10）Hive中有表A，现在需要将表A的月分区&emsp;201505&emsp;中&emsp;user＿id为20000的user＿dinner字段更新为bonc8920，其他用户user＿dinner字段数据不变，请列出更新的方法步骤。（Hive实现，提示：Hlive中无update语法，请通过其他办法进行数据更新）




第8题
有一个线上服务器访问日志格式如下（用sql答题）
时间                          接口                           ip地址
2016-11-09 11:22:05    /api/user/login                  110.23.5.33
2016-11-09 11:23:10    /api/user/detail                 57.3.2.16
2016-11-09 23:59:40    /api/user/login                  200.6.5.166
求11月9号下午14点（14-15点），访问api/user/login接口的top10的ip地址
select t1.interface, t1.ct from (
select interface, count(1) ct from eight_log where interface = '/api/user/login' and date_format(`date`,"yyyy-MM-dd HH") = "2016-11-09 14" group by interface 
) t1 order by t1.ct limit 10;


第9题
有一个充值日志表如下：

CREATE TABLE `credit log` 
(
    `dist_id` int（11）DEFAULT NULL COMMENT '区组id',
    `account` varchar（100）DEFAULT NULL COMMENT '账号',
    `money` int(11) DEFAULT NULL COMMENT '充值金额',
    `create_time` datetime DEFAULT NULL COMMENT '订单时间'
)ENGINE=InnoDB DEFAUILT CHARSET-utf8

请写出SQL语句，查询充值日志表2015年7月9号每个区组下充值额最大的账号，要求结果：
区组id，账号，金额，充值时间

select 
t1.dist_id,
t1.money,
t1.account,
t1.create_time
from (
select 
dist_id,
account,
rank() over(partition by dist_id order by money desc) rank,
money,
create_time

from nine_log where date_format(create_time,"yyyy-MM-dd") = "2015-07-09"
) t1

where t1.rank = 1



10 略


第11题
1）有三张表分别为会员表（member）销售表（sale）退货表（regoods）
（1）会员表有字段memberid（会员id，主键）credits（积分）；
（2）销售表有字段memberid（会员id，外键）购买金额（MNAccount）；
（3）退货表中有字段memberid（会员id，外键）退货金额（RMNAccount）；
2）业务说明：
（1）销售表中的销售记录可以是会员购买，也可是非会员购买。（即销售表中的memberid可以为空）
（2）销售表中的一个会员可以有多条购买记录
（3）退货表中的退货记录可以是会员，也可是非会员
 (4）一个会员可以有一条或多条退货记录
查询需求：分组查出销售表中所有会员购买金额，同时分组查出退货表中所有会员的退货金额，
把会员id相同的购买金额-退款金额得到的结果更新到表会员表中对应会员的积分字段（credits）


12
第12题 百度
现在有三个表student（学生表）、course(课程表)、score（成绩单），结构如下：
create table student
(
    id bigint comment "学号",
    name string comment "姓名",
    age bigint comment "年龄"
);
create table course
(
    cid string comment "课程号，001/002格式",
    cname string comment "课程名"
);
Create table score
(
    Id bigint comment "学号",
    cid string comment "课程号",
    score bigint comment "成绩"
) partitioned by(event_day string)

其中score中的id、cid，分别是student、course中对应的列请根据上面的表结构，回答下面的问题
1）请将本地文件（/home/users/test/20190301.csv）文件，加载到分区表score的20190301分区中，并覆盖之前的数据
2）查出平均成绩大于60分的学生的姓名、年龄、平均成绩
3）查出没有‘001’课程成绩的学生的姓名、年龄
4）查出有‘001’\’002’这两门课程下，成绩排名前3的学生的姓名、年龄
5）创建新的表score_20190317，并存入score表中20190317分区的数据
6）如果上面的score表中，uid存在数据倾斜，请进行优化，查出在20190101-20190317中，学生的姓名、年龄、课程、课程的平均成绩
7）描述一下union和union all的区别，以及在mysql和HQL中用法的不同之处？
8）简单描述一下lateral view语法在HQL中的应用场景，并写一个HQL实例

1.load data local inpath "/home/users/test/20190301.csv" overwrite into table score partition (event_day=20190301)

Hive

创建hive表

hive版的wordcount

分区表 partition

分桶表bucket

采样sampling

四个by

组内排序

空字段赋值

时间类

三元表达式

行转列

列转行 侧写 lateral view

窗口函数

系统函数

hive的优化

动态分区

hive 文件存储格式

练习

练习题

列转行侧写 lateral view