创建hive表
CREATE DATABASE elaiza;
create table news_noseg (sentence string)
row format delimited fields terminated by '\n';
desc article
select * from news_noseg limit 10;
load data local inpath '/usr/local/src/data/hive_about/allfiles.txt' into table news_noseg;
hive版的wordcount
--分割转数组
select split('But it is used with a suitable irony',' '); => 返回的是数组
--行转列
select explode(split('But it is used with a suitable irony',' ')); => 把数组每个元素单独列成一行
--应用到文章
select explode(split(sentence,' ')) as word from article limit 10;
--group by
select word, count(*) as cnt
from
(
select
explode(split(sentence,' ')) as word
from article limit 10
) as tmp
group by word;
--正则测试
select regexp_extract('12elaiza89', '[a-zA-Z]+', 0);
--小写转换
select lower('Elaiza');
--完整版
select
regexp_extract(lower(word), '[a-zA-Z]+', 0) as word, count(*) as cnt
from (select explode(split(sentence,' ')) as word from article limit 10) as tmp
group by word;
分区表 partition
create table art_dt(sentence string)
partitioned by (dt string)
row format delimited fields terminated by '\n';
insert into table art_dt partition(dt='20191109')
select * from article limit 100;
insert overwrite table art_dt partition(dt='20191110')
select * from article limit 100;
show partitions art_dt;
分桶表bucket
set hive.enforce.bucketing = true;
create table bucket_user (id int) clustered by (id) into 32 buckets;
create table tmp_user (id int)
row format delimited fields terminated by '\n';
load data local inpath '/usr/local/src/data/hive_about/hive_data_2/bucket.txt' into table tmp_user;
insert overwrite table bucket_user
select * from tmp_user;
采样sampling
res % y = (x - 1)
select * from bucket_user tablesample(bucket 1 out of 32 on id); => 32
select * from bucket_user tablesample(bucket 2 out of 32 on id); => 1
select * from bucket_user tablesample(bucket 1 out of 16 on id); => 32, 16
select * from bucket_user tablesample(bucket 2 out of 16 on id); => 1, 17
注:bucket x out of y
四个by
--order by
--sort by
--distribute by
--custer by
set mapreduce.job.reduces = 2
insert overwrite local directory "/usr/local/src/data/hive_about/sort_by"
select * from test sort by user_id;
insert overwrite local directory "/usr/local/src/data/hive_about/dis_by"
select * from test distribute by user_id sort by product_count;
组内排序
rank()就是排序 相同的排序是一样的,但是下一个小的会跳着排序,比如
等级 排序
23 1
23 1
22 3
dense_rank()相同的排序相同,下一个小的会紧挨着排序,比如
等级 排序
23 1
23 1
22 2
这样总个数是相对减少的,适合求某些指标前几个等级的个数。
row_number()就很简单,顺序排序。比如
等级 排序
23 1
23 2
22 3
这种排序 总个数是不变的,适合求某些值的前几名。
空字段赋值
nvl(null,0)
时间类
1.格式化: date_format('2019-11-11 23:32:22','yyyy-MM-dd HH:mm:ss')
2.date_add('2019-11-11',3)
3.date_format(regexp_replace('2019/1/23','/','-'),'yyyy-MM-dd');
三元表达式
1.case when 和 if(布尔表达式,为true的值,为false的值)
行转列
1.concat('hello','-','world')
2.concat_ws(分割符,str,str...)
3.collect_set(col) 将列去重汇总成数组
4.concat_ws('-',collect_set(col)) 结合使用
列转行 侧写 lateral view
explode()
movie category
疑犯追踪 悬疑,爱情,动作,喜剧
别对我说慌 剧情,悬疑,动作
奥特曼 动作,动画
create table movies (
movie string,
category array<string>
)
row format delimited fields terminated by "\t"
collection items terminated by ",";
load data local inpath '/usr/local/src/data/hive_about/hive_data_2/movies.txt' into table movies;
select
movie,
category_name
from
movies lateral view explode(category) table_tmp as category_name;
movie category_name
疑犯追踪 悬疑
疑犯追踪 爱情
疑犯追踪 动作
疑犯追踪 喜剧
别对我说慌 剧情
别对我说慌 悬疑
别对我说慌 动作
奥特曼 动作
奥特曼 动画
窗口函数
over()
select name , count(*) from business where substring(orderdate,1,7) = '2017-04' group by name;
jack 1
mark 4
select name , count(*) over() from business where substring(orderdate,1,7) = '2017-04' group by name;
jack 2
mark 2
select name, orderdate, sum(cost) over(order by orderdate) from business;
jack 2017-01-01 10
tony 2017-01-02 25
tony 2017-01-04 54
jack 2017-01-05 100
tony 2017-01-07 150
jack 2017-01-08 205
jack 2017-02-03 228
jack 2017-04-06 270
mart 2017-04-08 332
mart 2017-04-09 400
mart 2017-04-11 475
mart 2017-04-13 569
neil 2017-05-10 581
neil 2017-06-12 661
select name, orderdate, cost, sum(cost) over(distribute by name) from business;
jack 2017-01-05 46 176
jack 2017-01-08 55 176
jack 2017-01-01 10 176
jack 2017-04-06 42 176
jack 2017-02-03 23 176
mart 2017-04-13 94 299
mart 2017-04-11 75 299
mart 2017-04-09 68 299
mart 2017-04-08 62 299
neil 2017-05-10 12 92
neil 2017-06-12 80 92
tony 2017-01-04 29 94
tony 2017-01-02 15 94
tony 2017-01-07 50 94
select name, orderdate, cost, sum(cost) over(distribute by name sort by orderdate) from business;
jack 2017-01-01 10 10
jack 2017-01-05 46 56
jack 2017-01-08 55 111
jack 2017-02-03 23 134
jack 2017-04-06 42 176
mart 2017-04-08 62 62
mart 2017-04-09 68 130
mart 2017-04-11 75 205
mart 2017-04-13 94 299
neil 2017-05-10 12 12
neil 2017-06-12 80 92
tony 2017-01-02 15 15
tony 2017-01-04 29 44
tony 2017-01-07 50 94
select name, orderdate, cost, lag(orderdate,1,'1900-01-01') over(distribute by name sort by orderdate) from business;
select name, orderdate, cost, lead(orderdate,1,'1900-01-01') over(distribute by name sort by orderdate) from business;
jack 2017-01-01 10 1900-01-01
jack 2017-01-05 46 2017-01-01
jack 2017-01-08 55 2017-01-05
jack 2017-02-03 23 2017-01-08
jack 2017-04-06 42 2017-02-03
mart 2017-04-08 62 1900-01-01
mart 2017-04-09 68 2017-04-08
mart 2017-04-11 75 2017-04-09
mart 2017-04-13 94 2017-04-11
neil 2017-05-10 12 1900-01-01
neil 2017-06-12 80 2017-05-10
tony 2017-01-02 15 1900-01-01
tony 2017-01-04 29 2017-01-02
tony 2017-01-07 50 2017-01-04
select name, orderdate, cost, sum(cost) over(rows between 2 preceding and current row) from business;
mart 2017-04-13 94 94
neil 2017-06-12 80 174
mart 2017-04-11 75 249
neil 2017-05-10 12 167
mart 2017-04-09 68 155
mart 2017-04-08 62 142
jack 2017-01-08 55 185
tony 2017-01-07 50 167
jack 2017-04-06 42 147
jack 2017-01-05 46 138
tony 2017-01-04 29 117
jack 2017-02-03 23 98
tony 2017-01-02 15 67
jack 2017-01-01 10 48
select name, orderdate, cost from (select name, orderdate, cost, ntile(5) over(order by orderdate) pr from business)t where pr = 1;
select name, orderdate, cost, ntile(5) over() from business;
mart 2017-04-13 94 1
neil 2017-06-12 80 1
mart 2017-04-11 75 1
neil 2017-05-10 12 2
mart 2017-04-09 68 2
mart 2017-04-08 62 2
jack 2017-01-08 55 3
tony 2017-01-07 50 3
jack 2017-04-06 42 3
jack 2017-01-05 46 4
tony 2017-01-04 29 4
jack 2017-02-03 23 4
tony 2017-01-02 15 5
jack 2017-01-01 10 5
系统函数
show functions;
desc function extended upper;
hive的优化
一、解释多少个map和reduce的个数?
1.block大小
set dfs.block.size;
2.设置split切的数据大小
set mapred.max.split.size;
3.map个数 = split个数
map个数/split个数 = 大文件 / max(mapred.max.split.size, dfs.block.size);
3.手动设置map任务数
set mapred.map.tasks;
4.设置reduce处理的数据大小
set hive.exec.reducers.bytes.per.reducer;
5.手动设置reduce个数
set mapred.reduce.tasks;
二、什么时候只有一个reduce?(每个reduce都会生成哟个文件)
1.没有group by
2.order by 全局排序(可以使用distribute by 和 sort by 这是每个reduce里的数据排序)
3.笛卡尔积
三、如何合并小文件,减少map数?
1.使用hadoop archive命令把小文件进行归档
2.配置map和reduce的任务数
四、优化方法
使用group by 来代替distinct
提前过滤空值方便join
默认开启mapjoin 小表在前
数据倾斜,前面加上随机数来,避免大量的数据被一个reduce来处理
万能的方法:hive.groupby.skewindata=true
动态分区
开启动态分区参数设置
set hive.exec.dynamic.partition=true;
设置为非严格模式
set hive.exec.dynamic.partition.mode=nonstrict;
在所有执行MR的节点上,最大一共可以创建多少个动态分区
set hive.exec.max.dynamic.partitions=1000;
每个执行MR的节点上,最大可以创建多少个动态分区 (例如:一年365个day,那么就要设置大于365的值)
set hive.exec.max.dynamic.partitions.pernode=100
整个MRJob中,最大可以创建多少个HDFS文件
hive 文件存储格式
列式存储和行式存储
TextFile格式
Orc个格式
Parquet格式
练习
--创建orders表
create table orders (
order_id string,
user_id string,
eval_set string,
order_number string,
order_dow string,
order_hour_of_day string,
days_since_prior_order string
)
row format delimited fields terminated by ',';
--
order_id 订单编号
user_id 用户id
eval_set 标识订单是否为历史数据
order_number 用户购买订单序号
order_dow 星期几
days_since_prior_order 哪个小时产生的订单
--
--插入数据
load data local inpath '/usr/local/src/data/original_data/orders.csv' into table orders;
--创建商品和订单的关系表
create table priors (
order_id string,
product_id string,
add_to_cart_order string,
reordered string
)
row format delimited fields terminated by ',';
--
order_id 订单id
product_id 商品id
add_to_cart_order 订单支付先后顺序的位置
reordered 当前商品是否为重复下单的行为
--
--插入数据
load data local inpath '/usr/local/src/data/order_products__prior.csv' into table priors;
1.每个用户有多少订单?
insert overwrite local directory "/usr/local/src/data/hive_about/answer_01"
select user_id, count(1) as order_count from orders group by user_id order by order_count;
select user_id, Max(cast(order_number as int)) as order_count from orders group by user_id order by order_count;
2.每个用户购买的商品数量?
insert overwrite local directory "/usr/local/src/data/hive_about/answer_02"
select
C.user_id as user_id,
sum(C.product_count) as product_count
from (
select
A.user_id,
B.product_count
from orders A join (select order_id, count(1) as product_count from priors group by order_id) B
on A.order_id = B.order_id ) C
group by C.user_id;
3.每个用户平均每个订单是多少商品?
insert overwrite local directory "/usr/local/src/data/hive_about/answer_03"
select
C.user_id as user_id,
avg(C.product_count) as product_avg
from (
select
A.user_id,
B.product_count
from orders A join (select order_id, count(1) as product_count from priors group by order_id) B
on A.order_id = B.order_id ) C
group by C.user_id;
4.每个用户在一周中的购买订单的分布情况?
insert overwrite local directory "/usr/local/src/data/hive_about/answer_04"
select
user_id,
count(case when order_dow = 0 then order_id end),
count(case when order_dow = 1 then order_id end),
count(case when order_dow = 2 then order_id end),
count(case when order_dow = 3 then order_id end),
count(case when order_dow = 4 then order_id end),
count(case when order_dow = 5 then order_id end),
count(case when order_dow = 6 then order_id end)
from orders
group by user_id;
5.一个用户平均每个月购买多少商品?30天为一个月
间隔天数累计除去30向上取整,注意只有一个订单的时候是个特殊情况,需要判断下,因为是null
6.每个用户最喜爱的三个product是什么?
create table user_product (user_id string, product_id string, product_count int)
row format delimited fields terminated by '\t';
insert overwrite table user_product
select
A.user_id,
A.product_id,
count(A.product_id) as product_count
from (
select
o.user_id,
p.product_id
from orders o join priors p on o.order_id = p.order_id ) A group by A.user_id, A.product_id
insert overwrite local directory "/usr/local/src/data/hive_about/answer_06"
select
A.user_id,
collect_set(case when rank = 1 then A.product_id end)[0],
collect_set(case when rank = 2 then A.product_id end)[0],
collect_set(case when rank = 3 then A.product_id end)[0]
from
(
select
user_id,
product_id,
product_count,
row_number() over (partition by user_id order by product_count desc) rank
from user_product
group by user_id, product_id, product_count
) as A
where A.rank <= 3
group by A.user_id;
--question6的测试数据
99962 38944 26
99962 39200 1
99962 39794 4
99962 4066 1
99962 41030 3
99962 41328 3
99962 41544 1
99962 41621 1
99962 42509 10
99962 42585 2
99962 4392 1
99962 44396 7
99962 44422 3
99962 44426 5
99962 44532 3
99962 44873 1
99962 46820 1
99962 47209 1
99962 47223 11
99962 47766 3
99962 4855 12
99962 48679 2
99962 48745 5
99962 5184 1
99962 5250 1
99962 6020 1
99962 6541 1
99982 10108 15
99982 1025 2
99982 11046 1
99982 2091 2
99982 21137 9
99982 21195 1
99982 25489 1
99982 26497 2
99982 28918 1
99982 30391 2
99982 35855 1
99982 36070 1
99982 39121 1
99982 39275 7
99982 43713 6
99982 46709 1
99982 48493 2
99982 5025 5
99982 5322 1
99982 6664 2
99982 7485 1
99982 7746 1
99982 7948 30
99982 8424 12
99983 39275 7
99983 43713 6
99983 46709 1
99983 48493 2
99983 5025 5
99983 5322 1
99983 6664 2
99983 7485 1
99983 7746 1
99983 7948 2
99984 8424 15
99984 7746 1
99984 7948 3
99985 8424 3
create table test (
user_id string,
product_id string,
product_count int
)
row format delimited fields terminated by '\t';
load data local inpath '/usr/local/src/data/hive_about/hive_data_2/test.txt' into table test;
select
A.user_id,
collect_set(case when rank = 1 then A.product_id end)[0],
collect_set(case when rank = 2 then A.product_id end)[0],
collect_set(case when rank = 3 then A.product_id end)[0]
from
(
select
user_id,
product_id,
product_count,
row_number() over (partition by user_id order by product_count desc) rank
from test
group by user_id,product_id,product_count
) as A
where A.rank <= 3
group by A.user_id;
7.蚂蚁金服题目
create table jd (user_id string, shop string)
row format delimited fields terminated by "\t";
load data local inpath '/usr/local/src/data/hive_about/hive_data_2/jd.txt' into table jd;
u1 a
u2 b
u1 b
u1 a
u3 c
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
u2 c
u1 a
u2 a
u2 a
u3 a
u5 a
u5 a
u5 a
-- 使用distinct的效率不高(不推荐)
select shop, count(distinct user_id) from jd group by shop;
-- 使用group by(推荐)
select shop, count(1) from
(select shop, user_id from jd group by shop, user_id) tmp
group by tmp.shop;
8.对每个用户最喜爱购买的三个product的改编,每个用户最喜爱的top10%的商品
如果一个用户购买一共购买了10个,返回top1,购买了20个返回top2
如果一个用户购买了3个,3*10%=0.3 返回top1,也就是不够的至少一个
select
user_id,
product_id,
product_count
from (
select
user_id,
product_id,
product_count,
row_number() over (partition by user_id order by product_count desc) rank,
count(1) over (partition by user_id) user_product_count
from test) t1 where rank <= ceiling(t1.user_product_count * 0.1);
99961 38944 26
99961 42509 10
99961 42509 10
99961 42509 10
99961 38944 26
99961 38944 26
99961 38944 26
99962 4855 12
99962 38944 26
99962 47223 11
99982 8424 12
99982 10108 15
99982 7948 30
99983 39275 7
99984 8424 15
99985 8424 3
9.建分区表,orders表按照order_dow建立分区表orders_part,然后从hive查询orders动态插入orders_part表中
1.创建分区表
create table orders_part(
order_id string,
user_id string,
eval_set string,
order_number string,
order_hour_of_day string,
days_since_prior_order string)
partitioned by (order_dow string)
row format delimited fields terminated by ',';
2.查询导入数据(分区的字段放到最后面)
insert into table orders_part partition(order_dow)
select
order_id,
user_id,
eval_set,
order_number,
order_hour_of_day,
days_since_prior_order,
order_dow
from orders;
10.统计商品的销量(pv),购买的用户数(uv),reorder数,在同一个sql中展示出来
即:输出形式
| 商品id | 商品销量 | 购买用户数 |reordered数|
| :-------- | --------:| :------: |
| product_id | product_pv | product_uv | reordered_cnt|
--创建测试表 test2
select user_id, product_id, reordered from orders o left join priors p on o.order_id = p.order_id limit 100;
88772 37053 0
88772 3682 1
88772 9422 0
444 45638 0
444 28849 1
444 32052 1
444 39911 1
444 37687 1
444 35939 1
444 432 1
444 27966 1
444 27104 1
444 13535 1
444 2979 1
444 26369 1
444 11777 1
444 18272 1
444 28985 1
444 49075 1
444 22963 1
444 42265 1
444 30489 1
444 16797 1
444 48679 1
444 30252 1
444 44514 1
444 38400 1
444 25931 1
444 2846 1
444 44683 1
444 21938 1
444 45007 1
444 47626 1
444 2295 1
444 24184 1
444 49683 1
444 13176 1
144784 30489 0
144784 25832 0
144784 30450 1
144784 31506 0
144784 19154 0
144784 20063 0
144784 25948 0
144784 14335 0
144784 23734 0
144784 11140 0
144784 23622 0
144784 28849 0
144784 45819 0
144784 10603 0
144784 17949 0
144784 11818 0
144784 25340 1
144784 10305 1
144784 14333 1
144784 37594 1
157967 2481 0
157967 42423 1
41222 28204 1
41222 20794 1
41222 47766 1
41222 16911 0
41222 13874 0
41222 15803 0
41222 34466 0
41222 11777 0
41222 13176 1
44785 45007 1
44785 46676 1
44785 7781 1
44785 42615 1
44785 11136 1
44785 21454 1
44785 25890 1
44785 18564 1
44785 23029 1
44785 18818 1
44785 11137 1
44785 29432 1
44785 44359 1
44785 24852 1
44785 40545 1
44785 23765 1
44785 36397 1
164031 44570 1
164031 8193 1
164031 48679 0
164031 8277 1
164031 11440 0
164031 42265 0
164031 14992 1
164031 49683 1
164031 41220 1
119023 4137 0
119023 20899 0
119023 2057 0
119023 30016 0
119023 9245 0
119023 2452 0
select t.product_id, count(t.product_id),count(distinct(t.user_id)),sum(t.reordered) from
(select user_id, product_id, reordered from orders o left join priors p on o.order_id = p.order_id limit 1000) t group by t.product_id;
11.每个用户购买的product商品去重后的集合数据(用字符串表示以‘,’分割)
共两列即: user_id product_id1, product_id2, product_id3...
select t.user_id, concat_ws(',', collect_set(t.product_id))
from (select user_id,product_id from test2 group by user_id, product_id) t group by t.user_id;
练习题
第1题
我们有如下的用户访问数据
userId visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
U02 2017/1/23 6
U01 2017/2/22 4
要求使用SQL统计出每个用户的累积访问次数,如下表所示:
用户id 月份 小计 累积
u01 2017-01 11 11
u01 2017-02 12 23
u02 2017-01 12 12
u03 2017-01 8 8
u04 2017-01 3 3
解答
SELECT
t2.userid,
t2.s_day,
t2.vc,
sum(t2.vc) over(PARTITION BY t2.userid order by t2.s_day ROWS BETWEEN UNBOUNDED PRECEDING and CURRENT ROW) ccc
from (
SELECT
t1.userid,
t1.s_day,
sum(t1.visitcount) vc
from (
SELECT
userid,
from_unixtime(unix_timestamp(visitdate,"yyyy/mm/dd"),"yyyy-mm") s_day,
visitcount
from first ) t1
group by t1.userid,t1.s_day ) t2
第2题 京东
有50W个京东店铺,每个顾客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
1)每个店铺的UV(访客数)
2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
解答
1. select t1.shop, count(1) from (SELECT user_id, shop from Second_Visit group by user_id, shop) t1 group by t1.shop
2.
select
select
t2.shop,
t2.user_id,
t2.vc
from (
select
t1.shop,
t1.user_id,
t1.vc,
rank() over (PARTITION by t1.shop order by t1.vc) as rank
from (
select user_id,shop,count(1) vc from Second_Visit group by user_id, shop
) t1
) t2 where t2.rank <= 3
第3题
已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount
请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57
1)给出 2017年每个月的订单数、用户数、总成交金额。
2)给出2017年11月的新客数(指在11月才有第一笔订单)
解答
1.
select
count(Order_id) order_count,
count(Distinct(User_id)) user_count,
sum(amount) as amount_sum,
subString(`Date`,1,)
from second_order where subString(`Date`,1,4) = '2017' group by substring(`Date`,1,7)
或
select
count(Order_id) order_count,
count(Distinct(User_id)) user_count,
sum(amount) as amount_sum,
from_unixtime(unix_timestamp(`Date`,"yyyy-mm-dd"),"yyyy-mm")
from second_order where subString(`Date`,1,4) = '2017' group by from_unixtime(unix_timestamp(`Date`,"yyyy-mm-dd"),"yyyy-mm")
2.
select
t1.user_id
from
(select User_id, count(Order_id) order_count from second_order where subString(`Date`,1,7) = '2017-11' group by User_id) t1
where t1.order_count = 1
第4题
有一个5000万的用户文件(user_id,name,age), forth_user
一个2亿记录的用户看电影的记录文件(user_id,url),forth_log
根据年龄段观看电影的次数进行排序?关键词 电影的次数
解答
select
t3.category,
sum(t3.con)
from (
select
case when t2.age>=0 and t2.age<20 then '0-20'
when t2.age>=20 and t2.age<40 then '0-20'
when t2.age>=40 and t2.age<60 then '0-20'
else '' end as category
,
t2.con
from (
select
u1.age age ,
t1.con con
from forth_user u1
join
(
select
user_id,
count(*) con
from forth_log
group by user_id
)t1
on u1.user_id=t1.user_id
) t2
) t3
group by t3.category
第5题
有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。
(活跃用户指连续两天都有访问记录的用户)
日期 用户 年龄
11,test_1,23
11,test_2,19
11,test_3,39
11,test_1,23
11,test_3,39
11,test_1,23
12,test_2,19
13,test_1,23
1.
select count(1), sum(age)/count(1) from (select user_id,age from fiveth group by user_id, age) t1
2.
select * from (
select
t1.`date`,
t1.user_id,
t1.age,
t1.`date` - rank() over(partition by t1.user_id order by t1.`date`) cz
from (
select `date`, user_id , age from fiveth group by `date`, user_id, age
) t1
) t2
group by t2.user_id , t2.age, t2.rank
having count(1) >
第6题
请用sql写出所有用户中在今年10月份第一次购买商品的金额,表ordertable字段
(购买用户:userid,金额:money,购买时间:paymenttime(格式:2017-10-01),订单id:orderid)
select
paymenttime,
userid,
monty,
orderid,
rank() over(partition by userid order by paymenttime) row_con
from sixth
第7题
现有图书管理数据库的三个数据模型如下:
图书(数据表名:BOOK)
序号 字段名称 字段描述 字段类型
1 BOOK_ID 总编号 文本
2 SORT 分类号 文本
3 BOOK_NAME 书名 文本
4 WRITER 作者 文本
5 OUTPUT 出版单位 文本
6 PRICE 单价 数值(保留小数点后2位)
读者(数据表名:READER)
序号 字段名称 字段描述 字段类型
1 READER_ID 借书证号 文本
2 COMPANY 单位 文本
3 NAME 姓名 文本
4 SEX 性别 文本
5 GRADE 职称 文本
6 ADDR 地址 文本
借阅记录(数据表名:BORROW LOG)
序号 字段名称 字段描述 字段类型
1 READER_ID 借书证号 文本
2 BOOK_ID 总编号 文本
3 BORROW_DATE 借书日期 日期
(1)创建图书管理库的图书、读者和借阅三个基本表的表结构。请写出建表语句。
(2)找出姓李的读者姓名(NAME)和所在单位(COMPANY)。
(3)查找“高等教育出版社”的所有图书名称(BOOK_NAME)及单价(PRICE),结果按单价降序排序。
(4)查找价格介于10元和20元之间的图书种类(SORT)出版单位(OUTPUT)和单价(PRICE),结果按出版单位(OUTPUT)和单价(PRICE)升序排序。
(5)查找所有借了书的读者的姓名(NAME)及所在单位(COMPANY)。
(6)求”科学出版社”图书的最高单价、最低单价、平均单价。
(7)找出当前至少借阅了2本图书(大于等于2本)的读者姓名及其所在单位。
(8)考虑到数据安全的需要,需定时将“借阅记录”中数据进行备份,请使用一条SQL语句,在备份用户bak下创建与“借阅记录”表结构完全一致的数据表BORROW_LOG_BAK.井且将“借阅记录”中现有数据全部复制到BORROW_L0G_ BAK中。
(9)现在需要将原Oracle数据库中数据迁移至Hive仓库,请写出“图书”在Hive中的建表语句(Hive实现,提示:列分隔符|;数据表数据需要外部导入:分区分别以month_part、day_part 命名)
(10)Hive中有表A,现在需要将表A的月分区 201505 中 user_id为20000的user_dinner字段更新为bonc8920,其他用户user_dinner字段数据不变,请列出更新的方法步骤。(Hive实现,提示:Hlive中无update语法,请通过其他办法进行数据更新)
第8题
有一个线上服务器访问日志格式如下(用sql答题)
时间 接口 ip地址
2016-11-09 11:22:05 /api/user/login 110.23.5.33
2016-11-09 11:23:10 /api/user/detail 57.3.2.16
2016-11-09 23:59:40 /api/user/login 200.6.5.166
求11月9号下午14点(14-15点),访问api/user/login接口的top10的ip地址
select t1.interface, t1.ct from (
select interface, count(1) ct from eight_log where interface = '/api/user/login' and date_format(`date`,"yyyy-MM-dd HH") = "2016-11-09 14" group by interface
) t1 order by t1.ct limit 10;
第9题
有一个充值日志表如下:
CREATE TABLE `credit log`
(
`dist_id` int(11)DEFAULT NULL COMMENT '区组id',
`account` varchar(100)DEFAULT NULL COMMENT '账号',
`money` int(11) DEFAULT NULL COMMENT '充值金额',
`create_time` datetime DEFAULT NULL COMMENT '订单时间'
)ENGINE=InnoDB DEFAUILT CHARSET-utf8
请写出SQL语句,查询充值日志表2015年7月9号每个区组下充值额最大的账号,要求结果:
区组id,账号,金额,充值时间
select
t1.dist_id,
t1.money,
t1.account,
t1.create_time
from (
select
dist_id,
account,
rank() over(partition by dist_id order by money desc) rank,
money,
create_time
from nine_log where date_format(create_time,"yyyy-MM-dd") = "2015-07-09"
) t1
where t1.rank = 1
10 略
第11题
1)有三张表分别为会员表(member)销售表(sale)退货表(regoods)
(1)会员表有字段memberid(会员id,主键)credits(积分);
(2)销售表有字段memberid(会员id,外键)购买金额(MNAccount);
(3)退货表中有字段memberid(会员id,外键)退货金额(RMNAccount);
2)业务说明:
(1)销售表中的销售记录可以是会员购买,也可是非会员购买。(即销售表中的memberid可以为空)
(2)销售表中的一个会员可以有多条购买记录
(3)退货表中的退货记录可以是会员,也可是非会员
(4)一个会员可以有一条或多条退货记录
查询需求:分组查出销售表中所有会员购买金额,同时分组查出退货表中所有会员的退货金额,
把会员id相同的购买金额-退款金额得到的结果更新到表会员表中对应会员的积分字段(credits)
12
第12题 百度
现在有三个表student(学生表)、course(课程表)、score(成绩单),结构如下:
create table student
(
id bigint comment "学号",
name string comment "姓名",
age bigint comment "年龄"
);
create table course
(
cid string comment "课程号,001/002格式",
cname string comment "课程名"
);
Create table score
(
Id bigint comment "学号",
cid string comment "课程号",
score bigint comment "成绩"
) partitioned by(event_day string)
其中score中的id、cid,分别是student、course中对应的列请根据上面的表结构,回答下面的问题
1)请将本地文件(/home/users/test/20190301.csv)文件,加载到分区表score的20190301分区中,并覆盖之前的数据
2)查出平均成绩大于60分的学生的姓名、年龄、平均成绩
3)查出没有‘001’课程成绩的学生的姓名、年龄
4)查出有‘001’\’002’这两门课程下,成绩排名前3的学生的姓名、年龄
5)创建新的表score_20190317,并存入score表中20190317分区的数据
6)如果上面的score表中,uid存在数据倾斜,请进行优化,查出在20190101-20190317中,学生的姓名、年龄、课程、课程的平均成绩
7)描述一下union和union all的区别,以及在mysql和HQL中用法的不同之处?
8)简单描述一下lateral view语法在HQL中的应用场景,并写一个HQL实例
1.load data local inpath "/home/users/test/20190301.csv" overwrite into table score partition (event_day=20190301)