hive窗口函数练习

242 阅读4分钟

数据准备

```sql
create table  if not exists temp_ljy as

-- insert overwrite table  temp_ljy 
select 'jack' as name,  '2017-01-01'   as orderdate, 10 as cost
union all               
select 'tony' as name,  '2017-01-02'   as orderdate, 15  as cost
union all               
select 'jack' as name,  '2017-02-03'   as orderdate,  23  as cost
union all               
select 'tony' as name,  '2017-01-04'   as orderdate,  29  as cost
union all               
select 'jack' as name,  '2017-01-05'   as orderdate,  46  as cost
union all               
select 'jack' as name,  '2017-04-06'   as orderdate,  42  as cost
union all               
select 'tony' as name,  '2017-01-07'   as orderdate,  50  as cost
union all               
select 'jack' as name,  '2017-01-08'   as orderdate,  55  as cost
union all               
select 'mart' as name,  '2017-04-08'   as orderdate,  62  as cost
union all               
select 'mart' as name,  '2017-04-09'   as orderdate,  68  as cost
union all               
select 'neil' as name,  '2017-05-10'   as orderdate,  12  as cost
union all               
select 'mart' as name,  '2017-04-11'   as orderdate,   75 as cost
union all               
select 'neil' as name,  '2017-06-12'   as orderdate,   80 as cost
union all               
select 'mart' as name,  '2017-04-13'   as orderdate,   90 as cost
;
```

count、sum、MIN、MAX、AVG

select 
   name,orderdate,cost,
   -- 所有行相加
   sum(cost) over() as c,
   -- 按name分组,组内相加
   sum(cost) over(partition by name) as c1,
   -- (默认起点到当前行相加)按name分组,orderdate排序,组内相加
   sum(cost) over(partition by name order by orderdate) as c2,
   -- 起点到当前行的
   sum(cost) over(partition by name order by orderdate rows between unbounded preceding and current row) as c3,
   -- 当前行+前面2行
   sum(cost) over(partition by name order by orderdate rows between 2 preceding and current row) as c4,
   -- 当前行+后面2行
   sum(cost) over(partition by name order by orderdate rows between current row and 2 following) as c5,
   -- 前面2行+当前行+后面2行
   sum(cost) over(partition by name order by orderdate rows between 2 preceding and 2 following) as c6
from 
   temp_ljy;

first_value与last_value

select 
   name,orderdate,cost,
   row_number() over(partition by name order by cost) c1,
   -- 正序时:当前行到第一个值之间,第一个值
   first_value(cost) over(partition by name order by cost) c2,
   -- 正序时:当前行到左后一个值之间,最后一个值
   last_value(cost) over(partition by name order by cost) c3,
   -- 倒序时:当前行到第一个值之间,第一个值
   first_value(cost) over(partition by name order by cost desc)c4,
   -- 倒序时:当前行到最后一个值之间,最后一个值
   last_value(cost) over(partition by name order by cost desc) c5

from
   temp_ljy;

lead与lag(位移函数)

select 
   name,orderdate,cost,
   -- 分组内当前行,往后第一行的值(不包含当前行)
   lead(cost) over(partition by name order by cost) c1,
   -- 分组内当前行,往后第二行的值(不包含当前行)
   lead(cost,2) over(partition by name order by cost)c2 ,
   -- 分组内当前行,往后第二行的值(不包含当前行),如果为null,则用9999代替
   lead(cost,2,9999) over (partition by name order by cost) c3,
   -- 分组当前行,往前第一行的值(不包括当前行)
   lag(cost) over (partition by name order by cost) c4,
   -- 分组当前行,往前第二行(不包括当前行)
   lag(cost,2) over (partition by name order by cost) c5,
   lag(cost,2,-1) over(partition by name order by cost) c6
from
   temp_ljy;

RANK、ROW_NUMBER、DESCE_RANK排序函数

select
name,orderdate,cost,c,
--自然序号排序,不跳数,不重复'
ROW_NUMBER()over(partition by name order by c)c1,
-- 排序相同,中间会跳数,总数不变
RANK()over(partition by name order by c)c2,
-- 排序相同,中间不会跳数,总数会减少
DENSE_RANK()over(partition by name order by c)c3
from(
select name,orderdate,cost,date_format(orderdate,'yyyyMM') c
from temp_ljy)t1;

NTILE分组函数

-- NTILE
select name,orderdate,cost,
-- 将组内数据分成1片
ntile(1) over(partition by name order by orderdate) c1,
-- 将组内数据分成2片
ntile(2) over(partition by name order by orderdate) c2,
-- 将组内数据分成3片
ntile(3) over(partition by name order by orderdate) c3,
-- 将组内数据分成4片
ntile(4) over(partition by name order by orderdate) c4
from
    temp_ljy;
-- 注意:如果切片不均匀,默认增加第一个切片的分布
-- 例如:求20%的数据(按时间排序)
select 
    *
from
    (select 
        name,orderdate,cost,
        -- 查询20%时间的订单
        ntile(5) over(order by orderdate) c
    from
        temp_ljy
    )T
where c=1;

cume_dist和percent_rank百分比函数

--cume_dist
select name,orderdate,cost,
-- 不分组,所有数据为一组,当前行占总行数的比例,
--第一行:1/14=0.07142857142857142
--第二行:2/14=0.14285714285714285
cume_dist() over(order by orderdate),
-- 组内,计算当前行的行数/组内总行数
cume_dist() over(partition by name order by orderdate)
from temp_ljy;


--percent_rank
select name,orderdate,cost,
-- 按name分组,组内的行数
sum(1) over(partition by name),
-- 所有数据,按时间排序,排名
rank() over(order by orderdate),
-- (排名-1)/(总行数-1)
-- 第一行:排名1,(1-1)/(14-1) = 0
-- 第二行:排名4,(4-1)/(14-1) = 0.23076
-- 第三行:排名6,(3-1)/(14-1) = 0.3846
percent_rank() over(order by orderdate),
-- (组内当前行-1)/(当前组总行-1)
-- 第1行:(1-1)/(5-1)=0
-- 第2行:(2-1)/(5-1)=0.25
-- 第3行:(3-1)/(5-1)=0.5
PERCENT_RANK()over(partition by name order by orderdate)
from temp_ljy;

本文源自blog.csdn.net/hyunbar/art…