hive sql

230 阅读5分钟

1. 使用SQL统计出每个用户的累积访问次数


我们有如下的用户访问数据
    userId  visitDate   visitCount
    u01 2017/1/21   5
    u02 2017/1/23   6
    u03 2017/1/22   8
    u04 2017/1/20   3
    u01 2017/1/23   6
    u01 2017/2/21   8
    U02 2017/1/23   6
    U01 2017/2/22   4
要求使用SQL统计出每个用户的累积访问次数,如下表所示:
    用户id    月份  小计  累积
    u01 2017-01 11  11
    u01 2017-02 12  23
    u02 2017-01 12  12
    u03 2017-01 8   8
    u04 2017-01 3   3
CREATE TABLE test_sql.test1 ( 
        userId string, 
        visitDate string,
        visitCount INT )
    ROW format delimited FIELDS TERMINATED BY "\t";
    INSERT INTO TABLE test_sql.test1
    VALUES
        ( 'u01', '2017/1/21', 5 ),
        ( 'u02', '2017/1/23', 6 ),
        ( 'u03', '2017/1/22', 8 ),
        ( 'u04', '2017/1/20', 3 ),
        ( 'u01', '2017/1/23', 6 ),
        ( 'u01', '2017/2/21', 8 ),
        ( 'u02', '2017/1/23', 6 ),
        ( 'u01', '2017/2/22', 4 );
        
SELECT t2.userid,
       t2.visitmonth,
       subtotal_visit_cnt,
       sum(subtotal_visit_cnt) over (partition BY userid ORDER BY visitmonth) AS total_visit_cnt
FROM
  (SELECT userid,
          visitmonth,
          sum(visitcount) AS subtotal_visit_cnt
   FROM
     (SELECT userid,
             date_format(regexp_replace(visitdate,'/','-'),'yyyy-MM') AS visitmonth,
             visitcount
      FROM test_sql.test1) t1
   GROUP BY userid,
            visitmonth)t2
ORDER BY t2.userid,
         t2.visitmonth;

2. 尚硅谷hive

2.1 数据结构

image.png image.png

2.2 创建表
create table gulivideo_ori(

videoId string,
   uploader string,
   age int,
   category array<string>,
   length int,
   views int,
   rate float,
   ratings int,
   comments int,
   relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
//创建原始数据表: gulivideo_user_ori
create table gulivideo_user_ori(
   uploader string,
   videos int,
   friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
2.3 创建带压缩格式的表
create table gulivideo_orc(
   videoId string,
   uploader string,
   age int,
   category array<string>,
   length int,
   views int,
   rate float,
   ratings int,
   comments int,
   relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
(2)gulivideo_user_orc
(3)向 ori 表插入数据 (4)向 orc 表插入数据
create table gulivideo_user_orc(
   uploader string,
   videos int,
   friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
2.4 sql
2.4.1 统计视频观看数top10
SELECT
    videoId,
    views 
FROM
    gulivideo_orc
ORDER BY
    views DESC
LIMIT 10;
2.4.2 统计视频类别热度 Top10
SELECT
    t1.category_name ,
    COUNT(t1.videoId) hot
FROM 
    ( 
    SELECT
        videoId,
        category_name
    FROM
        gulivideo_orc
    lateral VIEW explode(category) gulivideo_orc_tmp AS category_name
    ) t1
GROUP BY
    t1.category_name
ORDER BY 
    hot
DESC
LIMIT 10;
2.4.3 统计出视频观看数最高的 20 个视频的所属类别以及类别包含Top20 视频的个数
SELECT 
    t2.category_name,
    COUNT(t2.videoId) video_sum
FROM 
    (
    SELECT
        t1.videoId,
        category_name
    FROM 
        (
        SELECT 
            videoId, 
            views ,
            category 
	FROM 
            gulivideo_orc
	ORDER BY 
            views 
            DESC 
	LIMIT 20 
	) t1
	lateral VIEW explode(t1.category) t1_tmp AS category_name
	) t2
GROUP BY t2.category_name;
2.4.4 统计视频观看数 Top50 所关联视频的所属类别排序
SELECT
    t6.category_name,
    t6.video_sum,
    rank() over(ORDER BY t6.video_sum DESC ) rk
FROM
    (
    SELECT
        t5.category_name,
        COUNT(t5.relatedid_id) video_sum
    FROM
        (
        SELECT
            t4.relatedid_id,
            category_name
        FROM
            (
            SELECT 
                t2.relatedid_id ,
                t3.category 
            FROM 
                (
                SELECT 
                relatedid_id
                FROM 
                    (
                    SELECT 
                        videoId, 
                        views,
                        relatedid 
                    FROM 
                        gulivideo_orc
                    ORDER BY
                        views 
                    DESC 
                    LIMIT 50
                    )t1
                lateral VIEW explode(t1.relatedid) t1_tmp AS relatedid_id
                )t2 
            JOIN 
                gulivideo_orc t3 
            ON 
                t2.relatedid_id = t3.videoId 
            ) t4 
        lateral VIEW explode(t4.category) t4_tmp AS category_name
        ) t5
    GROUP BY
        t5.category_name
    ORDER BY 
        video_sum
    DESC 
    ) t6;
2.4.5 统计每个类别中的视频热度 Top10,以 Music 为例
SELECT
    t1.videoId,
    t1.category_name,
    t1.views
FROM
    (	
    SELECT
    	videoId,
    	views,
    	category_name
    FROM	
        gulivideo_orc
    lateral VIEW explode(category) gulivideo_orc_tmp AS category_name
    ) t1
where t1.category_name = "Music"
ORDER BY
    t1.views
DESC
LIMIT
    10;
2.4.6 统计每个类别视频观看数 Top10
SELECT
    t2.videoId,
    t2.views,
    t2.category_name,
    t2.rk
FROM 
    (
    SELECT 
        t1.videoId,
        t1.views,
        t1.category_name,
        rank() over(PARTITION BY t1.category_name ORDER BY t1.views DESC ) rk
    FROM 
        (
        SELECT
            videoId,
            views,
            category_name
        FROM 
    	    gulivideo_orc
        lateral VIEW explode(category) gulivideo_orc_tmp AS category_name
        )t1
    )t2
WHERE t2.rk <=10;
2.4.7 统计上传视频最多的用户 Top10以及他们上传的视频观看次数在前 20 的视频
SELECT
 t2.videoId,
 t2.views,
 t2.uploader
FROM
    (
    SELECT 
        uploader,
        videos
    FROM 
	    gulivideo_user_orc 
    ORDER BY 
        videos
    DESC
    LIMIT 10 
    ) t1
JOIN gulivideo_orc t2 
ON t1.uploader = t2.uploader
ORDER BY 
 t2.views 
DESC
LIMIT 20;

3. 练习

3.1 数据结构
name,orderdate,cost

jack,2017-01-01,10

tony,2017-01-02,15

jack,2017-02-03,23

tony,2017-01-04,29

jack,2017-01-05,46

jack,2017-04-06,42

tony,2017-01-07,50

jack,2017-01-08,55

mart,2017-04-08,62

mart,2017-04-09,68

neil,2017-05-10,12

mart,2017-04-11,75

neil,2017-06-12,80

mart,2017-04-13,94
3.2 建表
create table business(
name string,
orderdate string,
cost int
) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',';
load data local inpath "/opt/xkxin/business.txt" into table 
business;
3.3 需求
3.3.1 查询在 2017 年 4 月份购买过的顾客及总人数
SELECT 
    name,
    count(*) over () 
from 
    business
where substring(orderdate,1,7) = '2017-04'
group by 
    name;

1626684126(1).png

3.3.2 查询顾客的购买明细及月购买总额
SELECT 
    name,
    orderdate,
    cost,
    sum(cost) over(partition by month(orderdate))
from 
    business;

image.png

3.3.3 将每个顾客的 cost 按照日期进行累加
select name,orderdate,cost,

sum(cost) over() as sample1,--所有行相加

sum(cost) over(partition by name) as sample2,--按 name 分组,组内数据相加

sum(cost) over(partition by name order by orderdate) as sample3,--按 name

分组,组内数据累加

sum(cost) over(partition by name order by orderdate rows between

UNBOUNDED PRECEDING and current row ) as sample4 ,--和 sample3 一样,由起点到

当前行的聚合

sum(cost) over(partition by name order by orderdate rows between 1

PRECEDING and current row) as sample5, --当前行和前面一行做聚合

sum(cost) over(partition by name order by orderdate rows between 1

PRECEDING AND 1 FOLLOWING ) as sample6,--当前行和前边一行及后面一行

sum(cost) over(partition by name order by orderdate rows between current

row and UNBOUNDED FOLLOWING ) as sample7 --当前行及后面所有行

from business;
rows 必须跟在 order by 子句之后,对排序的结果进行限制,使用固定的行数来限制分

区中的数据行数量

sample3 结果:

1626684484(1).png

3.3.4 查看顾客上次的购买时间
SELECT 
    name,
    orderdate,
    cost,
    lag(orderdate,1,'1900-01-01') over(partition by name order by orderdate ) as time1, 
    lag(orderdate,2) over (partition by name order by orderdate) as time2
from business;

image.png

3.3.5 查询前 20%时间的订单信息
SELECT 
    * 
from 
    (
    select 
        name,
        orderdate,
        cost, 
        ntile(5) over(order by orderdate) sorted
    from 
        business
) t
where 
    sorted = 1;