ODS层——原始数据存储
创建user_info表
CREATE TABLE user_info(
user_id int,
age_between varchar(20),
sex int,
user_level int,
reg_time date
)
-- 定义分隔符
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
-- 装载数据
load data inpath '/user/J15546620791/User_info.csv' into table user_info;
查询结果:
BUG:此时的数据第一行格式不对,具体数据变成了表头, age_between列变成了乱码,regtime列变成了null。
解决办法: 重新装载数据,可此时user/J155xxxxxxxx/目录下已经没有user_info文件了,它被移动到HDFS上了,为了不重新上传浪费时间,我们直接去apps/xxx/xxx目录下去寻找此文件,然后拷贝到原文件夹下。(实操找不到那个文件目录,重新上传)
drop table user_info;
CREATE TABLE user_info(
user_id int,
age_between varchar(20),
sex int,
user_level int,
reg_time date
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
-- 去掉第一行数据,告诉第一行是列头
TBLPROPERTIES ('skip.header.line.count' = '1');
load data inpath '/user/J15546620791/User_info.csv' overwrite into table user_info;
select * from user_info limit 10;
-- 解决hive中文输出乱码的问题
alter table user_info set serdeproperties('serialization.encoding'='GBK');
-- 修改表结构,更改列名,列类型
alter table user_info change column reg_time reg_time varchar(50);
select * from user_info limit 10;
查询结果:
此时格式正确。
将reg_time更改成日期格式:
unix_timestamp:将字段改成时间戳
from_unixtime:改成字符串格式
to_date:转成日期格式
select *,to_date(from_unixtime(unix_timestamp(reg_time,'yyyy/mm/dd'))) as reg from user_info limit 10;
创建user_action表
create table user_action(
user_id int,
sku_id string,
action_time Timestamp,
model_id int,
type int,
cate int,
brand int
)
row format delimited fields terminated by ',';
tblproperties("skip.header.line.count"="1");
load data inpath '/user/J15546620791/user_Action_201602.csv' into table user_action;
load data inpath '/user/J15546620791/user_Action_201603.csv' into table user_action;
load data inpath '/user/J15546620791/user_Action_201604.csv' into table user_action;
DWD层——数据清洗
第一步:创建新数据库create database DWS_XHS
第二步:创建新的数据表
创建新的user_info表
create table user_info as
select user_id,
age_between,
case when sex = 1 then '女' when sex = 0 then '男' else '保密' end,
user_level,
to_date(from_unixtime(uxin_timestamp(reg_time))) as reg_time,
from ods_xhs.user_info;
创建新的user_action表
create table User_action as
select user_id, sku_id ,
action_time,
type,
cate,
brand
from ods_shop.user_action;
DWS层——轻度汇总
第一步:创建数据库create database DWS_XHS
第二步:创建新的数据表
-
统计各用户每天各行为次数
create table user_date_action as select user_id,to_date(action_time) as action_date, sum(if(type=1,1,0)) as browes, sum(if(type=2,1,0)) as addCart sum(if(type=3,1,0)) as delCart, sum(if(type=4,1,0)) as orders, sum(if(type=5,1,0)) as addFriend, sum(if(type=6,1,0)) as click from DWD_XHS.user_action group by user_id,to_date(action_time)
-
统计每人活跃日期
create table user_date as select user_id,action_date from user_date_action group by user_id,action_date
-
用户信息表直接复制
create table user_info as select * from DWD_XHS.user_info
APP层——用户状态变化
第一步:创建新数据库
create database APP_XHS
第二步:统计用户状态变化,创建表
注:统计用户状态主要参考三个日期:注册日期,最后活跃日期,统计日期
-- 要统计的历史日期和所有用户的迪卡尔积表
create table user_cross as
select user_id,reg_time,action_date,
-- 用户最大活跃日期
(select max(action_date) from DWS_XHS.user_date where user_id = users.user_id and action_date <= riqi.action_date) as max_date
from (select action_date from dws_shop.user_date group by action_date) as riqi,
(select user_id,reg_time from dws_shop.user_info group by user_id,reg_time) as users
-- 统计user_date_cross中的各用户状态
create table user_status as
select *
case when stats_time = reg_time then 'NewUser' -- 注册用户
when day(max_date - reg_time <=7 or max_date is null) and day(stats_time - reg_time <= 14) then 'inaction' -- 未激活用户
when day(max_date - reg_time >7) and day(stats_time - max_date <=7) then 'action' -- 激活用户
when day(max_date - reg_time >7) and day(stats_time - max_date >14) and day(stats_time - max_time <=21) or (day(stats_time - reg_time >14) and day(max_date - reg_time <=7) or max_date is null)) then 'sleep' --睡眠用户
when (day(max_action-reg_time)>7 and day(stats_date-max_action)>21 ) or (day(stats_date-reg_time)>28 and (day(max_action-reg_time)<=7 or max_action is null )) then "lose_user" -- 流失用户
end as user_typed
from user_date_cross
-- 统计每天用户状态数
select action_date,user_typed,count(*) as user_count
from user_status
group by action_date,user_typed
第三步:每天新统计数据 (如添加2016年5月1号数据)
insert into user_status
select *,
case when stats_date = reg_time then "new_user"
when (day(max_action-reg_time)<=7 or max_action is null) and day(stats_date-reg_time)<=14 then "inaction_user"
when day(max_action-reg_time)>7 and day(stats_date-max_action)<=7 then "action_user"
when (day(max_action-reg_time)>7 and day(stats_date-max_action)>7 and day(stats_date-max_action)<=21) or (day(stats_date-reg_time)>14 and (day(max_action-reg_time)<=7 or max_action is null )) then "sleep_user"
when (day(max_action-reg_time)>7 and day(stats_date-max_action)>21 ) or (day(stats_date-reg_time)>28 and (day(max_action-reg_time)<=7 or max_action is null )) then "lose_user"
end as user_typed
from (
select to_date('2016-5-1') as stats_date,user_id,reg_time,
(select max(action_date) from dws_shop.user_date where user_id =
user_info.user_id and action_date < to_date('2016-5-1') ) max_action
from dws_shop.user_info ) as Sub