一、数据来源
阿里云,时间区间为2017-11-25到2017-12-03,大小为3.5G,包含5个字段。
二、字段定义
| 列名称 | 说明 |
|---|---|
| 用户ID | 整数类型,序列化后的用户ID |
| 商品ID | 整数类型,序列话后的商品ID |
| 商品类目ID | 整数类型,序列话后的商品所属类目ID |
| 行为类型 | 字符串,枚举类型,包括('pv','buy','cart','fav') |
| 时间戳 | 行为时间发生的时间戳 |
用户的四种行为类型:
| 行为类型 | 说明 |
|---|---|
| pv | 商品详情页pv,等价于点击 |
| buy | 商品购买 |
| cart | 将商品加入购物车 |
| fav | 收藏商品 |
三、数据处理
因为Spark部署在阿里云服务器的私网下的,但我搜了下想要直连私网要vpn,且好贵。。因此接下来的sql处理都是在spark-sql的交互界面下进行。
-- 建表
CREATE EXTERNAL TABLE user_behavior (
`user_id` string comment '用户ID',
`item_id` string comment '商品ID',
`category_id` string comment '商品类目ID',
`behavior_type` string comment '行为类型,枚举类型,包括(pv, buy, cart, fav)',
`timestamp` int comment '行为时间戳',
`datetime` string comment '行为时间')
row format delimited
fields terminated by ','
lines terminated by '\n'
LOCATION '/data/aliyun';
查询前10条数据
select * from user_behavior_raw limit 10;
CREATE TABLE user_behavior
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION '/data/aliyun'
AS
SELECT
user_id,
item_id,
category_id,
behavior_type,
timestamp,
from_unixtime(timestamp, 'yyyy-MM-dd HH:mm:ss') as datetime,
from_unixtime(timestamp, 'yyyy-MM-dd') as dt
FROM user_behavior_raw;
查看数据总数
select count(*) from user_behavior;
按日期查询
SELECT
dt, -- 日期字段
COUNT(*) AS daily_record_count -- 每日数据总条数
FROM
user_behavior
GROUP BY
dt
ORDER BY
dt;
可以看到还是存在很多异常数据的,这些日期与阿里云锁描述的2017年11月25日至2017年12月3日之间不符。
四、数据清洗
进行如下四个步骤:去重,删除异常值
-- 数据去重
INSERT OVERWRITE TABLE user_behavior
SELECT
user_id,
item_id,
category_id,
behavior_type,
timestamp,
datetime,
dt
FROM
user_behavior
GROUP BY
user_id,
item_id,
category_id,
behavior_type,
timestamp,
datetime,
dt;
-- 数据清洗,去掉时间异常的数据
INSERT OVERWRITE TABLE user_behavior
SELECT
user_id,
item_id,
category_id,
behavior_type,
timestamp,
datetime,
dt
FROM
user_behavior
WHERE
cast(datetime as date) between '2017-11-25' and '2017-12-03';
清洗完成后再看数据分布:
-- 按日期统计每天的数据量
SELECT
dt,
COUNT(*) AS daily_record_count -- 统计每天的总记录数
FROM user_behavior_with_dt -- 替换为你的表名(如 user_behavior_partitioned)
GROUP BY dt
ORDER BY dt; -- 按日期排序
可见每日的数据量差别不大,创建分区表缓解压力
INSERT INTO user_behavior_partitioned PARTITION (dt)
SELECT
user_id,
item_id,
category_id,
behavior_type,
timestamp,
datetime,
to_date(datetime) AS dt -- 分区字段值
FROM user_behavior;
五、数据分析
1、日均访问量,日均用户量
--日均访问量,日均用户量
select cast(datetime as date) as day,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,
count(distinct user_id) as uv
from user_behavior_partitioned
group by cast(datetime as date)
order by day;
2、记录每个用户的购物情况,加工存储到user_behavior_count中
-- 创建外部表,指定 Parquet 列式存储
CREATE EXTERNAL TABLE user_behavior_count (
user_id string,
pv int,
fav int,
cart int,
buy int
)
USING parquet
LOCATION '/data/user_behavior_count';
INSERT INTO user_behavior_count
SELECT
user_id,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,
sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,
sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,
sum(case when behavior_type = 'buy' then 1 else 0 end) as buy
FROM user_behavior_partitioned
GROUP BY user_id;
查询复购率,即产生两次或两次以上购买的用户占购买用户的比例
select sum(case when buy > 1 then 1 else 0 end) / sum(case when buy > 0 then 1 else 0 end)
from user_behavior_count;
3、用户行为转换率
--点击/(加购物车+收藏)/购买 , 各环节转化率
select a.pv,
a.fav,
a.cart,
a.fav + a.cart as `fav+cart`,
a.buy,
round((a.fav + a.cart) / a.pv, 4) as pv2favcart,
round(a.buy / (a.fav + a.cart), 4) as favcart2buy,
round(a.buy / a.pv, 4) as pv2buy
from(
select sum(pv) as pv, --点击数
sum(fav) as fav, --收藏数
sum(cart) as cart, --加购物车数
sum(buy) as buy --购买数
from user_behavior_count
) as a;
4、用户行为习惯
-- 一天的活跃时段分布
select hour(datetime) as hour,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv, --点击数
sum(case when behavior_type = 'fav' then 1 else 0 end) as fav, --收藏数
sum(case when behavior_type = 'cart' then 1 else 0 end) as cart, --加购物车数
sum(case when behavior_type = 'buy' then 1 else 0 end) as buy --购买数
from user_behavior
group by hour(datetime)
order by hour;
--一周用户的活跃分布
select pmod(datediff(datetime, '1920-01-01') - 3, 7) as weekday,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv, --点击数
sum(case when behavior_type = 'fav' then 1 else 0 end) as fav, --收藏数
sum(case when behavior_type = 'cart' then 1 else 0 end) as cart, --加购物车数
sum(case when behavior_type = 'buy' then 1 else 0 end) as buy --购买数
from user_behavior
where date(datetime) between '2017-11-27' and '2017-12-03'
group by pmod(datediff(datetime, '1920-01-01') - 3, 7)
order by weekday;
基于RFM模型找出有价值的用户
RFM模型基于三个要素衡量客户价值和客户创利能力:
- R-Recency(最近一次购买时间)
- F-Frequency(消费频率)
- M-Money(消费金额)
--R(最近一次购买时间),越高越说明用户活跃
select user_id, datediff('2017-12-04',max(datetime)) as R,
dense_rank() over(order by datediff('2017-12-04', max(datetime))) as R_rank
from user_behavior_partitioned
where
behavior_type = 'buy' -- 只筛选购买行为
and dt between '2017-11-25' and '2017-12-03'
group by user_id
limit 10;
select user_id,
count(1) as F,
dense_rank() over(order by count(1) desc) as F_rank
from user_behavior_partitioned
where behavior_type = 'buy'
group by user_id
limit 10;
对用户按照排名进行分组打分,共分为5组,前0/5
with cte as(
select user_id,
datediff('2017-12-04', max(datetime)) as R,
dense_rank() over(order by datediff('2017-12-04', max(datetime))) as R_rank,
count(1) as F,
dense_rank() over(order by count(1) desc) as F_rank
from user_behavior
where behavior_type = 'buy'
group by user_id)
select user_id, R, R_rank, R_score, F, F_rank, F_score, R_score + F_score AS score
from(
select *,
case ntile(5) over(order by R_rank) when 1 then 5
when 2 then 4
when 3 then 3
when 4 then 2
when 5 then 1
end as R_score,
case ntile(5) over(order by F_rank) when 1 then 5
when 2 then 4
when 3 then 3
when 4 then 2
when 5 then 1
end as F_score
from cte
) as a
order by score desc
limit 20;
--销量最高的商品
select item_id ,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv, --点击数
sum(case when behavior_type = 'fav' then 1 else 0 end) as fav, --收藏数
sum(case when behavior_type = 'cart' then 1 else 0 end) as cart, --加购物车数
sum(case when behavior_type = 'buy' then 1 else 0 end) as buy --购买数
from user_behavior
group by item_id
order by buy desc
limit 10;
--销量最高的商品大类
select category_id ,
sum(case when behavior_type = 'pv' then 1 else 0 end) as pv, --点击数
sum(case when behavior_type = 'fav' then 1 else 0 end) as fav, --收藏数
sum(case when behavior_type = 'cart' then 1 else 0 end) as cart, --加购物车数
sum(case when behavior_type = 'buy' then 1 else 0 end) as buy --购买数
from user_behavior
group by category_id
order by buy desc
limit 10;
因为商品维度较少,缺少分析价值。