大数据实战

23 阅读6分钟

一、数据来源

阿里云,时间区间为2017-11-25到2017-12-03,大小为3.5G,包含5个字段。

二、字段定义

列名称说明
用户ID整数类型,序列化后的用户ID
商品ID整数类型,序列话后的商品ID
商品类目ID整数类型,序列话后的商品所属类目ID
行为类型字符串,枚举类型,包括('pv','buy','cart','fav')
时间戳行为时间发生的时间戳

用户的四种行为类型:

行为类型说明
pv商品详情页pv,等价于点击
buy商品购买
cart将商品加入购物车
fav收藏商品

三、数据处理

因为Spark部署在阿里云服务器的私网下的,但我搜了下想要直连私网要vpn,且好贵。。因此接下来的sql处理都是在spark-sql的交互界面下进行。

-- 建表
CREATE EXTERNAL TABLE user_behavior (
`user_id` string comment '用户ID',
`item_id` string comment '商品ID',
`category_id` string comment '商品类目ID',
`behavior_type` string  comment '行为类型,枚举类型,包括(pv, buy, cart, fav)',
`timestamp` int comment '行为时间戳',
`datetime` string comment '行为时间')
row format delimited
fields terminated by ','
lines terminated by '\n'
LOCATION '/data/aliyun'; 

在这里插入图片描述

查询前10条数据

select * from user_behavior_raw limit 10;

在这里插入图片描述

CREATE TABLE user_behavior
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION '/data/aliyun'
AS
SELECT
  user_id,
  item_id,
  category_id,
  behavior_type,
  timestamp,
  from_unixtime(timestamp, 'yyyy-MM-dd HH:mm:ss') as datetime,
  from_unixtime(timestamp, 'yyyy-MM-dd') as dt
FROM user_behavior_raw;

image-20251007163358799 在这里插入图片描述

查看数据总数

select count(*) from user_behavior;

在这里插入图片描述

按日期查询

SELECT 
    dt,  -- 日期字段
    COUNT(*) AS daily_record_count  -- 每日数据总条数
FROM 
    user_behavior
GROUP BY 
    dt  
ORDER BY 
    dt;  

在这里插入图片描述

可以看到还是存在很多异常数据的,这些日期与阿里云锁描述的2017年11月25日至2017年12月3日之间不符。

四、数据清洗

进行如下四个步骤:去重,删除异常值

-- 数据去重
INSERT OVERWRITE TABLE user_behavior
SELECT 
    user_id,
    item_id,
    category_id,
    behavior_type,
    timestamp,
    datetime,
    dt 
FROM 
    user_behavior
GROUP BY 
    user_id,
    item_id,
    category_id,
    behavior_type,
    timestamp,
    datetime,
    dt;
​
-- 数据清洗,去掉时间异常的数据
INSERT OVERWRITE TABLE user_behavior
SELECT 
    user_id, 
    item_id, 
    category_id, 
    behavior_type, 
    timestamp, 
    datetime,
    dt
FROM 
    user_behavior
WHERE 
    cast(datetime as date) between '2017-11-25' and '2017-12-03';

清洗完成后再看数据分布:

-- 按日期统计每天的数据量
SELECT 
  dt, 
  COUNT(*) AS daily_record_count  -- 统计每天的总记录数
FROM user_behavior_with_dt  -- 替换为你的表名(如 user_behavior_partitioned)
GROUP BY dt
ORDER BY dt;  -- 按日期排序

在这里插入图片描述

可见每日的数据量差别不大,创建分区表缓解压力

INSERT INTO user_behavior_partitioned PARTITION (dt)
SELECT 
  user_id,
  item_id,
  category_id,
  behavior_type,
  timestamp,
  datetime,
  to_date(datetime) AS dt  -- 分区字段值
FROM user_behavior;

在这里插入图片描述 在这里插入图片描述

五、数据分析

1、日均访问量,日均用户量

--日均访问量,日均用户量
select cast(datetime as date) as day,
       sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,
       count(distinct user_id) as uv
from user_behavior_partitioned
group by cast(datetime as date)
order by day;

在这里插入图片描述

2、记录每个用户的购物情况,加工存储到user_behavior_count中

-- 创建外部表,指定 Parquet 列式存储
CREATE EXTERNAL TABLE user_behavior_count (
  user_id string,
  pv int,
  fav int,
  cart int,
  buy int
)
USING parquet
LOCATION '/data/user_behavior_count';

INSERT INTO user_behavior_count
SELECT 
  user_id,
  sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,
  sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,
  sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,
  sum(case when behavior_type = 'buy' then 1 else 0 end) as buy
FROM user_behavior_partitioned
GROUP BY user_id;

在这里插入图片描述

查询复购率,即产生两次或两次以上购买的用户占购买用户的比例

select sum(case when buy > 1 then 1 else 0 end) / sum(case when buy > 0 then 1 else 0 end)
from user_behavior_count;

在这里插入图片描述

3、用户行为转换率

--点击/(加购物车+收藏)/购买 , 各环节转化率
select a.pv,
       a.fav,
       a.cart,
       a.fav + a.cart as `fav+cart`,
       a.buy,
       round((a.fav + a.cart) / a.pv, 4) as pv2favcart,
       round(a.buy / (a.fav + a.cart), 4) as favcart2buy,
       round(a.buy / a.pv, 4) as pv2buy
from(
select sum(pv) as pv,   --点击数
       sum(fav) as fav,  --收藏数
       sum(cart) as cart,  --加购物车数
       sum(buy) as buy  --购买数
from user_behavior_count
) as a;

在这里插入图片描述

4、用户行为习惯

-- 一天的活跃时段分布
select hour(datetime) as hour,
       sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,   --点击数
       sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,  --收藏数
       sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,  --加购物车数
       sum(case when behavior_type = 'buy' then 1 else 0 end) as buy  --购买数
from user_behavior
group by hour(datetime)
order by hour;

在这里插入图片描述

--一周用户的活跃分布
select pmod(datediff(datetime, '1920-01-01') - 3, 7) as weekday,
       sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,   --点击数
       sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,  --收藏数
       sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,  --加购物车数
       sum(case when behavior_type = 'buy' then 1 else 0 end) as buy  --购买数
from user_behavior
where date(datetime) between '2017-11-27' and '2017-12-03'
group by pmod(datediff(datetime, '1920-01-01') - 3, 7)
order by weekday;

在这里插入图片描述

基于RFM模型找出有价值的用户

RFM模型基于三个要素衡量客户价值和客户创利能力:

  • R-Recency(最近一次购买时间)
  • F-Frequency(消费频率)
  • M-Money(消费金额)
--R(最近一次购买时间),越高越说明用户活跃
select user_id, datediff('2017-12-04',max(datetime)) as R,
dense_rank() over(order by datediff('2017-12-04', max(datetime))) as R_rank
from user_behavior_partitioned
where 
behavior_type = 'buy'  -- 只筛选购买行为
and dt between '2017-11-25' and '2017-12-03'
group by user_id
limit 10;


select user_id,
       count(1) as F,
       dense_rank() over(order by count(1) desc) as F_rank
from user_behavior_partitioned
where behavior_type = 'buy'
group by user_id
limit 10;

在这里插入图片描述 在这里插入图片描述

对用户按照排名进行分组打分,共分为5组,前0/5

with cte as(
select user_id,
       datediff('2017-12-04', max(datetime)) as R,
       dense_rank() over(order by datediff('2017-12-04', max(datetime))) as R_rank,
       count(1) as F,
       dense_rank() over(order by count(1) desc) as F_rank
from user_behavior
where behavior_type = 'buy'
group by user_id)

select user_id, R, R_rank, R_score, F, F_rank, F_score,  R_score + F_score AS score
from(
select *,
       case ntile(5) over(order by R_rank) when 1 then 5
                                           when 2 then 4
                                           when 3 then 3
                                           when 4 then 2
                                           when 5 then 1
       end as R_score,
       case ntile(5) over(order by F_rank) when 1 then 5
                                           when 2 then 4
                                           when 3 then 3
                                           when 4 then 2
                                           when 5 then 1
       end as F_score
from cte
) as a
order by score desc
limit 20;

在这里插入图片描述

--销量最高的商品
select item_id ,
       sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,   --点击数
       sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,  --收藏数
       sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,  --加购物车数
       sum(case when behavior_type = 'buy' then 1 else 0 end) as buy  --购买数
from user_behavior
group by item_id
order by buy desc
limit 10;

--销量最高的商品大类
select category_id ,
       sum(case when behavior_type = 'pv' then 1 else 0 end) as pv,   --点击数
       sum(case when behavior_type = 'fav' then 1 else 0 end) as fav,  --收藏数
       sum(case when behavior_type = 'cart' then 1 else 0 end) as cart,  --加购物车数
       sum(case when behavior_type = 'buy' then 1 else 0 end) as buy  --购买数
from user_behavior
group by category_id
order by buy desc
limit 10;

在这里插入图片描述 在这里插入图片描述

因为商品维度较少,缺少分析价值。