数据来源
阿里云天池数据集: User Behavior Data from Taobao for Recommendation 数据为2017年11月23-2017年12月3日淘宝网用户行为数据,包括用户ID,商品ID,商品类别ID,用户行为(浏览、购买、收藏、加购),时间四个字段。
初步处理
原数据超过1亿条,为便于处理,本案例选取其中一百万条进行操作。 保留一百万条数据,确定不存在空值和重复值,设定时间区间为11/27-12/3,共一周,删减后剩余78万条数据。
-- 创建数据库和数据表
create database taobao;
use taobao;
create table user_behavior (user_id int(9),
item_id int(9),
category_id int(9),
behavior_type varchar(5),
timestamps int(14) );
-- 增加编号列
USE taobao;
ALTER TABLE users ADD id int first;
ALTER TABLE users MODIFY id int primary key auto_increment;
-- 保留一百万条数据
-- 一亿条数据处理速度慢,删减值100万条
DELETE FROM users
WHERE id > 1000000;
-- 查询空值
SELECT * FROM users WHERE user_id IS NULL;
SELECT * FROM users WHERE item_id IS NULL;
SELECT * FROM users WHERE category_id IS NULL;
SELECT * FROM users WHERE behavior_type IS NULL;
SELECT * FROM users WHERE timestamps IS NULL;
-- 检查重复值
SELECT user_id, item_id, timestamps, id
FROM users
GROUP BY user_id, item_id, timestamps
HAVING count(*)>1;
-- 处理时间
-- 时间区间设置为一周:2017年11月27日-2017年12月3日,数据788887条
ALTER TABLE users ADD (datetimes TIMESTAMP(0), dates char(10), times char(8), hours char(2));
SET SQL_SAFE_UPDATES = false;
UPDATE users SET datetimes = from_unixtime(timestamps);
SELECT * FROM users LIMIT 5;
SET SQL_SAFE_UPDATES = false;
UPDATE users SET dates = substring(datetimes,1,10);
SET SQL_SAFE_UPDATES = false;
UPDATE users SET times = substring(datetimes,12,8);
SET SQL_SAFE_UPDATES = false;
UPDATE users SET hours = substring(datetimes,12,2);
DELETE FROM users
WHERE datetimes < '2017-11-27 00:00:00' OR datetimes > '2017-12-03 23:59:59';
浏览行为与购买行为分析
统计周内、日内访客数UV、访问量PV、平均访问量PV/UV;周内、日内购买量。 结果表示周内浏览与购买高峰为周末,日内浏览高峰为夜间9点到11点。网站可以集中在这些时段进行优惠活动和广告推送。
-- 周内浏览量
CREATE TABLE pv_weekly(dates char(10),
pv_day int(9),
uv_day int(9),
pvpu_day decimal);
INSERT INTO pv_weekly(dates,
pv_day,
uv_day,
pvpu_day)
SELECT dates, count(*) as 'pv', count(DISTINCT user_id) as 'uv', count(*)/count(DISTINCT user_id) as 'pvpu'
FROM users
WHERE behavior_type = 'pv'
GROUP BY dates;
-- 日内浏览
CREATE TABLE pv_daily(datetimes char(19),
pv int(9),
uv int(9),
pvpu decimal);
INSERT INTO pv_daily(datetimes,
pv,
uv,
pvpu)
SELECT datetimes, count(*) as 'pv', count(DISTINCT user_id) as 'uv', count(*)/count(DISTINCT user_id) as 'pvpu'
FROM users
WHERE behavior_type = 'pv'
GROUP BY datetimes;
-- 周内购买量
CREATE TABLE buyamount_weekly(dates char(10),
buy_day int(9));
INSERT INTO buyamount_weekly(dates,
buy_day)
SELECT dates, count(*) as 'buy'
FROM users
WHERE behavior_type = 'buy'
GROUP BY dates;
-- 日内购买量
CREATE TABLE buyamount_daily(datetimes char(19),
buy int(9));
INSERT INTO buyamount_daily(datetimes,
buy)
SELECT datetimes, count(*) as 'buy'
FROM users
WHERE behavior_type = 'buy'
GROUP BY datetimes;
-- 周内每日购买率
CREATE TABLE buy_weekly(
dates char(19),
buypc_daily decimal(10,5)
);
INSERT INTO buy_weekly(
dates,
buypc_daily
)
SELECT a.dates, a.buy/b.pv
FROM
(
SELECT dates, count(*) as 'buy'
FROM users
WHERE behavior_type = 'buy'
GROUP BY dates
) AS a,
(
SELECT dates, count(*) as 'pv'
FROM users
WHERE behavior_type = 'pv'
GROUP BY dates
) AS b
WHERE a.dates = b.dates;
-- 日购买率分析
CREATE TABLE buy_daily(
datetimes char(19),
buypc_time decimal(10,5)
);
INSERT INTO buy_daily(
datetimes,
buypc_time
)
SELECT a.datetimes, a.buy/b.pv
FROM
(
SELECT datetimes, count(*) as 'buy'
FROM users
WHERE behavior_type = 'buy'
GROUP BY datetimes
) AS a,
(
SELECT datetimes, count(*) as 'pv'
FROM users
WHERE behavior_type = 'pv'
GROUP BY datetimes
) AS b
WHERE a.datetimes = b.datetimes;
用户留存率
分析客户留存率对新客户更为有效。本次数据无法分辨新旧客户,仅进行留存率计算,并未进行留存率分析。
USE taobao;
CREATE TABLE retention_rate (
retention_0 float,
retention_1 float,
retention_3 float,
retention_5 float
);
INSERT INTO retention_rate (
retention_0,
retention_1,
retention_3,
retention_5
)
SELECT
count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) AS retention_0,
count(if(datediff(b.dates,a.dates)=1,b.user_id,NULL))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) AS retention_1,
count(if(datediff(b.dates,a.dates)=3,b.user_id,NULL))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) AS retention_3,
count(if(datediff(b.dates,a.dates)=5,b.user_id,NULL))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) AS retention_5
FROM
-- 1 11月27日 1 11月27日
-- 1 11月27日 1 11月28日
(SELECT user_id, dates
FROM users
GROUP BY user_id, dates
) as a,
(SELECT user_id, dates
FROM users
GROUP BY user_id, dates
) as b
WHERE a.dates = "2017-11-27"
AND a.user_id = b.user_id
AND a.dates <= b.dates;
用户行为路径
将(浏览,加购,收藏,购买)四种行为进行组合,组成共15中用户行为路径。结果显示87.47%的用户浏览后并未进行其他操作,说明网站推送并未成功提升用户消费行为,可以进一步提高商品推送精度,使推送更符合用户需求。
-- 提取用户对商品的行为
CREATE VIEW beha_num AS
SELECT user_id, item_id,
count(if(behavior_type='pv',behavior_type,NULL)) as 'pv',
count(if(behavior_type='cart',behavior_type,NULL)) as 'cart',
count(if(behavior_type='fav',behavior_type,NULL)) as 'fav',
count(if(behavior_type='buy',behavior_type,NULL)) as 'buy'
FROM users
GROUP BY user_id, item_id;
-- 定义用户行为
CREATE VIEW user_beha AS
SELECT user_id, item_id,
(case when pv>0 then 1 else 0 end) AS 'vpv',
(case when cart>0 then 1 else 0 end) AS 'vcart',
(case when fav>0 then 1 else 0 end) AS 'vfav',
(case when buy>0 then 1 else 0 end) AS 'vbuy'
FROM beha_num;
-- 定义行为路径
-- 1001 浏览后购买
-- 1101 浏览后架构后购买
CREATE VIEW beha_path AS
SELECT user_id, item_id,
concat(vpv,vcart,vfav,vbuy) as path_type
FROM user_beha;
-- 统计并保存行为路径
CREATE TABLE path_analyse(
path_type char(4),
path_count int);
INSERT INTO path_analyse
SELECT path_type, count(*)
FROM beha_path
GROUP BY path_type;
价值客户RFM模型
根据R(最近一次消费)、F(消费频率)、M(消费金额)
分类(R,F)
价值用户(1,1)
保持用户(0,1)
发展用户(1,0)
挽留用户(0,0)
-- 提取R,F数据
CREATE TABLE RFM(
user_id int,
F int,
R char(10)
);
INSERT INTO RFM
SELECT user_id, MAX(dates) as R, count(*) as F
FROM USERS
WHERE behavior_type = 'buy'
GROUP BY user_id
ORDER BY R,F DESC;
--打分
ALTER TABLE RFM ADD Rscore int;
SET SQL_SAFE_UPDATES = false;
UPDATE RFM
SET Rscore = case
WHEN R = '2017-12-03' then 5
WHEN R = '2017-12-02' then 4
WHEN R = '2017-12-01' then 3
WHEN R in ('2017-11-29','2017-11-30') then 2
ELSE 1
END;
ALTER TABLE RFM ADD Fscore int;
SET SQL_SAFE_UPDATES = false;
UPDATE RFM
SET Fscore = case
WHEN F BETWEEN 50 AND 57 THEN 5
WHEN F BETWEEN 40 AND 50 THEN 4
WHEN F BETWEEN 30 AND 40 THEN 3
WHEN F BETWEEN 15 AND 30 THEN 2
ELSE 1
END;
-- 分层
SET @Ravg = NULL;
SET @Favg = NULL;
SELECT avg(Rscore) INTO @Ravg FROM RFM;
SELECT avg(Fscore) INTO @Favg FROM RFM;
ALTER TABLE RFM ADD RFM_type;
SET SQL_SAFE_UPDATES = false;
UPDATE RFM
SET RFM_type = CASE
WHEN Rscore > @Ravg AND Fscore > @Favg then '价值用户'
WHEN Rscore < @Ravg AND Fscore > @Favg then '保持用户'
WHEN Rscore > @Ravg AND Fscore <> @Favg then '发展用户'
ELSE '挽留用户'
END;
复购情况
CREATE VIEW buycount AS
SELECT user_id, item_id, count(*) as num
FROM users
WHERE behavior_type = 'buy'
GROUP BY user_id, item_id;
CREATE TABLE buyagain(
buynum int,
usernum int
);
INSERT INTO buyagain(
buynum,
usernum
)
SELECT NUM, COUNT(user_id)
FROM buycount
GROUP BY num;
Tableau
使用Tableau对处理结果进行可视化。