淘宝用户行为分析

225 阅读7分钟

前一段时间在实习中做的一个小项目:

项目思路展示

image.png

image.png

image.png

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import numba as nb
from pyecharts import options as opts
from pyecharts.charts import Funnel
from pyecharts.charts import Line, Timeline,Bar,Pie
from pyecharts.globals import ThemeType
from pyecharts.faker import Faker

image.png

original_data=pd.read_csv(r"C:\Users\ljl20\Desktop\tianchi_mobile_recommend_train_user.csv")
original_data

image.png

image.png

original_data.sample(30)

image.png

image.png

image.png

然后查看数据是否重复。

@nb.jit(nopython=True)
def check_duplicates(data, columns):
    return data.duplicated(columns)

original_data.duplicated(['user_id','item_id','behavior_type','user_geohash','item_category','time'])

image.png

original_data.describe()

image.png

cleaned_data=original_data.copy()
cleaned_data

image.png

image.png

cleaned_data["time"] = pd.to_datetime(cleaned_data["time"])
cleaned_data["time"]

image.png

image.png

cleaned_data['time'] = pd.to_datetime(cleaned_data['time'])
cleaned_data['year'] = cleaned_data['time'].dt.year
cleaned_data['month'] = cleaned_data['time'].dt.month
cleaned_data['date'] = cleaned_data['time'].dt.day
cleaned_data['hour'] = cleaned_data['time'].dt.hour

image.png

cleaned_data[~cleaned_data.duplicated(['user_id','item_id','behavior_type','user_geohash','item_category','time'])]

image.png

del cleaned_data['time']
cleaned_data
hours_counts = cleaned_data.groupby("hour")["behavior_type"].count()
line_chart = (
    Line()
    .add_xaxis(hours_counts.index.tolist())
    .add_yaxis("次数", hours_counts.tolist(), label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每时用户购买次数"),
        xaxis_opts=opts.AxisOpts(name='hour'),
        yaxis_opts=opts.AxisOpts(name="次数"),
    )
)
hours_timeline_chart = Timeline(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
hours_timeline_chart.add(line_chart, "折线图")
hours_timeline_chart.render_notebook()

image.png

daily_counts = cleaned_data.pivot_table(index='date', columns='behavior_type', aggfunc='size', fill_value=0)
line_chart = (
    Line()
    .add_xaxis(daily_counts.index.tolist())
    .add_yaxis("收藏", daily_counts[2].tolist())
    .add_yaxis("加购物车", daily_counts[3].tolist())
    .add_yaxis("支付", daily_counts[4].tolist())
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每天收藏、加购物车、支付次数分析"),
        xaxis_opts=opts.AxisOpts(name='日期'),
        yaxis_opts=opts.AxisOpts(name="次数"),
    )
)
line_chart.render_notebook()

image.png

daily_counts = cleaned_data.pivot_table(index='date', columns='behavior_type', aggfunc='size', fill_value=0)
line_chart = (
    Line()
    .add_xaxis(daily_counts.index.tolist())
    .add_yaxis("用户访问总次数", daily_counts[1].tolist(), label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每天用户行为类型总次数"),
        xaxis_opts=opts.AxisOpts(name='日期'),
        yaxis_opts=opts.AxisOpts(name="次数"),
    )
)

line_chart.render_notebook()
collect_counts = daily_counts[2].tolist()
add_to_cart_counts = daily_counts[3].tolist()
click_counts = daily_counts[1].tolist()
payment_counts = daily_counts[4].tolist()
funnel_chart = (
    Funnel()
    .add(
        "漏斗图",
        [list(z) for z in zip(["收藏", "加购物车", "点击", "支付"], [collect_counts, add_to_cart_counts, click_counts, payment_counts])],
        label_opts=opts.LabelOpts(position="inside"),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="收藏、加购物车、点击和支付次数分析"),
        tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b} : {c}"),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{b} : {c}"))
)
funnel_chart.render_notebook()
daily_counts = cleaned_data[cleaned_data['behavior_type'].isin([1, 4])].groupby('date')['behavior_type'].value_counts().unstack().fillna(0)
conversion_rate = daily_counts[4] / daily_counts[1]
line_chart = (
    Line()
    .add_xaxis(daily_counts.index.tolist())
    .add_yaxis("转化率", conversion_rate.tolist(), label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
        title_opts=opts.TitleOpts(title="每天用户转化率"),
        xaxis_opts=opts.AxisOpts(name='日期'),
        yaxis_opts=opts.AxisOpts(name="转化率"),
    )
)
line_chart.render_notebook()

image.png

buy_counts = cleaned_data[cleaned_data['behavior_type'] == 4]['item_id'].value_counts()
top_twenty_buy_most = buy_counts.head(20)
top_twenty_buy_most_df = pd.DataFrame({'item_id': top_twenty_buy_most.index, 'buy_count': top_twenty_buy_most.values})
top_twenty_buy_most_df
click_counts = cleaned_data[cleaned_data['behavior_type'] == 1]['item_id'].value_counts()
top_twenty_click_most = click_counts.head(20)
top_twenty_click_most_df = pd.DataFrame({'item_id': top_twenty_click_most.index, 'click_count': top_twenty_click_most.values})
top_twenty_click_most_df
combined_df = pd.concat([top_twenty_click_most_df, top_twenty_buy_most_df],axis=0)
combined_df

image.png

image.png

cleaned_data['user_geohash']
cleaned_data.dropna(subset=['user_geohash'], inplace=True)
user_geohash_without_nan = cleaned_data.dropna(subset=['user_geohash'])
user_geohash_without_nan['user_geohash']
user_geohash_most = user_geohash_without_nan['user_geohash'].value_counts()
top_twenty_user_geohash_most = user_geohash_most.head(20)
top_twenty_user_geohash_most
user_geohash = [
    '94ek6ke', '94ek6lj', '94ek6lw', '94ek6kn', '94ek6l5',
    '94ek6kc', '94ek6lm', '94ek6lr', '94ek6li', '94ek6kw',
    '94ek6kg', '94ek6ln', '94ek6ka', '94ek6e9', '94ek6k1',
    '94ek6kh', '94ek6kj', '94ek6le', '94ek6k2', '94ek6kl'
]

count = [
    1052, 1008, 950, 918, 526,
    522, 517, 513, 507, 503,
    491, 490, 489, 487, 486,
    485, 482, 481, 479, 478
]
pie = (
    Pie()
    .add("", [list(z) for z in zip(user_geohash, count)])
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()
user_geohash = [
    '94ek6ke', '94ek6lj', '94ek6lw', '94ek6kn', '94ek6l5',
    '94ek6kc', '94ek6lm', '94ek6lr', '94ek6li', '94ek6kw',
    '94ek6kg', '94ek6ln', '94ek6ka', '94ek6e9', '94ek6k1',
    '94ek6kh', '94ek6kj', '94ek6le', '94ek6k2', '94ek6kl'
]
count = [
    1052, 1008, 950, 918, 526,
    522, 517, 513, 507, 503,
    491, 490, 489, 487, 486,
    485, 482, 481, 479, 478
]
bar = (
    Bar()
    .add_xaxis(user_geohash)
    .add_yaxis("", count)
    .reversal_axis()
    .set_global_opts(title_opts=opts.TitleOpts(title="Top 20 User Geohash"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{c}"))
)
bar.render_notebook()

image.png

cleaned_data['item_category']
item_category_most = cleaned_data['item_category'].value_counts()
top_twenty_item_category_most = item_category_most.head(20)
top_twenty_item_category_most
top_twenty_item_category_most_data = [
    ("1863", 127281),
    ("5027", 113615),
    ("13230", 105880),
    ("5399", 98309),
    ("5894", 97578),
    ("6513", 89906),
    ("11279", 58035),
    ("5232", 48217),
    ("2825", 48027),
    ("10894", 43106),
    ("4370", 37887),
    ("6000", 37173),
    ("3064", 36959),
    ("5689", 35794),
    ("10392", 32069),
    ("3381", 31607),
    ("14079", 29893),
    ("6344", 27999),
    ("9516", 27815),
    ("5395", 27600),
]
pie_category = (
    Pie()
    .add("", top_twenty_item_category_most_data)
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .set_global_opts(legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="85%"),
                     title_opts=opts.TitleOpts(title="商品类别分布饼图"))
)
pie_category.render_notebook()
bar_category = (
    Bar()
    .add_xaxis([x[0] for x in top_twenty_item_category_most_data])
    .add_yaxis("", [x[1] for x in top_twenty_item_category_most_data])
    .reversal_axis()
    .set_global_opts(title_opts=opts.TitleOpts(title="商品类别分布条形图"))
)
bar_category.render_notebook()

image.png

buy_customers=cleaned_data[cleaned_data['behavior_type']==4]
buy_customers
customer_purchase = buy_customers.groupby('user_id').size().reset_index(name='purchase_count')
percentile = 20  
threshold = np.percentile(customer_purchase['purchase_count'], 100 - percentile)
top_customers = customer_purchase[customer_purchase['purchase_count'] >= threshold]
total_purchase_twenty = top_customers['purchase_count'].sum()
total_purchase_twenty
buy_customers = cleaned_data[cleaned_data['behavior_type'] == 4]
customer_purchase = buy_customers.groupby('user_id').size().reset_index(name='purchase_count')
percentile = 80  
threshold = np.percentile(customer_purchase['purchase_count'], percentile)
bottom_customers = customer_purchase[customer_purchase['purchase_count'] < threshold]
total_purchase_eighty = bottom_customers['purchase_count'].sum()
total_purchase_eighty
percentage_eighty = total_purchase_eighty / (total_purchase_twenty+total_purchase_eighty) * 100
percentage_eighty 
percentage_twenty = total_purchase_twenty / (total_purchase_twenty+total_purchase_eighty)* 100
percentage_twenty
pie_customers = (
    Pie()
    .add(
        "",
        [("前20%的核心客户", percentage_eighty), ("其他客户", percentage_twenty)],
        label_opts=opts.LabelOpts(
            formatter="{b|{b}}: {d}%\n{hr|}\n",
            rich={
                "b": {"fontSize": 14, "fontWeight": "bold"},
                "hr": {"borderColor": "#aaa", "borderWidth": 0.5, "width": "100%", "height": 0},
            },
        ),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(title="购买数量比例"),
        legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="85%"),
    )
    .set_series_opts(tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"))
)
pie_customers.render_notebook()

image.png

item_id_most = cleaned_data['item_id']
item_id_most
item_id_most = cleaned_data['item_id'].value_counts()
top_twenty_item_id_most = item_id_most.head(20)
top_twenty_item_id_most
top_twenty_item_id_most_data = [
    ("112921337", 459),
    ("97655171", 416),
    ("387911330", 352),
    ("14087919", 349),
    ("209323160", 294),
    ("135104537", 294),
    ("128186279", 286),
    ("374235261", 272),
    ("6703599", 270),
    ("5685392", 260),
    ("277922302", 254),
    ("2217535", 252),
    ("275450912", 247),
    ("58727293", 244),
    ("353381230", 242),
    ("303205878", 238),
    ("211781109", 233),
    ("21087251", 208),
    ("110998572", 203),
    ("247894113", 197)
]
item_id_bar = (
    Bar()
    .add_xaxis([x[0] for x in top_twenty_item_id_most_data])
    .add_yaxis("商品数量", [x[1] for x in top_twenty_item_id_most_data])
    .reversal_axis()
    .set_global_opts(title_opts=opts.TitleOpts(title="点击数量最多的前二十个商品"))
)
item_id_bar.render_notebook()
item_buy_most = cleaned_data[cleaned_data['behavior_type'] == 4]['item_id'].value_counts()
top_twenty_item_buy_most = item_buy_most.head(20)
top_twenty_item_buy_most
top_twenty_item_buy_most_data= [
    ("115124482", 31),
    ("303205878", 26),
    ("243091690", 25),
    ("14087919", 22),
    ("17065447", 22),
    ("188241513", 21),
    ("305623947", 17),
    ("221830759", 17),
    ("380344970", 16),
    ("217442256", 15),
    ("154168523", 14),
    ("331245551", 14),
    ("358674330", 13),
    ("58558840", 13),
    ("374214353", 12),
    ("206167032", 12),
    ("26492860", 12),
    ("276670328", 12),
    ("377229070", 11),
    ("65096329", 11)
]
top_twenty_item_buy_most_data_bar = (
    Bar()
    .add_xaxis([x[0] for x in top_twenty_item_buy_most_data])
    .add_yaxis("商品数量", [x[1] for x in top_twenty_item_buy_most_data])
    .reversal_axis()
    .set_global_opts(title_opts=opts.TitleOpts(title="购买数量最多的前二十个商品"))
)
top_twenty_item_buy_most_data_bar.render_notebook()

image.png

image.png

image.png

image.png

image.png

image.png

image.png

image.png

image.png

image.png

image.png

image.png