前一段时间在实习中做的一个小项目:
项目思路展示
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import numba as nb
from pyecharts import options as opts
from pyecharts.charts import Funnel
from pyecharts.charts import Line, Timeline,Bar,Pie
from pyecharts.globals import ThemeType
from pyecharts.faker import Faker
original_data=pd.read_csv(r"C:\Users\ljl20\Desktop\tianchi_mobile_recommend_train_user.csv")
original_data
original_data.sample(30)
然后查看数据是否重复。
@nb.jit(nopython=True)
def check_duplicates(data, columns):
return data.duplicated(columns)
original_data.duplicated(['user_id','item_id','behavior_type','user_geohash','item_category','time'])
original_data.describe()
cleaned_data=original_data.copy()
cleaned_data
cleaned_data["time"] = pd.to_datetime(cleaned_data["time"])
cleaned_data["time"]
cleaned_data['time'] = pd.to_datetime(cleaned_data['time'])
cleaned_data['year'] = cleaned_data['time'].dt.year
cleaned_data['month'] = cleaned_data['time'].dt.month
cleaned_data['date'] = cleaned_data['time'].dt.day
cleaned_data['hour'] = cleaned_data['time'].dt.hour
cleaned_data[~cleaned_data.duplicated(['user_id','item_id','behavior_type','user_geohash','item_category','time'])]
del cleaned_data['time']
cleaned_data
hours_counts = cleaned_data.groupby("hour")["behavior_type"].count()
line_chart = (
Line()
.add_xaxis(hours_counts.index.tolist())
.add_yaxis("次数", hours_counts.tolist(), label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="每时用户购买次数"),
xaxis_opts=opts.AxisOpts(name='hour'),
yaxis_opts=opts.AxisOpts(name="次数"),
)
)
hours_timeline_chart = Timeline(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
hours_timeline_chart.add(line_chart, "折线图")
hours_timeline_chart.render_notebook()
daily_counts = cleaned_data.pivot_table(index='date', columns='behavior_type', aggfunc='size', fill_value=0)
line_chart = (
Line()
.add_xaxis(daily_counts.index.tolist())
.add_yaxis("收藏", daily_counts[2].tolist())
.add_yaxis("加购物车", daily_counts[3].tolist())
.add_yaxis("支付", daily_counts[4].tolist())
.set_global_opts(
title_opts=opts.TitleOpts(title="每天收藏、加购物车、支付次数分析"),
xaxis_opts=opts.AxisOpts(name='日期'),
yaxis_opts=opts.AxisOpts(name="次数"),
)
)
line_chart.render_notebook()
daily_counts = cleaned_data.pivot_table(index='date', columns='behavior_type', aggfunc='size', fill_value=0)
line_chart = (
Line()
.add_xaxis(daily_counts.index.tolist())
.add_yaxis("用户访问总次数", daily_counts[1].tolist(), label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="每天用户行为类型总次数"),
xaxis_opts=opts.AxisOpts(name='日期'),
yaxis_opts=opts.AxisOpts(name="次数"),
)
)
line_chart.render_notebook()
collect_counts = daily_counts[2].tolist()
add_to_cart_counts = daily_counts[3].tolist()
click_counts = daily_counts[1].tolist()
payment_counts = daily_counts[4].tolist()
funnel_chart = (
Funnel()
.add(
"漏斗图",
[list(z) for z in zip(["收藏", "加购物车", "点击", "支付"], [collect_counts, add_to_cart_counts, click_counts, payment_counts])],
label_opts=opts.LabelOpts(position="inside"),
)
.set_global_opts(
title_opts=opts.TitleOpts(title="收藏、加购物车、点击和支付次数分析"),
tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b} : {c}"),
)
.set_series_opts(tooltip_opts=opts.TooltipOpts(formatter="{b} : {c}"))
)
funnel_chart.render_notebook()
daily_counts = cleaned_data[cleaned_data['behavior_type'].isin([1, 4])].groupby('date')['behavior_type'].value_counts().unstack().fillna(0)
conversion_rate = daily_counts[4] / daily_counts[1]
line_chart = (
Line()
.add_xaxis(daily_counts.index.tolist())
.add_yaxis("转化率", conversion_rate.tolist(), label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(title="每天用户转化率"),
xaxis_opts=opts.AxisOpts(name='日期'),
yaxis_opts=opts.AxisOpts(name="转化率"),
)
)
line_chart.render_notebook()
buy_counts = cleaned_data[cleaned_data['behavior_type'] == 4]['item_id'].value_counts()
top_twenty_buy_most = buy_counts.head(20)
top_twenty_buy_most_df = pd.DataFrame({'item_id': top_twenty_buy_most.index, 'buy_count': top_twenty_buy_most.values})
top_twenty_buy_most_df
click_counts = cleaned_data[cleaned_data['behavior_type'] == 1]['item_id'].value_counts()
top_twenty_click_most = click_counts.head(20)
top_twenty_click_most_df = pd.DataFrame({'item_id': top_twenty_click_most.index, 'click_count': top_twenty_click_most.values})
top_twenty_click_most_df
combined_df = pd.concat([top_twenty_click_most_df, top_twenty_buy_most_df],axis=0)
combined_df
cleaned_data['user_geohash']
cleaned_data.dropna(subset=['user_geohash'], inplace=True)
user_geohash_without_nan = cleaned_data.dropna(subset=['user_geohash'])
user_geohash_without_nan['user_geohash']
user_geohash_most = user_geohash_without_nan['user_geohash'].value_counts()
top_twenty_user_geohash_most = user_geohash_most.head(20)
top_twenty_user_geohash_most
user_geohash = [
'94ek6ke', '94ek6lj', '94ek6lw', '94ek6kn', '94ek6l5',
'94ek6kc', '94ek6lm', '94ek6lr', '94ek6li', '94ek6kw',
'94ek6kg', '94ek6ln', '94ek6ka', '94ek6e9', '94ek6k1',
'94ek6kh', '94ek6kj', '94ek6le', '94ek6k2', '94ek6kl'
]
count = [
1052, 1008, 950, 918, 526,
522, 517, 513, 507, 503,
491, 490, 489, 487, 486,
485, 482, 481, 479, 478
]
pie = (
Pie()
.add("", [list(z) for z in zip(user_geohash, count)])
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
pie.render_notebook()
user_geohash = [
'94ek6ke', '94ek6lj', '94ek6lw', '94ek6kn', '94ek6l5',
'94ek6kc', '94ek6lm', '94ek6lr', '94ek6li', '94ek6kw',
'94ek6kg', '94ek6ln', '94ek6ka', '94ek6e9', '94ek6k1',
'94ek6kh', '94ek6kj', '94ek6le', '94ek6k2', '94ek6kl'
]
count = [
1052, 1008, 950, 918, 526,
522, 517, 513, 507, 503,
491, 490, 489, 487, 486,
485, 482, 481, 479, 478
]
bar = (
Bar()
.add_xaxis(user_geohash)
.add_yaxis("", count)
.reversal_axis()
.set_global_opts(title_opts=opts.TitleOpts(title="Top 20 User Geohash"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{c}"))
)
bar.render_notebook()
cleaned_data['item_category']
item_category_most = cleaned_data['item_category'].value_counts()
top_twenty_item_category_most = item_category_most.head(20)
top_twenty_item_category_most
top_twenty_item_category_most_data = [
("1863", 127281),
("5027", 113615),
("13230", 105880),
("5399", 98309),
("5894", 97578),
("6513", 89906),
("11279", 58035),
("5232", 48217),
("2825", 48027),
("10894", 43106),
("4370", 37887),
("6000", 37173),
("3064", 36959),
("5689", 35794),
("10392", 32069),
("3381", 31607),
("14079", 29893),
("6344", 27999),
("9516", 27815),
("5395", 27600),
]
pie_category = (
Pie()
.add("", top_twenty_item_category_most_data)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.set_global_opts(legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="85%"),
title_opts=opts.TitleOpts(title="商品类别分布饼图"))
)
pie_category.render_notebook()
bar_category = (
Bar()
.add_xaxis([x[0] for x in top_twenty_item_category_most_data])
.add_yaxis("", [x[1] for x in top_twenty_item_category_most_data])
.reversal_axis()
.set_global_opts(title_opts=opts.TitleOpts(title="商品类别分布条形图"))
)
bar_category.render_notebook()
buy_customers=cleaned_data[cleaned_data['behavior_type']==4]
buy_customers
customer_purchase = buy_customers.groupby('user_id').size().reset_index(name='purchase_count')
percentile = 20
threshold = np.percentile(customer_purchase['purchase_count'], 100 - percentile)
top_customers = customer_purchase[customer_purchase['purchase_count'] >= threshold]
total_purchase_twenty = top_customers['purchase_count'].sum()
total_purchase_twenty
buy_customers = cleaned_data[cleaned_data['behavior_type'] == 4]
customer_purchase = buy_customers.groupby('user_id').size().reset_index(name='purchase_count')
percentile = 80
threshold = np.percentile(customer_purchase['purchase_count'], percentile)
bottom_customers = customer_purchase[customer_purchase['purchase_count'] < threshold]
total_purchase_eighty = bottom_customers['purchase_count'].sum()
total_purchase_eighty
percentage_eighty = total_purchase_eighty / (total_purchase_twenty+total_purchase_eighty) * 100
percentage_eighty
percentage_twenty = total_purchase_twenty / (total_purchase_twenty+total_purchase_eighty)* 100
percentage_twenty
pie_customers = (
Pie()
.add(
"",
[("前20%的核心客户", percentage_eighty), ("其他客户", percentage_twenty)],
label_opts=opts.LabelOpts(
formatter="{b|{b}}: {d}%\n{hr|}\n",
rich={
"b": {"fontSize": 14, "fontWeight": "bold"},
"hr": {"borderColor": "#aaa", "borderWidth": 0.5, "width": "100%", "height": 0},
},
),
)
.set_global_opts(
title_opts=opts.TitleOpts(title="购买数量比例"),
legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="85%"),
)
.set_series_opts(tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"))
)
pie_customers.render_notebook()
item_id_most = cleaned_data['item_id']
item_id_most
item_id_most = cleaned_data['item_id'].value_counts()
top_twenty_item_id_most = item_id_most.head(20)
top_twenty_item_id_most
top_twenty_item_id_most_data = [
("112921337", 459),
("97655171", 416),
("387911330", 352),
("14087919", 349),
("209323160", 294),
("135104537", 294),
("128186279", 286),
("374235261", 272),
("6703599", 270),
("5685392", 260),
("277922302", 254),
("2217535", 252),
("275450912", 247),
("58727293", 244),
("353381230", 242),
("303205878", 238),
("211781109", 233),
("21087251", 208),
("110998572", 203),
("247894113", 197)
]
item_id_bar = (
Bar()
.add_xaxis([x[0] for x in top_twenty_item_id_most_data])
.add_yaxis("商品数量", [x[1] for x in top_twenty_item_id_most_data])
.reversal_axis()
.set_global_opts(title_opts=opts.TitleOpts(title="点击数量最多的前二十个商品"))
)
item_id_bar.render_notebook()
item_buy_most = cleaned_data[cleaned_data['behavior_type'] == 4]['item_id'].value_counts()
top_twenty_item_buy_most = item_buy_most.head(20)
top_twenty_item_buy_most
top_twenty_item_buy_most_data= [
("115124482", 31),
("303205878", 26),
("243091690", 25),
("14087919", 22),
("17065447", 22),
("188241513", 21),
("305623947", 17),
("221830759", 17),
("380344970", 16),
("217442256", 15),
("154168523", 14),
("331245551", 14),
("358674330", 13),
("58558840", 13),
("374214353", 12),
("206167032", 12),
("26492860", 12),
("276670328", 12),
("377229070", 11),
("65096329", 11)
]
top_twenty_item_buy_most_data_bar = (
Bar()
.add_xaxis([x[0] for x in top_twenty_item_buy_most_data])
.add_yaxis("商品数量", [x[1] for x in top_twenty_item_buy_most_data])
.reversal_axis()
.set_global_opts(title_opts=opts.TitleOpts(title="购买数量最多的前二十个商品"))
)
top_twenty_item_buy_most_data_bar.render_notebook()