1 导入库
In [1]:
import pandas as pd
import numpy as np
# 可视化相关库
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly_express as px
import plotly.tools as tls
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from itertools import cycle
pd.options.display.max_columns = 50
plt.style.use('bmh')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])
2 数据基本信息
2.1 读取数据
In [2]:
calendar = pd.read_csv("calendar.csv")
stv = pd.read_csv("sales_train_validation.csv")
sell = pd.read_csv("sell_prices.csv")
ss = pd.read_csv("sample_submission.csv")
2.2 基本信息
In [3]:
calendar.head() # 前5行数据
查看数据的基本信息:
In [4]:
calendar.shape # 数据行列数
Out[4]:
(1969, 14)
In [5]:
calendar.isnull().sum() # 数据缺失值
Out[5]:
date 0
wm_yr_wk 0
weekday 0
wday 0
month 0
year 0
d 0
event_name_1 1807
event_type_1 1807
event_name_2 1964
event_type_2 1964
snap_CA 0
snap_TX 0
snap_WI 0
dtype: int64
In [6]:
calendar.dtypes # 数据字段类型
Out[6]:
date object
wm_yr_wk int64
weekday object
wday int64
month int64
year int64
d object
event_name_1 object
event_type_1 object
event_name_2 object
event_type_2 object
snap_CA int64
snap_TX int64
snap_WI int64
dtype: object
info()函数输出数据信息,包含字段名称、非空数量NoN-Null Count、字段类型:
In [7]:
calendar.info() # 字段名称、非空数量NoN-Null Count、字段类型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 date 1969 non-null object
1 wm_yr_wk 1969 non-null int64
2 weekday 1969 non-null object
3 wday 1969 non-null int64
4 month 1969 non-null int64
5 year 1969 non-null int64
6 d 1969 non-null object
7 event_name_1 162 non-null object
8 event_type_1 162 non-null object
9 event_name_2 5 non-null object
10 event_type_2 5 non-null object
11 snap_CA 1969 non-null int64
12 snap_TX 1969 non-null int64
13 snap_WI 1969 non-null int64
dtypes: int64(7), object(7)
memory usage: 215.5+ KB
In [8]:
stv.head()
Out[8]:
| id | item_id | dept_id | cat_id | store_id | state_id | d_1 | d_2 | d_3 | d_4 | d_5 | d_6 | d_7 | d_8 | d_9 | d_10 | d_11 | d_12 | d_13 | d_14 | d_15 | d_16 | d_17 | d_18 | d_19 | ... | d_1889 | d_1890 | d_1891 | d_1892 | d_1893 | d_1894 | d_1895 | d_1896 | d_1897 | d_1898 | d_1899 | d_1900 | d_1901 | d_1902 | d_1903 | d_1904 | d_1905 | d_1906 | d_1907 | d_1908 | d_1909 | d_1910 | d_1911 | d_1912 | d_1913 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HOBBIES_1_001_CA_1_validation | HOBBIES_1_001 | HOBBIES_1 | HOBBIES | CA_1 | CA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 4 | 2 | 3 | 0 | 1 | 2 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 1 | 1 | 1 | 3 | 0 | 1 | 1 |
| 1 | HOBBIES_1_002_CA_1_validation | HOBBIES_1_002 | HOBBIES_1 | HOBBIES | CA_1 | CA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 2 | HOBBIES_1_003_CA_1_validation | HOBBIES_1_003 | HOBBIES_1 | HOBBIES | CA_1 | CA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 2 | 1 | 2 | 1 | 1 | 1 | 0 | 1 | 1 | 1 |
| 3 | HOBBIES_1_004_CA_1_validation | HOBBIES_1_004 | HOBBIES_1 | HOBBIES | CA_1 | CA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 3 | 1 | 2 | 1 | 3 | 1 | 0 | 2 | 5 | 4 | 2 | 0 | 3 | 0 | 1 | 0 | 5 | 4 | 1 | 0 | 1 | 3 | 7 | 2 |
| 4 | HOBBIES_1_005_CA_1_validation | HOBBIES_1_005 | HOBBIES_1 | HOBBIES | CA_1 | CA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 4 | 0 | 1 | 4 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 2 | 0 | 1 | 1 | 2 | 1 | 1 | 0 | 1 | 1 | 2 | 2 | 2 | 4 |
5 rows × 1919 columns
In [9]:
stv.shape
Out[9]:
(30490, 1919)
In [10]:
stv["id"].value_counts()
Out[10]:
id
HOBBIES_1_001_CA_1_validation 1
FOODS_2_202_TX_3_validation 1
FOODS_2_214_TX_3_validation 1
FOODS_2_213_TX_3_validation 1
FOODS_2_212_TX_3_validation 1
..
HOUSEHOLD_1_457_CA_4_validation 1
HOUSEHOLD_1_456_CA_4_validation 1
HOUSEHOLD_1_455_CA_4_validation 1
HOUSEHOLD_1_454_CA_4_validation 1
FOODS_3_827_WI_3_validation 1
Name: count, Length: 30490, dtype: int64
In [11]:
stv["item_id"].value_counts()
Out[11]:
item_id
HOBBIES_1_001 10
FOODS_2_210 10
FOODS_2_201 10
FOODS_2_202 10
FOODS_2_203 10
..
HOUSEHOLD_1_464 10
HOUSEHOLD_1_465 10
HOUSEHOLD_1_466 10
HOUSEHOLD_1_467 10
FOODS_3_827 10
Name: count, Length: 3049, dtype: int64
In [12]:
stv.dtypes
Out[12]:
id object
item_id object
dept_id object
cat_id object
store_id object
...
d_1909 int64
d_1910 int64
d_1911 int64
d_1912 int64
d_1913 int64
Length: 1919, dtype: object
In [13]:
sell.head()
Out[13]:
| store_id | item_id | wm_yr_wk | sell_price | |
|---|---|---|---|---|
| 0 | CA_1 | HOBBIES_1_001 | 11325 | 9.58 |
| 1 | CA_1 | HOBBIES_1_001 | 11326 | 9.58 |
| 2 | CA_1 | HOBBIES_1_001 | 11327 | 8.26 |
| 3 | CA_1 | HOBBIES_1_001 | 11328 | 8.26 |
| 4 | CA_1 | HOBBIES_1_001 | 11329 | 8.26 |
不同的item_id数量统计:
In [14]:
sell["item_id"].value_counts()
Out[14]:
item_id
FOODS_3_587 2820
HOUSEHOLD_1_177 2820
HOUSEHOLD_2_283 2820
HOBBIES_1_337 2820
FOODS_1_032 2820
...
HOUSEHOLD_1_308 642
HOUSEHOLD_1_159 623
HOUSEHOLD_1_242 600
FOODS_3_296 592
FOODS_2_379 530
Name: count, Length: 3049, dtype: int64
In [15]:
sell.shape
Out[15]:
(6841121, 4)
In [16]:
sell.dtypes
Out[16]:
store_id object
item_id object
wm_yr_wk int64
sell_price float64
dtype: object
In [17]:
ss.head()
Out[17]:
| id | F1 | F2 | F3 | F4 | F5 | F6 | F7 | F8 | F9 | F10 | F11 | F12 | F13 | F14 | F15 | F16 | F17 | F18 | F19 | F20 | F21 | F22 | F23 | F24 | F25 | F26 | F27 | F28 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | HOBBIES_1_001_CA_1_validation | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | HOBBIES_1_002_CA_1_validation | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | HOBBIES_1_003_CA_1_validation | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | HOBBIES_1_004_CA_1_validation | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | HOBBIES_1_005_CA_1_validation | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
In [18]:
ss.shape
Out[18]:
(60980, 29)
3 数据探索性分析EDA
3.1 单个item_id数据可视化
In [19]:
stv.columns
Out[19]:
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1', 'd_2', 'd_3', 'd_4', ... 'd_1904', 'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911', 'd_1912', 'd_1913'],
dtype='object', length=1919)
In [20]:
d_cols = [c for c in stv.columns if "d_" in c]
d_cols[:10]
Out[20]:
['d_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'd_7', 'd_8', 'd_9', 'd_10']
In [21]:
len(d_cols)
Out[21]:
1913
In [22]:
stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"].set_index("id") # 设置行索引为id
Out[22]:
| item_id | dept_id | cat_id | store_id | state_id | d_1 | d_2 | d_3 | d_4 | d_5 | d_6 | d_7 | d_8 | d_9 | d_10 | d_11 | d_12 | d_13 | d_14 | d_15 | d_16 | d_17 | d_18 | d_19 | d_20 | ... | d_1889 | d_1890 | d_1891 | d_1892 | d_1893 | d_1894 | d_1895 | d_1896 | d_1897 | d_1898 | d_1899 | d_1900 | d_1901 | d_1902 | d_1903 | d_1904 | d_1905 | d_1906 | d_1907 | d_1908 | d_1909 | d_1910 | d_1911 | d_1912 | d_1913 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||||||||||||||||||||||||||||||||
| FOODS_3_090_CA_3_validation | FOODS_3_090 | FOODS_3 | FOODS | CA_3 | CA | 108 | 132 | 102 | 120 | 106 | 123 | 279 | 175 | 186 | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 130 | 156 | 133 | 114 | 80 | 75 | 113 | 81 | 107 | 160 | 124 | 107 | 84 | 88 | 90 | 129 | 160 | 204 | 86 | 100 | 88 | 77 | 141 | 139 | 130 |
1 rows × 1918 columns
基于一个id号进行数据展示:
In [23]:
# 选择一个id号进行展示
stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"].set_index("id")[d_cols].T.plot(figsize=(15,5),
title="FOODS_3_090_CA_3",
color=next(color_cycle))
plt.legend('')
plt.show()
3.2 数据合并可视化
In [24]:
calendar[["d","date","event_name_1","event_name_2","event_type_1","event_type_2","snap_CA"]].head()
Out[24]:
| d | date | event_name_1 | event_name_2 | event_type_1 | event_type_2 | snap_CA | |
|---|---|---|---|---|---|---|---|
| 0 | d_1 | 2011-01-29 | NaN | NaN | NaN | NaN | 0 |
| 1 | d_2 | 2011-01-30 | NaN | NaN | NaN | NaN | 0 |
| 2 | d_3 | 2011-01-31 | NaN | NaN | NaN | NaN | 0 |
| 3 | d_4 | 2011-02-01 | NaN | NaN | NaN | NaN | 1 |
| 4 | d_5 | 2011-02-02 | NaN | NaN | NaN | NaN | 1 |
In [25]:
stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"]
Out[25]:
| id | item_id | dept_id | cat_id | store_id | state_id | d_1 | d_2 | d_3 | d_4 | d_5 | d_6 | d_7 | d_8 | d_9 | d_10 | d_11 | d_12 | d_13 | d_14 | d_15 | d_16 | d_17 | d_18 | d_19 | ... | d_1889 | d_1890 | d_1891 | d_1892 | d_1893 | d_1894 | d_1895 | d_1896 | d_1897 | d_1898 | d_1899 | d_1900 | d_1901 | d_1902 | d_1903 | d_1904 | d_1905 | d_1906 | d_1907 | d_1908 | d_1909 | d_1910 | d_1911 | d_1912 | d_1913 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8412 | FOODS_3_090_CA_3_validation | FOODS_3_090 | FOODS_3 | FOODS | CA_3 | CA | 108 | 132 | 102 | 120 | 106 | 123 | 279 | 175 | 186 | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 130 | 156 | 133 | 114 | 80 | 75 | 113 | 81 | 107 | 160 | 124 | 107 | 84 | 88 | 90 | 129 | 160 | 204 | 86 | 100 | 88 | 77 | 141 | 139 | 130 |
1 rows × 1919 columns
In [26]:
test = stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"][d_cols].T
test
Out[26]:
| 8412 | |
|---|---|
| d_1 | 108 |
| d_2 | 132 |
| d_3 | 102 |
| d_4 | 120 |
| d_5 | 106 |
| ... | ... |
| d_1909 | 88 |
| d_1910 | 77 |
| d_1911 | 141 |
| d_1912 | 139 |
| d_1913 | 130 |
1913 rows × 1 columns
In [27]:
# 字段的重新命名
test.rename(columns={8412:"FOODS_3_090_CA_3"},inplace=True)
将d_1、d_2...d_1913变成列属性:
In [28]:
test = test.reset_index().rename(columns={"index":"d"})
test
Out[28]:
| d | FOODS_3_090_CA_3 | |
|---|---|---|
| 0 | d_1 | 108 |
| 1 | d_2 | 132 |
| 2 | d_3 | 102 |
| 3 | d_4 | 120 |
| 4 | d_5 | 106 |
| ... | ... | ... |
| 1908 | d_1909 | 88 |
| 1909 | d_1910 | 77 |
| 1910 | d_1911 | 141 |
| 1911 | d_1912 | 139 |
| 1912 | d_1913 | 130 |
1913 rows × 2 columns
查看两个DataFrame数据的字段信息:
In [29]:
print(test.columns)
print(calendar.columns)
Index(['d', 'FOODS_3_090_CA_3'], dtype='object')
Index(['date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'd',
'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
'snap_CA', 'snap_TX', 'snap_WI'],
dtype='object')
根据相同的字段d进行数据的合并:
In [30]:
test = test.merge(calendar, how="left", validate='1:1')
test.head()
Out[30]:
| d | FOODS_3_090_CA_3 | date | wm_yr_wk | weekday | wday | month | year | event_name_1 | event_type_1 | event_name_2 | event_type_2 | snap_CA | snap_TX | snap_WI | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | d_1 | 108 | 2011-01-29 | 11101 | Saturday | 1 | 1 | 2011 | NaN | NaN | NaN | NaN | 0 | 0 | 0 |
| 1 | d_2 | 132 | 2011-01-30 | 11101 | Sunday | 2 | 1 | 2011 | NaN | NaN | NaN | NaN | 0 | 0 | 0 |
| 2 | d_3 | 102 | 2011-01-31 | 11101 | Monday | 3 | 1 | 2011 | NaN | NaN | NaN | NaN | 0 | 0 | 0 |
| 3 | d_4 | 120 | 2011-02-01 | 11101 | Tuesday | 4 | 2 | 2011 | NaN | NaN | NaN | NaN | 1 | 1 | 0 |
| 4 | d_5 | 106 | 2011-02-02 | 11101 | Wednesday | 5 | 2 | 2011 | NaN | NaN | NaN | NaN | 1 | 0 | 1 |
In [31]:
test.set_index("date")["FOODS_3_090_CA_3"].plot(figsize=(15,5),
color=next(color_cycle),
title="FOODS_3_090_CA_3 sales by actual sale dates"
)
plt.show()
完整过程:
In [32]:
# # 换成另一个id-FOODS_3_587_CA_1_validation
test2 = stv.loc[stv["id"] == "FOODS_3_587_CA_1_validation"][d_cols].T # 提取信息并转置
test2.rename(columns={2811:"FOODS_3_587_CA_1"},inplace=True) # 字段重命名
test2 = test2.reset_index().rename(columns={"index":"d"}) # 将行索引转成列字段
test2 = test2.merge(calendar, how="left") # 根据d字段合并数据
# 绘图
test2.set_index("date")["FOODS_3_587_CA_1"].plot(figsize=(15,5),
color=next(color_cycle),
title="FOODS_3_587_CA_1 sales by actual sale dates"
)
plt.show()
另一个案例:
In [33]:
test3 = stv.loc[stv['id'] == 'HOUSEHOLD_1_118_CA_3_validation'][d_cols].T
test3 = test3.rename(columns={6776:'HOUSEHOLD_1_118_CA_3'})
test3 = test3.reset_index().rename(columns={'index': 'd'})
test3 = test3.merge(calendar, how='left', validate='1:1')
# # 绘图
# test3.set_index("date")["HOUSEHOLD_1_118_CA_3"].plot(figsize=(15,5),
# color=next(color_cycle),
# title="HOUSEHOLD_1_118_CA_3 sales by actual sale dates"
# )
# plt.show()
3.3 基于时间变量可视化
涉及到的时间变量:Day of week、Month、Year
In [34]:
tests = ["FOODS_3_090_CA_3", "FOODS_3_587_CA_1", "HOUSEHOLD_1_118_CA_3"]
test_df = [test, test2, test3]
for i in [0,1,2]:
# 绘制3个子图
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(16,3))
# 子图1:基于wday统计
# 前面部分是聚合统计 + plot的line绘图部分
test_df[i].groupby("wday")[tests[i]].mean().plot(kind="line",
title="Average Sale: Day of week",
lw=5,
color=color_pal[0],
ax=ax1)
# 子图2:基于month统计
test_df[i].groupby("month")[tests[i]].mean().plot(kind="line",
title="Average Sale: Month",
lw=5,
color=color_pal[1],
ax=ax2)
# 子图3:基于year统计
test_df[i].groupby("year")[tests[i]].mean().plot(kind="line",
title="Average Sale: Year",
lw=5,
color=color_pal[2],
ax=ax3)
fig.suptitle(f'Trends for item: {tests[i]}',size=20,y=1.1)
plt.tight_layout()
plt.show()
如何理解前面的聚合统计:以第一个test_df[0] = test为例说明:
In [35]:
test.groupby("wday")["FOODS_3_090_CA_3"].mean()
Out[35]:
wday
1 175.091241
2 143.956204
3 118.219780
4 105.241758
5 107.490842
6 117.087912
7 149.333333
Name: FOODS_3_090_CA_3, dtype: float64
3.4 不同item_id可视化
绘制20种不同item_id的销售情况:
In [36]:
# 随机选择20个数据,设置id为index,取出d_cols相关的数据,最后再转置
# stv.sample(20, random_state=42).set_index("id")[d_cols].T
In [37]:
calendar.set_index("d")["date"] # 取出date部分数据
Out[37]:
d
d_1 2011-01-29
d_2 2011-01-30
d_3 2011-01-31
d_4 2011-02-01
d_5 2011-02-02
...
d_1965 2016-06-15
d_1966 2016-06-16
d_1967 2016-06-17
d_1968 2016-06-18
d_1969 2016-06-19
Name: date, Length: 1969, dtype: object
In [38]:
twenty_tests = stv.sample(20, random_state=42).set_index("id")[d_cols].T.merge(calendar.set_index("d")["date"],
left_index=True,
right_index=True,
validate="1:1").set_index("date")
In [39]:
fig, axs = plt.subplots(10,2,figsize=(15, 20)) # 10行2列的子图;大小是15-20
axs = axs.flatten() # 将二维数组展平成一维数组
ax_idx = 0
for item in twenty_tests.columns: # 遍历每个列名进行绘图
twenty_tests[item].plot(title=item,
color=next(color_cycle), # 从color_cycle中获取下一个颜色,用于绘制当前图形
ax=axs[ax_idx])
ax_idx += 1
plt.tight_layout()
plt.show()
3.5 基于不同cat_id结合时间的可视化
不同的类型cat_id包含:
- Hobbies
- Household
- Foods
具体有哪些不同的类型:
In [40]:
stv["cat_id"].unique()
Out[40]:
array(['HOBBIES', 'HOUSEHOLD', 'FOODS'], dtype=object)
不同类型的数量对比:
In [41]:
stv["cat_id"].value_counts()
Out[41]:
cat_id
FOODS 14370
HOUSEHOLD 10470
HOBBIES 5650
Name: count, dtype: int64
基于matplotlib的实现:
In [42]:
stv["cat_id"].value_counts().plot(kind="barh", figsize=(15,5),title="Count of Items by Catebory")
plt.show()
基于plotly_express的实现:
In [43]:
data = [go.Bar(
x = stv.cat_id.unique(), # x-y轴
y = stv.cat_id.value_counts(),
marker = dict(colorscale = "Jet", # 颜色设置
color = stv.cat_id.value_counts().values),
text = stv.cat_id.value_counts() # 柱子显示数据;同x
)]
# Layout中的标题设置
layout = go.Layout(
title = "不同类型数量统计"
)
fig = go.Figure(data=data, layout=layout)
fig.show()
绘制水平柱状图:
In [44]:
data = [go.Bar(
x = stv.cat_id.value_counts(), # x-y轴
y = stv.cat_id.unique(),
marker = dict(colorscale = "Jet", # 颜色设置
color = stv.cat_id.value_counts().values),
orientation = 'h',
text = stv.cat_id.value_counts() # 柱子显示数据;同x
)]
# Layout中的标题设置
layout = go.Layout(
title = "不同类型数量统计"
)
fig = go.Figure(data=data, layout=layout)
fig.show()
将calendar的索引设置为d,提取date字段信息:
In [45]:
calendar.set_index("d")["date"]
Out[45]:
d
d_1 2011-01-29
d_2 2011-01-30
d_3 2011-01-31
d_4 2011-02-01
d_5 2011-02-02
...
d_1965 2016-06-15
d_1966 2016-06-16
d_1967 2016-06-17
d_1968 2016-06-18
d_1969 2016-06-19
Name: date, Length: 1969, dtype: object
转置之后再和上面的数据进行合并:
In [46]:
past_sales = stv.set_index("id")[d_cols].T.merge(calendar.set_index("d")["date"],
left_index=True,
right_index=True,
validate="1:1",
).set_index("date")
In [47]:
stv["cat_id"].unique()
Out[47]:
array(['HOBBIES', 'HOUSEHOLD', 'FOODS'], dtype=object)
以不同的种类cat_id作为前缀:
In [48]:
past_sales.columns
Out[48]:
Index(['HOBBIES_1_001_CA_1_validation', 'HOBBIES_1_002_CA_1_validation', 'HOBBIES_1_003_CA_1_validation', 'HOBBIES_1_004_CA_1_validation', 'HOBBIES_1_005_CA_1_validation', 'HOBBIES_1_006_CA_1_validation', 'HOBBIES_1_007_CA_1_validation', 'HOBBIES_1_008_CA_1_validation', 'HOBBIES_1_009_CA_1_validation', 'HOBBIES_1_010_CA_1_validation', ... 'FOODS_3_818_WI_3_validation', 'FOODS_3_819_WI_3_validation', 'FOODS_3_820_WI_3_validation', 'FOODS_3_821_WI_3_validation', 'FOODS_3_822_WI_3_validation', 'FOODS_3_823_WI_3_validation', 'FOODS_3_824_WI_3_validation', 'FOODS_3_825_WI_3_validation', 'FOODS_3_826_WI_3_validation', 'FOODS_3_827_WI_3_validation'],
dtype='object', length=30490)
In [49]:
for i in stv["cat_id"].unique():
items_col = [c for c in past_sales.columns if i in c] # 筛选出带有cat_id作为前缀的columns
past_sales[items_col].sum(axis=1).plot(figsize=(15,5), # 大小
alpha=0.8, # 透明度
title="Total Sales by Item Type" # 标题
)
plt.legend(stv["cat_id"].unique()) # 图例显示
plt.show()
3.6 新品推出可视化
In [50]:
past_sales_clipped = past_sales.clip(0,1) # 0----not selling >0-----selling
for i in stv["cat_id"].unique():
items_col = [c for c in past_sales.columns if i in c]
(past_sales_clipped[items_col].mean(axis=1)*100).plot(figsize = (15, 5),
alpha = 0.8,
title = "Inventory Sale Percentage by Date",
style = "."
)
plt.ylabel('Inventory with at least 1 sale')
plt.legend(stv["cat_id"].unique())
plt.show()
3.7 基于Store的销量可视化
In [51]:
store_list = sell["store_id"].unique()
for s in store_list:
store_items = [c for c in past_sales.columns if s in c]
# 滑动窗口为90
past_sales[store_items].sum(axis=1).rolling(90).mean().plot(figsize=(15,5),
alpha=0.8,
title="Rolling 90 Day Average Total Sales(10 Stores)"
)
plt.legend(store_list)
plt.show()
不同store_list的信息:
In [52]:
fig, axes = plt.subplots(5,2,figsize=(15,10),sharex=True)
axes = axes.flatten()
ax_idx = 0
for s in store_list:
store_items = [c for c in past_sales.columns if s in c] # 筛选出带有store_list的columns
past_sales[store_items].sum(axis=1).rolling(7).mean().plot(alpha=1,
ax=axes[ax_idx],
title=s,
lw=3,
color=next(color_cycle)
)
ax_idx += 1
plt.suptitle("Weekly Sale Trends by Store ID")
plt.tight_layout()
plt.show()
3.8 Sales Heatmap Calendar
In [53]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from datetime import datetime
from dateutil.relativedelta import relativedelta
In [54]:
def calmap(ax, year, data):
ax.tick_params("x", length=0, labelsize="medium", which="major")
ax.tick_params("y", length=0, labelsize="x-small",which="major")
# 基于月
xticks, labels = [], []
start = datetime(year, 1, 1).weekday()
for month in range(1,13):
first = datetime(year, month, 1)
last = first + relativedelta(months=1, days=-1)
y0 = first.weekday()
y1 = last.weekday()
x0 = (int(first.strftime("%j")) + start - 1) // 7
x1 = (int(last.strftime("%j")) + + start - 1) // 7
P = [(x0,y0), (x0,7), (x1,7),
(x1, y1+1),(x1+1, y1+1),
(x1+1,0),(x0+1,0),(x0+1,y0)]
xticks.append(x0 + (x1-x0+1) / 2)
labels.append(first.strftime("%b"))
poly = Polygon(P,
edgecolor="black",
facecolor="None",
linewidth=1,
zorder=20,
clip_on=False
)
ax.add_artist(poly)
ax.set_xticks(xticks)
ax.set_xticklabels(labels)
ax.set_yticks(0.5 + np.arange(7))
ax.set_yticklabels(["Mon","Tue", "Wed","Thu", "Fri", "Sat", "Sun"])
ax.set_title("{}".format(year), weight="semibold")
valid = datetime(year, 1, 1).weekday()
data[:valid, 0] = np.nan
valid = datetime(year,12,31).weekday()
data[valid+1:, x1] = np.nan
ax.imshow(data,
extent=[0,53,0,7],
zorder=10,
vmin=-1,
vmax=1,
cmap="RdYlBu_r",
origin="lower",
alpha=0.75)
In [55]:
print('The lowest sale date was:',
past_sales.sum(axis=1).sort_values().index[0],
'with',
past_sales.sum(axis=1).sort_values().values[0],
'sales')
print('The lowest sale date was:',
past_sales.sum(axis=1).sort_values(ascending=False).index[0],
'with',
past_sales.sum(axis=1).sort_values(ascending=False).values[0],
'sales')
The lowest sale date was: 2012-12-25 with 11 sales
The lowest sale date was: 2016-03-06 with 57218 sales
In [56]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
In [57]:
# 将索引改成时间类型的数据
past_sales.index = pd.to_datetime(past_sales.index)
In [58]:
for i in stv["cat_id"].unique():
fig, axes = plt.subplots(3,1,figsize=(20,8))
items_col = [c for c in past_sales.columns if i in c]
sales2013 = past_sales.loc[past_sales.index.isin(pd.date_range("31-Dec-2012",periods=371))][items_col].mean(axis=1)
vals = np.hstack(ss.fit_transform(sales2013.values.reshape(-1,1)))
calmap(axes[0], 2013, vals.reshape(53,7).T)
sales2014 = past_sales.loc[past_sales.index.isin(pd.date_range("30-Dec-2013",periods=371))][items_col].mean(axis=1)
vals = np.hstack(ss.fit_transform(sales2014.values.reshape(-1,1)))
calmap(axes[1], 2014, vals.reshape(53,7).T)
sales2015 = past_sales.loc[past_sales.index.isin(pd.date_range("29-Dec-2014",periods=371))][items_col].mean(axis=1)
vals = np.hstack(ss.fit_transform(sales2015.values.reshape(-1,1)))
calmap(axes[2], 2015, vals.reshape(53,7).T)
plt.suptitle(i, fontsize=30, x=0.4, y=1.01)
plt.tight_layout()
plt.show()
3.9 Sale Prices
In [59]:
sell.head()
Out[59]:
| store_id | item_id | wm_yr_wk | sell_price | |
|---|---|---|---|---|
| 0 | CA_1 | HOBBIES_1_001 | 11325 | 9.58 |
| 1 | CA_1 | HOBBIES_1_001 | 11326 | 9.58 |
| 2 | CA_1 | HOBBIES_1_001 | 11327 | 8.26 |
| 3 | CA_1 | HOBBIES_1_001 | 11328 | 8.26 |
| 4 | CA_1 | HOBBIES_1_001 | 11329 | 8.26 |
In [60]:
sell["item_id"].value_counts()
Out[60]:
item_id
FOODS_3_587 2820
HOUSEHOLD_1_177 2820
HOUSEHOLD_2_283 2820
HOBBIES_1_337 2820
FOODS_1_032 2820
...
HOUSEHOLD_1_308 642
HOUSEHOLD_1_159 623
HOUSEHOLD_1_242 600
FOODS_3_296 592
FOODS_2_379 530
Name: count, Length: 3049, dtype: int64
In [61]:
sell.query("item_id == 'FOODS_3_090'").groupby("store_id")
Out[61]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E00D57A0D0>
In [62]:
fig,ax = plt.subplots(figsize=(15,5)) # 创建图形fig 和 坐标轴ax
stores = [] # 存储每个商店的名字
for store, d in sell.query("item_id == 'FOODS_3_090'").groupby("store_id"): # 筛选出指定数据,再根据store_id分组的数据进行绘图
d.plot(x="wm_yr_wk",
y="sell_price",
style=".",
color=next(color_cycle),
figsize=(15,5),
title="FOODS_3_090 sale price over time",
ax=ax,
legend=store # 图例设置
)
stores.append(store)
plt.legend(stores) # 添加图例
plt.show()
FOODS、HOBBIES、HOUSEHOLD三种不同类型的价格分布:
In [63]:
sell["Category"] = sell["item_id"].str.split("_", expand=True)[0] # 展开之后取第一个数据
sell
Out[63]:
| store_id | item_id | wm_yr_wk | sell_price | Category | |
|---|---|---|---|---|---|
| 0 | CA_1 | HOBBIES_1_001 | 11325 | 9.58 | HOBBIES |
| 1 | CA_1 | HOBBIES_1_001 | 11326 | 9.58 | HOBBIES |
| 2 | CA_1 | HOBBIES_1_001 | 11327 | 8.26 | HOBBIES |
| 3 | CA_1 | HOBBIES_1_001 | 11328 | 8.26 | HOBBIES |
| 4 | CA_1 | HOBBIES_1_001 | 11329 | 8.26 | HOBBIES |
| ... | ... | ... | ... | ... | ... |
| 6841116 | WI_3 | FOODS_3_827 | 11617 | 1.00 | FOODS |
| 6841117 | WI_3 | FOODS_3_827 | 11618 | 1.00 | FOODS |
| 6841118 | WI_3 | FOODS_3_827 | 11619 | 1.00 | FOODS |
| 6841119 | WI_3 | FOODS_3_827 | 11620 | 1.00 | FOODS |
| 6841120 | WI_3 | FOODS_3_827 | 11621 | 1.00 | FOODS |
6841121 rows × 5 columns
In [64]:
fig, axs = plt.subplots(1,3,figsize=(15,4))
i = 0
for cat, d in sell.groupby("Category"):
ax = d["sell_price"].apply(np.log1p).plot(kind="hist",
bins=20,
title=f"Distribution of {cat} prices",
ax=axs[i],
color=next(color_cycle))
ax.set_xlabel("Log(price)")
i += 1
plt.tight_layout()
plt.show()