销售数据多维度可视化分析

131 阅读20分钟

1 导入库

In [1]:

import pandas as pd
import numpy as np

# 可视化相关库
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly_express as px
import plotly.tools as tls
from matplotlib import pyplot as plt
%matplotlib inline

import seaborn as sns
from itertools import cycle

pd.options.display.max_columns = 50
plt.style.use('bmh')
color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])

2 数据基本信息

2.1 读取数据

In [2]:

calendar = pd.read_csv("calendar.csv")
stv = pd.read_csv("sales_train_validation.csv")

sell = pd.read_csv("sell_prices.csv")
ss = pd.read_csv("sample_submission.csv")

2.2 基本信息

In [3]:

calendar.head()  #  前5行数据

查看数据的基本信息:

In [4]:

calendar.shape  # 数据行列数

Out[4]:

(1969, 14)

In [5]:

calendar.isnull().sum()  # 数据缺失值

Out[5]:

date               0
wm_yr_wk           0
weekday            0
wday               0
month              0
year               0
d                  0
event_name_1    1807
event_type_1    1807
event_name_2    1964
event_type_2    1964
snap_CA            0
snap_TX            0
snap_WI            0
dtype: int64

In [6]:

calendar.dtypes  # 数据字段类型

Out[6]:

date            object
wm_yr_wk         int64
weekday         object
wday             int64
month            int64
year             int64
d               object
event_name_1    object
event_type_1    object
event_name_2    object
event_type_2    object
snap_CA          int64
snap_TX          int64
snap_WI          int64
dtype: object

info()函数输出数据信息,包含字段名称、非空数量NoN-Null Count、字段类型:

In [7]:

calendar.info()  # 字段名称、非空数量NoN-Null Count、字段类型
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1969 entries, 0 to 1968
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1969 non-null   object
 1   wm_yr_wk      1969 non-null   int64 
 2   weekday       1969 non-null   object
 3   wday          1969 non-null   int64 
 4   month         1969 non-null   int64 
 5   year          1969 non-null   int64 
 6   d             1969 non-null   object
 7   event_name_1  162 non-null    object
 8   event_type_1  162 non-null    object
 9   event_name_2  5 non-null      object
 10  event_type_2  5 non-null      object
 11  snap_CA       1969 non-null   int64 
 12  snap_TX       1969 non-null   int64 
 13  snap_WI       1969 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 215.5+ KB

In [8]:

stv.head()

Out[8]:

iditem_iddept_idcat_idstore_idstate_idd_1d_2d_3d_4d_5d_6d_7d_8d_9d_10d_11d_12d_13d_14d_15d_16d_17d_18d_19...d_1889d_1890d_1891d_1892d_1893d_1894d_1895d_1896d_1897d_1898d_1899d_1900d_1901d_1902d_1903d_1904d_1905d_1906d_1907d_1908d_1909d_1910d_1911d_1912d_1913
0HOBBIES_1_001_CA_1_validationHOBBIES_1_001HOBBIES_1HOBBIESCA_1CA0000000000000000000...0001042301200011301113011
1HOBBIES_1_002_CA_1_validationHOBBIES_1_002HOBBIES_1HOBBIESCA_1CA0000000000000000000...0000000000000000000010000
2HOBBIES_1_003_CA_1_validationHOBBIES_1_003HOBBIES_1HOBBIESCA_1CA0000000000000000000...0001000100000122121110111
3HOBBIES_1_004_CA_1_validationHOBBIES_1_004HOBBIES_1HOBBIESCA_1CA0000000000000000000...0312131025420301054101372
4HOBBIES_1_005_CA_1_validationHOBBIES_1_005HOBBIES_1HOBBIESCA_1CA0000000000000000000...4014010101120112110112224

5 rows × 1919 columns

In [9]:

stv.shape

Out[9]:

(30490, 1919)

In [10]:

stv["id"].value_counts()

Out[10]:

id
HOBBIES_1_001_CA_1_validation      1
FOODS_2_202_TX_3_validation        1
FOODS_2_214_TX_3_validation        1
FOODS_2_213_TX_3_validation        1
FOODS_2_212_TX_3_validation        1
                                  ..
HOUSEHOLD_1_457_CA_4_validation    1
HOUSEHOLD_1_456_CA_4_validation    1
HOUSEHOLD_1_455_CA_4_validation    1
HOUSEHOLD_1_454_CA_4_validation    1
FOODS_3_827_WI_3_validation        1
Name: count, Length: 30490, dtype: int64

In [11]:

stv["item_id"].value_counts()

Out[11]:

item_id
HOBBIES_1_001      10
FOODS_2_210        10
FOODS_2_201        10
FOODS_2_202        10
FOODS_2_203        10
                   ..
HOUSEHOLD_1_464    10
HOUSEHOLD_1_465    10
HOUSEHOLD_1_466    10
HOUSEHOLD_1_467    10
FOODS_3_827        10
Name: count, Length: 3049, dtype: int64

In [12]:

stv.dtypes

Out[12]:

id          object
item_id     object
dept_id     object
cat_id      object
store_id    object
             ...  
d_1909       int64
d_1910       int64
d_1911       int64
d_1912       int64
d_1913       int64
Length: 1919, dtype: object

In [13]:

sell.head()

Out[13]:

store_iditem_idwm_yr_wksell_price
0CA_1HOBBIES_1_001113259.58
1CA_1HOBBIES_1_001113269.58
2CA_1HOBBIES_1_001113278.26
3CA_1HOBBIES_1_001113288.26
4CA_1HOBBIES_1_001113298.26

不同的item_id数量统计:

In [14]:

sell["item_id"].value_counts()

Out[14]:

item_id
FOODS_3_587        2820
HOUSEHOLD_1_177    2820
HOUSEHOLD_2_283    2820
HOBBIES_1_337      2820
FOODS_1_032        2820
                   ... 
HOUSEHOLD_1_308     642
HOUSEHOLD_1_159     623
HOUSEHOLD_1_242     600
FOODS_3_296         592
FOODS_2_379         530
Name: count, Length: 3049, dtype: int64

In [15]:

sell.shape

Out[15]:

(6841121, 4)

In [16]:

sell.dtypes

Out[16]:

store_id       object
item_id        object
wm_yr_wk        int64
sell_price    float64
dtype: object

In [17]:

ss.head()

Out[17]:

idF1F2F3F4F5F6F7F8F9F10F11F12F13F14F15F16F17F18F19F20F21F22F23F24F25F26F27F28
0HOBBIES_1_001_CA_1_validation0000000000000000000000000000
1HOBBIES_1_002_CA_1_validation0000000000000000000000000000
2HOBBIES_1_003_CA_1_validation0000000000000000000000000000
3HOBBIES_1_004_CA_1_validation0000000000000000000000000000
4HOBBIES_1_005_CA_1_validation0000000000000000000000000000

In [18]:

ss.shape

Out[18]:

(60980, 29)

3 数据探索性分析EDA

3.1 单个item_id数据可视化

In [19]:

stv.columns

Out[19]:

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',       'd_2', 'd_3', 'd_4',       ...       'd_1904', 'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910',       'd_1911', 'd_1912', 'd_1913'],
      dtype='object', length=1919)

In [20]:

d_cols = [c for c in stv.columns if "d_" in c]
d_cols[:10]

Out[20]:

['d_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'd_7', 'd_8', 'd_9', 'd_10']

In [21]:

len(d_cols)

Out[21]:

1913

In [22]:

stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"].set_index("id")   # 设置行索引为id

Out[22]:

item_iddept_idcat_idstore_idstate_idd_1d_2d_3d_4d_5d_6d_7d_8d_9d_10d_11d_12d_13d_14d_15d_16d_17d_18d_19d_20...d_1889d_1890d_1891d_1892d_1893d_1894d_1895d_1896d_1897d_1898d_1899d_1900d_1901d_1902d_1903d_1904d_1905d_1906d_1907d_1908d_1909d_1910d_1911d_1912d_1913
id
FOODS_3_090_CA_3_validationFOODS_3_090FOODS_3FOODSCA_3CA1081321021201061232791751861200000000000...130156133114807511381107160124107848890129160204861008877141139130

1 rows × 1918 columns

基于一个id号进行数据展示:

In [23]:

# 选择一个id号进行展示

stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"].set_index("id")[d_cols].T.plot(figsize=(15,5),
                                                                                  title="FOODS_3_090_CA_3",
                                                                                  color=next(color_cycle))

plt.legend('')
plt.show()

3.2 数据合并可视化

In [24]:

calendar[["d","date","event_name_1","event_name_2","event_type_1","event_type_2","snap_CA"]].head()

Out[24]:

ddateevent_name_1event_name_2event_type_1event_type_2snap_CA
0d_12011-01-29NaNNaNNaNNaN0
1d_22011-01-30NaNNaNNaNNaN0
2d_32011-01-31NaNNaNNaNNaN0
3d_42011-02-01NaNNaNNaNNaN1
4d_52011-02-02NaNNaNNaNNaN1

In [25]:

stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"]

Out[25]:

iditem_iddept_idcat_idstore_idstate_idd_1d_2d_3d_4d_5d_6d_7d_8d_9d_10d_11d_12d_13d_14d_15d_16d_17d_18d_19...d_1889d_1890d_1891d_1892d_1893d_1894d_1895d_1896d_1897d_1898d_1899d_1900d_1901d_1902d_1903d_1904d_1905d_1906d_1907d_1908d_1909d_1910d_1911d_1912d_1913
8412FOODS_3_090_CA_3_validationFOODS_3_090FOODS_3FOODSCA_3CA108132102120106123279175186120000000000...130156133114807511381107160124107848890129160204861008877141139130

1 rows × 1919 columns

In [26]:

test = stv.loc[stv["id"] == "FOODS_3_090_CA_3_validation"][d_cols].T          
test

Out[26]:

8412
d_1108
d_2132
d_3102
d_4120
d_5106
......
d_190988
d_191077
d_1911141
d_1912139
d_1913130

1913 rows × 1 columns

In [27]:

# 字段的重新命名
test.rename(columns={8412:"FOODS_3_090_CA_3"},inplace=True)

d_1、d_2...d_1913变成列属性:

In [28]:

test = test.reset_index().rename(columns={"index":"d"})
test 

Out[28]:

dFOODS_3_090_CA_3
0d_1108
1d_2132
2d_3102
3d_4120
4d_5106
.........
1908d_190988
1909d_191077
1910d_1911141
1911d_1912139
1912d_1913130

1913 rows × 2 columns

查看两个DataFrame数据的字段信息:

In [29]:

print(test.columns)
print(calendar.columns) 
Index(['d', 'FOODS_3_090_CA_3'], dtype='object')
Index(['date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'd',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI'],
      dtype='object')

根据相同的字段d进行数据的合并

In [30]:

test = test.merge(calendar, how="left", validate='1:1')
test.head()

Out[30]:

dFOODS_3_090_CA_3datewm_yr_wkweekdaywdaymonthyearevent_name_1event_type_1event_name_2event_type_2snap_CAsnap_TXsnap_WI
0d_11082011-01-2911101Saturday112011NaNNaNNaNNaN000
1d_21322011-01-3011101Sunday212011NaNNaNNaNNaN000
2d_31022011-01-3111101Monday312011NaNNaNNaNNaN000
3d_41202011-02-0111101Tuesday422011NaNNaNNaNNaN110
4d_51062011-02-0211101Wednesday522011NaNNaNNaNNaN101

In [31]:

test.set_index("date")["FOODS_3_090_CA_3"].plot(figsize=(15,5),
                                                color=next(color_cycle),
                                                title="FOODS_3_090_CA_3 sales by actual sale dates"
                                               )

plt.show()

完整过程:

In [32]:

# # 换成另一个id-FOODS_3_587_CA_1_validation

test2 = stv.loc[stv["id"] == "FOODS_3_587_CA_1_validation"][d_cols].T  # 提取信息并转置 
test2.rename(columns={2811:"FOODS_3_587_CA_1"},inplace=True)  # 字段重命名
test2 = test2.reset_index().rename(columns={"index":"d"})  # 将行索引转成列字段
test2 = test2.merge(calendar, how="left")   # 根据d字段合并数据

# 绘图
test2.set_index("date")["FOODS_3_587_CA_1"].plot(figsize=(15,5),
                                                color=next(color_cycle),
                                                title="FOODS_3_587_CA_1 sales by actual sale dates"
                                               )

plt.show()

另一个案例:

In [33]:

test3 = stv.loc[stv['id'] == 'HOUSEHOLD_1_118_CA_3_validation'][d_cols].T
test3 = test3.rename(columns={6776:'HOUSEHOLD_1_118_CA_3'})
test3 = test3.reset_index().rename(columns={'index': 'd'})
test3 = test3.merge(calendar, how='left', validate='1:1')

# # 绘图
# test3.set_index("date")["HOUSEHOLD_1_118_CA_3"].plot(figsize=(15,5),
#                                                 color=next(color_cycle),
#                                                 title="HOUSEHOLD_1_118_CA_3 sales by actual sale dates"
#                                                )

# plt.show()

3.3 基于时间变量可视化

涉及到的时间变量:Day of week、Month、Year

In [34]:

tests = ["FOODS_3_090_CA_3", "FOODS_3_587_CA_1", "HOUSEHOLD_1_118_CA_3"]
test_df = [test, test2, test3]

for i in [0,1,2]:
    #  绘制3个子图
    fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(16,3)) 
    
    # 子图1:基于wday统计
    # 前面部分是聚合统计 + plot的line绘图部分
    test_df[i].groupby("wday")[tests[i]].mean().plot(kind="line", 
                                                     title="Average Sale: Day of week", 
                                                     lw=5, 
                                                     color=color_pal[0],
                                                     ax=ax1)
    # 子图2:基于month统计                                                 
    test_df[i].groupby("month")[tests[i]].mean().plot(kind="line", 
                                                     title="Average Sale: Month", 
                                                     lw=5, 
                                                     color=color_pal[1],
                                                     ax=ax2)
    # 子图3:基于year统计                                                
    test_df[i].groupby("year")[tests[i]].mean().plot(kind="line", 
                                                     title="Average Sale: Year", 
                                                     lw=5, 
                                                     color=color_pal[2],
                                                     ax=ax3)
    
    fig.suptitle(f'Trends for item: {tests[i]}',size=20,y=1.1)
    
    plt.tight_layout()
    plt.show()

如何理解前面的聚合统计:以第一个test_df[0] = test为例说明:

In [35]:

test.groupby("wday")["FOODS_3_090_CA_3"].mean()

Out[35]:

wday
1    175.091241
2    143.956204
3    118.219780
4    105.241758
5    107.490842
6    117.087912
7    149.333333
Name: FOODS_3_090_CA_3, dtype: float64

3.4 不同item_id可视化

绘制20种不同item_id的销售情况:

In [36]:

#  随机选择20个数据,设置id为index,取出d_cols相关的数据,最后再转置
# stv.sample(20, random_state=42).set_index("id")[d_cols].T

In [37]:

calendar.set_index("d")["date"]  # 取出date部分数据

Out[37]:

d
d_1       2011-01-29
d_2       2011-01-30
d_3       2011-01-31
d_4       2011-02-01
d_5       2011-02-02
             ...    
d_1965    2016-06-15
d_1966    2016-06-16
d_1967    2016-06-17
d_1968    2016-06-18
d_1969    2016-06-19
Name: date, Length: 1969, dtype: object

In [38]:

twenty_tests = stv.sample(20, random_state=42).set_index("id")[d_cols].T.merge(calendar.set_index("d")["date"],
                                                                              left_index=True,
                                                                              right_index=True,
                                                                              validate="1:1").set_index("date")

In [39]:

fig, axs = plt.subplots(10,2,figsize=(15, 20))  # 10行2列的子图;大小是15-20

axs = axs.flatten()  #  将二维数组展平成一维数组

ax_idx = 0

for item in twenty_tests.columns:  # 遍历每个列名进行绘图
    twenty_tests[item].plot(title=item,
                           color=next(color_cycle), # 从color_cycle中获取下一个颜色,用于绘制当前图形
                           ax=axs[ax_idx])
    ax_idx += 1
    
plt.tight_layout()
plt.show()

3.5 基于不同cat_id结合时间的可视化

不同的类型cat_id包含:

  • Hobbies
  • Household
  • Foods

具体有哪些不同的类型:

In [40]:

stv["cat_id"].unique()  

Out[40]:

array(['HOBBIES', 'HOUSEHOLD', 'FOODS'], dtype=object)

不同类型的数量对比:

In [41]:

stv["cat_id"].value_counts()

Out[41]:

cat_id
FOODS        14370
HOUSEHOLD    10470
HOBBIES       5650
Name: count, dtype: int64

基于matplotlib的实现:

In [42]:

stv["cat_id"].value_counts().plot(kind="barh", figsize=(15,5),title="Count of Items by Catebory")

plt.show()

基于plotly_express的实现:

In [43]:

data = [go.Bar(
    x = stv.cat_id.unique(),  # x-y轴
    y = stv.cat_id.value_counts(),
    marker = dict(colorscale = "Jet",  # 颜色设置
                  color = stv.cat_id.value_counts().values),
    text = stv.cat_id.value_counts()  # 柱子显示数据;同x
)]

# Layout中的标题设置
layout = go.Layout(
    title = "不同类型数量统计"
)

fig = go.Figure(data=data, layout=layout)

fig.show()

绘制水平柱状图:

In [44]:

data = [go.Bar(
    x = stv.cat_id.value_counts(), # x-y轴
    y = stv.cat_id.unique(),  
    marker = dict(colorscale = "Jet",  # 颜色设置
                  color = stv.cat_id.value_counts().values),
    orientation = 'h',
    text = stv.cat_id.value_counts()  # 柱子显示数据;同x
)]

# Layout中的标题设置
layout = go.Layout(
    title = "不同类型数量统计"
)

fig = go.Figure(data=data, layout=layout)

fig.show()

将calendar的索引设置为d,提取date字段信息:

In [45]:

calendar.set_index("d")["date"]

Out[45]:

d
d_1       2011-01-29
d_2       2011-01-30
d_3       2011-01-31
d_4       2011-02-01
d_5       2011-02-02
             ...    
d_1965    2016-06-15
d_1966    2016-06-16
d_1967    2016-06-17
d_1968    2016-06-18
d_1969    2016-06-19
Name: date, Length: 1969, dtype: object

转置之后再和上面的数据进行合并:

In [46]:

past_sales = stv.set_index("id")[d_cols].T.merge(calendar.set_index("d")["date"],
                                                 left_index=True,
                                                 right_index=True,
                                                 validate="1:1",
                                                ).set_index("date")

In [47]:

stv["cat_id"].unique() 

Out[47]:

array(['HOBBIES', 'HOUSEHOLD', 'FOODS'], dtype=object)

以不同的种类cat_id作为前缀:

In [48]:

past_sales.columns  

Out[48]:

Index(['HOBBIES_1_001_CA_1_validation', 'HOBBIES_1_002_CA_1_validation',       'HOBBIES_1_003_CA_1_validation', 'HOBBIES_1_004_CA_1_validation',       'HOBBIES_1_005_CA_1_validation', 'HOBBIES_1_006_CA_1_validation',       'HOBBIES_1_007_CA_1_validation', 'HOBBIES_1_008_CA_1_validation',       'HOBBIES_1_009_CA_1_validation', 'HOBBIES_1_010_CA_1_validation',       ...       'FOODS_3_818_WI_3_validation', 'FOODS_3_819_WI_3_validation',       'FOODS_3_820_WI_3_validation', 'FOODS_3_821_WI_3_validation',       'FOODS_3_822_WI_3_validation', 'FOODS_3_823_WI_3_validation',       'FOODS_3_824_WI_3_validation', 'FOODS_3_825_WI_3_validation',       'FOODS_3_826_WI_3_validation', 'FOODS_3_827_WI_3_validation'],
      dtype='object', length=30490)

In [49]:

for i in stv["cat_id"].unique():
    items_col = [c for c in past_sales.columns if i in c]  # 筛选出带有cat_id作为前缀的columns
    past_sales[items_col].sum(axis=1).plot(figsize=(15,5),  # 大小
                                           alpha=0.8,  # 透明度
                                           title="Total Sales by Item Type"  # 标题
                                          )

plt.legend(stv["cat_id"].unique())  # 图例显示
plt.show()

3.6 新品推出可视化

In [50]:

past_sales_clipped = past_sales.clip(0,1) # 0----not selling  >0-----selling

for i in stv["cat_id"].unique():
    items_col = [c for c in past_sales.columns if i in c]
    (past_sales_clipped[items_col].mean(axis=1)*100).plot(figsize = (15, 5), 
                                                          alpha = 0.8, 
                                                          title = "Inventory Sale Percentage by Date",
                                                          style = "."
                                                         )
    
plt.ylabel('Inventory with at least 1 sale')
plt.legend(stv["cat_id"].unique())
plt.show()

3.7 基于Store的销量可视化

In [51]:

store_list = sell["store_id"].unique()
for s in store_list:
    store_items = [c for c in past_sales.columns if s in c]
    # 滑动窗口为90
    past_sales[store_items].sum(axis=1).rolling(90).mean().plot(figsize=(15,5),
                                                                alpha=0.8,
                                                                title="Rolling 90 Day Average Total Sales(10 Stores)"
                                                               )    
plt.legend(store_list)
plt.show()

不同store_list的信息:

In [52]:

fig, axes = plt.subplots(5,2,figsize=(15,10),sharex=True)

axes = axes.flatten()

ax_idx = 0

for s in store_list:
    store_items = [c for c in past_sales.columns if s in c]  # 筛选出带有store_list的columns
    past_sales[store_items].sum(axis=1).rolling(7).mean().plot(alpha=1,
                                                               ax=axes[ax_idx],
                                                               title=s,
                                                               lw=3,
                                                               color=next(color_cycle)
                                                              )
    ax_idx += 1
    
plt.suptitle("Weekly Sale Trends by Store ID")
plt.tight_layout()
plt.show()

3.8 Sales Heatmap Calendar

In [53]:

import numpy as np

import matplotlib.pyplot as plt
from matplotlib.patches import Polygon

from datetime import datetime
from dateutil.relativedelta import relativedelta

In [54]:

def calmap(ax, year, data):
    ax.tick_params("x", length=0, labelsize="medium", which="major")
    ax.tick_params("y", length=0, labelsize="x-small",which="major")
    
    # 基于月
    xticks, labels = [], []
    start = datetime(year, 1, 1).weekday()
    for month in range(1,13):
        first = datetime(year, month, 1)
        last = first + relativedelta(months=1, days=-1)
        
        y0 = first.weekday()
        y1 = last.weekday()
        
        x0 = (int(first.strftime("%j")) + start - 1) // 7
        x1 = (int(last.strftime("%j")) + + start - 1) // 7
        
        P = [(x0,y0), (x0,7), (x1,7), 
             (x1, y1+1),(x1+1, y1+1), 
             (x1+1,0),(x0+1,0),(x0+1,y0)]
        
        xticks.append(x0 + (x1-x0+1) / 2)
        
        labels.append(first.strftime("%b"))
        poly = Polygon(P, 
                       edgecolor="black",
                       facecolor="None",
                       linewidth=1,
                       zorder=20,
                       clip_on=False
                      )
        ax.add_artist(poly)
        
    ax.set_xticks(xticks)
    ax.set_xticklabels(labels)
    ax.set_yticks(0.5 + np.arange(7))
    ax.set_yticklabels(["Mon","Tue", "Wed","Thu", "Fri", "Sat", "Sun"])
    ax.set_title("{}".format(year), weight="semibold")
    
    
    valid = datetime(year, 1, 1).weekday()
    data[:valid, 0] = np.nan
    valid = datetime(year,12,31).weekday()
    data[valid+1:, x1] = np.nan
    
    ax.imshow(data, 
              extent=[0,53,0,7],
              zorder=10,
              vmin=-1,
              vmax=1,
              cmap="RdYlBu_r",
              origin="lower",
              alpha=0.75)
    

In [55]:

print('The lowest sale date was:', 
      past_sales.sum(axis=1).sort_values().index[0],
      'with', 
      past_sales.sum(axis=1).sort_values().values[0],
      'sales')

print('The lowest sale date was:',
      past_sales.sum(axis=1).sort_values(ascending=False).index[0],
      'with', 
      past_sales.sum(axis=1).sort_values(ascending=False).values[0], 
      'sales')
The lowest sale date was: 2012-12-25 with 11 sales
The lowest sale date was: 2016-03-06 with 57218 sales

In [56]:

from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

In [57]:

# 将索引改成时间类型的数据

past_sales.index = pd.to_datetime(past_sales.index)

In [58]:

for i in stv["cat_id"].unique():
    fig, axes = plt.subplots(3,1,figsize=(20,8))
    items_col = [c for c in past_sales.columns if i in c]
    
    sales2013 = past_sales.loc[past_sales.index.isin(pd.date_range("31-Dec-2012",periods=371))][items_col].mean(axis=1)
    vals = np.hstack(ss.fit_transform(sales2013.values.reshape(-1,1)))
    calmap(axes[0], 2013, vals.reshape(53,7).T)
    
    sales2014 = past_sales.loc[past_sales.index.isin(pd.date_range("30-Dec-2013",periods=371))][items_col].mean(axis=1)
    vals = np.hstack(ss.fit_transform(sales2014.values.reshape(-1,1)))
    calmap(axes[1], 2014, vals.reshape(53,7).T)
    
    sales2015 = past_sales.loc[past_sales.index.isin(pd.date_range("29-Dec-2014",periods=371))][items_col].mean(axis=1)
    vals = np.hstack(ss.fit_transform(sales2015.values.reshape(-1,1)))
    calmap(axes[2], 2015, vals.reshape(53,7).T)
    
    plt.suptitle(i, fontsize=30, x=0.4, y=1.01)
    plt.tight_layout()
    plt.show()

3.9 Sale Prices

In [59]:

sell.head()

Out[59]:

store_iditem_idwm_yr_wksell_price
0CA_1HOBBIES_1_001113259.58
1CA_1HOBBIES_1_001113269.58
2CA_1HOBBIES_1_001113278.26
3CA_1HOBBIES_1_001113288.26
4CA_1HOBBIES_1_001113298.26

In [60]:

sell["item_id"].value_counts()

Out[60]:

item_id
FOODS_3_587        2820
HOUSEHOLD_1_177    2820
HOUSEHOLD_2_283    2820
HOBBIES_1_337      2820
FOODS_1_032        2820
                   ... 
HOUSEHOLD_1_308     642
HOUSEHOLD_1_159     623
HOUSEHOLD_1_242     600
FOODS_3_296         592
FOODS_2_379         530
Name: count, Length: 3049, dtype: int64

In [61]:

sell.query("item_id == 'FOODS_3_090'").groupby("store_id")

Out[61]:

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E00D57A0D0>

In [62]:

fig,ax = plt.subplots(figsize=(15,5)) # 创建图形fig 和 坐标轴ax

stores = []  # 存储每个商店的名字

for store, d in sell.query("item_id == 'FOODS_3_090'").groupby("store_id"):  # 筛选出指定数据,再根据store_id分组的数据进行绘图
    d.plot(x="wm_yr_wk", 
           y="sell_price",
           style=".",
           color=next(color_cycle),
           figsize=(15,5),
           title="FOODS_3_090 sale price over time",
           ax=ax,
           legend=store  # 图例设置
          )
    stores.append(store) 
    
plt.legend(stores)  # 添加图例
plt.show()  

FOODS、HOBBIES、HOUSEHOLD三种不同类型的价格分布:

In [63]:

sell["Category"] = sell["item_id"].str.split("_", expand=True)[0]  # 展开之后取第一个数据
sell

Out[63]:

store_iditem_idwm_yr_wksell_priceCategory
0CA_1HOBBIES_1_001113259.58HOBBIES
1CA_1HOBBIES_1_001113269.58HOBBIES
2CA_1HOBBIES_1_001113278.26HOBBIES
3CA_1HOBBIES_1_001113288.26HOBBIES
4CA_1HOBBIES_1_001113298.26HOBBIES
..................
6841116WI_3FOODS_3_827116171.00FOODS
6841117WI_3FOODS_3_827116181.00FOODS
6841118WI_3FOODS_3_827116191.00FOODS
6841119WI_3FOODS_3_827116201.00FOODS
6841120WI_3FOODS_3_827116211.00FOODS

6841121 rows × 5 columns

In [64]:

fig, axs = plt.subplots(1,3,figsize=(15,4))

i = 0
for cat, d in sell.groupby("Category"):
    ax = d["sell_price"].apply(np.log1p).plot(kind="hist",
                                              bins=20,
                                              title=f"Distribution of {cat} prices",
                                              ax=axs[i],
                                              color=next(color_cycle))
    
    ax.set_xlabel("Log(price)")
    i += 1
    
plt.tight_layout()
plt.show()

项目地址:www.kaggle.com/code/robiks…