用户聚类分群

63 阅读18分钟

公众号:尤而小屋
作者:Peter
编辑:Peter

大家好,我是Peter~

今天给大家分享一篇关于聚类的建模文章,主要内容包含:

  • 数据基本信息
  • 原数据特征处理
  • 数据探索性分析EDA
  • 建模之数据预处理:数据编码、数据标准化、数据降维
  • 实施聚类:肘图法确定k值、实施聚类、聚类结果可视化
  • 聚类效果评估:从不同角度查看不同的簇群

1 导入库

In [1]:

import numpy as np
import pandas as pd
import datetime

# 可视化
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
import plotly_express as px
import plotly.graph_objects as go

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics

# 警告处理
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
np.random.seed(42)

2 数据信息

2.1 导入数据

In [2]:

df = pd.read_csv("marketing_campaign.csv",sep="\t")
df.head()

Out[2]:

IDYear_BirthEducationMarital_StatusIncomeKidhomeTeenhomeDt_CustomerRecencyMntWines...NumWebVisitsMonthAcceptedCmp3AcceptedCmp4AcceptedCmp5AcceptedCmp1AcceptedCmp2ComplainZ_CostContactZ_RevenueResponse
055241957GraduationSingle58138.00004-09-201258635...70000003111
121741954GraduationSingle46344.01108-03-20143811...50000003110
241411965GraduationTogether71613.00021-08-201326426...40000003110
361821984GraduationTogether26646.01010-02-20142611...60000003110
453241981PhDMarried58293.01019-01-201494173...50000003110

5 rows × 29 columns

整体数据量:

In [3]:

df.shape

Out[3]:

(2240, 29)

In [4]:

df.columns

Out[4]:

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
      dtype='object')

In [5]:

df.dtypes  # 字段的数据类型

Out[5]:

ID                       int64
Year_Birth               int64
Education               object
Marital_Status          object
Income                 float64
Kidhome                  int64
Teenhome                 int64
Dt_Customer             object
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Complain                 int64
Z_CostContact            int64
Z_Revenue                int64
Response                 int64
dtype: object

In [6]:

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   int64  
 16  NumWebPurchases      2240 non-null   int64  
 17  NumCatalogPurchases  2240 non-null   int64  
 18  NumStorePurchases    2240 non-null   int64  
 19  NumWebVisitsMonth    2240 non-null   int64  
 20  AcceptedCmp3         2240 non-null   int64  
 21  AcceptedCmp4         2240 non-null   int64  
 22  AcceptedCmp5         2240 non-null   int64  
 23  AcceptedCmp1         2240 non-null   int64  
 24  AcceptedCmp2         2240 non-null   int64  
 25  Complain             2240 non-null   int64  
 26  Z_CostContact        2240 non-null   int64  
 27  Z_Revenue            2240 non-null   int64  
 28  Response             2240 non-null   int64  
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB

2.2 数据缺失值情况

In [7]:

df.isnull().sum()

Out[7]:

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64

字段缺失值比例:

In [8]:

df.isnull().sum() / len(df)  # 缺失值比例

Out[8]:

ID                     0.000000
Year_Birth             0.000000
Education              0.000000
Marital_Status         0.000000
Income                 0.010714
Kidhome                0.000000
Teenhome               0.000000
Dt_Customer            0.000000
Recency                0.000000
MntWines               0.000000
MntFruits              0.000000
MntMeatProducts        0.000000
MntFishProducts        0.000000
MntSweetProducts       0.000000
MntGoldProds           0.000000
NumDealsPurchases      0.000000
NumWebPurchases        0.000000
NumCatalogPurchases    0.000000
NumStorePurchases      0.000000
NumWebVisitsMonth      0.000000
AcceptedCmp3           0.000000
AcceptedCmp4           0.000000
AcceptedCmp5           0.000000
AcceptedCmp1           0.000000
AcceptedCmp2           0.000000
Complain               0.000000
Z_CostContact          0.000000
Z_Revenue              0.000000
Response               0.000000
dtype: float64

缺失值的比例非常小,我们考虑直接删除:

In [9]:

df.dropna(inplace=True)

df.isnull().sum() # 删除缺失值后的统计情况

Out[9]:

ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
dtype: int64

3 数据特征处理

3.1 时间字段Dt_Customer

关于时间字段处理

In [10]:

df["Dt_Customer"].value_counts()

Out[10]:

Dt_Customer
31-08-2012    12
12-09-2012    11
14-02-2013    11
12-05-2014    11
20-08-2013    10
              ..
05-08-2012     1
18-11-2012     1
09-05-2014     1
26-06-2013     1
09-01-2014     1
Name: count, Length: 662, dtype: int64

In [11]:

df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], dayfirst=True)  # 转成时间相关字段,需要给定参数dayfirst

df["Dt_Customer"]

Out[11]:

0      2012-09-04
1      2014-03-08
2      2013-08-21
3      2014-02-10
4      2014-01-19
          ...    
2235   2013-06-13
2236   2014-06-10
2237   2014-01-25
2238   2014-01-24
2239   2012-10-15
Name: Dt_Customer, Length: 2216, dtype: datetime64[ns]

In [12]:

df.sort_values("Year_Birth",ascending=False)

Out[12]:

IDYear_BirthEducationMarital_StatusIncomeKidhomeTeenhomeDt_CustomerRecencyMntWines...NumWebVisitsMonthAcceptedCmp3AcceptedCmp4AcceptedCmp5AcceptedCmp1AcceptedCmp2ComplainZ_CostContactZ_RevenueResponse
46990919962n CycleMarried7500.0002012-11-09243...90000003111
11701931996BasicMarried14421.0002014-02-17810...51000003110
747105481995GraduationSingle71163.0002014-03-0930283...10000003110
2213366119952n CycleSingle80617.0002012-10-1242594...20000003110
69683151995GraduationSingle34824.0002014-03-26654...60000003110
..................................................................
42469321941PhDMarried93027.0002013-04-13771285...20010003110
195066631940PhDSingle51141.0002013-07-0896144...50000003110
192782919002n CycleDivorced36640.0102013-09-269915...50000013110
33911501899PhDTogether83532.0002013-09-2636755...10010003110
2391100418932n CycleSingle60182.0012014-05-17238...40000003110

2216 rows × 29 columns

时间日期中的最值:

In [13]:

dates = df["Dt_Customer"].tolist()
dates[:5]

Out[13]:

[Timestamp('2012-09-04 00:00:00'),
 Timestamp('2014-03-08 00:00:00'),
 Timestamp('2013-08-21 00:00:00'),
 Timestamp('2014-02-10 00:00:00'),
 Timestamp('2014-01-19 00:00:00')]

In [14]:

max(dates)

Out[14]:

Timestamp('2014-06-29 00:00:00')

In [15]:

min(dates)

Out[15]:

Timestamp('2012-07-30 00:00:00')

3.2 新特征Customer_For

创建一个特征("Customer_For"),表示客户开始在商店购物的日期与最后记录日期之间相差的天数。

In [16]:

days = []

max_day = max(dates)

for i in dates:
    delta = max_day - i  # 和最近日期相差天数
    days.append(delta.days)  # 提取天信息
    
days[:10]

Out[16]:

[663, 113, 312, 139, 161, 293, 593, 417, 388, 108]

In [17]:

df["Customer_For"] = days

强制转成数值型数据:

In [18]:

df["Customer_For"] = pd.to_numeric(df["Customer_For"], errors="coerce")  

3.3 Marital_Status & Education

In [19]:

df["Marital_Status"].value_counts()

Out[19]:

Marital_Status
Married     857
Together    573
Single      471
Divorced    232
Widow        76
Alone         3
Absurd        2
YOLO          2
Name: count, dtype: int64

In [20]:

df["Education"].value_counts()

Out[20]:

Education
Graduation    1116
PhD            481
Master         365
2n Cycle       200
Basic           54
Name: count, dtype: int64

3.4 特征衍生

In [21]:

# 年龄:用当前年 - 出生年

df["Age"] = 2024 - df["Year_Birth"]

In [22]:

df[df["Age"] == 131]

Out[22]:

IDYear_BirthEducationMarital_StatusIncomeKidhomeTeenhomeDt_CustomerRecencyMntWines...AcceptedCmp4AcceptedCmp5AcceptedCmp1AcceptedCmp2ComplainZ_CostContactZ_RevenueResponseCustomer_ForAge
2391100418932n CycleSingle60182.0012014-05-17238...00000311043131

1 rows × 31 columns

In [23]:

# 所有的花费:将多个字段合并为一个字段Spent

df["Spent"] = df["MntWines"] + df["MntFruits"]+ df["MntMeatProducts"]+ df["MntFishProducts"]+ df["MntSweetProducts"]+ df["MntGoldProds"]

In [24]:

# 生活状态Living_with

df["Living_With"] = df["Marital_Status"].replace({"Married":"Partner",  # 共同生活
                                                  "Together":"Partner", 
                                                  "Absurd":"Alone",  # 独自一人
                                                  "Widow":"Alone",
                                                  "YOLO":"Alone",
                                                  "Divorced":"Alone",
                                                  "Single":"Alone"}
                                                )

In [25]:

# 小孩个数

df["Children"] = df["Kidhome"] + df["Teenhome"]

In [26]:

# 家庭成员总数

df["Family_Size"] = df["Living_With"].replace({"Alone": 1, "Partner": 2}) + df["Children"]

In [27]:

# 是否为父母

df["Is_Parent"]  = np.where(df["Children"] > 0, 1, 0)

In [28]:

# 受教育程度

df["Education"] = df["Education"].replace({"Basic":"Undergrauate",
                                           "2n Cycle":"Undergrauate",
                                           "Graducation":"Graduate",
                                           "Master":"Postgraduate",
                                           "PhD":"Postgraduate"
                                          })

3.5 字段重命名rename

In [29]:

df = df.rename(columns={"MntWines": "Wines",
                        "MntFruits":"Fruits",
                        "MntMeatProducts":"Meat",
                        "MntFishProducts":"Fish",
                        "MntSweetProducts":"Sweets",
                        "MntGoldProds":"Gold"})

3.6 删除无效字段drop

In [30]:

to_drop = ["Marital_Status", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Year_Birth", "ID"]

In [31]:

df.drop(to_drop, axis=1, inplace=True)

In [32]:

df.describe()

Out[32]:

IncomeKidhomeTeenhomeRecencyWinesFruitsMeatFishSweetsGold...AcceptedCmp1AcceptedCmp2ComplainResponseCustomer_ForAgeSpentChildrenFamily_SizeIs_Parent
count2216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.000000...2216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.0000002216.000000
mean52247.2513540.4417870.50541549.012635305.09160626.356047166.99593937.63763527.02888143.965253...0.0640790.0135380.0094770.150271353.52120955.179603607.0753610.9472022.5925090.714350
std25173.0766610.5368960.54418128.948352337.32792039.793917224.28327354.75208241.07204651.815414...0.2449500.1155880.0969070.357417202.43466711.985554602.9004760.7490620.9057220.451825
min1730.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.00000028.0000005.0000000.0000001.0000000.000000
25%35303.0000000.0000000.00000024.00000024.0000002.00000016.0000003.0000001.0000009.000000...0.0000000.0000000.0000000.000000180.00000047.00000069.0000000.0000002.0000000.000000
50%51381.5000000.0000000.00000049.000000174.5000008.00000068.00000012.0000008.00000024.500000...0.0000000.0000000.0000000.000000355.50000054.000000396.5000001.0000003.0000001.000000
75%68522.0000001.0000001.00000074.000000505.00000033.000000232.25000050.00000033.00000056.000000...0.0000000.0000000.0000000.000000529.00000065.0000001048.0000001.0000003.0000001.000000
max666666.0000002.0000002.00000099.0000001493.000000199.0000001725.000000259.000000262.000000321.000000...1.0000001.0000001.0000001.000000699.000000131.0000002525.0000003.0000005.0000001.000000

8 rows × 28 columns

4 数据EDA

4.1 两两特征关系pairplot

In [33]:

sns.set(rc={"axes.facecolor": "#FFF9ED", "figure.facecolor": "#FFF9ED"})
# pallet = ["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"]

To_Plot = [ "Income", "Recency", "Customer_For", "Age", "Spent", "Is_Parent"]

# 取消palette
# sns.pairplot(df[To_Plot], hue= "Is_Parent",palette= (["#682F2F","#F3AB60"]))
plt.figure(figsize=(10, 6))
sns.pairplot(df[To_Plot], hue= "Is_Parent")
plt.show()

可以看到在Age和Income中存在很多的异常值:

In [34]:

fig = px.box(df["Age"])

fig.show()

可以看到Age字段的上四分位数是:84

In [35]:

fig = px.box(df, y=["Income"], points="outliers",)  

fig.show()

在收入Income字段中,我们取临界值为600000:

In [36]:

df = df[df["Age"] < 90]
df = df[df["Income"] < 600000]  # 两次数据过滤

新数据的总长度为:

In [37]:

len(df)

Out[37]:

2212

4.2 相关系数

使用df中包含两个字符类型的字段,在新版本的pandas中需要带上参数numeric_only=True

In [38]:

corr = df.corr(numeric_only=True)
corr.head()

Out[38]:

IncomeKidhomeTeenhomeRecencyWinesFruitsMeatFishSweetsGold...AcceptedCmp1AcceptedCmp2ComplainResponseCustomer_ForAgeSpentChildrenFamily_SizeIs_Parent
Income1.000000-0.5145230.0345650.0079650.6882090.5073540.6922790.5200400.5235990.388299...0.3275240.104036-0.0279000.161387-0.0237600.1999770.792740-0.343529-0.286638-0.403132
Kidhome-0.5145231.000000-0.0390660.010623-0.497203-0.373258-0.439031-0.388643-0.377843-0.354922...-0.174261-0.0819110.037067-0.077901-0.055281-0.237497-0.5579490.6880810.5832500.520355
Teenhome0.034565-0.0390661.0000000.0143920.003945-0.175905-0.261134-0.205235-0.163107-0.018579...-0.145198-0.0156330.007746-0.1544020.0185570.361932-0.1379640.6981990.5944810.587993
Recency0.0079650.0106230.0143921.0000000.015981-0.0052570.0229140.0007880.0252440.018148...-0.021147-0.0014290.005713-0.2001140.0256810.0156940.0204790.0180620.0147170.002189
Wines0.688209-0.4972030.0039450.0159811.0000000.3858440.5680810.3969150.3895830.391461...0.3516100.206309-0.0364200.2463200.1678520.1646150.892996-0.353356-0.296702-0.341994

5 rows × 28 columns

In [39]:

cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#1F8A78", "#F3AB60"])

plt.figure(figsize=(20, 16))

sns.heatmap(corr, annot=True, cmap=cmap, center=0)

plt.show()

5 数据预处理

5.1 数据编码LabelEncoder

针对字符类型数据的编码处理:

In [40]:

df.columns

Out[40]:

Index(['Education', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'Wines',       'Fruits', 'Meat', 'Fish', 'Sweets', 'Gold', 'NumDealsPurchases',       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',       'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5',       'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response', 'Customer_For',       'Age', 'Spent', 'Living_With', 'Children', 'Family_Size', 'Is_Parent'],
      dtype='object')

确定属于object类型的字段:

In [41]:

object_columns = df.columns[df.dtypes == "object"].tolist()
object_columns

Out[41]:

['Education', 'Living_With']

In [42]:

df["Education"].value_counts()

Out[42]:

Education
Graduation      1115
Postgraduate     845
Undergrauate     252
Name: count, dtype: int64

In [43]:

df["Living_With"].value_counts()

Out[43]:

Living_With
Partner    1428
Alone       784
Name: count, dtype: int64

对上面两个字段的类型编码工作:

In [44]:

le = LabelEncoder()

for col in object_columns:
    df[col] = df[[col]].apply(le.fit_transform)

此时字段全部是数值相关的字段类型:

In [45]:

pd.value_counts(df.dtypes)

Out[45]:

int64      26
int32       3
float64     1
Name: count, dtype: int64

5.2 数据标准化StandardScaler

In [46]:

# 副本

df1 = df.copy()  

In [47]:

#  待删除字段
cols_del = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Response']

对副本的操作:

In [48]:

df1.drop(cols_del, axis=1, inplace=True)

In [49]:

ss = StandardScaler()
ss.fit(df1)

Out[49]:

StandardScaler

StandardScaler()

字段经过缩放后的数据:

In [50]:

ss_df = pd.DataFrame(ss.transform(df1), columns=df1.columns)
ss_df

Out[50]:

EducationIncomeKidhomeTeenhomeRecencyWinesFruitsMeatFishSweets...NumCatalogPurchasesNumStorePurchasesNumWebVisitsMonthCustomer_ForAgeSpentLiving_WithChildrenFamily_SizeIs_Parent
0-0.8935860.287105-0.822754-0.9296990.3103530.9776601.5520411.6902932.4534721.483713...2.503607-0.5558140.6921811.5277211.0183521.676245-1.349603-1.264598-1.758359-1.581139
1-0.893586-0.2608821.0400210.908097-0.380813-0.872618-0.637461-0.718230-0.651004-0.634019...-0.571340-1.171160-0.132545-1.1890111.274785-0.963297-1.3496031.4045720.4490700.632456
2-0.8935860.913196-0.822754-0.929699-0.7955140.3579350.570540-0.1785421.339513-0.147184...-0.2296791.290224-0.544908-0.2060480.3345300.2801100.740959-1.264598-0.654644-1.581139
3-0.893586-1.1761141.040021-0.929699-0.795514-0.872618-0.561961-0.655787-0.504911-0.585335...-0.913000-0.5558140.279818-1.060584-1.289547-0.9201350.7409590.0699870.4490700.632456
40.5716570.2943071.040021-0.9296991.554453-0.3922570.419540-0.2186840.152508-0.001133...0.1119820.059532-0.132545-0.951915-1.033114-0.3075620.7409590.0699870.4490700.632456
..................................................................
2207-0.8935860.430444-0.8227540.908097-0.1043471.1970840.4195400.0667700.0794612.213965...0.111982-0.555814-0.1325450.1347780.1635751.2180610.7409590.0699870.4490700.632456
22080.5716570.5601232.9027960.9080970.2412370.298631-0.662628-0.611184-0.687527-0.658360...-0.229679-0.2481410.692181-1.6533261.958607-0.2710400.7409592.7391582.6564990.632456
2209-0.8935860.233347-0.822754-0.9296991.4507781.7871560.5453730.222878-0.103155-0.366260...0.1119822.2132420.279818-0.981552-1.0331141.052052-1.349603-1.264598-1.758359-1.581139
22100.5716570.803172-0.8227540.908097-1.4175640.3638660.0923730.2094980.7734030.071892...0.7953031.290224-0.957271-0.9766121.1038300.3913360.7409590.0699870.4490700.632456
22110.5716570.0422901.0400210.908097-0.311697-0.656159-0.587128-0.472917-0.651004-0.634019...-0.571340-0.5558140.6921811.3252011.274785-0.7225840.7409591.4045721.5527840.632456

2212 rows × 23 columns

5.3 数据降维PCA

5.3.1 实施降维

In [51]:

pca = PCA(n_components=3)
pca.fit(ss_df)  # 对标准化后的数据进行降维

Out[51]:

PCA

PCA(n_components=3)

经过降维后的数据:

In [52]:

PCA_ds = pd.DataFrame(pca.transform(ss_df), columns=(["col1","col2","col3"]))
PCA_ds

Out[52]:

col1col2col3
04.986336-0.1615022.445704
1-2.8741680.022701-1.530784
22.615763-0.731408-0.264243
3-2.654568-1.455875-0.398126
4-0.6560150.177848-0.141472
............
22072.3229452.4375130.495596
2208-3.1009994.014083-1.415940
22092.666497-1.8937060.556943
22101.4873491.651778-1.760027
2211-2.7334691.688075-0.212150

2212 rows × 3 columns

In [53]:

PCA_ds.describe().T

Out[53]:

countmeanstdmin25%50%75%max
col12212.0-3.212219e-172.878602-5.978123-2.539470-0.7815952.3863807.452915
col22212.0-1.284887e-171.709469-4.194757-1.323932-0.1737161.2349236.168185
col32212.04.577411e-171.231685-3.625184-0.853556-0.0512920.8638416.746845

5.3.2 降维可视化

对3个主成分的可视化:

In [54]:

x = PCA_ds["col1"]
y = PCA_ds["col2"]
z = PCA_ds["col3"]

In [55]:

fig = plt.figure(figsize=(10,8))

ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="maroon", marker="o" )
ax.set_title("3D Figure Of Data In The Reduced Dimension")

plt.show()

6 聚类Clustering

6.1 肘图确定k值

In [56]:

Elbow_M = KElbowVisualizer(KMeans(), k=10)

Elbow_M.fit(PCA_ds)
Elbow_M.show()

可以看到在k=4时候是最好的。

6.2 实施聚类

实施凝聚层次聚类AgglomerativeClustering

In [57]:

# k=4

AC = AgglomerativeClustering(n_clusters=4)
y_pred = AC.fit_predict(PCA_ds)  # 预测值
y_pred

Out[57]:

array([2, 3, 2, ..., 2, 0, 3], dtype=int64)

给降维后的数据贴上聚类的结果y_pred

In [58]:

PCA_ds["Clusters"] = y_pred

PCA_ds["Clusters"] = PCA_ds["Clusters"].astype("object")  # 将聚类结果字段转成字符型

给原数据也贴上聚类的结果y_pred

In [59]:

df["Clusters"] = y_pred  

df["Clusters"] = df["Clusters"].astype("object")  # 将聚类结果字段转成字符型

6.3 聚类效果可视化

In [60]:

x = PCA_ds["col1"]
y = PCA_ds["col2"]
z = PCA_ds["col3"]

In [61]:

fig = plt.figure(figsize=(10,8))

ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z,s=40, c=PCA_ds["Clusters"], marker="o",cmap=cmap)
ax.set_title("3D Figure of Clusters")

plt.show()

7 聚类效果评估

7.1 不同簇群统计countplot

如何显示柱子上方的数据:

In [62]:

# pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
# pl = sns.countplot(x=df["Clusters"], palette= pal)

ax = sns.countplot(x=df["Clusters"])

# 在柱子上方显示数据
ax.bar_label(ax.containers[0])

plt.title("Distribution Of The Clusters")
plt.show()

7.2 基于簇群多指标关系分布scatterplot

In [63]:

# # pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]

# pl = sns.scatterplot(data = df,  # 数据
#                      x="Spent", # x-y轴信息
#                      y="Income",
#                      hue="Clusters",  # 分类字段
#                      #palette= pal
#                     )

# pl.set_title("Cluster's Distribution Based On Income & Spent")
# plt.legend()
# plt.show()

In [64]:

df["Clusters"] = df["Clusters"].astype("object")

In [65]:

fig = px.scatter(df,x="Spent", y="Income",color="Clusters")

fig.show()

  • Group0:高Spent + 平均Income
  • Group1:低SPent + 低Income
  • Group2:高Spent + 高Income
  • Group3:低Spent + 平均Income

7.3 基于簇群的单个指标分布(swarmplot+boxenplot)

  • swarmplot()可以自己实现对数据分类的展现,也可以作为箱形图或小提琴图的一种补充,用来显示所有结果以及基本分布情况。
  • boxenplot是为更大的数据集绘制增强的箱型图。这种风格的绘图最初被命名为“信值图”,因为它显示了大量被定义为“置信区间”的分位数。它类似于绘制分布的非参数表示的箱形图,其中所有特征对应于实际观察的数值点。通过绘制更多分位数,它提供了有关分布形状的更多信息,特别是尾部数据的分布。

In [66]:

plt.figure()

ax = sns.swarmplot(x=df["Clusters"],y=df["Spent"],color="#CBEDDD",alpha=0.5)
ax = sns.boxenplot(x=df["Clusters"],y=df["Spent"])

plt.show()

基于plotly的实现:

In [67]:

fig = px.violin(df,
                x="Clusters",
                y="Spent",
                color="Clusters",
                violinmode="group",
                box=True
               )

fig.show()

7.4 整体情况All_AcceptedCmp

整体的AcceptedCmp表现情况:

In [68]:

df["All_AcceptedCmp"] = df["AcceptedCmp1"]+ df["AcceptedCmp2"]+ df["AcceptedCmp3"]+ df["AcceptedCmp4"]+ df["AcceptedCmp5"]
df.head()

Out[68]:

EducationIncomeKidhomeTeenhomeRecencyWinesFruitsMeatFishSweets...ResponseCustomer_ForAgeSpentLiving_WithChildrenFamily_SizeIs_ParentClustersAll_AcceptedCmp
0058138.000586358854617288...1663671617001020
1046344.01138111621...01137027023130
2071613.000264264912711121...031259776102020
3026646.0102611420103...01394053113110
4158293.01094173431184627...016143422113130

5 rows × 32 columns

In [69]:

df["All_AcceptedCmp"].value_counts()

Out[69]:

All_AcceptedCmp
0    1754
1     322
2      81
3      44
4      11
Name: count, dtype: int64

In [70]:

pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]

plt.figure()
# 写法1:基于df数据使用countplot统计
pl = sns.countplot(x=df["All_AcceptedCmp"], hue=df["Clusters"],  palette= pal)
pl.set_title("Count Of Promotion Accepted")
pl.set_xlabel("Number Of Total Accepted Promotions")

plt.show()

另一种写法:

In [71]:

pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]

plt.figure()
# 写法2
sns.countplot(x=df["All_AcceptedCmp"], hue=df["Clusters"],  palette= pal)
plt.title("Count Of Promotion Accepted")
plt.xlabel("Number Of Total Accepted Promotions")

plt.show()

统计不同簇群下的All_AcceptedCmp情况:

In [72]:

AcceptedCmp_by_Clusters = df.groupby(["Clusters","All_AcceptedCmp"]).size().reset_index()
AcceptedCmp_by_Clusters.head()

Out[72]:

ClustersAll_AcceptedCmp0
000427
10194
20225
3038
4042

In [73]:

AcceptedCmp_by_Clusters.columns = ["Clusters","All_AcceptedCmp","Count"]

AcceptedCmp_by_Clusters["Count"] = AcceptedCmp_by_Clusters["Count"].astype("object")
AcceptedCmp_by_Clusters

Out[73]:

ClustersAll_AcceptedCmpCount
000427
10194
20225
3038
4042
510545
61152
7121
820280
921119
102251
112336
12249
1330502
143157
15324

7.5 促销活动NumDealsPurchases

In [74]:

plt.figure()

pl=sns.boxenplot(y=df["NumDealsPurchases"],x=df["Clusters"])
pl.set_title("Number of Deals Purchased")
plt.show()

另一种写法:

In [75]:

# 写法2

plt.figure()
sns.boxenplot(y=df["NumDealsPurchases"],x=df["Clusters"])
plt.title("Number of Deals Purchased")
plt.show()

基于plotly的实现:

In [76]:

fig = px.box(df,x="Clusters",y="NumDealsPurchases",color="Clusters")

fig.show()

7.6 不同簇群下的人群特点

In [77]:

Personal = ["Kidhome","Teenhome","Customer_For", "Age", "Children", "Family_Size", "Is_Parent", "Education","Living_With"]

In [78]:

# 写法1
for col in Personal:
    plt.figure()
    sns.jointplot(x=df[col], y=df["Spent"], hue=df["Clusters"],kind="kde",palette=pal)
    plt.show()

另一种写法:基于seaborn的jointplot,需要自定义

参考博文:blog.csdn.net/qq_44785318…

In [79]:

import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import gridspec
 
class SeabornFig2Grid():
    """
    自定义类SeabornFig2Grid
    """
    def __init__(self, seaborngrid, fig,  subplot_spec):
        self.fig = fig
        self.sg = seaborngrid
        self.subplot = subplot_spec
        if isinstance(self.sg, sns.axisgrid.FacetGrid) or isinstance(self.sg, sns.axisgrid.PairGrid):
            self._movegrid()
        elif isinstance(self.sg, sns.axisgrid.JointGrid):
            self._movejointgrid()
        self._finalize()
     
    def _movegrid(self):
        """Move PairGrid or Facetgrid"""
        self._resize()
        n = self.sg.axes.shape[0]
        m = self.sg.axes.shape[1]
        self.subgrid = gridspec.GridSpecFromSubplotSpec(n,m, subplot_spec=self.subplot)
        for i in range(n):
            for j in range(m):
                self._moveaxes(self.sg.axes[i,j], self.subgrid[i,j])
     
    def _movejointgrid(self):
        """ Move Jointgrid """
        h= self.sg.ax_joint.get_position().height
        h2= self.sg.ax_marg_x.get_position().height
        r = int(np.round(h/h2)) 
        self._resize()
        self.subgrid = gridspec.GridSpecFromSubplotSpec(r+1,r+1, subplot_spec=self.subplot)
 
        self._moveaxes(self.sg.ax_joint, self.subgrid[1:, :-1])
        self._moveaxes(self.sg.ax_marg_x, self.subgrid[0, :-1])
        self._moveaxes(self.sg.ax_marg_y, self.subgrid[1:, -1])
 
    def _moveaxes(self, ax, gs):
        ax.remove()
        ax.figure=self.fig
        self.fig.axes.append(ax)
        self.fig.add_axes(ax)
        ax._subplotspec = gs
        ax.set_position(gs.get_position(self.fig))
        ax.set_subplotspec(gs)
     
    def _finalize(self):
        plt.close(self.sg.fig)
        self.fig.canvas.mpl_connect("resize_event", self._resize)
        self.fig.canvas.draw()
     
    def _resize(self, evt=None):
        self.sg.fig.set_size_inches(self.fig.get_size_inches())

In [80]:

fig, axes = plt.subplots(3, 3, figsize=(12,14))

g0 = sns.jointplot(ax=axes[0,0], data=df, x="Kidhome",  y="Spent", hue="Clusters",kind="kde", palette=pal)
g1 = sns.jointplot(ax=axes[0,1], data=df, x="Teenhome", y="Spent", hue="Clusters",kind="kde", palette=pal)
g2 = sns.jointplot(ax=axes[0,2], data=df, x="Customer_For",  y="Spent", hue="Clusters",kind="kde", palette=pal)
g3 = sns.jointplot(ax=axes[1,0], data=df, x="Age", y="Spent", hue="Clusters",kind="kde", palette=pal)
g4 = sns.jointplot(ax=axes[1,1], data=df, x="Children",  y="Spent", hue="Clusters",kind="kde", palette=pal)
g5 = sns.jointplot(ax=axes[1,2], data=df, x="Family_Size", y="Spent", hue="Clusters",kind="kde", palette=pal)
g6 = sns.jointplot(ax=axes[2,0], data=df, x="Is_Parent",  y="Spent", hue="Clusters",kind="kde", palette=pal)
g7 = sns.jointplot(ax=axes[2,1], data=df, x="Education",  y="Spent", hue="Clusters",kind="kde", palette=pal)
g8 = sns.jointplot(ax=axes[2,2], data=df, x="Living_With", y="Spent", hue="Clusters",kind="kde", palette=pal)

gs = gridspec.GridSpec(3,3)
mg0 = SeabornFig2Grid(g0, fig, gs[0])
mg1 = SeabornFig2Grid(g1, fig, gs[1])
mg2 = SeabornFig2Grid(g2, fig, gs[2])
mg0 = SeabornFig2Grid(g3, fig, gs[3])
mg1 = SeabornFig2Grid(g4, fig, gs[4])
mg2 = SeabornFig2Grid(g5, fig, gs[5])
mg0 = SeabornFig2Grid(g6, fig, gs[6])
mg1 = SeabornFig2Grid(g7, fig, gs[7])
mg2 = SeabornFig2Grid(g8, fig, gs[8])

plt.show()

完整图形的显示