公众号:尤而小屋
作者:Peter
编辑:Peter
大家好,我是Peter~
今天给大家分享一篇关于聚类的建模文章,主要内容包含:
- 数据基本信息
- 原数据特征处理
- 数据探索性分析EDA
- 建模之数据预处理:数据编码、数据标准化、数据降维
- 实施聚类:肘图法确定k值、实施聚类、聚类结果可视化
- 聚类效果评估:从不同角度查看不同的簇群
1 导入库
In [1]:
import numpy as np
import pandas as pd
import datetime
# 可视化
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
import plotly_express as px
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn import metrics
# 警告处理
import warnings
import sys
if not sys.warnoptions:
warnings.simplefilter("ignore")
np.random.seed(42)
2 数据信息
2.1 导入数据
In [2]:
df = pd.read_csv("marketing_campaign.csv",sep="\t")
df.head()
Out[2]:
ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5524 | 1957 | Graduation | Single | 58138.0 | 0 | 0 | 04-09-2012 | 58 | 635 | ... | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
1 | 2174 | 1954 | Graduation | Single | 46344.0 | 1 | 1 | 08-03-2014 | 38 | 11 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
2 | 4141 | 1965 | Graduation | Together | 71613.0 | 0 | 0 | 21-08-2013 | 26 | 426 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
3 | 6182 | 1984 | Graduation | Together | 26646.0 | 1 | 0 | 10-02-2014 | 26 | 11 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
4 | 5324 | 1981 | PhD | Married | 58293.0 | 1 | 0 | 19-01-2014 | 94 | 173 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
5 rows × 29 columns
整体数据量:
In [3]:
df.shape
Out[3]:
(2240, 29)
In [4]:
df.columns
Out[4]:
Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response'],
dtype='object')
In [5]:
df.dtypes # 字段的数据类型
Out[5]:
ID int64
Year_Birth int64
Education object
Marital_Status object
Income float64
Kidhome int64
Teenhome int64
Dt_Customer object
Recency int64
MntWines int64
MntFruits int64
MntMeatProducts int64
MntFishProducts int64
MntSweetProducts int64
MntGoldProds int64
NumDealsPurchases int64
NumWebPurchases int64
NumCatalogPurchases int64
NumStorePurchases int64
NumWebVisitsMonth int64
AcceptedCmp3 int64
AcceptedCmp4 int64
AcceptedCmp5 int64
AcceptedCmp1 int64
AcceptedCmp2 int64
Complain int64
Z_CostContact int64
Z_Revenue int64
Response int64
dtype: object
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 2240 non-null int64
1 Year_Birth 2240 non-null int64
2 Education 2240 non-null object
3 Marital_Status 2240 non-null object
4 Income 2216 non-null float64
5 Kidhome 2240 non-null int64
6 Teenhome 2240 non-null int64
7 Dt_Customer 2240 non-null object
8 Recency 2240 non-null int64
9 MntWines 2240 non-null int64
10 MntFruits 2240 non-null int64
11 MntMeatProducts 2240 non-null int64
12 MntFishProducts 2240 non-null int64
13 MntSweetProducts 2240 non-null int64
14 MntGoldProds 2240 non-null int64
15 NumDealsPurchases 2240 non-null int64
16 NumWebPurchases 2240 non-null int64
17 NumCatalogPurchases 2240 non-null int64
18 NumStorePurchases 2240 non-null int64
19 NumWebVisitsMonth 2240 non-null int64
20 AcceptedCmp3 2240 non-null int64
21 AcceptedCmp4 2240 non-null int64
22 AcceptedCmp5 2240 non-null int64
23 AcceptedCmp1 2240 non-null int64
24 AcceptedCmp2 2240 non-null int64
25 Complain 2240 non-null int64
26 Z_CostContact 2240 non-null int64
27 Z_Revenue 2240 non-null int64
28 Response 2240 non-null int64
dtypes: float64(1), int64(25), object(3)
memory usage: 507.6+ KB
2.2 数据缺失值情况
In [7]:
df.isnull().sum()
Out[7]:
ID 0
Year_Birth 0
Education 0
Marital_Status 0
Income 24
Kidhome 0
Teenhome 0
Dt_Customer 0
Recency 0
MntWines 0
MntFruits 0
MntMeatProducts 0
MntFishProducts 0
MntSweetProducts 0
MntGoldProds 0
NumDealsPurchases 0
NumWebPurchases 0
NumCatalogPurchases 0
NumStorePurchases 0
NumWebVisitsMonth 0
AcceptedCmp3 0
AcceptedCmp4 0
AcceptedCmp5 0
AcceptedCmp1 0
AcceptedCmp2 0
Complain 0
Z_CostContact 0
Z_Revenue 0
Response 0
dtype: int64
字段缺失值比例:
In [8]:
df.isnull().sum() / len(df) # 缺失值比例
Out[8]:
ID 0.000000
Year_Birth 0.000000
Education 0.000000
Marital_Status 0.000000
Income 0.010714
Kidhome 0.000000
Teenhome 0.000000
Dt_Customer 0.000000
Recency 0.000000
MntWines 0.000000
MntFruits 0.000000
MntMeatProducts 0.000000
MntFishProducts 0.000000
MntSweetProducts 0.000000
MntGoldProds 0.000000
NumDealsPurchases 0.000000
NumWebPurchases 0.000000
NumCatalogPurchases 0.000000
NumStorePurchases 0.000000
NumWebVisitsMonth 0.000000
AcceptedCmp3 0.000000
AcceptedCmp4 0.000000
AcceptedCmp5 0.000000
AcceptedCmp1 0.000000
AcceptedCmp2 0.000000
Complain 0.000000
Z_CostContact 0.000000
Z_Revenue 0.000000
Response 0.000000
dtype: float64
缺失值的比例非常小,我们考虑直接删除:
In [9]:
df.dropna(inplace=True)
df.isnull().sum() # 删除缺失值后的统计情况
Out[9]:
ID 0
Year_Birth 0
Education 0
Marital_Status 0
Income 0
Kidhome 0
Teenhome 0
Dt_Customer 0
Recency 0
MntWines 0
MntFruits 0
MntMeatProducts 0
MntFishProducts 0
MntSweetProducts 0
MntGoldProds 0
NumDealsPurchases 0
NumWebPurchases 0
NumCatalogPurchases 0
NumStorePurchases 0
NumWebVisitsMonth 0
AcceptedCmp3 0
AcceptedCmp4 0
AcceptedCmp5 0
AcceptedCmp1 0
AcceptedCmp2 0
Complain 0
Z_CostContact 0
Z_Revenue 0
Response 0
dtype: int64
3 数据特征处理
3.1 时间字段Dt_Customer
关于时间字段处理
In [10]:
df["Dt_Customer"].value_counts()
Out[10]:
Dt_Customer
31-08-2012 12
12-09-2012 11
14-02-2013 11
12-05-2014 11
20-08-2013 10
..
05-08-2012 1
18-11-2012 1
09-05-2014 1
26-06-2013 1
09-01-2014 1
Name: count, Length: 662, dtype: int64
In [11]:
df["Dt_Customer"] = pd.to_datetime(df["Dt_Customer"], dayfirst=True) # 转成时间相关字段,需要给定参数dayfirst
df["Dt_Customer"]
Out[11]:
0 2012-09-04
1 2014-03-08
2 2013-08-21
3 2014-02-10
4 2014-01-19
...
2235 2013-06-13
2236 2014-06-10
2237 2014-01-25
2238 2014-01-24
2239 2012-10-15
Name: Dt_Customer, Length: 2216, dtype: datetime64[ns]
In [12]:
df.sort_values("Year_Birth",ascending=False)
Out[12]:
ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | NumWebVisitsMonth | AcceptedCmp3 | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
46 | 9909 | 1996 | 2n Cycle | Married | 7500.0 | 0 | 0 | 2012-11-09 | 24 | 3 | ... | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 |
1170 | 193 | 1996 | Basic | Married | 14421.0 | 0 | 0 | 2014-02-17 | 81 | 0 | ... | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
747 | 10548 | 1995 | Graduation | Single | 71163.0 | 0 | 0 | 2014-03-09 | 30 | 283 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
2213 | 3661 | 1995 | 2n Cycle | Single | 80617.0 | 0 | 0 | 2012-10-12 | 42 | 594 | ... | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
696 | 8315 | 1995 | Graduation | Single | 34824.0 | 0 | 0 | 2014-03-26 | 65 | 4 | ... | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
424 | 6932 | 1941 | PhD | Married | 93027.0 | 0 | 0 | 2013-04-13 | 77 | 1285 | ... | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 3 | 11 | 0 |
1950 | 6663 | 1940 | PhD | Single | 51141.0 | 0 | 0 | 2013-07-08 | 96 | 144 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
192 | 7829 | 1900 | 2n Cycle | Divorced | 36640.0 | 1 | 0 | 2013-09-26 | 99 | 15 | ... | 5 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 11 | 0 |
339 | 1150 | 1899 | PhD | Together | 83532.0 | 0 | 0 | 2013-09-26 | 36 | 755 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 3 | 11 | 0 |
239 | 11004 | 1893 | 2n Cycle | Single | 60182.0 | 0 | 1 | 2014-05-17 | 23 | 8 | ... | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 |
2216 rows × 29 columns
时间日期中的最值:
In [13]:
dates = df["Dt_Customer"].tolist()
dates[:5]
Out[13]:
[Timestamp('2012-09-04 00:00:00'),
Timestamp('2014-03-08 00:00:00'),
Timestamp('2013-08-21 00:00:00'),
Timestamp('2014-02-10 00:00:00'),
Timestamp('2014-01-19 00:00:00')]
In [14]:
max(dates)
Out[14]:
Timestamp('2014-06-29 00:00:00')
In [15]:
min(dates)
Out[15]:
Timestamp('2012-07-30 00:00:00')
3.2 新特征Customer_For
创建一个特征("Customer_For"),表示客户开始在商店购物的日期与最后记录日期之间相差的天数。
In [16]:
days = []
max_day = max(dates)
for i in dates:
delta = max_day - i # 和最近日期相差天数
days.append(delta.days) # 提取天信息
days[:10]
Out[16]:
[663, 113, 312, 139, 161, 293, 593, 417, 388, 108]
In [17]:
df["Customer_For"] = days
强制转成数值型数据:
In [18]:
df["Customer_For"] = pd.to_numeric(df["Customer_For"], errors="coerce")
3.3 Marital_Status & Education
In [19]:
df["Marital_Status"].value_counts()
Out[19]:
Marital_Status
Married 857
Together 573
Single 471
Divorced 232
Widow 76
Alone 3
Absurd 2
YOLO 2
Name: count, dtype: int64
In [20]:
df["Education"].value_counts()
Out[20]:
Education
Graduation 1116
PhD 481
Master 365
2n Cycle 200
Basic 54
Name: count, dtype: int64
3.4 特征衍生
In [21]:
# 年龄:用当前年 - 出生年
df["Age"] = 2024 - df["Year_Birth"]
In [22]:
df[df["Age"] == 131]
Out[22]:
ID | Year_Birth | Education | Marital_Status | Income | Kidhome | Teenhome | Dt_Customer | Recency | MntWines | ... | AcceptedCmp4 | AcceptedCmp5 | AcceptedCmp1 | AcceptedCmp2 | Complain | Z_CostContact | Z_Revenue | Response | Customer_For | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
239 | 11004 | 1893 | 2n Cycle | Single | 60182.0 | 0 | 1 | 2014-05-17 | 23 | 8 | ... | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 43 | 131 |
1 rows × 31 columns
In [23]:
# 所有的花费:将多个字段合并为一个字段Spent
df["Spent"] = df["MntWines"] + df["MntFruits"]+ df["MntMeatProducts"]+ df["MntFishProducts"]+ df["MntSweetProducts"]+ df["MntGoldProds"]
In [24]:
# 生活状态Living_with
df["Living_With"] = df["Marital_Status"].replace({"Married":"Partner", # 共同生活
"Together":"Partner",
"Absurd":"Alone", # 独自一人
"Widow":"Alone",
"YOLO":"Alone",
"Divorced":"Alone",
"Single":"Alone"}
)
In [25]:
# 小孩个数
df["Children"] = df["Kidhome"] + df["Teenhome"]
In [26]:
# 家庭成员总数
df["Family_Size"] = df["Living_With"].replace({"Alone": 1, "Partner": 2}) + df["Children"]
In [27]:
# 是否为父母
df["Is_Parent"] = np.where(df["Children"] > 0, 1, 0)
In [28]:
# 受教育程度
df["Education"] = df["Education"].replace({"Basic":"Undergrauate",
"2n Cycle":"Undergrauate",
"Graducation":"Graduate",
"Master":"Postgraduate",
"PhD":"Postgraduate"
})
3.5 字段重命名rename
In [29]:
df = df.rename(columns={"MntWines": "Wines",
"MntFruits":"Fruits",
"MntMeatProducts":"Meat",
"MntFishProducts":"Fish",
"MntSweetProducts":"Sweets",
"MntGoldProds":"Gold"})
3.6 删除无效字段drop
In [30]:
to_drop = ["Marital_Status", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Year_Birth", "ID"]
In [31]:
df.drop(to_drop, axis=1, inplace=True)
In [32]:
df.describe()
Out[32]:
Income | Kidhome | Teenhome | Recency | Wines | Fruits | Meat | Fish | Sweets | Gold | ... | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Customer_For | Age | Spent | Children | Family_Size | Is_Parent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | ... | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 | 2216.000000 |
mean | 52247.251354 | 0.441787 | 0.505415 | 49.012635 | 305.091606 | 26.356047 | 166.995939 | 37.637635 | 27.028881 | 43.965253 | ... | 0.064079 | 0.013538 | 0.009477 | 0.150271 | 353.521209 | 55.179603 | 607.075361 | 0.947202 | 2.592509 | 0.714350 |
std | 25173.076661 | 0.536896 | 0.544181 | 28.948352 | 337.327920 | 39.793917 | 224.283273 | 54.752082 | 41.072046 | 51.815414 | ... | 0.244950 | 0.115588 | 0.096907 | 0.357417 | 202.434667 | 11.985554 | 602.900476 | 0.749062 | 0.905722 | 0.451825 |
min | 1730.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 28.000000 | 5.000000 | 0.000000 | 1.000000 | 0.000000 |
25% | 35303.000000 | 0.000000 | 0.000000 | 24.000000 | 24.000000 | 2.000000 | 16.000000 | 3.000000 | 1.000000 | 9.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 180.000000 | 47.000000 | 69.000000 | 0.000000 | 2.000000 | 0.000000 |
50% | 51381.500000 | 0.000000 | 0.000000 | 49.000000 | 174.500000 | 8.000000 | 68.000000 | 12.000000 | 8.000000 | 24.500000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 355.500000 | 54.000000 | 396.500000 | 1.000000 | 3.000000 | 1.000000 |
75% | 68522.000000 | 1.000000 | 1.000000 | 74.000000 | 505.000000 | 33.000000 | 232.250000 | 50.000000 | 33.000000 | 56.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 529.000000 | 65.000000 | 1048.000000 | 1.000000 | 3.000000 | 1.000000 |
max | 666666.000000 | 2.000000 | 2.000000 | 99.000000 | 1493.000000 | 199.000000 | 1725.000000 | 259.000000 | 262.000000 | 321.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 699.000000 | 131.000000 | 2525.000000 | 3.000000 | 5.000000 | 1.000000 |
8 rows × 28 columns
4 数据EDA
4.1 两两特征关系pairplot
In [33]:
sns.set(rc={"axes.facecolor": "#FFF9ED", "figure.facecolor": "#FFF9ED"})
# pallet = ["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#9F8A78", "#F3AB60"]
To_Plot = [ "Income", "Recency", "Customer_For", "Age", "Spent", "Is_Parent"]
# 取消palette
# sns.pairplot(df[To_Plot], hue= "Is_Parent",palette= (["#682F2F","#F3AB60"]))
plt.figure(figsize=(10, 6))
sns.pairplot(df[To_Plot], hue= "Is_Parent")
plt.show()
可以看到在Age和Income中存在很多的异常值:
In [34]:
fig = px.box(df["Age"])
fig.show()
可以看到Age字段的上四分位数是:84
In [35]:
fig = px.box(df, y=["Income"], points="outliers",)
fig.show()
在收入Income字段中,我们取临界值为600000:
In [36]:
df = df[df["Age"] < 90]
df = df[df["Income"] < 600000] # 两次数据过滤
新数据的总长度为:
In [37]:
len(df)
Out[37]:
2212
4.2 相关系数
使用df中包含两个字符类型的字段,在新版本的pandas中需要带上参数numeric_only=True
:
In [38]:
corr = df.corr(numeric_only=True)
corr.head()
Out[38]:
Income | Kidhome | Teenhome | Recency | Wines | Fruits | Meat | Fish | Sweets | Gold | ... | AcceptedCmp1 | AcceptedCmp2 | Complain | Response | Customer_For | Age | Spent | Children | Family_Size | Is_Parent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Income | 1.000000 | -0.514523 | 0.034565 | 0.007965 | 0.688209 | 0.507354 | 0.692279 | 0.520040 | 0.523599 | 0.388299 | ... | 0.327524 | 0.104036 | -0.027900 | 0.161387 | -0.023760 | 0.199977 | 0.792740 | -0.343529 | -0.286638 | -0.403132 |
Kidhome | -0.514523 | 1.000000 | -0.039066 | 0.010623 | -0.497203 | -0.373258 | -0.439031 | -0.388643 | -0.377843 | -0.354922 | ... | -0.174261 | -0.081911 | 0.037067 | -0.077901 | -0.055281 | -0.237497 | -0.557949 | 0.688081 | 0.583250 | 0.520355 |
Teenhome | 0.034565 | -0.039066 | 1.000000 | 0.014392 | 0.003945 | -0.175905 | -0.261134 | -0.205235 | -0.163107 | -0.018579 | ... | -0.145198 | -0.015633 | 0.007746 | -0.154402 | 0.018557 | 0.361932 | -0.137964 | 0.698199 | 0.594481 | 0.587993 |
Recency | 0.007965 | 0.010623 | 0.014392 | 1.000000 | 0.015981 | -0.005257 | 0.022914 | 0.000788 | 0.025244 | 0.018148 | ... | -0.021147 | -0.001429 | 0.005713 | -0.200114 | 0.025681 | 0.015694 | 0.020479 | 0.018062 | 0.014717 | 0.002189 |
Wines | 0.688209 | -0.497203 | 0.003945 | 0.015981 | 1.000000 | 0.385844 | 0.568081 | 0.396915 | 0.389583 | 0.391461 | ... | 0.351610 | 0.206309 | -0.036420 | 0.246320 | 0.167852 | 0.164615 | 0.892996 | -0.353356 | -0.296702 | -0.341994 |
5 rows × 28 columns
In [39]:
cmap = colors.ListedColormap(["#682F2F", "#9E726F", "#D6B2B1", "#B9C0C9", "#1F8A78", "#F3AB60"])
plt.figure(figsize=(20, 16))
sns.heatmap(corr, annot=True, cmap=cmap, center=0)
plt.show()
5 数据预处理
5.1 数据编码LabelEncoder
针对字符类型数据的编码处理:
In [40]:
df.columns
Out[40]:
Index(['Education', 'Income', 'Kidhome', 'Teenhome', 'Recency', 'Wines', 'Fruits', 'Meat', 'Fish', 'Sweets', 'Gold', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Response', 'Customer_For', 'Age', 'Spent', 'Living_With', 'Children', 'Family_Size', 'Is_Parent'],
dtype='object')
确定属于object类型的字段:
In [41]:
object_columns = df.columns[df.dtypes == "object"].tolist()
object_columns
Out[41]:
['Education', 'Living_With']
In [42]:
df["Education"].value_counts()
Out[42]:
Education
Graduation 1115
Postgraduate 845
Undergrauate 252
Name: count, dtype: int64
In [43]:
df["Living_With"].value_counts()
Out[43]:
Living_With
Partner 1428
Alone 784
Name: count, dtype: int64
对上面两个字段的类型编码工作:
In [44]:
le = LabelEncoder()
for col in object_columns:
df[col] = df[[col]].apply(le.fit_transform)
此时字段全部是数值相关的字段类型:
In [45]:
pd.value_counts(df.dtypes)
Out[45]:
int64 26
int32 3
float64 1
Name: count, dtype: int64
5.2 数据标准化StandardScaler
In [46]:
# 副本
df1 = df.copy()
In [47]:
# 待删除字段
cols_del = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Response']
对副本的操作:
In [48]:
df1.drop(cols_del, axis=1, inplace=True)
In [49]:
ss = StandardScaler()
ss.fit(df1)
Out[49]:
StandardScaler
StandardScaler()
字段经过缩放后的数据:
In [50]:
ss_df = pd.DataFrame(ss.transform(df1), columns=df1.columns)
ss_df
Out[50]:
Education | Income | Kidhome | Teenhome | Recency | Wines | Fruits | Meat | Fish | Sweets | ... | NumCatalogPurchases | NumStorePurchases | NumWebVisitsMonth | Customer_For | Age | Spent | Living_With | Children | Family_Size | Is_Parent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -0.893586 | 0.287105 | -0.822754 | -0.929699 | 0.310353 | 0.977660 | 1.552041 | 1.690293 | 2.453472 | 1.483713 | ... | 2.503607 | -0.555814 | 0.692181 | 1.527721 | 1.018352 | 1.676245 | -1.349603 | -1.264598 | -1.758359 | -1.581139 |
1 | -0.893586 | -0.260882 | 1.040021 | 0.908097 | -0.380813 | -0.872618 | -0.637461 | -0.718230 | -0.651004 | -0.634019 | ... | -0.571340 | -1.171160 | -0.132545 | -1.189011 | 1.274785 | -0.963297 | -1.349603 | 1.404572 | 0.449070 | 0.632456 |
2 | -0.893586 | 0.913196 | -0.822754 | -0.929699 | -0.795514 | 0.357935 | 0.570540 | -0.178542 | 1.339513 | -0.147184 | ... | -0.229679 | 1.290224 | -0.544908 | -0.206048 | 0.334530 | 0.280110 | 0.740959 | -1.264598 | -0.654644 | -1.581139 |
3 | -0.893586 | -1.176114 | 1.040021 | -0.929699 | -0.795514 | -0.872618 | -0.561961 | -0.655787 | -0.504911 | -0.585335 | ... | -0.913000 | -0.555814 | 0.279818 | -1.060584 | -1.289547 | -0.920135 | 0.740959 | 0.069987 | 0.449070 | 0.632456 |
4 | 0.571657 | 0.294307 | 1.040021 | -0.929699 | 1.554453 | -0.392257 | 0.419540 | -0.218684 | 0.152508 | -0.001133 | ... | 0.111982 | 0.059532 | -0.132545 | -0.951915 | -1.033114 | -0.307562 | 0.740959 | 0.069987 | 0.449070 | 0.632456 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2207 | -0.893586 | 0.430444 | -0.822754 | 0.908097 | -0.104347 | 1.197084 | 0.419540 | 0.066770 | 0.079461 | 2.213965 | ... | 0.111982 | -0.555814 | -0.132545 | 0.134778 | 0.163575 | 1.218061 | 0.740959 | 0.069987 | 0.449070 | 0.632456 |
2208 | 0.571657 | 0.560123 | 2.902796 | 0.908097 | 0.241237 | 0.298631 | -0.662628 | -0.611184 | -0.687527 | -0.658360 | ... | -0.229679 | -0.248141 | 0.692181 | -1.653326 | 1.958607 | -0.271040 | 0.740959 | 2.739158 | 2.656499 | 0.632456 |
2209 | -0.893586 | 0.233347 | -0.822754 | -0.929699 | 1.450778 | 1.787156 | 0.545373 | 0.222878 | -0.103155 | -0.366260 | ... | 0.111982 | 2.213242 | 0.279818 | -0.981552 | -1.033114 | 1.052052 | -1.349603 | -1.264598 | -1.758359 | -1.581139 |
2210 | 0.571657 | 0.803172 | -0.822754 | 0.908097 | -1.417564 | 0.363866 | 0.092373 | 0.209498 | 0.773403 | 0.071892 | ... | 0.795303 | 1.290224 | -0.957271 | -0.976612 | 1.103830 | 0.391336 | 0.740959 | 0.069987 | 0.449070 | 0.632456 |
2211 | 0.571657 | 0.042290 | 1.040021 | 0.908097 | -0.311697 | -0.656159 | -0.587128 | -0.472917 | -0.651004 | -0.634019 | ... | -0.571340 | -0.555814 | 0.692181 | 1.325201 | 1.274785 | -0.722584 | 0.740959 | 1.404572 | 1.552784 | 0.632456 |
2212 rows × 23 columns
5.3 数据降维PCA
5.3.1 实施降维
In [51]:
pca = PCA(n_components=3)
pca.fit(ss_df) # 对标准化后的数据进行降维
Out[51]:
PCA
PCA(n_components=3)
经过降维后的数据:
In [52]:
PCA_ds = pd.DataFrame(pca.transform(ss_df), columns=(["col1","col2","col3"]))
PCA_ds
Out[52]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 4.986336 | -0.161502 | 2.445704 |
1 | -2.874168 | 0.022701 | -1.530784 |
2 | 2.615763 | -0.731408 | -0.264243 |
3 | -2.654568 | -1.455875 | -0.398126 |
4 | -0.656015 | 0.177848 | -0.141472 |
... | ... | ... | ... |
2207 | 2.322945 | 2.437513 | 0.495596 |
2208 | -3.100999 | 4.014083 | -1.415940 |
2209 | 2.666497 | -1.893706 | 0.556943 |
2210 | 1.487349 | 1.651778 | -1.760027 |
2211 | -2.733469 | 1.688075 | -0.212150 |
2212 rows × 3 columns
In [53]:
PCA_ds.describe().T
Out[53]:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
col1 | 2212.0 | -3.212219e-17 | 2.878602 | -5.978123 | -2.539470 | -0.781595 | 2.386380 | 7.452915 |
col2 | 2212.0 | -1.284887e-17 | 1.709469 | -4.194757 | -1.323932 | -0.173716 | 1.234923 | 6.168185 |
col3 | 2212.0 | 4.577411e-17 | 1.231685 | -3.625184 | -0.853556 | -0.051292 | 0.863841 | 6.746845 |
5.3.2 降维可视化
对3个主成分的可视化:
In [54]:
x = PCA_ds["col1"]
y = PCA_ds["col2"]
z = PCA_ds["col3"]
In [55]:
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z, c="maroon", marker="o" )
ax.set_title("3D Figure Of Data In The Reduced Dimension")
plt.show()
6 聚类Clustering
6.1 肘图确定k值
In [56]:
Elbow_M = KElbowVisualizer(KMeans(), k=10)
Elbow_M.fit(PCA_ds)
Elbow_M.show()
可以看到在k=4时候是最好的。
6.2 实施聚类
实施凝聚层次聚类AgglomerativeClustering
In [57]:
# k=4
AC = AgglomerativeClustering(n_clusters=4)
y_pred = AC.fit_predict(PCA_ds) # 预测值
y_pred
Out[57]:
array([2, 3, 2, ..., 2, 0, 3], dtype=int64)
给降维后的数据贴上聚类的结果y_pred
:
In [58]:
PCA_ds["Clusters"] = y_pred
PCA_ds["Clusters"] = PCA_ds["Clusters"].astype("object") # 将聚类结果字段转成字符型
给原数据也贴上聚类的结果y_pred
:
In [59]:
df["Clusters"] = y_pred
df["Clusters"] = df["Clusters"].astype("object") # 将聚类结果字段转成字符型
6.3 聚类效果可视化
In [60]:
x = PCA_ds["col1"]
y = PCA_ds["col2"]
z = PCA_ds["col3"]
In [61]:
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x,y,z,s=40, c=PCA_ds["Clusters"], marker="o",cmap=cmap)
ax.set_title("3D Figure of Clusters")
plt.show()
7 聚类效果评估
7.1 不同簇群统计countplot
如何显示柱子上方的数据:
In [62]:
# pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
# pl = sns.countplot(x=df["Clusters"], palette= pal)
ax = sns.countplot(x=df["Clusters"])
# 在柱子上方显示数据
ax.bar_label(ax.containers[0])
plt.title("Distribution Of The Clusters")
plt.show()
7.2 基于簇群多指标关系分布scatterplot
In [63]:
# # pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
# pl = sns.scatterplot(data = df, # 数据
# x="Spent", # x-y轴信息
# y="Income",
# hue="Clusters", # 分类字段
# #palette= pal
# )
# pl.set_title("Cluster's Distribution Based On Income & Spent")
# plt.legend()
# plt.show()
In [64]:
df["Clusters"] = df["Clusters"].astype("object")
In [65]:
fig = px.scatter(df,x="Spent", y="Income",color="Clusters")
fig.show()
- Group0:高Spent + 平均Income
- Group1:低SPent + 低Income
- Group2:高Spent + 高Income
- Group3:低Spent + 平均Income
7.3 基于簇群的单个指标分布(swarmplot+boxenplot)
- swarmplot()可以自己实现对数据分类的展现,也可以作为箱形图或小提琴图的一种补充,用来显示所有结果以及基本分布情况。
- boxenplot是为更大的数据集绘制增强的箱型图。这种风格的绘图最初被命名为“信值图”,因为它显示了大量被定义为“置信区间”的分位数。它类似于绘制分布的非参数表示的箱形图,其中所有特征对应于实际观察的数值点。通过绘制更多分位数,它提供了有关分布形状的更多信息,特别是尾部数据的分布。
In [66]:
plt.figure()
ax = sns.swarmplot(x=df["Clusters"],y=df["Spent"],color="#CBEDDD",alpha=0.5)
ax = sns.boxenplot(x=df["Clusters"],y=df["Spent"])
plt.show()
基于plotly的实现:
In [67]:
fig = px.violin(df,
x="Clusters",
y="Spent",
color="Clusters",
violinmode="group",
box=True
)
fig.show()
7.4 整体情况All_AcceptedCmp
整体的AcceptedCmp表现情况:
In [68]:
df["All_AcceptedCmp"] = df["AcceptedCmp1"]+ df["AcceptedCmp2"]+ df["AcceptedCmp3"]+ df["AcceptedCmp4"]+ df["AcceptedCmp5"]
df.head()
Out[68]:
Education | Income | Kidhome | Teenhome | Recency | Wines | Fruits | Meat | Fish | Sweets | ... | Response | Customer_For | Age | Spent | Living_With | Children | Family_Size | Is_Parent | Clusters | All_AcceptedCmp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 58138.0 | 0 | 0 | 58 | 635 | 88 | 546 | 172 | 88 | ... | 1 | 663 | 67 | 1617 | 0 | 0 | 1 | 0 | 2 | 0 |
1 | 0 | 46344.0 | 1 | 1 | 38 | 11 | 1 | 6 | 2 | 1 | ... | 0 | 113 | 70 | 27 | 0 | 2 | 3 | 1 | 3 | 0 |
2 | 0 | 71613.0 | 0 | 0 | 26 | 426 | 49 | 127 | 111 | 21 | ... | 0 | 312 | 59 | 776 | 1 | 0 | 2 | 0 | 2 | 0 |
3 | 0 | 26646.0 | 1 | 0 | 26 | 11 | 4 | 20 | 10 | 3 | ... | 0 | 139 | 40 | 53 | 1 | 1 | 3 | 1 | 1 | 0 |
4 | 1 | 58293.0 | 1 | 0 | 94 | 173 | 43 | 118 | 46 | 27 | ... | 0 | 161 | 43 | 422 | 1 | 1 | 3 | 1 | 3 | 0 |
5 rows × 32 columns
In [69]:
df["All_AcceptedCmp"].value_counts()
Out[69]:
All_AcceptedCmp
0 1754
1 322
2 81
3 44
4 11
Name: count, dtype: int64
In [70]:
pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
plt.figure()
# 写法1:基于df数据使用countplot统计
pl = sns.countplot(x=df["All_AcceptedCmp"], hue=df["Clusters"], palette= pal)
pl.set_title("Count Of Promotion Accepted")
pl.set_xlabel("Number Of Total Accepted Promotions")
plt.show()
另一种写法:
In [71]:
pal = ["#682F2F","#B9C0C9", "#9F8A78","#F3AB60"]
plt.figure()
# 写法2
sns.countplot(x=df["All_AcceptedCmp"], hue=df["Clusters"], palette= pal)
plt.title("Count Of Promotion Accepted")
plt.xlabel("Number Of Total Accepted Promotions")
plt.show()
统计不同簇群下的All_AcceptedCmp情况:
In [72]:
AcceptedCmp_by_Clusters = df.groupby(["Clusters","All_AcceptedCmp"]).size().reset_index()
AcceptedCmp_by_Clusters.head()
Out[72]:
Clusters | All_AcceptedCmp | 0 | |
---|---|---|---|
0 | 0 | 0 | 427 |
1 | 0 | 1 | 94 |
2 | 0 | 2 | 25 |
3 | 0 | 3 | 8 |
4 | 0 | 4 | 2 |
In [73]:
AcceptedCmp_by_Clusters.columns = ["Clusters","All_AcceptedCmp","Count"]
AcceptedCmp_by_Clusters["Count"] = AcceptedCmp_by_Clusters["Count"].astype("object")
AcceptedCmp_by_Clusters
Out[73]:
Clusters | All_AcceptedCmp | Count | |
---|---|---|---|
0 | 0 | 0 | 427 |
1 | 0 | 1 | 94 |
2 | 0 | 2 | 25 |
3 | 0 | 3 | 8 |
4 | 0 | 4 | 2 |
5 | 1 | 0 | 545 |
6 | 1 | 1 | 52 |
7 | 1 | 2 | 1 |
8 | 2 | 0 | 280 |
9 | 2 | 1 | 119 |
10 | 2 | 2 | 51 |
11 | 2 | 3 | 36 |
12 | 2 | 4 | 9 |
13 | 3 | 0 | 502 |
14 | 3 | 1 | 57 |
15 | 3 | 2 | 4 |
7.5 促销活动NumDealsPurchases
In [74]:
plt.figure()
pl=sns.boxenplot(y=df["NumDealsPurchases"],x=df["Clusters"])
pl.set_title("Number of Deals Purchased")
plt.show()
另一种写法:
In [75]:
# 写法2
plt.figure()
sns.boxenplot(y=df["NumDealsPurchases"],x=df["Clusters"])
plt.title("Number of Deals Purchased")
plt.show()
基于plotly的实现:
In [76]:
fig = px.box(df,x="Clusters",y="NumDealsPurchases",color="Clusters")
fig.show()
7.6 不同簇群下的人群特点
In [77]:
Personal = ["Kidhome","Teenhome","Customer_For", "Age", "Children", "Family_Size", "Is_Parent", "Education","Living_With"]
In [78]:
# 写法1
for col in Personal:
plt.figure()
sns.jointplot(x=df[col], y=df["Spent"], hue=df["Clusters"],kind="kde",palette=pal)
plt.show()
另一种写法:基于seaborn的jointplot,需要自定义
In [79]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import gridspec
class SeabornFig2Grid():
"""
自定义类SeabornFig2Grid
"""
def __init__(self, seaborngrid, fig, subplot_spec):
self.fig = fig
self.sg = seaborngrid
self.subplot = subplot_spec
if isinstance(self.sg, sns.axisgrid.FacetGrid) or isinstance(self.sg, sns.axisgrid.PairGrid):
self._movegrid()
elif isinstance(self.sg, sns.axisgrid.JointGrid):
self._movejointgrid()
self._finalize()
def _movegrid(self):
"""Move PairGrid or Facetgrid"""
self._resize()
n = self.sg.axes.shape[0]
m = self.sg.axes.shape[1]
self.subgrid = gridspec.GridSpecFromSubplotSpec(n,m, subplot_spec=self.subplot)
for i in range(n):
for j in range(m):
self._moveaxes(self.sg.axes[i,j], self.subgrid[i,j])
def _movejointgrid(self):
""" Move Jointgrid """
h= self.sg.ax_joint.get_position().height
h2= self.sg.ax_marg_x.get_position().height
r = int(np.round(h/h2))
self._resize()
self.subgrid = gridspec.GridSpecFromSubplotSpec(r+1,r+1, subplot_spec=self.subplot)
self._moveaxes(self.sg.ax_joint, self.subgrid[1:, :-1])
self._moveaxes(self.sg.ax_marg_x, self.subgrid[0, :-1])
self._moveaxes(self.sg.ax_marg_y, self.subgrid[1:, -1])
def _moveaxes(self, ax, gs):
ax.remove()
ax.figure=self.fig
self.fig.axes.append(ax)
self.fig.add_axes(ax)
ax._subplotspec = gs
ax.set_position(gs.get_position(self.fig))
ax.set_subplotspec(gs)
def _finalize(self):
plt.close(self.sg.fig)
self.fig.canvas.mpl_connect("resize_event", self._resize)
self.fig.canvas.draw()
def _resize(self, evt=None):
self.sg.fig.set_size_inches(self.fig.get_size_inches())
In [80]:
fig, axes = plt.subplots(3, 3, figsize=(12,14))
g0 = sns.jointplot(ax=axes[0,0], data=df, x="Kidhome", y="Spent", hue="Clusters",kind="kde", palette=pal)
g1 = sns.jointplot(ax=axes[0,1], data=df, x="Teenhome", y="Spent", hue="Clusters",kind="kde", palette=pal)
g2 = sns.jointplot(ax=axes[0,2], data=df, x="Customer_For", y="Spent", hue="Clusters",kind="kde", palette=pal)
g3 = sns.jointplot(ax=axes[1,0], data=df, x="Age", y="Spent", hue="Clusters",kind="kde", palette=pal)
g4 = sns.jointplot(ax=axes[1,1], data=df, x="Children", y="Spent", hue="Clusters",kind="kde", palette=pal)
g5 = sns.jointplot(ax=axes[1,2], data=df, x="Family_Size", y="Spent", hue="Clusters",kind="kde", palette=pal)
g6 = sns.jointplot(ax=axes[2,0], data=df, x="Is_Parent", y="Spent", hue="Clusters",kind="kde", palette=pal)
g7 = sns.jointplot(ax=axes[2,1], data=df, x="Education", y="Spent", hue="Clusters",kind="kde", palette=pal)
g8 = sns.jointplot(ax=axes[2,2], data=df, x="Living_With", y="Spent", hue="Clusters",kind="kde", palette=pal)
gs = gridspec.GridSpec(3,3)
mg0 = SeabornFig2Grid(g0, fig, gs[0])
mg1 = SeabornFig2Grid(g1, fig, gs[1])
mg2 = SeabornFig2Grid(g2, fig, gs[2])
mg0 = SeabornFig2Grid(g3, fig, gs[3])
mg1 = SeabornFig2Grid(g4, fig, gs[4])
mg2 = SeabornFig2Grid(g5, fig, gs[5])
mg0 = SeabornFig2Grid(g6, fig, gs[6])
mg1 = SeabornFig2Grid(g7, fig, gs[7])
mg2 = SeabornFig2Grid(g8, fig, gs[8])
plt.show()
完整图形的显示