# 机器学习(十五) K-means 算法

·  阅读 254

## 1 简介

K-means称为K-平均算法，简单来讲K-平均聚类算法的目的就是：

## 3 代码实例

``````# !/usr/bin/env python3
# -*- coding:utf-8 _*-
"""
@Author:yanqiang
@File: k-means.py
@Time: 2019/1/17 14:08
@Software: PyCharm
@Description:
"""
# 导入库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 步骤1：初始化
df = pd.DataFrame({
'x': [12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69, 72],
'y': [39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24]
})
np.random.seed(300)

# 随机初始化三个聚类中心
k = 3
centroids = {i + 1: [np.random.randint(0, 80), np.random.randint(0, 80)] for i in range(k)}  # centroids[i] = [x, y]

fig = plt.figure(figsize=(5, 5))
plt.scatter(df['x'], df['y'], color='k')
colors = {1: 'b', 2: 'r', 3: 'g'}
for i in centroids.keys():
plt.scatter(*centroids[i], color=colors[i])
plt.xlim(0, 80)
plt.ylim(0, 80)
plt.show()  # 画图

``````# 步骤2：Assignment 分配
# 将点归类到与聚类中心距离最短的类别
def assignment(df, centroids):
for i in centroids.keys():
# sqrt((x1-x2)^2+(y1-y2)^2)
df['distance_from_{}'.format(i)] = (
np.sqrt(
(df['x'] - centroids[i][0]) ** 2
+ (df['y'] - centroids[i][1]) ** 2
)
)
centroids_distance_cols = ['distance_from_{}'.format(i) for i in centroids.keys()]
df['closest'] = df.loc[:, centroids_distance_cols].idxmin(axis=1)  # ?
df['closest'] = df['closest'].map(lambda x: int(x.lstrip('distance_from_')))
df['color'] = df['closest'].map(lambda x: colors[x])
return df

df = assignment(df, centroids)

fig = plt.figure(figsize=(5, 5))
plt.scatter(df['x'], df['y'], color=df['color'], alpha=0.5, edgecolors='k')
for i in centroids.keys():
plt.scatter(*centroids[i], color=colors[i])
plt.xlim(0, 80)
plt.ylim(0, 80)
plt.show()

``````# 步骤3 ：Update 更新
# 重新计算每个聚类的质心
import copy

old_centroids = copy.deepcopy(centroids)

def update(k):
for i in centroids.keys():
centroids[i][0] = np.mean(df[df['closest'] == i]['x'])
centroids[i][1] = np.mean(df[df['closest'] == i]['y'])
return k

centroids = update(centroids)

fig = plt.figure(figsize=(5, 5))
ax = plt.axes()
plt.scatter(df['x'], df['y'], color=df['color'], alpha=0.5, edgecolor='k')
for i in centroids.keys():
plt.scatter(*centroids[i], color=colors[i])
plt.xlim(0, 80)
plt.ylim(0, 80)
for i in old_centroids.keys():
old_x = old_centroids[i][0]
old_y = old_centroids[i][1]
dx = (centroids[i][0] - old_centroids[i][0]) * 0.75
dy = (centroids[i][1] - old_centroids[i][1]) * 0.75
plt.show()

``````# 步骤4 重新分配 Repeat Assigment
# 重新将点归类到与新聚类中心距离最短的类别

df = assignment(df, centroids)

# 画图
fig = plt.figure(figsize=(5, 5))
plt.scatter(df['x'], df['y'], color=df['color'], alpha=0.5, edgecolor='k')
for i in centroids.keys():
plt.scatter(*centroids[i], color=colors[i])
plt.xlim(0, 80)
plt.ylim(0, 80)
plt.show()

``````# 步骤5 一直重复上面的步骤，直到每个的点所属的聚类中心不再更新
while True:
closest_centroids = df['closest'].copy(deep=True)
centroids = update(centroids)
df = assignment(df, centroids)
if closest_centroids.equals(df['closest']):
df.to_csv('cal_process.csv')
break

fig = plt.figure(figsize=(5, 5))
plt.scatter(df['x'], df['y'], color=df['color'], alpha=0.5, edgecolor='k')
for i in centroids.keys():
plt.scatter(*centroids[i], color=colors[i])
plt.xlim(0, 80)
plt.ylim(0, 80)
plt.show()