python K-Means 算法

229 阅读1分钟

lines 代表 [[a,b],[c,d]] 、k 代表 聚合个数、m代表 聚合方法

1 为少量聚合法 KMeans 

2 为大量聚合法 MiniBatchKMeans

import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from enum import Enum

class Cluter_ways(Enum):    
    kMeans = 1       
    MiniBatchKMeans = 2

def kMeans(self, lines, km):    
    # 转换为矩阵    
    matrix = np.mat(lines)    
    # 聚类    
    km.fit(matrix)    
    # 获取聚类标签    
    label_map_list = dict()    
    for i in km.labels_:        
        count = label_map_list.get(i)        
        count = count is None and 1 or count + 1        
        label_map_list[i] = count        
        min = sys.maxint    
        max = 0    
    for value in label_map_list.values():        
        if (value < min):            
            min = value        
        if (value > max):            
            max = value                
    # 矩阵转换成数组    
    centroids = km.cluster_centers_.tolist()    
    for i, item in enumerate(centroids):        
        count = label_map_list.get(i)        
        score = round((int(count)+int(min))*100.0/(int(max)+int(min)), 2)        
        item.append(score)    
    return centroids

def k_ms(self, lines, k):    
    return self.kMeans(lines, KMeans(n_clusters=k))

def Minik_ms(self, lines, k):    
    return self.kMeans(lines, MiniBatchKMeans(n_clusters=k, batch_size=10000, reassignment_ratio=0.005))

def cluster(self, lines, k, m):        print(time.time())  
    if m == Cluter_ways.kMeans:        
        print ('少量数据聚合')        
        all_data = self.k_ms(lines, k)    
    elif m == Cluter_ways.MiniBatchKMeans:        
        print ('大量数据聚合')        
        all_data = self.Minik_ms(lines, k)    
    else:        
        all_data = lines    
        print(time.time())    
return all_data