前言
本文从2个方面讲解knn算法的实战。
<1>完全自己coding代码
<2>通过sklearn这个扩展包
<1>完全自己编写的代码
import numpy as np
import operator
def createDataSet():
dataset = np.array( [ [1,101],
[5,89],
[108,5],
[115,8] ] )
labels = ['爱情片','爱情片','动作片','动作片']
return dataset, labels
def classify(input_data, dataset, labels, k):
dataset_size = dataset.shape[0]
diffMat = np.tile(input_data, (dataset_size, 1)) - dataset 把待分类数据,复制成和数据集合相同行数的array,然后2个array相减,
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndices = distances.argsort()
classCount = dict()
for i in range(k):
voteIlabel = labels[ sortedDistIndices[i] ]
classCount[voteIlabel] = classCount.get(voteIlabel,1) + 1
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),
reverse=True)
return sortedClassCount[0][0]
if __name__ == '__main__':
dataset, labels = createDataSet()
test = [101,20]
test_class = classify0(test, dataset, labels, 3)
print(test_class)
<2>使用sklearn扩展包
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
np.random.seed(0)
iris = datasets.load_iris()
iris_x= iris.data
iris_y = iris.target
indices = np.random.permutation(len(iris_x))
iris_x_train = iris_x[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_x_test = iris_x[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]
n_neighbors=5
int 型参数,
knn算法中指定以最近的几个最近邻样本具有投票权,默认参数为5
weights='uniform'
str参数,即每个拥有投票权的样本是按什么比重投票,
'uniform'表示等比重投票,
'distance'表示按距离反比投票,及距离越小,权重越大,及话语权越大。这个是对于数据分布不均衡的情况的
默认参数为‘uniform’
knn = KNeighborsClassifier()
knn.fit(iris_x_train, iris_y_train)
iris_y_predict = knn.predict(iris_x_test)
iris_y_predict_proba = knn.predict_proba(iris_x_test)
neighborpoint = knn.kneighbors(iris_x_test[-1],5,False)
score = knn.score(iris_x_test,iris_y_test,sample_weight=None)
print('测试 ',iris_y_predict)
print('iris_y_test = ')
print(iris_y_test)
print('Accuracy:',score)
print('neighborpoint of last test sample:',neighborpoint)
print('probility:',iris_y_predict_proba)
iris_y_predict =
[1 2 1 0 0 0 2 1 2 0]
iris_y_test =
[1 1 1 0 0 0 2 1 2 0]
Accuracy: 0.9
neighborpoint of last test sample: [[ 75 41 96 78 123]]
probility: [[ 0. 1. 0. ]
[ 0. 0.4 0.6]
[ 0. 1. 0. ]
[ 1. 0. 0. ]
[ 1. 0. 0. ]
[ 1. 0. 0. ]
[ 0. 0. 1. ]
[ 0. 1. 0. ]
[ 0. 0. 1. ]
[ 1. 0. 0. ]]