import numpy as np
import pandas as pd
from collections import Counter
def euclidean_distance(vec1, vec2):
return np.sqrt(np.sum(np.square(vec1 - vec2)))
train_data = {'宝贝当家': [45, 2, 9, '喜剧片'],
'美人鱼': [21, 17, 5, '喜剧片'],
'澳门风云3': [54, 9, 11, '喜剧片'],
'功夫熊猫3': [39, 0, 31, '喜剧片'],
'谍影重重': [5, 2, 57, '动作片'],
'叶问3': [3, 2, 65, '动作片'],
'我的特工爷爷': [6, 4, 21, '动作片'],
'奔爱': [7, 46, 4, '爱情片'],
'夜孔雀': [9, 39, 8, '爱情片'],
'代理情人': [9, 38, 2, '爱情片'],
'新步步惊心': [8, 34, 17, '爱情片'],
'伦敦陷落': [2, 3, 55, '动作片']
}
test_data = {'唐人街探案': [23, 3, 17]}
train_df = pd.DataFrame(train_data).T
train_df.columns = ['搞笑镜头', '拥抱镜头', '打斗镜头', '电影类型']
# print(train_df)
K = 5
movie = '唐人街探案'
distance_list = []
for train_X in train_df.values[:, :-1]:
test_X = np.array(test_data[movie])
distance_list.append(euclidean_distance(train_X, test_X))
distance_df = pd.DataFrame({"欧式距离": distance_list}, index=train_df.index)
result = pd.concat([train_df, distance_df], axis=1).sort_values(by="欧式距离")
print(result)
d = Counter(result.head(K)['电影类型'])
print(movie, max(d, key=d.get))
最后两行代码体现了K近邻分类的基本思想:先找出和《唐人街探案》距离最近的K个电影,然后在这K个电影的“电影类型” 中进行多数表决。代码的运行结果为:唐人街探案 喜剧片