基于KNN的电影题材分类

162 阅读1分钟
import numpy as np
import pandas as pd
from collections import Counter


def euclidean_distance(vec1, vec2):
    return np.sqrt(np.sum(np.square(vec1 - vec2)))


train_data = {'宝贝当家': [45, 2, 9, '喜剧片'],
              '美人鱼': [21, 17, 5, '喜剧片'],
              '澳门风云3': [54, 9, 11, '喜剧片'],
              '功夫熊猫3': [39, 0, 31, '喜剧片'],
              '谍影重重': [5, 2, 57, '动作片'],
              '叶问3': [3, 2, 65, '动作片'],
              '我的特工爷爷': [6, 4, 21, '动作片'],
              '奔爱': [7, 46, 4, '爱情片'],
              '夜孔雀': [9, 39, 8, '爱情片'],
              '代理情人': [9, 38, 2, '爱情片'],
              '新步步惊心': [8, 34, 17, '爱情片'],
              '伦敦陷落': [2, 3, 55, '动作片']
              }

test_data = {'唐人街探案': [23, 3, 17]}
train_df = pd.DataFrame(train_data).T
train_df.columns = ['搞笑镜头', '拥抱镜头', '打斗镜头', '电影类型']
# print(train_df)

K = 5
movie = '唐人街探案'
distance_list = []
for train_X in train_df.values[:, :-1]:
    test_X = np.array(test_data[movie])
    distance_list.append(euclidean_distance(train_X, test_X))

distance_df = pd.DataFrame({"欧式距离": distance_list}, index=train_df.index)
result = pd.concat([train_df, distance_df], axis=1).sort_values(by="欧式距离")
print(result)

d = Counter(result.head(K)['电影类型'])
print(movie, max(d, key=d.get))

最后两行代码体现了K近邻分类的基本思想:先找出和《唐人街探案》距离最近的K个电影,然后在这K个电影的“电影类型” 中进行多数表决。代码的运行结果为:唐人街探案 喜剧片