Latent Factor Model
最近要入坑推荐系统的开发,特意了解一些推荐算法,并实现个demo,做下记录。
是什么?
隐语义模型、属于协同领域。
item CF: 根据目标用户喜欢的物品,寻找和这些物品相似的物品
user CF: 计算和目标用户兴趣相似的用户,根据计算出来的用户喜欢的物品给目标用户推荐物品
LFM: 对所有的物品进行分类,再根据用户的兴趣分类给用户推荐该分类中的物品
怎么办?
数据集下载files.grouplens.org/datasets/mo…
此处下载ml-1M.zip,约100万行数据
读取ratings.dat文件
import pandas as pd
df = pd.read_csv('ratings.dat',sep='::',header=None,names=['uid','mid','rating','timestamp'])
正负样本均衡、负采样
pos_dict, neg_dict ,train_data = {} , {} , []
for tup in zip(df['uid'], df['mid'],df['rating']):
uid,mid,rating = tup[0],tup[1],tup[2]
if uid not in pos_dict:
pos_dict[uid] = []
if uid not in neg_dict:
neg_dict[uid] = []
if rating >= 4:
pos_dict[uid].append((mid,1)) # 正样本
else:
score = round(df[df['mid']==mid].mean().fillna(0)['rating'],3)
neg_dict[uid].append((mid,score))
# 正负样本均衡、负采样
for uid in pos_dict:
data_num = min(len(pos_dict[uid]),len(neg_dict.get(uid,[])))
if data_num > 0:
train_data.append([(uid,x[0],x[1]) for x in pos_dict[uid]][:data_num])
sorted_neg_list = sorted(neg_dict[uid],key=lambda d: d[1],reverse=True)[:data_num]
train_data.append([(uid,x[0],0) for x in sorted_neg_list])
模型
import numpy as np
import operator
def give_recom_result(user_vec,item_vec,uid):
if uid not in user_vec:
return []
record = {}
recom_list = []
user_vector = user_vec[uid]
fix_num = 10
for item_id in item_vec:
item_vector = item_vec[item_id]
res = np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector)*np.linalg.norm(item_vector))
record[item_id] = res
for x in sorted(record.items(),key = operator.itemgetter(1),reverse=True)[:fix_num]:
item_id = x[0]
score = round(x[1],3)
recom_list.append((item_id,score))
return recom_list
def model_predict(user_vector,item_vector):
return np.dot(user_vector,item_vector)/(np.linalg.norm(user_vector)*np.linalg.norm(item_vector))
def init_model(vector_len):
return np.random.randn(vector_len)
def lfm_train(train_data,F,alpha,beta,step):
"""
Args:
train_data: train_data for lfm
F: user vector len , item vector len
alpha: regularization factor
beta: learning rate
step: iteration num
Returns:
dict:key itemid, value list
dict:key userid, value list
"""
user_vec = {}
item_vec = {}
for step_index in range(step):
for data_instance in train_data:
uid, item_id ,label = data_instance
if uid not in user_vec:
user_vec[uid] = init_model(F)
if item_id not in item_vec:
item_vec[uid] = init_model(F)
delta = label - model_predict(user_vec[uid],item_vec[item_id])
for index in range(F):
user_vec[uid][index] += beta * (delta * item_vec[item_id][index] - alpha*user_vec[uid][index])
item_vec[item_id][index] += beta * (delta * user_vec[uid][index] - alpha*item_vec[item_id][index])
beta = beta * 0.9
return user_vec,item_vec
进行训练
# 训练
u_v,i_v = lfm_train(train_data_1,50,0.01,0.1,50)
# 测试 uid==2的用户
give_recom_result(u_v,i_v,2)
发现了推荐如下(mid,score)的电影↓
[(3433, 0.498), (1921, 0.487), (2332, 0.486), (5241, 0.485), (1117, 0.478), (1499, 0.465), (3368, 0.463), (5655, 0.461), (686, 0.449), (903, 0.447)]
看看uid==2的这个用户的情况
from collections import Counter
# 导入数据
df_movie = pd.read_csv('movies.dat',sep='::',header=None,names=['movie_id','movie_title','movie_type'])
movie_type_list = []
for mid in uid_2_mid_list:
movie_type_o = df_movie[df_movie['movie_id']==mid]['movie_type']
movie_type = movie_type_o.get_values()[0]
if '|' in movie_type:
for m in movie_type.split('|'):
movie_type_list.append(m)
else:
movie_type_list.append(movie_type)
c = Counter(movie_type_list)
sorted(c,key=lambda x:x[1],reverse=True)
用户喜欢的结果如下↓
['Mystery', 'Drama', 'Crime', 'Romance', 'Comedy', 'Horror', 'Film-Noir', 'Thriller', 'Western', 'Adventure', 'Action', 'Sci-Fi', 'War', 'Fantasy']
Mystery Drama Crime Romance是他比较喜欢的类型
再看看推荐的列表:
recom_type_list = []
for r in recom_list:
mid = r[0]
movie_type_o = df_movie[df_movie['movie_id']==mid]['movie_type']
movie_type_v = movie_type_o.get_values()
if movie_type_v:
movie_type = movie_type_o.get_values()[0]
if '|' in movie_type:
for m in movie_type.split('|'):
recom_type_list.append(m)
else:
recom_type_list.append(movie_type)
sorted(Counter(recom_type_list),key=lambda x:x[1],reverse=True)
推荐给用户的结果如下↓
['Mystery', 'Drama', 'Crime', 'Romance', 'Thriller', 'Western', 'Adventure', 'Action', 'Sci-Fi']
会发现模型给他推荐了Mystery Drama Crime Romance类型的movie了