python推荐系统库Surprise
在推荐系统的建模过程中,我们将用到python库 Surprise(Simple Python RecommendatIon System Engine),是scikit系列中的一个(很多同学用过scikit-learn和scikit-image等库)。
简单易用,同时支持多种推荐算法:
- 基础算法/baseline algorithms
- 基于近邻方法(协同过滤)/neighborhood methods
- 矩阵分解方法/matrix factorization-based (SVD, PMF, SVD++, NMF)
| 算法类名 | 说明 |
|---|---|
| random_pred.NormalPredictor | Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal. |
| baseline_only.BaselineOnly | Algorithm predicting the baseline estimate for given user and item. |
| knns.KNNBasic | A basic collaborative filtering algorithm. |
| knns.KNNWithMeans | A basic collaborative filtering algorithm, taking into account the mean ratings of each user. |
| knns.KNNBaseline | A basic collaborative filtering algorithm taking into account a baseline rating. |
| matrix_factorization.SVD | The famous SVD algorithm, as popularized by Simon Funk during the Netflix Prize. |
| matrix_factorization.SVDpp | The SVD++ algorithm, an extension of SVD taking into account implicit ratings. |
| matrix_factorization.NMF | A collaborative filtering algorithm based on Non-negative Matrix Factorization. |
| slope_one.SlopeOne | A simple yet accurate collaborative filtering algorithm. |
| co_clustering.CoClustering | A collaborative filtering algorithm based on co-clustering. |
其中基于近邻的方法(协同过滤)可以设定不同的度量准则。
| 相似度度量标准 | 度量标准说明 |
|---|---|
| cosine | Compute the cosine similarity between all pairs of users (or items). |
| msd | Compute the Mean Squared Difference similarity between all pairs of users (or items). |
| pearson | Compute the Pearson correlation coefficient between all pairs of users (or items). |
| pearson_baseline | Compute the (shrunk) Pearson correlation coefficient between all pairs of users (or items) using baselines for centering instead of means. |
支持不同的评估准则
| 评估准则 | 准则说明 |
|---|---|
| rmse | Compute RMSE (Root Mean Squared Error). |
| mae | Compute MAE (Mean Absolute Error). |
| fcp | Compute FCP (Fraction of Concordant Pairs). |
使用示例
基本使用方法如下
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')
# Use the famous SVD algorithm.
algo = SVD()
# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
载入自己的数据集方法
# 指定文件所在路径
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating timestamp', sep='\t')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)
# 手动切分成5折(方便交叉验证)
data.split(n_folds=5)
算法调参(让推荐系统有更好的效果)
这里实现的算法用到的算法无外乎也是SGD等,因此也有一些超参数会影响最后的结果,我们同样可以用sklearn中常用到的网格搜索交叉验证(GridSearchCV)来选择最优的参数。简单的例子如下所示:
# 定义好需要优选的参数网格
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
'reg_all': [0.4, 0.6]}
# 使用网格搜索交叉验证
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
# 在数据集上找到最好的参数
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)
grid_search.evaluate(data)
# 输出调优的参数组
# 输出最好的RMSE结果
print(grid_search.best_score['RMSE'])
# >>> 0.96117566386
# 输出对应最好的RMSE结果的参数
print(grid_search.best_params['RMSE'])
# >>> {'reg_all': 0.4, 'lr_all': 0.005, 'n_epochs': 10}
# 最好的FCP得分
print(grid_search.best_score['FCP'])
# >>> 0.702279736531
# 对应最高FCP得分的参数
print(grid_search.best_params['FCP'])
# >>> {'reg_all': 0.6, 'lr_all': 0.005, 'n_epochs': 10}
在我们的数据集上训练模型
首先载入数据
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple surprise
! top /jhub/students/data/course13/project_music_recommendation/popular_music_suprise_format.txt
import os
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split,KFold, cross_validate
BASEPATH = '/jhub/students/data/course13/project_music_recommendation/'
# 指定文件路径
file_path = os.path.expanduser(BASEPATH+'popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折
# kf = KFold(n_splits=5)
# music_data = kf.split(music_data)
# from surprise import SVD
# from surprise import Dataset
# from surprise.model_selection import cross_validate
# # Load the movielens-100k dataset (download it if needed),
# data = Dataset.load_builtin('ml-100k')
# # We'll use the famous SVD algorithm.
# algo = SVD()
# # Run 5-fold cross-validation and print results
# cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
music_data
music_data.raw_ratings[:5]
使用不同的推荐系统算法进行建模比较
### 使用NormalPredictor
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
algo = NormalPredictor()
# trainset, testset = train_test_split(music_data, test_size=0.001)
# algo.fit(trainset)
# predictions = algo.test(testset)
# accuracy.rmse(predictions)
# algo.fit(music_data)
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'],cv=3, verbose=True)
### 使用BaselineOnly
from surprise import BaselineOnly
algo = BaselineOnly()
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'],cv=5)
print(perf)
### 使用基础版协同过滤
from surprise import KNNBasic
algo = KNNBasic()
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'], cv=5)
### 使用均值协同过滤
from surprise import KNNWithMeans
algo = KNNWithMeans()
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'],cv=5)
### 使用协同过滤baseline
from surprise import KNNBaseline
algo = KNNBaseline()
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'],cv=5)
### 使用SVD
from surprise import SVD
algo = SVD()
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'],cv=5)
### 使用SVD++
from surprise import SVDpp
algo = SVDpp()
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'],cv=5)
### 使用NMF
from surprise import NMF
algo = NMF()
perf = cross_validate(algo, music_data, measures=['RMSE', 'MAE'],cv=5)
print(perf)
建模和存储模型
用协同过滤构建模型并进行预测
movielens的例子
# 可以使用上面提到的各种推荐系统算法
from surprise import SVD
from surprise import Dataset
# from surprise import print_perf
from surprise.model_selection import cross_validate
# 默认载入movielens数据集
data = Dataset.load_builtin('ml-100k')
# k折交叉验证(k=3)
# data.split(n_folds=3)
# 试一把SVD矩阵分解
algo = SVD()
# 在数据集上测试一下效果
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'],cv=3)
#输出结果
print(perf)
- 电影名字
- IMDB定义的电影id
- 内部ID/inner_id
"""
以下的程序段告诉大家如何在协同过滤算法建模以后,根据一个item取回相似度最高的item,主要是用到algo.get_neighbors()这个函数
"""
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import os
import io
from surprise import KNNBaseline
from surprise import Dataset
def read_item_names():
"""
获取电影名到电影id 和 电影id到电影名的映射
"""
file_name = (os.path.expanduser('~') +
'/.surprise_data/ml-100k/ml-100k/u.item')
rid_to_name = {}
name_to_rid = {}
with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
for line in f:
line = line.split('|')
# 从电影id(IMDB的电影编号)到电影名字的映射
rid_to_name[line[0]] = line[1]
name_to_rid[line[1]] = line[0]
return rid_to_name, name_to_rid
# 首先,用算法计算相互间的相似度
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)
# 获取电影名到电影id 和 电影id到电影名的映射
rid_to_name, name_to_rid = read_item_names()
# Retieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
for rid in toy_story_neighbors)
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
print(movie)
音乐预测的例子
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io
from surprise import Reader, Dataset
from surprise import KNNBaseline
from surprise import Dataset
import _pickle as pickle
# 重建歌单id到歌单名的映射字典
id_name_dic = pickle.load(open(BASEPATH+"popular_playlist.pkl","rb"))
print("加载歌单id到歌单名的映射字典完成...")
# 重建歌单名到歌单id的映射字典
name_id_dic = {}
for playlist_id in id_name_dic:
name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("加载歌单名到歌单id的映射字典完成...")
file_path = os.path.expanduser(BASEPATH+'popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲和歌曲之间的相似度
print("构建数据集...")
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}
trainset.n_items
trainset.n_users
trainset.all_items()
- current_playlist => 歌单名
- playlist_id => 歌单id(网易给的歌单id)
- playlist_inner_id => 内部id(对所有歌单id重新从1开始编码)
import _pickle as pickle
playlist_dic = pickle.load(open(BASEPATH+"playlist.pkl","rb"))
playlist_id = list(playlist_dic.keys())[1542]
print(playlist_id)
print("开始训练模型...")
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNBaseline()
algo.fit(trainset)
# print(name_id_dic)
# print(len(list(name_id_dic.keys())))
current_playlist = list(name_id_dic.keys())[1]
# 取出近邻
playlist_id = name_id_dic[current_playlist]
print(playlist_id)
print(playlist_id)
playlist_inner_id = algo.trainset.to_inner_uid("b'"+playlist_id)
print(playlist_inner_id)
playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)
# 把歌曲id转成歌曲名字
playlist_neighbors = (algo.trainset.to_raw_uid(inner_id)
for inner_id in playlist_neighbors)
playlist_neighbors = (id_name_dic[playlist_id.split("'")[1]]
for playlist_id in playlist_neighbors)
print("和歌单 《", current_playlist, "》 最接近的10个歌单为:\n")
print(playlist_neighbors)
for playlist in playlist_neighbors:
print(playlist)
# except:
# pass
#可能内存有点问题...
用SVD矩阵分解进行预测
### 使用SVD++
from surprise import SVDpp
import os
from surprise import Dataset
file_path = os.path.expanduser(BASEPATH+'popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 构建数据集和建模
algo = SVDpp()
trainset = music_data.build_full_trainset()
algo.fit(trainset)