# 足够详细、足够简单的 Python 版推荐系统入门级—理论篇(下)｜8月更文挑战

### 代码实现

import numpy as np
import pandas as pd

#### 导入数据集

r_cols = ['user_id','movie_id','rating','unix_timestamp']

ratings_train.head()

user_id movie_id rating unix_timestamp
0 1 1 5 874965758
1 1 2 3 876893171
2 1 3 4 878542960
3 1 4 3 876893119
4 1 5 3 889751712
ratings_test = pd.read_csv('data/ml-100k/ua.test',sep='\t',names=r_cols,encoding='latin-1')

ratings_test.shape,ratings_train.shape

((9430, 4), (90570, 4))

ratings_test.head()

user_id movie_id rating unix_timestamp
0 1 20 4 887431883
1 1 33 4 878542699
2 1 61 4 878542420
3 1 117 3 874965739
4 1 155 2 878542201

#### 生成评分矩阵

rating_df = ratings_train.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

data_matrix = rating_df.to_numpy()

data_matrix.shape

(943, 1680)

##### 计算用户到用户、项目到项目的相似性

from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

##### 实现预测方法

def predict(ratings, similarity, type='user'):
if type == 'user':
mean_user_rating = ratings.mean(axis=1)
#We use np.newaxis so that mean_user_rating has same format as ratings
ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred

user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

user_prediction

array([[ 1.81349209,  0.70700463,  0.61698708, ...,  0.39276591,
0.39226752,  0.39200766],
[ 1.49898583,  0.34098239,  0.18310294, ..., -0.08555358,
-0.08404725, -0.08377072],
[ 1.51740786,  0.29296796,  0.15029285, ..., -0.12609608,
-0.12431009, -0.12410346],
...,
[ 1.36707183,  0.23452991,  0.09185339, ..., -0.17162167,
-0.17056838, -0.17063272],
[ 1.54450965,  0.36817316,  0.25677137, ..., -0.01115452,
-0.01046112, -0.01008175],
[ 1.59370125,  0.45494826,  0.37321426, ...,  0.14561441,
0.14514214,  0.14528961]])

### 矩阵分解(MF)

$x^2 -1 =0 \rightarrow (x-1)(x+1) = 0$

$R = P \Sigma Q^T$
• M 用户的数量
• N 项目(电影)的数量
• K 隐含空间特征数量
• $R_{M \times N}$ 用户-电影的评分矩阵
• $P_{M \times K}$ 用户特征矩阵表示每个用户隐含特征
• $Q_{N \times K}$ 项目特征矩阵表示每个项目隐含特征
• $\Sigma_{k \times k}$对角特征权重矩阵表示特征

$r_{ui} = \sum_{k=1}^k p_{uk}\sigma q_{ik}$

$R = P\Sigma Q^T\\ RQ = P\Sigma Q^TQ\\ Q^TQ = 1\\ RQ=P\Sigma\\ RQ\Sigma^{-1}=P$
• 在等号两次都乘以矩阵 $Q$
• 因为 $Q$ 矩阵是正交矩阵，得到 $Q^TQ = 1$ 所以有 $RQ = P\Sigma$
• 从而得出 

$e_{ui}^2 = (r_{ui} - \hat{r}_{ui})^2 = (r_{ui} - \sum_{k=1}^K p_{uk}\sigma_k q_{ki})^2$
• $e_{ui}$ 是误差值，u 表示用户下标而 i 表示物品的下标
• $r_{ui}$ 是实际 u 用户对 i 项目的评分
• $\hat{r}_{ui}$ 是通过矩阵分解对 u 用户对 i 商品的预测值
$\frac{\partial (e_{ui}^2)}{\partial p_{uk}} = -2(r_{ui} - \hat{r}_{ui})q_{ki} = -2e_{ui}q_{ki}\\ \frac{\partial (e_{ui}^2)}{\partial q_{ki}} = -2(r_{ui} - \hat{r}_{ui})p_{uk} = -2e_{ui}p_{uk}\\$

$p^{\prime}_{uk} = p_{uk} = a^*\frac{\partial (e_{ui}^2)}{\partial p_{uk}} = p_{uk} + 2 e_{ui}q_{ki}\\ q^{\prime}_{ki} = q_{ki} = a^*\frac{\partial (e_{ui}^2)}{\partial q_{ki}} = p_{uk} + 2 e_{ui}p_{uk}\\$

class MF():

# 初始化用户-电影评分矩阵，这里隐含特征，以及 alpha 和 beta 参数
def __init__(self, R, K, alpha, beta, iterations):
self.R = R
self.num_users, self.num_items = R.shape
self.K = K
self.alpha = alpha
self.beta = beta
self.iterations = iterations

# 初始化用户-特征和电影-特征矩阵
def train(self):
self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

# Initializing the bias terms 初始化偏置
self.b_u = np.zeros(self.num_users)
self.b_i = np.zeros(self.num_items)
self.b = np.mean(self.R[np.where(self.R != 0)])

# List of training samples
self.samples = [
(i, j, self.R[i, j])
for i in range(self.num_users)
for j in range(self.num_items)
if self.R[i, j] > 0
]

# 在指定迭代过程中随机梯度下降
training_process = []
for i in range(self.iterations):
np.random.shuffle(self.samples)
self.sgd()
mse = self.mse()
training_process.append((i, mse))
if (i+1) % 20 == 0:
print("Iteration: %d ; error = %.4f" % (i+1, mse))

return training_process

# 计算批量的 MSE
def mse(self):
xs, ys = self.R.nonzero()
predicted = self.full_matrix()
error = 0
for x, y in zip(xs, ys):
error += pow(self.R[x, y] - predicted[x, y], 2)
return np.sqrt(error)

# 随机梯度下降来对 P 和 Q 矩阵进行优化
def sgd(self):
for i, j, r in self.samples:
prediction = self.get_rating(i, j)
e = (r - prediction)

self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

# j 获取 i 用户对 j 电影的评价
def get_rating(self, i, j):
prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
return prediction

# 填充用户-电影评分矩阵
def full_matrix(self):
return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

### 评估推荐系统引擎

TTPFN
FFPTN
• TP(True Positive) 真正类 测试集真实标签为 T， 预测值也为 T的总数
• FN(False Negative) 漏报 测试集真实标签为 F，测试集却为 F 的总数
• FP(False Positive) 误报 测试集真实标签为 F，测试集却为 T 的总数
• TN(True Negative) 真负类

#### 准确率

$N_{pred} = TP + TN$ $N_{total} = TP + TN + FP + FN$

#### 精准度(Precision)

$Precision = \frac{TP}{TP + FP}$

#### 召回率(recall)

$Recall = \frac{TP}{TP+FN}$

假设有 20 任务其中 10 个被按时完成，10 个逾期完成

• 第 1 种

• 第 2 种预测情况

$Recall = \frac{tp}{tp + fn}$