推荐算法实践:ItemCF计算相似度与召回率

101 阅读2分钟

1. 数据加载与预处理

  • train_user_x_items.json:存储用户与物品的交互记录,格式为{user_id: [(item_id, rating, timestamp), ...]}
  • user_x_N:计算每个用户的交互次数,并将其转换为权重(1 / log(1 + N)),用于减少热门物品的影响。
  • item_x_users:统计每个物品被哪些用户交互过,格式为{item_id: {user_id1, user_id2, ...}}
import math
import multiprocessing
import json
from collections import defaultdict
item_x_users = defaultdict(set)
user_x_N = defaultdict(int)

with open('train_user_items.json','r') as f:
    train_user_items = json.load(f)

# 1. 处理数据得到user->itemlist已经item交互次数
for userid, items in train_user_items.items():
    user_x_N[userid] = len(items)
    for movieid, rating, timestamp in items:
        item_x_users[movieid].add(userid)

for userid, N in user_x_N.items():
    user_x_N[userid] = 1 / math.log(1 + N)

all_items = list(item_x_users.keys())

2. 物品相似度计算

  • 相似度公式
sim(i,j)=uUi,j1log(1+Nu)UiUj \text{sim}(i,j) = \frac{\sum_{u \in U_{i,j}} \frac{1}{\log(1 + N_u)}}{\sqrt{|U_i| \cdot |U_j|}}
  • Ui,jU_{i,j}:同时交互过物品iijj的用户集合。
  • NuN_u:用户uu的交互次数。
  • Ui|U_i|Uj|U_j|:分别表示物品iijj的用户数量。

使用multiprocessing.Pool对每个物品与其他物品的相似度进行并行计算。

def iuf_simscore(i):
    sim = []
    itemi = all_items[i]
    itemi_users = item_x_users[itemi]
    ni = len(itemi_users)
    for j in range(i+1, len(all_items)):
        score = 0
        itemj = all_items[j]
        itemj_users = item_x_users[itemj]
        union_users = itemi_users & itemj_users
        for user in union_users:
            score += user_x_N[user]
        nj = len(itemj_users)
        if ni > 0 and nj > 0:
            score = score / math.sqrt(ni * nj)
            sim.append((itemi, itemj, score))
    return sim

if __name__ == "__main__":
    results = []
    item_x_item_score = defaultdict(dict)
    num_producers = 16
    topk = 100
    with multiprocessing.Pool(processes=num_producers) as pool:
        results = pool.imap(iuf_simscore,list(range(len(all_items))))
        for sim_scores in results:
            for itemi, itemj, score in sim_scores:
                item_x_item_score[itemi][itemj] = score
                item_x_item_score[itemj][itemi] = score

    # 4. 计算每个用户最相似的topk个用户和分数
    for item, scores in item_x_item_score.items():
        scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
        item_x_item_score[item] = scores[:topk]

    with open('itemcf_simscore.json', 'w') as f:
        json.dump(item_x_item_score, f)

3. 计算召回率

import json
from collections import defaultdict

with open('itemcf_simscore.json','r') as f:
    sim_scores = json.load(f)

with open('train_user_items.json','r') as f:
    train_user_items = json.load(f)

with open('test_user_items.json','r') as f:
    test_user_items = json.load(f)

def get_item_by_user_item(ui, top_K=100):
    item_score = defaultdict(float)
    interacted_items = set()
    # 获取用户历史交互过的商品和评分
    for item, rate, _ in train_user_items.get(ui, []):
        interacted_items.add(item)
        # 相似的商品和相似分
        for sim_item, sim_score in sim_scores.get(item, []):
            item_score[sim_item] += rate * sim_score
    item_score = {item: score for item, score in item_score.items() if item not in interacted_items}
    item_score = sorted(item_score.items(), key=lambda x: x[1], reverse=True)
    return item_score[:top_K]


# 计算召回率
users = set(test_user_items.keys())
total_recall = 0.0
for user in users:
    test_items = {item[0] for item in test_user_items.get(user, [])}
    recommended_items = {item[0] for item in get_item_by_user_item(user)}
    if not test_items or not recommended_items:
        continue
    recall = len(recommended_items & test_items) / len(test_items)
    total_recall += recall

average_recall = total_recall / len(users) if users else 0.0
print(f"Average Recall: {average_recall}")

最终召回率为0.31938770604518185

当然也可以实现swingi2i的计算方法

def swing_simscore(i):
    sim = []
    itemi = all_items[i]
    itemi_users = item_x_users[itemi]
    for j in range(i+1, len(all_items)):
        score = 0
        itemj = all_items[j]
        itemj_users = item_x_users[itemj]
        union_users = itemi_users & itemj_users
        union_users_pair = combinations(union_users, 2)
        for ui, uj in union_users_pair:
            overlap = len(user_x_items[ui] & user_x_items[uj])
            score += 1 / (1 + overlap)
            sim.append((itemi, itemj, score))
    return sim