1. 数据加载与预处理
train_user_x_items.json:存储用户与物品的交互记录,格式为{user_id: [(item_id, rating, timestamp), ...]}。user_x_N:计算每个用户的交互次数,并将其转换为权重(1 / log(1 + N)),用于减少热门物品的影响。item_x_users:统计每个物品被哪些用户交互过,格式为{item_id: {user_id1, user_id2, ...}}。
import math
import multiprocessing
import json
from collections import defaultdict
item_x_users = defaultdict(set)
user_x_N = defaultdict(int)
with open('train_user_items.json','r') as f:
train_user_items = json.load(f)
# 1. 处理数据得到user->itemlist已经item交互次数
for userid, items in train_user_items.items():
user_x_N[userid] = len(items)
for movieid, rating, timestamp in items:
item_x_users[movieid].add(userid)
for userid, N in user_x_N.items():
user_x_N[userid] = 1 / math.log(1 + N)
all_items = list(item_x_users.keys())
2. 物品相似度计算
- 相似度公式:
- :同时交互过物品和的用户集合。
- :用户的交互次数。
- 和:分别表示物品和的用户数量。
使用multiprocessing.Pool对每个物品与其他物品的相似度进行并行计算。
def iuf_simscore(i):
sim = []
itemi = all_items[i]
itemi_users = item_x_users[itemi]
ni = len(itemi_users)
for j in range(i+1, len(all_items)):
score = 0
itemj = all_items[j]
itemj_users = item_x_users[itemj]
union_users = itemi_users & itemj_users
for user in union_users:
score += user_x_N[user]
nj = len(itemj_users)
if ni > 0 and nj > 0:
score = score / math.sqrt(ni * nj)
sim.append((itemi, itemj, score))
return sim
if __name__ == "__main__":
results = []
item_x_item_score = defaultdict(dict)
num_producers = 16
topk = 100
with multiprocessing.Pool(processes=num_producers) as pool:
results = pool.imap(iuf_simscore,list(range(len(all_items))))
for sim_scores in results:
for itemi, itemj, score in sim_scores:
item_x_item_score[itemi][itemj] = score
item_x_item_score[itemj][itemi] = score
# 4. 计算每个用户最相似的topk个用户和分数
for item, scores in item_x_item_score.items():
scores = sorted(scores.items(), key = lambda x : x[1], reverse = True)
item_x_item_score[item] = scores[:topk]
with open('itemcf_simscore.json', 'w') as f:
json.dump(item_x_item_score, f)
3. 计算召回率
import json
from collections import defaultdict
with open('itemcf_simscore.json','r') as f:
sim_scores = json.load(f)
with open('train_user_items.json','r') as f:
train_user_items = json.load(f)
with open('test_user_items.json','r') as f:
test_user_items = json.load(f)
def get_item_by_user_item(ui, top_K=100):
item_score = defaultdict(float)
interacted_items = set()
# 获取用户历史交互过的商品和评分
for item, rate, _ in train_user_items.get(ui, []):
interacted_items.add(item)
# 相似的商品和相似分
for sim_item, sim_score in sim_scores.get(item, []):
item_score[sim_item] += rate * sim_score
item_score = {item: score for item, score in item_score.items() if item not in interacted_items}
item_score = sorted(item_score.items(), key=lambda x: x[1], reverse=True)
return item_score[:top_K]
# 计算召回率
users = set(test_user_items.keys())
total_recall = 0.0
for user in users:
test_items = {item[0] for item in test_user_items.get(user, [])}
recommended_items = {item[0] for item in get_item_by_user_item(user)}
if not test_items or not recommended_items:
continue
recall = len(recommended_items & test_items) / len(test_items)
total_recall += recall
average_recall = total_recall / len(users) if users else 0.0
print(f"Average Recall: {average_recall}")
最终召回率为0.31938770604518185
当然也可以实现swingi2i的计算方法
def swing_simscore(i):
sim = []
itemi = all_items[i]
itemi_users = item_x_users[itemi]
for j in range(i+1, len(all_items)):
score = 0
itemj = all_items[j]
itemj_users = item_x_users[itemj]
union_users = itemi_users & itemj_users
union_users_pair = combinations(union_users, 2)
for ui, uj in union_users_pair:
overlap = len(user_x_items[ui] & user_x_items[uj])
score += 1 / (1 + overlap)
sim.append((itemi, itemj, score))
return sim