推荐算法实践:ml-1m划分数据集

84 阅读1分钟

划分逻辑非常的简单

  1. 按照用户分组
  2. 对每个用户交互的item按照时间戳排序
  3. 排好序的item按照比例划分训练集和测试集
import json
from collections import defaultdict
user_items = defaultdict(list)

with open("ml-1m/ratings.dat", "r") as lines:
    for line in lines:
        userid,itemid,rate,timestamp = line.strip().split("::")
        user_items[userid].append((itemid,int(rate),int(timestamp)))

train_user_items = defaultdict(list)
test_user_items = defaultdict(list)
train_rate = 0.8
for userid, items in user_items.items():
    items = sorted(items, key=lambda x : x[2], reverse=True)
    train_size = int(len(items) * train_rate)
    train_user_items[userid] = items[:train_size]
    test_user_items[userid] = items[train_size:]

with open("train_user_items.json",'w') as f:
    json.dump(train_user_items, f)

with open("test_user_items.json",'w') as f:
    json.dump(test_user_items, f)