划分逻辑非常的简单
- 按照用户分组
- 对每个用户交互的item按照时间戳排序
- 排好序的item按照比例划分训练集和测试集
import json
from collections import defaultdict
user_items = defaultdict(list)
with open("ml-1m/ratings.dat", "r") as lines:
for line in lines:
userid,itemid,rate,timestamp = line.strip().split("::")
user_items[userid].append((itemid,int(rate),int(timestamp)))
train_user_items = defaultdict(list)
test_user_items = defaultdict(list)
train_rate = 0.8
for userid, items in user_items.items():
items = sorted(items, key=lambda x : x[2], reverse=True)
train_size = int(len(items) * train_rate)
train_user_items[userid] = items[:train_size]
test_user_items[userid] = items[train_size:]
with open("train_user_items.json",'w') as f:
json.dump(train_user_items, f)
with open("test_user_items.json",'w') as f:
json.dump(test_user_items, f)