原数据的格式:
处理数据思路:将原数据取出来,并按照比列分成train, dev, test三个部分,将每一份数据按照字典存储到json文件中
import json
import numpy as np
from config import HP
trainset_ratio = 0.7
devset_ratio = 0.2
testset_ratio = 0.1
np.random.seed(HP.seed)
f = open(HP.data_path, encoding="utf-8")
dataset = f.read().splitlines()
np.random.shuffle(dataset)
n_items = len(dataset)
trainset_num = int(trainset_ratio*n_items)
devset_num = int(devset_ratio*n_items)
testset_num = n_items - trainset_num - devset_num
traindata = dataset[:trainset_num]
valdata = dataset[trainset_num:trainset_num+devset_num]
testdata = dataset[trainset_num+devset_num:]
c = []
d = []
for b in traindata:
num = b.find(' ')
c.append(b[:num])
d.append(b[num + 1:])
data1 = dict(zip(c, d))
e = []
f = []
for b in valdata:
num = b.find(' ')
e.append(b[:num])
f.append(b[num + 1:])
data2 = dict(zip(e, f))
i = []
j = []
for b in testdata:
num = b.find(' ')
i.append(b[:num])
j.append(b[num + 1:])
data3 = dict(zip(i, j))
json_file = open('./data/data_train.json', mode='w')
json.dump(data1, json_file, indent=2)
json_file = open('./data/data_val.json', mode='w')
json.dump(data2, json_file, indent=2)
json_file = open('./data/data_test.json', mode='w')
json.dump(data3, json_file, indent=2)
f.close()
处理之后的格式:
![%P2CE89}C)Y]P4Y3D2$8WJT.png](p1-juejin.byteimg.com/tos-cn-i-k3…?)