处理cmudict数据

310 阅读1分钟

原数据的格式:

@6{FWE{H%52(JWUZO6S.png

处理数据思路:将原数据取出来,并按照比列分成train, dev, test三个部分,将每一份数据按照字典存储到json文件中

import json

import numpy as np
from config import HP

trainset_ratio = 0.7
devset_ratio = 0.2
testset_ratio = 0.1


np.random.seed(HP.seed)
f = open(HP.data_path, encoding="utf-8")
dataset = f.read().splitlines()
np.random.shuffle(dataset)

n_items = len(dataset)

trainset_num = int(trainset_ratio*n_items)
devset_num = int(devset_ratio*n_items)
testset_num = n_items - trainset_num - devset_num

traindata = dataset[:trainset_num]
valdata = dataset[trainset_num:trainset_num+devset_num]
testdata = dataset[trainset_num+devset_num:]


c = []
d = []
for b in traindata:
    num = b.find(' ')
    c.append(b[:num])
    d.append(b[num + 1:])
data1 = dict(zip(c, d))


e = []
f = []
for b in valdata:
    num = b.find(' ')
    e.append(b[:num])
    f.append(b[num + 1:])
data2 = dict(zip(e, f))

i = []
j = []
for b in testdata:
    num = b.find(' ')
    i.append(b[:num])
    j.append(b[num + 1:])
data3 = dict(zip(i, j))


json_file = open('./data/data_train.json', mode='w')
json.dump(data1, json_file, indent=2)

json_file = open('./data/data_val.json', mode='w')
json.dump(data2, json_file, indent=2)

json_file = open('./data/data_test.json', mode='w')
json.dump(data3, json_file, indent=2)



f.close()

处理之后的格式:

![%P2CE89}C)Y]P4Y3D2$8WJT.png](p1-juejin.byteimg.com/tos-cn-i-k3…?)