网络训练数据准备---数据标签在numpy和tensor之间的转换

539 阅读4分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路。

网络训练数据准备---数据标签在numpy和tensor之间的转换

1.从txt文档中读取训练数据和测试数据

 fp = open('E:\桌面\test_new_3\data\train.txt', 'r', encoding='utf-8')
 string = fp.read()  # string是一行字符串,该字符串包含文件所有内容
 fp.close()
 row_list = string.splitlines()  # splitlines默认参数是‘\n’
 data_list = [[float(i) for i in row.strip().split(" ")] for row in row_list]
 shuffle(data_list) ##打乱数据
 data_train = np.array(data_list)     # data=data.to_numpy()
 fp = open('E:\桌面\test_new_3\data\test.txt', 'r', encoding='utf-8')
 string = fp.read()  # string是一行字符串,该字符串包含文件所有内容
 fp.close()
 row_list = string.splitlines()  # splitlines默认参数是‘\n’
 data_list = [[float(i) for i in row.strip().split(" ")] for row in row_list]
 shuffle(data_list) ##打乱数据
 data_test = np.array(data_list)     # data=data.to_numpy()
 ​

2.从txt文件中读取标签数据

 fp = open('E:\桌面\test_new_3\label\1004.txt', 'r', encoding='utf-8')
 line = fp.readline()
 l = line.split(",")
 label1 = [[float(i)] for i in l]
 label1 = np.array(label1)
 fp = open('E:\桌面\test_new_3\label\355.txt', 'r', encoding='utf-8')
 line = fp.readline()
 l = line.split(",")
 label2 = [[float(i)] for i in l]
 label2 = np.array(label2)
 # label1 = np.loadtxt('./284.txt', dtype=np.float32, delimiter=' ')
 # print("原始红外矩阵大小为:", label1)
 # label2 = np.loadtxt('./71.txt', dtype=np.float32, delimiter=' ')
 # print("原始红外矩阵大小为:", label2)
 train_datas = data_train
 train_labels = label1
 test_datas = data_test
 test_labels = label2

3.将训练数据和测试数据转化为tensor形式,为后面训练做准备,因为训练数据在pytorch要求一般为tensor格式

 ##这几步的作用就是将数据转化为tensor的格式,为后面训练做准备(因为训练的时候数据要是tensor格式)
 ##同时,train_datas = train_datas / 1.0 ,这里的作用就是将其强制转化为浮点类型
 train_datas = train_datas / 1.0
 train_datas = torch.from_numpy(train_datas).float()
 test_datas = test_datas / 1.0
 test_datas = torch.from_numpy(test_datas).float()

4.将标签转化为float型(此处是因为做的是回归任务),如果想要做分类任务的话需要将标签转化为long型!

 train_labels = torch.from_numpy(train_labels).float()
 test_labels = torch.from_numpy(test_labels).float()

5.将已经转化为tensor格式的(train_datas, train_labels)和(test_datas, test_labels)转化为Data.DataLoader可以使用的格式

 torch_dataset_train = Data.TensorDataset(train_datas, train_labels)
 torch_dataset_test = Data.TensorDataset(test_datas, test_labels)
 print(train_datas.size())                 # (60000, 28, 28)
 print(train_labels.size())
 print(test_datas.size())
 print(test_labels.size())

6.利用Data.DataLoader来对数据集进行迭代

 train_loader = Data.DataLoader(dataset=torch_dataset_train, batch_size=BATCH_SIZE, shuffle=True)

7.训练流程

 a = []
 for epoch in range(EPOCH):
     for step, (b_x, b_y) in enumerate(train_loader):   # train_loader的用法!
         # print(b_x.size())
         b_x = b_x.view(-1, 3)
         # print(b_x.size())
         output = mlp(b_x)            # logistic output
         loss = loss_func(output, b_y)   # cross entropy loss
         optimizer.zero_grad()           # clear gradients for this training step
         loss.backward()                 # backpropagation, compute gradients
         optimizer.step()                # apply gradients
 ​
         if step % 50 == 0:
             test_output = mlp(test_x.view(-1, 3))
             accuracy = (((abs(test_output.detach().numpy()-test_y.numpy()) < 0.1)).sum())/len(test_y.numpy())
             print('Epoch: ', round(epoch), '| train loss: %.4f' % loss.data.numpy(), '| test accuracy:%.2f' % accuracy)
             a.append(accuracy)
 ​

8.全部代码

 ​
 # library
 # standard library
 import os
 from random import shuffle
 # third-party library
 import torch
 import torch.nn as nn
 import torch.utils.data as Data
 import torchvision
 import numpy as np
 # import matplotlib.pyplot as plt
 ​
 # torch.manual_seed(1)    # reproducible
 ​
 # Hyper Parameters
 EPOCH = 300                # train the training data n times, to save time, we just train 1 epoch
 BATCH_SIZE = 50
 LR = 0.0005              # learning rate
 ​
 ​
 # # Mnist digits dataset
 # if not(os.path.exists('./mnist/')) or not os.listdir('./mnist/'):
 #     # not mnist dir or mnist is empyt dir
 #     DOWNLOAD_MNIST = True
 ​
 fp = open('E:\桌面\test_new_3\data\train.txt', 'r', encoding='utf-8')
 string = fp.read()  # string是一行字符串,该字符串包含文件所有内容
 fp.close()
 row_list = string.splitlines()  # splitlines默认参数是‘\n’
 data_list = [[float(i) for i in row.strip().split(" ")] for row in row_list]
 shuffle(data_list) ##打乱数据
 data_train = np.array(data_list)     # data=data.to_numpy()
 fp = open('E:\桌面\test_new_3\data\test.txt', 'r', encoding='utf-8')
 string = fp.read()  # string是一行字符串,该字符串包含文件所有内容
 fp.close()
 row_list = string.splitlines()  # splitlines默认参数是‘\n’
 data_list = [[float(i) for i in row.strip().split(" ")] for row in row_list]
 shuffle(data_list) ##打乱数据
 data_test = np.array(data_list)     # data=data.to_numpy()
 ##划分数据
 ​
 fp = open('E:\桌面\test_new_3\label\1004.txt', 'r', encoding='utf-8')
 line = fp.readline()
 l = line.split(",")
 label1 = [[float(i)] for i in l]
 label1 = np.array(label1)
 fp = open('E:\桌面\test_new_3\label\355.txt', 'r', encoding='utf-8')
 line = fp.readline()
 l = line.split(",")
 label2 = [[float(i)] for i in l]
 label2 = np.array(label2)
 # label1 = np.loadtxt('./284.txt', dtype=np.float32, delimiter=' ')
 # print("原始红外矩阵大小为:", label1)
 # label2 = np.loadtxt('./71.txt', dtype=np.float32, delimiter=' ')
 # print("原始红外矩阵大小为:", label2)
 train_datas = data_train
 train_labels = label1
 test_datas = data_test
 test_labels = label2
 # print(train_datas)
 # print(train_labels)
 # print(test_datas)
 # print(test_labels)
 ​
 ##这几步的作用就是将数据转化为tensor的格式,为后面训练做准备(因为训练的时候数据要是tensor格式)
 ##同时,train_datas = train_datas / 1.0 ,这里的作用就是将其强制转化为浮点类型
 ​
 train_datas = train_datas / 1.0
 train_datas = torch.from_numpy(train_datas).float()
 test_datas = test_datas / 1.0
 test_datas = torch.from_numpy(test_datas).float()
 ​
 ##------------------------------------------------
 #将标签转化为float型(此处是因为做的是回归任务),如果想要做分类任务的话需要将标签转化为long型!
 ##------------------------------------------------
 train_labels = torch.from_numpy(train_labels).float()
 test_labels = torch.from_numpy(test_labels).float()
 ​
 # train_datas=torch.tensor(train_datas)
 # train_labels=torch.tensor(train_labels)
 # test_datas=torch.tensor(test_datas)
 # test_labels=torch.tensor(test_labels)
 ##将已经转化为tensor格式的(train_datas, train_labels)和(test_datas, test_labels)转化为Data.DataLoader可以使用的格式
 torch_dataset_train = Data.TensorDataset(train_datas, train_labels)
 torch_dataset_test = Data.TensorDataset(test_datas, test_labels)
 print(train_datas.size())                 # (60000, 28, 28)
 print(train_labels.size())
 print(test_datas.size())
 print(test_labels.size())
 ​
 ##Data.DataLoader来对数据集进行迭代
 train_loader = Data.DataLoader(dataset=torch_dataset_train, batch_size=BATCH_SIZE, shuffle=True)
 ​
 # pick 2000 samples to speed up testing
 # test_data = torch_dataset_test
 test_x = test_datas   # shape from (2000, 28, 28) to (2000, 1, 28, 28), value in range(0,1)
 test_y = test_labels
 class MLP(nn.Module):
     def __init__(self):
         super(MLP, self).__init__()
         self.mlp = nn.Sequential(
             nn.Linear(3, 5), nn.BatchNorm1d(5), nn.Tanh(), nn.Linear(5, 5), nn.BatchNorm1d(5),
             nn.Linear(5, 1)
         )
     def forward(self, x):
         output = self.mlp(x)
         return output    # return x for visualization
 ​
 """
 无激活层
 """
 # class MLP(nn.Module):
 #     def __init__(self):
 #         super(MLP, self).__init__()
 #         self.mlp = nn.Sequential(
 #             nn.Linear(5, 5),
 #             nn.Linear(5, 1)
 #         )
 #     def forward(self, x):
 #         output = self.mlp(x)
 #         return output    # return x for visualization
 ​
 mlp = MLP()
 print(mlp)  # net architecture
 optimizer = torch.optim.Adam(mlp.parameters(), lr=LR)   # optimize all logistic parameters
 # optimizer = torch.optim.SGD(mlp.parameters(), lr=LR, momentum=0.9)   # optimize all logistic parameters
 loss_func = nn.MSELoss()                       # the target label is not one-hotted
 # plt.ion()
 # training and testing
 a = []
 for epoch in range(EPOCH):
     for step, (b_x, b_y) in enumerate(train_loader):   # gives batch data, normalize x when iterate train_loader
         # print(b_x.size())
         b_x = b_x.view(-1, 3)
         # print(b_x.size())
         output = mlp(b_x)            # logistic output
         loss = loss_func(output, b_y)   # cross entropy loss
         optimizer.zero_grad()           # clear gradients for this training step
         loss.backward()                 # backpropagation, compute gradients
         optimizer.step()                # apply gradients
 ​
         if step % 50 == 0:
             test_output = mlp(test_x.view(-1, 3))
             # accuracy = abs(test_output-test_y)
             # print(accuracy)
             accuracy = (((abs(test_output.detach().numpy()-test_y.numpy()) < 0.1)).sum())/len(test_y.numpy())
             # print(accuracy)
             # accuracy = float(abs(pred_y-test_y.numpy()).astype(int).sum()) / float(test_y.numpy().sum())
             print('Epoch: ', round(epoch), '| train loss: %.4f' % loss.data.numpy(), '| test accuracy:%.2f' % accuracy)
             a.append(accuracy)
             # print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy())
             # if HAS_SK:
             #     # Visualization of trained flatten layer (T-SNE)
             #     tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
             #     plot_only = 500
             #     low_dim_embs = tsne.fit_transform(last_layer.data.numpy()[:plot_only, :])
             #     labels = test_y.numpy()[:plot_only]
             #     plot_with_labels(low_dim_embs, labels)
 # plt.ioff()
 ​
 # print 10 predictions from test data
 test_output = mlp(test_x[:355].view(-1, 3))
 # pred_y = torch.max(test_output, 1)[1].data.numpy()
 pred_y = test_output.data.numpy()
 pred_y = pred_y.flatten()
 print(pred_y, 'prediction number')
 print(test_y[:355].numpy(), 'real number')
 v=a[-1]
 print(a)
 ​
 ​
 import matplotlib.pyplot as plt
 plt.rc("font", family='KaiTi')
 plt.figure()
 f, axes = plt.subplots(1, 1)
 x = np.arange(1, 356)
 # axes.plot(x , pred_y)
 axes.scatter(x, pred_y, c='r', s=3, marker = 'o')
 plt.axhline(36.7, c ='g')
 axes.set_xlabel("位置点位")
 axes.set_ylabel("预测值")
 axes.set_title("矫正网络预测结果")
 axes.set_ylim((36, 37))
 plt.savefig("result.png")
 plt.legend(['真实值36.7℃', '预测值'], loc='upper left')
 ​
 row_labels = ['准确率:']
 col_labels = ['数值']
 table_vals = [['{:.2f}%'.format(v*100)]]
 row_colors = ['gold']
 my_table = plt.table(cellText=table_vals, colWidths=[0.1] * 5,
                              rowLabels=row_labels, rowColours=row_colors, loc='best')
 plt.show()
 ​

\