惠勒提问法
Q1:如何把一个.csv文件变成PyTorch中的dataset?
Q2:如何使用dataloader对数据集进行处理?
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
# 问题一
class WineDataset(Dataset):
def __init__(self):
# data loading
super(WineDataset, self).__init__()
xy = np.loadtxt('./data/wine/wine.csv', delimiter=',', dtype=np.float32, skiprows=1)
self.x = torch.from_numpy(xy[:, 1:])
self.y = torch.from_numpy(xy[:, [0]])# n_samples, 1
self.n_samples = xy.shape[0]
def __getitem__(self, index):
# dataset[0]
return self.x[index], self.y[index]
def __len__(self):
# len(dataset)
return self.n_samples
dataset = WineDataset()
# print(type(dataset))
# <class '__main__.WineDataset'>
# first_data = dataset[0]
# print(type(first_data))
# print(first_data)
# <class 'tuple'>
# (tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
# 3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
# 1.0650e+03]), tensor([1.]))
# features, labels = first_data # 获取元组的不同元素
#问题二
dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=True)
# num_worker = 2 可以加快处理速度
# 测试过程中本机不支持,会报错,故不设置
dataiter = iter(dataloader)
data = dataiter.next()
# features, labels = data
# print(features,labels)
# tensor([[1.2720e+01, 1.8100e+00, 2.2000e+00, 1.8800e+01, 8.6000e+01, 2.2000e+00,
# 2.5300e+00, 2.6000e-01, 1.7700e+00, 3.9000e+00, 1.1600e+00, 3.1400e+00,
# 7.1400e+02],
# [1.3490e+01, 3.5900e+00, 2.1900e+00, 1.9500e+01, 8.8000e+01, 1.6200e+00,
# 4.8000e-01, 5.8000e-01, 8.8000e-01, 5.7000e+00, 8.1000e-01, 1.8200e+00,
# 5.8000e+02],
# [1.3510e+01, 1.8000e+00, 2.6500e+00, 1.9000e+01, 1.1000e+02, 2.3500e+00,
# 2.5300e+00, 2.9000e-01, 1.5400e+00, 4.2000e+00, 1.1000e+00, 2.8700e+00,
# 1.0950e+03],
# [1.4130e+01, 4.1000e+00, 2.7400e+00, 2.4500e+01, 9.6000e+01, 2.0500e+00,
# 7.6000e-01, 5.6000e-01, 1.3500e+00, 9.2000e+00, 6.1000e-01, 1.6000e+00,
# 5.6000e+02]]) tensor([[2.],
# [3.],
# [1.],
# [3.]])
# training loop
num_epochs = 2
total_samples = len(dataset)
# Math.ceil()是常见编程语言中的常用代码,用于向上取整数计算,返回的是大于或等于函数参数的数值。
n_iteraionts = math.ceil(total_samples)
print(total_samples, n_iteraionts)
for epoch in range(num_epochs):
for i, (inputs, labels) in enumerate(dataloader):
# forward backward and , update
if (i+1) % 5 == 0:
print(f'epoch {epoch+1}/{num_epochs}, step {i+1}/{n_iteraionts}, inputs {inputs.shape}')
# pytorch内置了一些数据集
# torchvision.datasets.MNIST()
#fashion-mnist, cifar, coco