PyTorch 深度学习笔记 第二章 分类问题

77 阅读4分钟

去做著名的例子泰坦尼克号

www.kaggle.com/competition…

1.官方教程

官方的教程是使用 sklearn 的决策树

import pandas as pd
from sklearn.ensemble import RandomForestClassifier

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

2.csv 单元格含逗号

第一个坑是,给出的 csv 中,人名里面是含分割符逗号的,所以直接用 np.loadtxt 读会出错

image.png

但是直接用 excel 打开又会发现可以正确识别

image.png

所以看上去 excel 会做一些额外的处理,使得读出来的 csv 不对劲的时候就矫正一下,np.loadtxt 就不会这么做

pandas 能够正确读出来,难道 pandas 也是有特殊处理……

反正最方便的方法就是换一个分隔符

3.np.loadtxt 返回 (n,)

只读一列的 np.loadtxt 会返回 (n,)

所以要记得 reshape

4.初步尝试

照抄 blog.csdn.net/bit452/arti…

import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# 将训练数据集进行批量处理
# prepare dataset
class TrainDataset(Dataset):
    def __init__(self, data, label):
        self.len = data.shape[0]  # shape(多少行,多少列)
        self.x_data = torch.from_numpy(data)
        self.y_data = torch.from_numpy(label)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

# design model using class
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(4, 2)
        self.linear2 = torch.nn.Linear(2, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
        return x

# 读数据 train 用来训练,test 用来预测
X_train = np.loadtxt("train.csv", delimiter=";", usecols=(2, 4, 6, 7), skiprows=1, dtype=np.float32)
Y_train = np.loadtxt("train.csv", delimiter=";", usecols=(1,), skiprows=1, dtype=np.float32)
X_predict = np.loadtxt("test.csv", delimiter=";", usecols=(1, 3, 5, 6), skiprows=1, dtype=np.float32)

# (n,) to (n,1)
Y_train = Y_train.reshape(-1, 1)

# 划分训练集和测试集
X_train_split, X_test_split, Y_train_split, Y_test_split = train_test_split(X_train, Y_train, test_size=0.3)

# 转化为 PyTorch 的 Tensor 变量
X_test_split_tensor = torch.from_numpy(X_test_split)
Y_test_split_tensor = torch.from_numpy(Y_test_split)

# 设置 dataset 划分 mini-batch
train_dataset = TrainDataset(X_train_split, Y_train_split)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=2)  # num_workers 多线程

# 构造模型
model = Model()

# 构造损失函数和优化器
criterion = torch.nn.BCELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# training cycle forward, backward, update
def train(epoch):
    train_loss = 0.0
    count = 0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        y_pred = model(inputs)

        loss = criterion(y_pred, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        count = i

    print("train loss:", train_loss / count, end=',')


def test():
    with torch.no_grad():
        y_pred = model(X_test_split_tensor)
        y_pred_label = torch.where(y_pred >= 0.5, torch.tensor([1.0]), torch.tensor([0.0]))
        acc = torch.eq(y_pred_label, Y_test_split_tensor).sum().item() / Y_test_split_tensor.size(0)
        print("test acc:", acc)


if __name__ == '__main__':
    for epoch in range(10):
        train(epoch)
        test()

Log:

train loss: 0.7130194086777536,test acc: 0.6455223880597015
train loss: 0.7137780534593683,test acc: 0.6455223880597015
train loss: 0.7117264082557276,test acc: 0.6455223880597015
train loss: 0.7123606706920423,test acc: 0.6455223880597015
train loss: 0.7120833271428159,test acc: 0.6455223880597015
train loss: 0.713776381392228,test acc: 0.6455223880597015
train loss: 0.7099782197098983,test acc: 0.6455223880597015
train loss: 0.7080307383286325,test acc: 0.6455223880597015
train loss: 0.7121636177364149,test acc: 0.6455223880597015
train loss: 0.7110078240695753,test acc: 0.6455223880597015

这里可以看到正确率一下就收敛了

但是多次执行的话,可能还需要一段时间才能收敛

train loss: 0.8309515212711535,test acc: 0.3917910447761194
train loss: 0.8128885156229922,test acc: 0.3917910447761194
train loss: 0.7958379419226396,test acc: 0.3917910447761194
train loss: 0.7815723544672916,test acc: 0.3917910447761194
train loss: 0.7667004685652884,test acc: 0.3917910447761194
train loss: 0.7559730504688463,test acc: 0.3917910447761194
train loss: 0.7456844919606259,test acc: 0.4216417910447761
train loss: 0.7366465361494767,test acc: 0.5074626865671642
train loss: 0.7300329616195277,test acc: 0.6716417910447762
train loss: 0.7232479923649838,test acc: 0.6343283582089553

5.资料预处理

显然这里做一个字符串到数字的转换

这里我是手动改了输入文件,把性别手动改成了 0 和 1,之后才发现有 LabelEncoder

LabelEncoder 只是把字符串映射到连续整数,这时这些整数之间存在大小关系。因此还需要将它们转化成不相关的整数

因此要使用 OneHotEncoder 来转化,转化时数据表会多出几列

但是 pd.get_dummies 一步到位

6.仿照

import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 新建一个 Title 列,提取 Name 中的“头衔”信息
# 少见的头衔统一命名为 Rare

train_data_title = [i.split(',')[1].split('.')[0].strip() for i in train_data['Name']]
train_data['Title'] = pd.Series(train_data_title)
train_data['Title'].value_counts()
train_data['Title'] = train_data['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

train_data_title = [i.split(',')[1].split('.')[0].strip() for i in test_data['Name']]
test_data['Title'] = pd.Series(train_data_title)
test_data['Title'].value_counts()
test_data['Title'] = test_data['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

train_data['FamilyS'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilyS'] = test_data['SibSp'] + test_data['Parch'] + 1

# 计算家庭总数

def family(x):
    if x < 2:
        return 'Single'
    elif x == 2:
        return 'Couple'
    elif x <= 4:
        return 'InterM'
    else:
        return 'Large'


train_data['FamilyS'] = train_data['FamilyS'].apply(family)
test_data['FamilyS'] = test_data['FamilyS'].apply(family)

# 填充空白单元格

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# 删除多余行

train_data = train_data.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
test_data_passengers = test_data['PassengerId']
test_data = test_data.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)

# 从表格中提取数据

Y_train = train_data["Survived"]

features = ["Pclass", "Sex", "Age", "Fare", "Embarked", "Title", "FamilyS"]
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# dataframe 转化为 ndarray
Y_train = Y_train.to_numpy(dtype=np.float32)
Y_train = Y_train.reshape(-1, 1)

X_train = X_train.to_numpy(dtype=np.float32)
X_test = X_test.to_numpy(dtype=np.float32)

test_data_passengers = test_data_passengers.to_numpy(dtype=np.float32)
#test_data_passengers = test_data_passengers.reshape(-1, 1)

# 保存长度变量

features_len = X_train.shape[1]

# 将训练数据集进行批量处理
# prepare dataset
class TrainDataset(Dataset):
    def __init__(self, data, label):
        self.len = data.shape[0]  # shape(多少行,多少列)
        self.x_data = torch.from_numpy(data)
        self.y_data = torch.from_numpy(label)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

# design model using class
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(features_len, 270)
        self.linear2 = torch.nn.Linear(270, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
        return x


# 设置 dataset 划分 mini-batch
train_dataset = TrainDataset(X_train, Y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=2)

# 构造模型
model = Model()

# 构造损失函数和优化器
criterion = torch.nn.BCELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# 训练次数
max_epoch = 50

# 截断误差
error = 1e-6

# 损失列表
loss_list = np.zeros((max_epoch, 1))

# training cycle forward, backward, update
def train(epoch):
    train_loss = 0.0
    count = 0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        y_pred = model(inputs)

        loss = criterion(y_pred, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        count = i

    loss_list[epoch] = train_loss / count

    if train_loss / count < error:
        return True
    else:
        return False

if __name__ == '__main__':

    # 训练
    for epoch in range(max_epoch):
        if train(epoch):
            print(loss_list[epoch])
            break

    # 预测
    with torch.no_grad():
        y_pred = model(torch.from_numpy(X_test))
        y_pred_label = torch.where(y_pred >= 0.5, torch.tensor([1.0]), torch.tensor([0.0]))

    # tensor 变量转换为 ndarray 需要 shape 为 (n,) 
    y_pred_label = y_pred_label.numpy()
    y_pred_label = y_pred_label.reshape(y_pred_label.shape[0],)

    # 元素类型转换为 int
    test_data_passengers = test_data_passengers.astype(int)
    y_pred_label = y_pred_label.astype(int)

    # 输出 csv
    output = pd.DataFrame({'PassengerId': test_data_passengers, 'Survived': y_pred_label})
    output.to_csv('submission.csv', index=False)
    print("Your submission was successfully saved!")

7.数据类型

nn.CrossEntropyLoss() 需要 long 类型,pytorch 的模型需要 float32 类型,所以 x 和 y 需要转化成不同的类型,一个用 torch.FloatTensor 一个用 torch.LongTensor

8.改进

我看了别人都是用 relu,然后输出两个特征,一个是 0 的概率一个是 1 的概率

虽然和只输出一个是差不多的,但是为了试试别人的算法还是这么写了……

import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 新建一个 Title 列,提取 Name 中的“头衔”信息
# 少见的头衔统一命名为 Rare

train_data_title = [i.split(',')[1].split('.')[0].strip() for i in train_data['Name']]
train_data['Title'] = pd.Series(train_data_title)
train_data['Title'].value_counts()
train_data['Title'] = train_data['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

train_data_title = [i.split(',')[1].split('.')[0].strip() for i in test_data['Name']]
test_data['Title'] = pd.Series(train_data_title)
test_data['Title'].value_counts()
test_data['Title'] = test_data['Title'].replace(['Lady', 'the Countess', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'Ms', 'Mme', 'Mlle'], 'Rare')

train_data['FamilyS'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilyS'] = test_data['SibSp'] + test_data['Parch'] + 1

# 计算家庭总数

def family(x):
    if x < 2:
        return 'Single'
    elif x == 2:
        return 'Couple'
    elif x <= 4:
        return 'InterM'
    else:
        return 'Large'


train_data['FamilyS'] = train_data['FamilyS'].apply(family)
test_data['FamilyS'] = test_data['FamilyS'].apply(family)

# 填充空白单元格

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

# 删除多余行

train_data = train_data.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)
test_data_passengers = test_data['PassengerId']
test_data = test_data.drop(['PassengerId', 'Cabin', 'Name', 'SibSp', 'Parch', 'Ticket'], axis=1)

# 从表格中提取数据

Y_train = train_data["Survived"]

features = ["Pclass", "Sex", "Age", "Fare", "Embarked", "Title", "FamilyS"]
X_train = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

# dataframe 转化为 ndarray
Y_train = Y_train.to_numpy()

X_train = X_train.to_numpy()
X_test = X_test.to_numpy()

test_data_passengers = test_data_passengers.to_numpy()

# 保存长度变量

features_len = X_train.shape[1]

# 将训练数据集进行批量处理
# prepare dataset
class TrainDataset(Dataset):
    def __init__(self, data, label):
        self.len = data.shape[0]  # shape(多少行,多少列)
        self.x_data = torch.FloatTensor(data)
        self.y_data = torch.LongTensor(label)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

# design model using class
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(features_len, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x


# 设置 dataset 划分 mini-batch
train_dataset = TrainDataset(X_train, Y_train)
train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=True, num_workers=2)

# 构造模型
model = Model()

# 构造损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# 训练次数
max_epoch = 500

if __name__ == '__main__':

    # 损失
    train_loss = 0
    train_loss_min = np.Inf

    # 训练
    for epoch in range(max_epoch):
        train_loss = 0.0
        count = 0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data

            optimizer.zero_grad()

            output = model(inputs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            count = i

        train_loss = train_loss / count
        if train_loss <= train_loss_min:
            print(
                "Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min, train_loss))
            torch.save(model.state_dict(), "model.pt")
            train_loss_min = train_loss

    # 预测
    with torch.no_grad():
        y_pred = model(torch.FloatTensor(X_test))
        _, y_pred_label = torch.max(y_pred, 1)

    # tensor 变量转换为 ndarray 需要 shape 为 (n,)
    y_pred_label = y_pred_label.numpy()
    y_pred_label = y_pred_label.reshape(y_pred_label.shape[0],)

    # 元素类型转换为 int
    test_data_passengers = test_data_passengers.astype(int)
    y_pred_label = y_pred_label.astype(int)

    # 输出 csv
    output = pd.DataFrame({'PassengerId': test_data_passengers, 'Survived': y_pred_label})
    output.to_csv('submission.csv', index=False)
    print("Your submission was successfully saved!")

最终的正确率是 0.74641

跟官方的解决方案 0.77511 还有点差距,但不大