数据处理|预备知识|动手学深度学习

109 阅读2分钟

创建包含更多行和列的原始数据集。

  1. 删除缺失值最多的列。
  2. 将预处理后的数据集转换为张量格式。
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Age,Price\n')  # 列名
    f.write('NA,Pave,NA,127500\n')  # 每行表示一个数据样本
    f.write('2,NA,NA,106000\n')
    f.write('4,NA,NA,178100\n')
    f.write('NA,NA,NA,140000\n')
    f.write('3,Pave,NA,120000\n')
    f.write('1,Pave,NA,130000\n')
    f.write('5,Pave,NA,110000\n')
    f.write('NA,NA,NA,100000\n')

# 如果没有安装pandas,只需取消对以下行的注释来安装pandas
# !pip install pandas
import pandas as pd

df = pd.read_csv(data_file)
print(f"Init df: \n{df}")

dash_line = '=' * 20

# 找出每列缺失值的数量
missing_values = df.isnull().sum()
print(dash_line)
print(f"Missing_values: \n{missing_values}")

# 找出缺失值最多的列
column_to_drop = missing_values.idxmax()
print(dash_line)
print(f"Column_to_drop: \n{column_to_drop}")

# 删除该列
df_dropped = df.drop(columns=column_to_drop, axis=1)
print(dash_line)
print(f"df_dropped: \n{df_dropped}")

import torch

# 将numpy数组转换为torch张量
df_dropped = df_dropped.fillna(df_dropped.mean())
df_dropped = pd.get_dummies(df_dropped, dummy_na=True)
print(dash_line)
print(f"df_appended: \n{df_dropped}")

T = torch.tensor(df_dropped.values) # 假设数据是浮点数
print(dash_line)
print(f"Tensor: \n{T}")
Init df: 
   NumRooms Alley  Age   Price
0       NaN  Pave  NaN  127500
1       2.0   NaN  NaN  106000
2       4.0   NaN  NaN  178100
3       NaN   NaN  NaN  140000
4       3.0  Pave  NaN  120000
5       1.0  Pave  NaN  130000
6       5.0  Pave  NaN  110000
7       NaN   NaN  NaN  100000
====================
Missing_values: 
NumRooms    3
Alley       4
Age         8
Price       0
dtype: int64
====================
Column_to_drop: 
Age
====================
df_dropped: 
   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000
4       3.0  Pave  120000
5       1.0  Pave  130000
6       5.0  Pave  110000
7       NaN   NaN  100000
====================
df_appended: 
   NumRooms   Price  Alley_Pave  Alley_nan
0       3.0  127500           1          0
1       2.0  106000           0          1
2       4.0  178100           0          1
3       3.0  140000           0          1
4       3.0  120000           1          0
5       1.0  130000           1          0
6       5.0  110000           1          0
7       3.0  100000           0          1
====================
Tensor: 
tensor([[3.0000e+00, 1.2750e+05, 1.0000e+00, 0.0000e+00],
        [2.0000e+00, 1.0600e+05, 0.0000e+00, 1.0000e+00],
        [4.0000e+00, 1.7810e+05, 0.0000e+00, 1.0000e+00],
        [3.0000e+00, 1.4000e+05, 0.0000e+00, 1.0000e+00],
        [3.0000e+00, 1.2000e+05, 1.0000e+00, 0.0000e+00],
        [1.0000e+00, 1.3000e+05, 1.0000e+00, 0.0000e+00],
        [5.0000e+00, 1.1000e+05, 1.0000e+00, 0.0000e+00],
        [3.0000e+00, 1.0000e+05, 0.0000e+00, 1.0000e+00]], dtype=torch.float64)