创建包含更多行和列的原始数据集。
- 删除缺失值最多的列。
- 将预处理后的数据集转换为张量格式。
import os
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
f.write('NumRooms,Alley,Age,Price\n')
f.write('NA,Pave,NA,127500\n')
f.write('2,NA,NA,106000\n')
f.write('4,NA,NA,178100\n')
f.write('NA,NA,NA,140000\n')
f.write('3,Pave,NA,120000\n')
f.write('1,Pave,NA,130000\n')
f.write('5,Pave,NA,110000\n')
f.write('NA,NA,NA,100000\n')
import pandas as pd
df = pd.read_csv(data_file)
print(f"Init df: \n{df}")
dash_line = '=' * 20
missing_values = df.isnull().sum()
print(dash_line)
print(f"Missing_values: \n{missing_values}")
column_to_drop = missing_values.idxmax()
print(dash_line)
print(f"Column_to_drop: \n{column_to_drop}")
df_dropped = df.drop(columns=column_to_drop, axis=1)
print(dash_line)
print(f"df_dropped: \n{df_dropped}")
import torch
df_dropped = df_dropped.fillna(df_dropped.mean())
df_dropped = pd.get_dummies(df_dropped, dummy_na=True)
print(dash_line)
print(f"df_appended: \n{df_dropped}")
T = torch.tensor(df_dropped.values)
print(dash_line)
print(f"Tensor: \n{T}")
Init df:
NumRooms Alley Age Price
0 NaN Pave NaN 127500
1 2.0 NaN NaN 106000
2 4.0 NaN NaN 178100
3 NaN NaN NaN 140000
4 3.0 Pave NaN 120000
5 1.0 Pave NaN 130000
6 5.0 Pave NaN 110000
7 NaN NaN NaN 100000
====================
Missing_values:
NumRooms 3
Alley 4
Age 8
Price 0
dtype: int64
====================
Column_to_drop:
Age
====================
df_dropped:
NumRooms Alley Price
0 NaN Pave 127500
1 2.0 NaN 106000
2 4.0 NaN 178100
3 NaN NaN 140000
4 3.0 Pave 120000
5 1.0 Pave 130000
6 5.0 Pave 110000
7 NaN NaN 100000
====================
df_appended:
NumRooms Price Alley_Pave Alley_nan
0 3.0 127500 1 0
1 2.0 106000 0 1
2 4.0 178100 0 1
3 3.0 140000 0 1
4 3.0 120000 1 0
5 1.0 130000 1 0
6 5.0 110000 1 0
7 3.0 100000 0 1
====================
Tensor:
tensor([[3.0000e+00, 1.2750e+05, 1.0000e+00, 0.0000e+00],
[2.0000e+00, 1.0600e+05, 0.0000e+00, 1.0000e+00],
[4.0000e+00, 1.7810e+05, 0.0000e+00, 1.0000e+00],
[3.0000e+00, 1.4000e+05, 0.0000e+00, 1.0000e+00],
[3.0000e+00, 1.2000e+05, 1.0000e+00, 0.0000e+00],
[1.0000e+00, 1.3000e+05, 1.0000e+00, 0.0000e+00],
[5.0000e+00, 1.1000e+05, 1.0000e+00, 0.0000e+00],
[3.0000e+00, 1.0000e+05, 0.0000e+00, 1.0000e+00]], dtype=torch.float64)