Kaggle上刷题:# Predict the Introverts from the Extroverts
原题链接:www.kaggle.com/competition… 题目简介:根据训练数据中的维度,建立预测模型,并利用训练的模型预测测试数据中的值。
数据格式:
train data:
第一步:先查看train data中各个维度与目标相关的热力图:
data = pd.read_csv('train.csv') #读入数据
def display():
d = clean_data(data)
d['Personality'] = d['Personality'].map({'Extrovert': 1, 'Introvert': 0})
d = d.drop(['id'], axis=1)
features = data.columns
plt.figure(figsize=(15, 10))
sns.heatmap(d.corr(), cmap="YlGnBu", annot=True, fmt=".2f", linewidths=0.5)
plt.show()
第二步:对数据进行清洗
def clean_data(df):
"""清洗数据"""
# 1. 处理数值列中的缺失值 - 用中位数填充
numeric_cols = ['Time_spent_Alone', 'Social_event_attendance', 'Going_outside',
'Friends_circle_size', 'Post_frequency']
for col in numeric_cols:
median_val = df[col].median()
df[col]= df[col].fillna(median_val)
# 2. 处理分类列中的缺失值
df['Stage_fear']= df['Stage_fear'].fillna('No')
df['Drained_after_socializing']= df['Drained_after_socializing'].fillna('No')
# 将分类变量转换为数值
df['Stage_fear'] = df['Stage_fear'].map({'Yes': 1, 'No': 0})
df['Drained_after_socializing'] = df['Drained_after_socializing'].map({'Yes': 1, 'No': 0})
return df
第三步:使用清洗后的数据进行模型训练
def train_model():
df = clean_data(data)
print(df.head())
# 编码目标变量
le = LabelEncoder()
df['Personality'] = le.fit_transform(df['Personality'])
# 准备特征和目标变量
X = df.drop([ 'Personality'], axis=1)
y = df['Personality']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
# 标准化数值特征
# X_train = scaler.fit_transform(X_train)
# y_train = scaler.transform(y_train)
# 创建并训练逻辑回归模型
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)
# 训练集准确率: 0.9688912882110804
# 测试集准确率: 0.9686909581646423
# estimators = [
# ('dt', DecisionTreeClassifier()),
# ('lr', LogisticRegression())
# ]
# model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
# model.fit(X_train, y_train)
joblib.dump(model, 'personality_model.pkl') # 保存模型
joblib.dump(le, 'label_encoder.pkl')
# 评估模型
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
print("训练集准确率:", accuracy_score(y_train, train_preds))
print("测试集准确率:", accuracy_score(y_test, test_preds))
print("\n分类报告:\n", classification_report(y_test, test_preds, target_names=le.classes_))
# 特征重要性分析
features = X.columns
coefficients = pd.DataFrame({'特征': features, '系数': model.coef_[0]})
coefficients['绝对值'] = np.abs(coefficients['系数'])
coefficients = coefficients.sort_values('绝对值', ascending=False)
print("\n特征重要性:")
print(coefficients.drop('绝对值', axis=1))
第四步:加载模型,并进行预测
def load_model():
"""加载模型"""
model = joblib.load('personality_model.pkl')
le = joblib.load('label_encoder.pkl')
return model, le
def predict():
"""预测新数据"""
model, le = load_model()
lable = LabelEncoder()
# 清洗输入数据
input_df = pd.read_csv("test.csv")
input_df = clean_data(input_df)
print(input_df.head())
# 预测
predictions = model.predict(input_df)
print(predictions)
input_df['Personality_Predicted'] = le.inverse_transform(predictions)
results = input_df[['id', 'Personality_Predicted']]
results.to_csv('personality_predictions.csv', index=False)
最终得出的各项指标为:
训练集准确率: 0.9687563263378096
测试集准确率: 0.968421052631579
分类报告:
precision recall f1-score support
Extrovert 0.98 0.98 0.98 2753
Introvert 0.95 0.93 0.94 952
accuracy 0.97 3705
macro avg 0.96 0.96 0.96 3705
weighted avg 0.97 0.97 0.97 3705