1.读入数据:
df_heart = pd.read_csv("heart.csv")
X = df_heart.drop(columns=["target"], axis=1)
y = df_heart["target"].values
y = y.reshape(-1, 1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X_train.shape, X_test.shape)
2.使用K临近算法进行分类:
f1_score_list = []
acc_score_list = []
# K-近邻分类器
K=5
for i in range(1, 15):
KNN = KNeighborsClassifier(n_neighbors=i, algorithm='auto', n_jobs=-1)
KNN.fit(X_train, y_train)
acc_score_list.append(KNN.score(X_test, y_test))
y_pred = KNN.predict(X_test)
f1_score_list.append(f1_score(y_test, y_pred))
设置多个K值,将精准度和f1分数保存起来,然后通过可视化方式寻找最优的k值:
index = np.arange(1, 15,1)
print(max(f1_score_list))
plt.plot(index, acc_score_list, label='Accuracy', marker='o', c='blue',linestyle='solid')
plt.plot(index, f1_score_list, label='F1 Score', marker='o',c='red', linestyle='dashed')
plt.xlabel('K Value')
plt.legend(["Accuracy", "F1 Score"])
plt.ylabel('Score')
plt.grid('False')
plt.show()
3.支持向量机
# 支持向量机
svm = SVC(random_state=1)
svm.fit(X_train, y_train)
svm_y_pred = svm.predict(X_test)
print(svm.score(X_test, y_test)*100)
print(f1_score(y_test, svm_y_pred)*100)
4.朴素贝叶斯
# 高斯朴素贝叶斯
nb=GaussianNB()
nb.fit(X_train, y_train)
ng_ypred = nb.predict(X_test)
print(nb.score(X_test, y_test)*100)
print(f1_score(y_test, ng_ypred)*100)
5.随机森林
# 随机森林
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)
rf_y_pred = rf.predict(X_test)
print(rf.score(X_test, y_test)*100)
print(f1_score(y_test, rf_y_pred)*100)
6.使用网格搜索优化随机森林的超参数
#网格搜索超参数优化
from sklearn.model_selection import GridSearchCV
kfholder = StratifiedKFold(n_splits=10)
rf_param_grid = {
'n_estimators': [ 50, 100],
'max_depth': [None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4],
'bootstrap': [ False],
'criterion': ['gini'],
'max_features': ['auto', 'sqrt', 'log2']
}
rf_gs = GridSearchCV(rf, rf_param_grid, cv=kfholder, scoring='accuracy', n_jobs=10, verbose=1)
rf_gs.fit(X_train, y_train.ravel())
y_hat_rf_gs = rf_gs.predict(X_test)
print(accuracy_score(y_test, y_hat_rf_gs) * 100)
print(f1_score(y_test, y_hat_rf_gs) * 100)
# 输出最佳参数
print(rf_gs.best_params_)