1. 数据切分
from sklearn.model_selection import train_test_split
label = train_data['type'].values
train = train_data.drop(['type'], axis=1)
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3)
print(train_X.shape)
print(valid_X.shape)
print(train_y.shape)
print(valid_y.shape)
# (259, 10)
# (112, 10)
# (259,)
# (112,)
2. 分类模型
Logistic model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
seed = 10
lr = LogisticRegression(random_state=seed,
C=30,
class_weight='balanced',
solver='newton-cg',
multi_class='multinomial')
lr = lr.fit(train_X, train_y)
lr_pred = lr.predict(valid_X)
print('confusion_matrix--:')
print(confusion_matrix(valid_y, lr_pred))
print('Precision---------:', precision_score(valid_y, lr_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, lr_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, lr_pred, average='micro'))
输出:
confusion_matrix--:
[[24 5 6]
[ 5 33 0]
[ 8 2 29]]
Precision---------: 0.7678571428571429
Recall------------: 0.7678571428571429
F1_score----------: 0.7678571428571429
KNN model
from sklearn.neighbors import KNeighborsClassifier
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(train_X, train_y)
knn = lr.fit(train_X, train_y)
knn_pred = knn.predict(valid_X)
print('confusion_matrix--:')
print(confusion_matrix(valid_y, knn_pred))
print('Precision---------:', precision_score(valid_y, knn_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, knn_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, knn_pred, average='micro'))
confusion_matrix--:
[[24 5 6]
[ 5 33 0]
[ 8 2 29]]
Precision---------: 0.7678571428571429
Recall------------: 0.7678571428571429
F1_score----------: 0.7678571428571429
SVM model
from sklearn.svm import SVC
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
svc = SVC(C=3, kernel='rbf', random_state=10)
svc = svc.fit(train_X, train_y)
svc_pred = svc.predict(valid_X)
print('confusion_matrix--:')
print(confusion_matrix(valid_y, svc_pred))
print('Precision---------:', precision_score(valid_y, svc_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, svc_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, svc_pred, average='micro'))
confusion_matrix
[[24 5 6]
[ 7 31 0]
[ 8 3 28]]
Precision
Recall
F1_score
随机森林
from sklearn.ensemble import RandomForestClassifier as RFC
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
rfc = RFC(n_estimators=150, max_depth=4, random_state=10)
rfc = rfc.fit(train_X, train_y)
rfc_pred = rfc.predict(valid_X)
print('confusion_matrix--:')
print(confusion_matrix(valid_y, rfc_pred))
print('Precision---------:', precision_score(valid_y, rfc_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, rfc_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, rfc_pred, average='micro'))
confusion_matrix
[[25 5 5]
[ 7 31 0]
[ 8 2 29]]
Precision
Recall
F1_score
LightGBM model
import lightgbm as lgb
params = {'boosting_type':'gbdt',
'num_leaves': 60,
'min_data_in_leaf': 30,
'objective': 'multiclass',
'num_class': 3,
'max_depth': -1,
'learning_rate': 0.06,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"lambda_l1": 0.4,
"lambda_l2": 0.5,
"verbosity": -1,
'metric': 'multi_logloss',
"random_state": 2022,
}
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
tr_data = lgb.Dataset(train_X, label=train_y)
val_data = lgb.Dataset(valid_X, label=valid_y)
num_round = 1000
lgb = lgb.train(params,
tr_data,
num_round,
valid_sets=[tr_data, val_data],
verbose_eval=100,
early_stopping_rounds=200)
y_pred = lgb.predict(valid_X, num_iteration=lgb.best_iteration)
lgb_pred = [list(x).index(max(x)) for x in y_pred]
print('confusion_matrix--:')
print(confusion_matrix(valid_y, lgb_pred))
print('Precision---------:', precision_score(valid_y, lgb_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, lgb_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, lgb_pred, average='micro'))
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 200 rounds
[100] training's multi_logloss: 0.388456 valid_1's multi_logloss: 0.598804
[200] training's multi_logloss: 0.316302 valid_1's multi_logloss: 0.614812
Early stopping, best iteration is:
[82] training's multi_logloss: 0.408648 valid_1's multi_logloss: 0.589863
confusion_matrix--:
[[26 5 4]
[ 6 32 0]
[11 2 26]]
Precision---------: 0.75
Recall------------: 0.75
F1_score----------: 0.75
D:\Anaconda\lib\site-packages\lightgbm\engine.py:181: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
_log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
D:\Anaconda\lib\site-packages\lightgbm\engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
_log_warning("'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. "
NN-tensorflow
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers
seed = 10
print('seed----------------:', seed)
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
X_train_NN = (train_X.values).astype('float32')
y_train_NN = train_y.astype('int32')
X_valid_NN = (valid_X.values).astype('float32')
y_valid_NN = valid_y.astype('int32')
one_hot_train_y=to_categorical(train_y)
one_hot_valid_y=to_categorical(valid_y)
np.random.seed(seed)
tf.random.set_seed(seed)
input_shape = X_train_NN.shape[1]
b_size = 500
max_epochs = 200
model = models.Sequential()
model.add(layers.Dense(64,activation='relu',input_shape = (10,)))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(3,activation='softmax'))
print(model.summary())
model.compile(optimizer='rmsprop',
loss="categorical_crossentropy",
metrics=['accuracy'])
h = model.fit(X_train_NN,
one_hot_train_y,
batch_size=b_size,
epochs=max_epochs,
shuffle=True,
verbose=1)
nn_pred = model.predict(X_valid_NN)
NN_pred = []
for v in nn_pred:
index = np.argmax(v)
NN_pred.append(index)
print('confusion_matrix--:')
print(confusion_matrix(NN_pred, y_valid_NN))
print('Precision---------:', precision_score(y_valid_NN, NN_pred, average='micro'))
print('Recall------------:', recall_score(y_valid_NN, NN_pred, average='micro'))
print('F1_score----------:', f1_score(y_valid_NN, NN_pred, average='micro'))
seed----------------: 10
Model: "sequential_103"
_________________________________________________________________
Layer (type) Output Shape Param
=================================================================
dense_344 (Dense) (None, 64) 704
dense_345 (Dense) (None, 64) 4160
dense_346 (Dense) (None, 3) 195
=================================================================
Total params: 5,059
Trainable params: 5,059
Non-trainable params: 0
_________________________________________________________________
confusion_matrix--:
[[26 5 6]
[ 3 33 0]
[ 6 0 33]]
Precision---------: 0.8214285714285714
Recall------------: 0.8214285714285714
F1_score----------: 0.8214285714285714
NN-sklearn库
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
seed = 10
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=seed)
mlp = MLPClassifier(hidden_layer_sizes=(128, 30),
activation='relu',
solver='adam',
max_iter=200,
random_state=seed)
X_train_NN = (train_X.values).astype('float32')
y_train_NN = train_y.astype('int32')
X_valid_NN = (valid_X.values).astype('float32')
y_valid_NN = valid_y.astype('int32')
mlp = mlp.fit(X_train_NN, y_train_NN)
y_pred = mlp.predict(X_valid_NN)
print('confusion_matrix--:')
print(confusion_matrix(y_pred, y_valid_NN))
print('Precision---------:', precision_score(y_valid_NN, y_pred, average='micro'))
print('Recall------------:', recall_score(y_valid_NN, y_pred, average='micro'))
print('F1_score----------:', f1_score(y_valid_NN, y_pred, average='micro'))
3. 自动化寻找最佳模型
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers
seed = 10
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=seed)
def LR():
seed = 10
lr = LogisticRegression(random_state=seed,
C=30,
class_weight='balanced',
solver='newton-cg',
multi_class='multinomial')
lr = lr.fit(train_X, train_y)
lr_pred = lr.predict(valid_X)
return [precision_score(valid_y, lr_pred, average='micro'),
recall_score(valid_y, lr_pred, average='micro'),
f1_score(valid_y, lr_pred, average='micro')]
def KNN():
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(train_X, train_y)
knn = lr.fit(train_X, train_y)
knn_pred = knn.predict(valid_X)
return [precision_score(valid_y, knn_pred, average='micro'),
recall_score(valid_y, knn_pred, average='micro'),
f1_score(valid_y, knn_pred, average='micro')]
def SVM():
svc = SVC(C=3, kernel='rbf', random_state=seed)
svc = svc.fit(train_X, train_y)
svc_pred = svc.predict(valid_X)
return [precision_score(valid_y, svc_pred, average='micro'),
recall_score(valid_y, svc_pred, average='micro'),
f1_score(valid_y, svc_pred, average='micro')]
def rfc():
rfc = RFC(n_estimators=150, max_depth=4, random_state=seed)
rfc = rfc.fit(train_X, train_y)
rfc_pred = rfc.predict(valid_X)
return [precision_score(valid_y, rfc_pred, average='micro'),
recall_score(valid_y, rfc_pred, average='micro'),
f1_score(valid_y, rfc_pred, average='micro')]
def lgb_model():
params = {'boosting_type':'gbdt',
'num_leaves': 60,
'min_data_in_leaf': 30,
'objective': 'multiclass',
'num_class': 3,
'max_depth': -1,
'learning_rate': 0.06,
"min_sum_hessian_in_leaf": 6,
"boosting": "gbdt",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"lambda_l1": 0.4,
"lambda_l2": 0.5,
"verbosity": -1,
'metric': 'multi_logloss',
"random_state": 2022,
}
tr_data = lgb.Dataset(train_X, label=train_y)
val_data = lgb.Dataset(valid_X, label=valid_y)
num_round = 1000
lgb_model = lgb.train(params,
tr_data,
num_round,
valid_sets=[tr_data, val_data],
verbose_eval=100,
early_stopping_rounds=200)
y_pred = lgb_model.predict(valid_X, num_iteration=lgb_model.best_iteration)
lgb_pred = [list(x).index(max(x)) for x in y_pred]
return [precision_score(valid_y, lgb_pred, average='micro'),
recall_score(valid_y, lgb_pred, average='micro'),
f1_score(valid_y, lgb_pred, average='micro')]
def NN():
X_train_NN = (train_X.values).astype('float32')
y_train_NN = train_y.astype('int32')
X_valid_NN = (valid_X.values).astype('float32')
y_valid_NN = valid_y.astype('int32')
one_hot_train_y=to_categorical(train_y)
one_hot_valid_y=to_categorical(valid_y)
np.random.seed(seed)
tf.random.set_seed(seed)
input_shape = X_train_NN.shape[1]
b_size = 500
max_epochs = 200
model = models.Sequential()
model.add(layers.Dense(64,activation='relu',input_shape = (10,)))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(3,activation='softmax'))
print(model.summary())
model.compile(optimizer='rmsprop',
loss="categorical_crossentropy",
metrics=['accuracy'])
h = model.fit(X_train_NN,
one_hot_train_y,
batch_size=b_size,
epochs=max_epochs,
shuffle=True,
verbose=1)
nn_pred = model.predict(X_valid_NN)
NN_pred = []
for v in nn_pred:
index = np.argmax(v)
NN_pred.append(index)
return [precision_score(valid_y, NN_pred, average='micro'),
recall_score(valid_y, NN_pred, average='micro'),
f1_score(valid_y, NN_pred, average='micro')]
model_dict = {'0':'LR()',
'1':'KNN()',
'2':'SVM()',
'3':'rfc()',
'4':'lgb_model()',
'5':'NN()'}
MODELS = [LR(), KNN(), SVM(), rfc(), lgb_model(), NN()]
Pre = []
Recall = []
F1_score = []
for MODEL in MODELS:
clf = MODEL
Pre.append(clf[0])
Recall.append(clf[1])
F1_score.append(clf[2])
Pre_index = np.argmax(Pre)
Recall_index = np.argmax(Recall)
F1_score_index = np.argmax(F1_score)
print('Pre最高的模型是---------:', model_dict[str(Pre_index)])
print('Recall最高的模型是---------:', model_dict[str(Recall_index)])
print('F1_score最高的模型是-------:', model_dict[str(F1_score_index)])
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 200 rounds
[100] training's multi_logloss: 0.388456 valid_1's multi_logloss: 0.598804
[200] training's multi_logloss: 0.316302 valid_1's multi_logloss: 0.614812
Early stopping, best iteration is:
[82] training's multi_logloss: 0.408648 valid_1's multi_logloss: 0.589863
Model: "sequential_108"
D:\Anaconda\lib\site-packages\lightgbm\engine.py:181: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
_log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
D:\Anaconda\lib\site-packages\lightgbm\engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
_log_warning("'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. "
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_359 (Dense) (None, 64) 704
dense_360 (Dense) (None, 64) 4160
dense_361 (Dense) (None, 3) 195
=================================================================
Total params: 5,059
Trainable params: 5,059
Non-trainable params: 0
_________________________________________________________________
Pre最高的模型是---------: NN()
Recall最高的模型是---------: NN()
F1_score最高的模型是-------: NN()