二分类问题--ML模型

265 阅读4分钟

1. 数据切分

from sklearn.model_selection import train_test_split
label = train_data['type'].values
train = train_data.drop(['type'], axis=1)
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3)

print(train_X.shape)
print(valid_X.shape)
print(train_y.shape)
print(valid_y.shape)
# (259, 10)
# (112, 10)
# (259,)
# (112,)

2. 分类模型

Logistic model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

seed = 10
lr = LogisticRegression(random_state=seed, 
                        C=30, 
                        class_weight='balanced', 
                        solver='newton-cg', 
                        multi_class='multinomial')  
lr = lr.fit(train_X, train_y)
lr_pred = lr.predict(valid_X)

print('confusion_matrix--:')
print(confusion_matrix(valid_y, lr_pred))
print('Precision---------:', precision_score(valid_y, lr_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, lr_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, lr_pred, average='micro'))

输出:
confusion_matrix--:
[[24  5  6]
 [ 5 33  0]
 [ 8  2 29]]
Precision---------: 0.7678571428571429
Recall------------: 0.7678571428571429
F1_score----------: 0.7678571428571429

KNN model

from sklearn.neighbors import KNeighborsClassifier
​
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(train_X, train_y)
knn = lr.fit(train_X, train_y)
knn_pred = knn.predict(valid_X)
​
print('confusion_matrix--:')
print(confusion_matrix(valid_y, knn_pred))
print('Precision---------:', precision_score(valid_y, knn_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, knn_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, knn_pred, average='micro'))
confusion_matrix--:
[[24  5  6]
 [ 5 33  0]
 [ 8  2 29]]
Precision---------: 0.7678571428571429
Recall------------: 0.7678571428571429
F1_score----------: 0.7678571428571429

SVM model

from sklearn.svm import SVC
​
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
svc = SVC(C=3, kernel='rbf', random_state=10)
svc = svc.fit(train_X, train_y)
svc_pred = svc.predict(valid_X)
​
print('confusion_matrix--:')
print(confusion_matrix(valid_y, svc_pred))
print('Precision---------:', precision_score(valid_y, svc_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, svc_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, svc_pred, average='micro'))
confusion_matrix--:
[[24  5  6]
 [ 7 31  0]
 [ 8  3 28]]
Precision---------: 0.7410714285714286
Recall------------: 0.7410714285714286
F1_score----------: 0.7410714285714286

随机森林

from sklearn.ensemble import RandomForestClassifier as RFC
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
rfc = RFC(n_estimators=150, max_depth=4, random_state=10)
rfc = rfc.fit(train_X, train_y)
rfc_pred = rfc.predict(valid_X)
​
print('confusion_matrix--:')
print(confusion_matrix(valid_y, rfc_pred))
print('Precision---------:', precision_score(valid_y, rfc_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, rfc_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, rfc_pred, average='micro'))
confusion_matrix--:
[[25  5  5]
 [ 7 31  0]
 [ 8  2 29]]
Precision---------: 0.7589285714285714
Recall------------: 0.7589285714285714
F1_score----------: 0.7589285714285714

LightGBM model

import lightgbm as lgb
​
params = {'boosting_type':'gbdt',
          'num_leaves': 60, 
          'min_data_in_leaf': 30,
          'objective': 'multiclass',
          'num_class': 3,
          'max_depth': -1,
          'learning_rate': 0.06,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.4, 
          "lambda_l2": 0.5,
          "verbosity": -1,
          'metric': 'multi_logloss',
          "random_state": 2022, 
          }
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
tr_data = lgb.Dataset(train_X, label=train_y)
val_data = lgb.Dataset(valid_X, label=valid_y)
num_round = 1000
lgb = lgb.train(params, 
                tr_data,
                num_round,
                valid_sets=[tr_data, val_data],
                verbose_eval=100,
                early_stopping_rounds=200)
y_pred = lgb.predict(valid_X, num_iteration=lgb.best_iteration)
lgb_pred = [list(x).index(max(x)) for x in y_pred]print('confusion_matrix--:')
print(confusion_matrix(valid_y, lgb_pred))
print('Precision---------:', precision_score(valid_y, lgb_pred, average='micro'))
print('Recall------------:', recall_score(valid_y, lgb_pred, average='micro'))
print('F1_score----------:', f1_score(valid_y, lgb_pred, average='micro'))
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 200 rounds
[100]   training's multi_logloss: 0.388456  valid_1's multi_logloss: 0.598804
[200]   training's multi_logloss: 0.316302  valid_1's multi_logloss: 0.614812
Early stopping, best iteration is:
[82]    training's multi_logloss: 0.408648  valid_1's multi_logloss: 0.589863
confusion_matrix--:
[[26  5  4]
 [ 6 32  0]
 [11  2 26]]
Precision---------: 0.75
Recall------------: 0.75
F1_score----------: 0.75
D:\Anaconda\lib\site-packages\lightgbm\engine.py:181: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
  _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
D:\Anaconda\lib\site-packages\lightgbm\engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
  _log_warning("'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. "

NN-tensorflow

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers
​
seed = 10
print('seed----------------:', seed)
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=10)
​
X_train_NN = (train_X.values).astype('float32') # all pixel values
y_train_NN = train_y.astype('int32')
X_valid_NN = (valid_X.values).astype('float32')
y_valid_NN = valid_y.astype('int32')
​
one_hot_train_y=to_categorical(train_y)
one_hot_valid_y=to_categorical(valid_y)
​
#-----------------------------------------构建为网络
np.random.seed(seed)
tf.random.set_seed(seed)
​
input_shape = X_train_NN.shape[1]
b_size = 500
max_epochs = 200
model = models.Sequential()
model.add(layers.Dense(64,activation='relu',input_shape = (10,)))
# model.add(layers.Dense(128,activation='relu'))
# model.add(layers.Dense(128,activation='relu'))
model.add(layers.Dense(64,activation='relu'))
model.add(layers.Dense(3,activation='softmax'))
print(model.summary())
#------------------------------------------训练模型
model.compile(optimizer='rmsprop', 
              loss="categorical_crossentropy", 
              metrics=['accuracy'])
​
h = model.fit(X_train_NN, 
              one_hot_train_y, 
              batch_size=b_size, 
              epochs=max_epochs, 
              shuffle=True, 
              verbose=1)
nn_pred = model.predict(X_valid_NN)
​
NN_pred = []
for v in nn_pred:
    index = np.argmax(v)
    NN_pred.append(index)
​
print('confusion_matrix--:')
print(confusion_matrix(NN_pred, y_valid_NN))
print('Precision---------:', precision_score(y_valid_NN, NN_pred, average='micro'))
print('Recall------------:', recall_score(y_valid_NN, NN_pred, average='micro'))
print('F1_score----------:', f1_score(y_valid_NN, NN_pred, average='micro'))
seed----------------: 10
Model: "sequential_103"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_344 (Dense)           (None, 64)                704       
                                                                 
 dense_345 (Dense)           (None, 64)                4160      
                                                                 
 dense_346 (Dense)           (None, 3)                 195       
                                                                 
=================================================================
Total params: 5,059
Trainable params: 5,059
Non-trainable params: 0
_________________________________________________________________
​
confusion_matrix--:
[[26  5  6]
 [ 3 33  0]
 [ 6  0 33]]
Precision---------: 0.8214285714285714
Recall------------: 0.8214285714285714
F1_score----------: 0.8214285714285714

NN-sklearn库

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

seed = 10
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=seed)
mlp = MLPClassifier(hidden_layer_sizes=(128, 30), 
                    activation='relu', 
                    solver='adam', 
                    max_iter=200, 
                    random_state=seed)

X_train_NN = (train_X.values).astype('float32')
y_train_NN = train_y.astype('int32')
X_valid_NN = (valid_X.values).astype('float32')
y_valid_NN = valid_y.astype('int32')

mlp = mlp.fit(X_train_NN, y_train_NN)
y_pred = mlp.predict(X_valid_NN) 

print('confusion_matrix--:')
print(confusion_matrix(y_pred, y_valid_NN))
print('Precision---------:', precision_score(y_valid_NN, y_pred, average='micro'))
print('Recall------------:', recall_score(y_valid_NN, y_pred, average='micro'))
print('F1_score----------:', f1_score(y_valid_NN, y_pred, average='micro'))

3. 自动化寻找最佳模型

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RFC
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import models
from tensorflow.keras import layers
​
​
seed = 10
train_X, valid_X, train_y, valid_y = train_test_split(train, label, test_size=0.3, random_state=seed)
​
def LR():
    seed = 10
    lr = LogisticRegression(random_state=seed, 
                            C=30, 
                            class_weight='balanced', 
                            solver='newton-cg', 
                            multi_class='multinomial')  
    lr = lr.fit(train_X, train_y)
    lr_pred = lr.predict(valid_X)
    
    return [precision_score(valid_y, lr_pred, average='micro'),
            recall_score(valid_y, lr_pred, average='micro'), 
            f1_score(valid_y, lr_pred, average='micro')]
​
def KNN():
    knn = KNeighborsClassifier(n_neighbors=3)
    knn = knn.fit(train_X, train_y)
    knn = lr.fit(train_X, train_y)
    knn_pred = knn.predict(valid_X)
​
    return [precision_score(valid_y, knn_pred, average='micro'),
            recall_score(valid_y, knn_pred, average='micro'), 
            f1_score(valid_y, knn_pred, average='micro')]
    
def SVM():
    svc = SVC(C=3, kernel='rbf', random_state=seed)
    svc = svc.fit(train_X, train_y)
    svc_pred = svc.predict(valid_X)
​
    return [precision_score(valid_y, svc_pred, average='micro'),
            recall_score(valid_y, svc_pred, average='micro'), 
            f1_score(valid_y, svc_pred, average='micro')]
​
def rfc():
    rfc = RFC(n_estimators=150, max_depth=4, random_state=seed)
    rfc = rfc.fit(train_X, train_y)
    rfc_pred = rfc.predict(valid_X)
​
    return [precision_score(valid_y, rfc_pred, average='micro'),
            recall_score(valid_y, rfc_pred, average='micro'), 
            f1_score(valid_y, rfc_pred, average='micro')]
    
def lgb_model():
    params = {'boosting_type':'gbdt',
          'num_leaves': 60, 
          'min_data_in_leaf': 30,
          'objective': 'multiclass',
          'num_class': 3,
          'max_depth': -1,
          'learning_rate': 0.06,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.9,
          "bagging_freq": 1,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "lambda_l1": 0.4, 
          "lambda_l2": 0.5,
          "verbosity": -1,              
          'metric': 'multi_logloss',    
          "random_state": 2022, 
          }
    tr_data = lgb.Dataset(train_X, label=train_y)
    val_data = lgb.Dataset(valid_X, label=valid_y)
    num_round = 1000
    lgb_model = lgb.train(params, 
                    tr_data,
                    num_round,
                    valid_sets=[tr_data, val_data],
                    verbose_eval=100,
                    early_stopping_rounds=200)
    y_pred = lgb_model.predict(valid_X, num_iteration=lgb_model.best_iteration)
    lgb_pred = [list(x).index(max(x)) for x in y_pred]
​
    return [precision_score(valid_y, lgb_pred, average='micro'),
            recall_score(valid_y, lgb_pred, average='micro'), 
            f1_score(valid_y, lgb_pred, average='micro')]
    
    
def NN():
    X_train_NN = (train_X.values).astype('float32') # all pixel values
    y_train_NN = train_y.astype('int32')
    X_valid_NN = (valid_X.values).astype('float32')
    y_valid_NN = valid_y.astype('int32')
​
    one_hot_train_y=to_categorical(train_y)
    one_hot_valid_y=to_categorical(valid_y)
​
    #-----------------------------------------构建为网络
    np.random.seed(seed)
    tf.random.set_seed(seed)
​
    input_shape = X_train_NN.shape[1]
    b_size = 500
    max_epochs = 200
    
    model = models.Sequential()
    model.add(layers.Dense(64,activation='relu',input_shape = (10,)))
    model.add(layers.Dense(64,activation='relu'))
    model.add(layers.Dense(3,activation='softmax'))
    print(model.summary())
    #------------------------------------------训练模型
    model.compile(optimizer='rmsprop', 
                  loss="categorical_crossentropy", 
                  metrics=['accuracy'])
​
    h = model.fit(X_train_NN, 
                  one_hot_train_y, 
                  batch_size=b_size, 
                  epochs=max_epochs, 
                  shuffle=True, 
                  verbose=1)
    nn_pred = model.predict(X_valid_NN)
​
    NN_pred = []
    for v in nn_pred:
        index = np.argmax(v)
        NN_pred.append(index)
​
    return [precision_score(valid_y, NN_pred, average='micro'),
            recall_score(valid_y, NN_pred, average='micro'), 
            f1_score(valid_y, NN_pred, average='micro')]
model_dict = {'0':'LR()', 
              '1':'KNN()', 
              '2':'SVM()', 
              '3':'rfc()', 
              '4':'lgb_model()', 
              '5':'NN()'}
MODELS = [LR(), KNN(), SVM(), rfc(), lgb_model(), NN()]
Pre = []
Recall = []
F1_score = []
for MODEL in MODELS:
    clf = MODEL
    Pre.append(clf[0])
    Recall.append(clf[1])
    F1_score.append(clf[2])
Pre_index = np.argmax(Pre)
Recall_index = np.argmax(Recall)
F1_score_index = np.argmax(F1_score)
​
print('Pre最高的模型是---------:', model_dict[str(Pre_index)])
print('Recall最高的模型是---------:', model_dict[str(Recall_index)])
print('F1_score最高的模型是-------:', model_dict[str(F1_score_index)])
[LightGBM] [Warning] boosting is set=gbdt, boosting_type=gbdt will be ignored. Current value: boosting=gbdt
Training until validation scores don't improve for 200 rounds
[100]   training's multi_logloss: 0.388456  valid_1's multi_logloss: 0.598804
[200]   training's multi_logloss: 0.316302  valid_1's multi_logloss: 0.614812
Early stopping, best iteration is:
[82]    training's multi_logloss: 0.408648  valid_1's multi_logloss: 0.589863
Model: "sequential_108"
D:\Anaconda\lib\site-packages\lightgbm\engine.py:181: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.
  _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
D:\Anaconda\lib\site-packages\lightgbm\engine.py:239: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.
  _log_warning("'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. "
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 dense_359 (Dense)           (None, 64)                704       
                                                                 
 dense_360 (Dense)           (None, 64)                4160      
                                                                 
 dense_361 (Dense)           (None, 3)                 195       
                                                                 
=================================================================
Total params: 5,059
Trainable params: 5,059
Non-trainable params: 0
_________________________________________________________________
​
Pre最高的模型是---------: NN()
Recall最高的模型是---------: NN()
F1_score最高的模型是-------: NN()