集成学习

111 阅读2分钟

集成学习思想: 通过多个分类器,去共同完成任务。

比较不同的分类器

import time
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

def train(X, y,model):
    model.fit(X, y)
    
def predict(X,model):
    y_pred = model.predict(X)
    return y_pred
  
def get_acc(y_pred, y_true):
    acc = (y_pred == y_true).mean()
    return acc
 
def generateDatas(n_samples,makeFunc):
    X, y = makeFunc(n_samples=n_samples)
    X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=0)
    return X_train, X_test, y_train, y_test 
  
#Voting思想 投票 (算法不同, 数据相同)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')


#Bagging思想 (算法相同, 数据不同)
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

#stacking 双阶段思想, 先使用多个分类器预测, 然后使用一个最终分类器处理结果
#"Stacked generalization consists in stacking the output of individual estimator and use a classifier to compute the final prediction."

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(dual="auto", random_state=42)))]

#Boosting 思想
#错题本, 错误的通过加权
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=0)
>>> clf.fit(X, y)
AdaBoostClassifier(algorithm='SAMME', n_estimators=100, random_state=0)
>>> clf.predict([[0, 0, 0, 0]])

#随机森林
from sklearn.ensemble import RandomForestClassifier
#XGBoost, lightgbm
from lightgbm import LGBMClassifier


amount_of_datas = [1000, 10000, 100000]
models = {"KNN":KNeighborsClassifier(),
          "DTC":DecisionTreeClassifier(), 
          #"voting":VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'),
         #"Bagging":BaggingClassifier(estimator=SVC()),
          #"Stacking":StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()),
          "RandomForest":RandomForestClassifier(max_depth=2, random_state=0),
          "lightgbm":LGBMClassifier()
         }
         
for ele in amount_of_datas:
    X_train, X_test, y_train, y_test = generateDatas(ele, make_classification)
    for name, model in models.items():
        start_time = time.time()
        train(X_train, y_train, model)
        end_time = time.time()
        print(f"训练耗时:{end_time-start_time:.4f}秒")
        start_time = time.time()
        y_pred = predict(X_test, model)
        end_time = time.time()
        print(f"预测耗时:{end_time-start_time:.4f}秒")
        acc = get_acc(y_pred, y_test)
        print(f"{ele}个样本 {name} 的acc:{acc}")