集成学习思想: 通过多个分类器,去共同完成任务。
比较不同的分类器
import time
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
def train(X, y,model):
model.fit(X, y)
def predict(X,model):
y_pred = model.predict(X)
return y_pred
def get_acc(y_pred, y_true):
acc = (y_pred == y_true).mean()
return acc
def generateDatas(n_samples,makeFunc):
X, y = makeFunc(n_samples=n_samples)
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=0)
return X_train, X_test, y_train, y_test
#Voting思想 投票 (算法不同, 数据相同)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
#Bagging思想 (算法相同, 数据不同)
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
#stacking 双阶段思想, 先使用多个分类器预测, 然后使用一个最终分类器处理结果
#"Stacked generalization consists in stacking the output of individual estimator and use a classifier to compute the final prediction."
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(dual="auto", random_state=42)))]
#Boosting 思想
#错题本, 错误的通过加权
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, algorithm="SAMME", random_state=0)
>>> clf.fit(X, y)
AdaBoostClassifier(algorithm='SAMME', n_estimators=100, random_state=0)
>>> clf.predict([[0, 0, 0, 0]])
#随机森林
from sklearn.ensemble import RandomForestClassifier
#XGBoost, lightgbm
from lightgbm import LGBMClassifier
amount_of_datas = [1000, 10000, 100000]
models = {"KNN":KNeighborsClassifier(),
"DTC":DecisionTreeClassifier(),
#"voting":VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'),
#"Bagging":BaggingClassifier(estimator=SVC()),
#"Stacking":StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()),
"RandomForest":RandomForestClassifier(max_depth=2, random_state=0),
"lightgbm":LGBMClassifier()
}
for ele in amount_of_datas:
X_train, X_test, y_train, y_test = generateDatas(ele, make_classification)
for name, model in models.items():
start_time = time.time()
train(X_train, y_train, model)
end_time = time.time()
print(f"训练耗时:{end_time-start_time:.4f}秒")
start_time = time.time()
y_pred = predict(X_test, model)
end_time = time.time()
print(f"预测耗时:{end_time-start_time:.4f}秒")
acc = get_acc(y_pred, y_test)
print(f"{ele}个样本 {name} 的acc:{acc}")