一、方法介绍
Logistic回归是概率型非线性回归模型,是研究二值型输出分类的一种多变量分析方法,可以将二分类的观察结果y与一些影响因素建立关系从而对某些因素条件下某个结果发生的概率进行估计并分类。
- Sigmoid函数:
二、算法介绍:梯度上升法
三、Python代码实现
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression()
clf.fit(X, y)#训练
clf.predict(X[:10, :])
clf.predict_proba(X[:2, :]) #输出分类的概率
clf.score(X, y)#评分,准确率 先预测得到结果,得到结果和真实结果进行比较
from sklearn import metrics
print(metrics.classification_report(y, clf.predict(X)))#分类报告
print(metrics.confusion_matrix(y, clf.predict(X)))#混淆矩阵
- 网格搜索参数优化
#病马死亡率预测
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
train_data=pd.read_csv("data\\horseColicTraining.txt",sep=' ',names=range(22))
trainingSet=train_data.iloc[:,0:21]#特征
trainingLabels=train_data.iloc[:,21:]#label
test_data=pd.read_csv("data\\horseColicTest.txt",sep=' ')
testSet=test_data.iloc[:,0:21]
testLabels=test_data.iloc[:,21:]
classifier = LogisticRegression().fit(trainingSet, trainingLabels)
classifier.score(testSet,testLabels)
#参数优化
from sklearn.model_selection import GridSearchCV
# params={'penalty':['l2','l1'],'C':[0.001,0.01,0.1,1,10,100],'class_weight':['','balanced']}#粗调节
params={'penalty':['l2','l1'],'C':[0.0108,0.0109,0.011,0.012,0.013]}
LR = LogisticRegression()
grid_search=GridSearchCV(LR,param_grid=params,cv=5)
grid_search.fit(trainingSet, trainingLabels)
grid_search.best_params_
LR_=grid_search.best_estimator_.fit(testSet,testLabels)
LR_.score(testSet,testLabels)
grid_search.score(testSet,testLabels) # 准确率
grid_search.predict(testSet)
#ROC曲线绘制
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
%matplotlib inline
false_positive_rate, true_positive_rate, thresholds = roc_curve(testLabels, predict_proba[:,1])
roc_auc = auc(false_positive_rate, true_positive_rate)#求面积
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b',label='AUC = %0.2f'% roc_auc)
plt.scatter(false_positive_rate, true_positive_rate,c='r')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()