前言
Titanic生存预测分析是十分经典的数据分析入门项目,数据集不算很大,但具有一定规模的数据能够帮助我们在学习与实践过程中更为方便得进行不同模型的尝试与融合,接下来,开始进行本次数据分析之旅。
模块导入、Jupyter环境配置及数据集加载
本文中数据集在Kaggle官网竞赛页面下载获得。
在jupyter notebook中导入python相关模块。
import pandas as pd,numpy as np, seaborn as sns
from matplotlib import pyplot as plt
import pylab as plot
%matplotlib inline
pd.options.display.max_columns =100
#设置图像的属性
params = {
'axes.labelsize': "large",
'xtick.labelsize': 'x-large',
'legend.fontsize': 20,
'figure.dpi': 150,
'figure.figsize': [25, 7]
}
plot.rcParams.update(params)
#拦截警告
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore',category=DeprecationWarning)
加载数据集
data = pd.read_csv('../git_hub_manager/tantlic/train.csv')
探索性分析(EDA)
data.head(5)
_______________________________________________________
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
其中,特征名称分别代表:
- PassengerId :身份证明
- Survived :预测目标值,1代表活着,0代表死亡
- Pclass :等级类别,有1,2,3级
- Name: 姓名
- Sex:性别
- Age: 年龄
- SibSp :兄弟姐妹和配偶的数量
- Parch :父母和孩子的数量
- Ticket:票号码
- Fare :票价
- Cabin :船舱号码
- Embarked:登船地点,有S,Q,C
观察整体数据,发现Age缺失了177条数据,数值型数据可以通过。
data.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
通过年龄的中位数填充。
data['Age'] = data['Age'].fillna(data['Age'].median())
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.361582 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 13.019697 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 22.000000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 35.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
分析性别对生存的影响,男性死亡几率比女性更大一些。
data['Died'] = 1-data['Survived']
data.groupby('Sex').agg('sum')[['Survived','Died']].plot(kind='bar',
figsize=(30,10),stacked =True,color = ['g','r'])
分析年龄对生存的影响。
- 在不同年龄范围中,女性生存率均高于男性。
- 20-40岁范围中的男性死亡数较多,而年轻男性比年老者更易生存。
- 女性年龄对生存的影响较小。
sns.violinplot(x='Sex',y='Age',hue='Survived',data=data,split = True,
palette ={0:'black',1:'g'})
分析票价对生存的影响。
- 较低票价的乘客中死亡数较多。
- 高票价的乘客生存率较高。
figure=plt.figure(figsize = (30,10))
plt.hist([
data[data['Survived']==1]['Fare'],
data[data['Survived']==0]['Fare']],
stacked =True,
color = ['black','r'],
bins =50,
label=['Survived','Died']
)
plt.xlabel('Fare')
plt.ylabel('Number of passagers')
plt.legend()
分析年龄、票价对生存的影响。其中,圆的面积大小代表票价的高低。
- 10岁以下的孩子得救
- 20岁以上的成年人中,票价越高生存几率越大,最大的两个红圈代表票价最高
plt.figure(figsize=(30,10))
ax = plt.subplot()
ax.scatter(data[data['Survived'] ==1]['Age'],data[data['Survived']==1]['Fare'],
color = 'r',s=data[data['Survived']==1]['Fare'])
ax.scatter(data[data['Survived']==0]['Age'],data[data['Survived']==0]['Fare'],
color = 'b',s=data[data['Survived']==0]['Fare'])
分析登船地点对生存的影响。
- S和C处支付更高票价登船的人的生存几率更大。
fig = plt.figure(figsize=(30,10))
sns.violinplot(x='Embarked',y='Fare',hue='Survived',data=data,split=True,
palette={0:'black',1:'g'})
特征工程
定义一个显示数据处理完成的函数。
def status(feature):
print ('Processing' , feature,':ok')
将训练集和测试集合在一起。
def get_combined_data():
train = pd.read_csv('../git_hub_manager/tantlic/train.csv')
test= pd.read_csv('../git_hub_manager/tantlic/test.csv')
#将训练集中的目标预测值值'Survived'删除
targets = train.Survived
train.drop(['Survived'],1,inplace=True)
combined = train.append(test)
#重置合集的索引
combined.reset_index(inplace=True)
combined.drop(['index','PassengerId'],inplace=True,axis=1)
return combined
合并后的数据集。
combined = get_combined_data()
combined
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ...
1304 3 Spector, Mr. Woolf male NaN 0 0 A.5. 3236 8.0500 NaN S
1305 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 108.9000 C105 C
1306 3 Saether, Mr. Simon Sivertsen male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S
1307 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S
1308 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C
1309 rows × 10 columns
combined.shape
(1309, 10)
独热编码
处理称谓。
#获取乘客名字中的称谓
titles = set()
for name in data['Name']:
titles.add(name.split(',')[1].split('.')[0].strip())
titles
{'Capt','Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess'}
#将称谓进行分类,通过字典映射,减少复杂度
Title_Dictionary = {'Capt':'Officer', 'Col':'Officer',
'Don':'Royalty', 'Dr':'Officer', 'Jonkheer':'Royalty', 'Lady':'Royalty', 'Major':'Officer', 'Master':'Master', 'Miss':'Miss', 'Mlle':'Miss', 'Mme':'Mrs', 'Mr':'Mr', 'Mrs':'Mrs','Ms':'Mrs','Rev':'Officer','Sir':'Royalty', 'the Countess':'Royalty'}
def get_titles():
combined['Title'] = combined['Name'].map(lambda name :name.split(',')[1].split('.')[0].strip())
combined['Title'] = combined.Title.map(Title_Dictionary)
status('Title')
return combined
处理之后的数据集中增加了Title标签列。
combined = get_titles()
Processing Title :ok
combined.head()
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title
0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S Mr
1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C Mrs
2 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S Miss
3 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S Mrs
4 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S Mr
处理年龄,在训练集中缺少了177个数据,测试集中缺少了86个数据。
combined.iloc[:891].Age.isnull().sum()
177
combined.iloc[891:].Age.isnull().sum()
86
#根据性别,等级和称谓分组得到年龄中位值。
group_median_train =combined.iloc[:891].groupby(['Sex','Pclass','Title']).median()
group_median_train =group_median_train.reset_index()[['Sex',
'Pclass','Title','Age']]
处理票价,在训练集上用票价的平均值来填充空值。
def modify_fare():
global combined
combined.Fare.fillna(combined.iloc[:891].Fare.mean())
status('fara')
return combined
combined = modify_fare()
Processing fara :ok
处理登船地点的数据。
#发现缺少了两个数据,
combined.Embarked.isnull().sum()
2
#S点处登船数最多为914,因此用S填充空值
(combined['Embarked'] =='S').sum(),(combined['Embarked'] =='C').sum(),(combined['Embarked'] =='Q').sum()
(914, 270, 123)
#使用虚拟编码整合登船地点Embarked_C,Embarked_Q,Embarked_S
def modify_embarked():
global combined
combined.Embarked.fillna('S',inplace=True)
embarked_dummies = pd.get_dummies(combined['Embarked'],prefix='Embarked')
combined = pd.concat([combined,embarked_dummies],axis=1)
combined.drop('Embarked',axis=1,inplace=True)
status('embarked')
return combined
combined =modify_embarked()
Processing embarked :ok
处理船舱位置,发现有1014个NAN值。
combined.Cabin.isnull().sum()
————————————————————————————————————————————————————————
1014
获取船舱位置首字母,发现测试集的字母均在训练集中。
#训练集,用N表示NAN
train1_set = set()
combined1.Cabin.fillna('N',inplace=True)
for e in combined1.iloc[:891]['Cabin']:
train1_set.add(e[0])
train1_set
————————————————————————————————————————————————————————
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'N', 'T'}
#测试集
test1_set =set()
combined1.Cabin.fillna('N',inplace=True)
for e in combined1.iloc[891:]['Cabin']:
test1_set.add(e[0])
test1_set
————————————————————————————————————————————————————————
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'N'}
#使用U(unkonwn)填充空值,提取船仓首字母,使用虚拟编码整合标签。
def modify_Cabin():
global combined
combined.Cabin.fillna('U',inplace=True)
combined['Cabin'] = combined['Cabin'].map(lambda e : e[0])
cabin_dummies = pd.get_dummies(combined['Cabin'],prefix='Cabin')
combined = pd.concat([combined,cabin_dummies],axis=1)
combined.drop('Cabin',axis=1,inplace=True)
status('cabin')
return combined
#运行修改函数
combined = modify_Cabin()
Processing cabin :ok
#处理后的数据集
combined.head()
Pclass Sex Age SibSp Parch Ticket Fare Title_Master Title_Miss Title_Mr Title_Mrs Title_Officer Title_Royalty Embarked_C Embarked_Q Embarked_S Cabin_A Cabin_B Cabin_C Cabin_D Cabin_E Cabin_F Cabin_G Cabin_T Cabin_U
0 3 male 22.0 1 0 A/5 21171 7.2500 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
1 1 female 38.0 1 0 PC 17599 71.2833 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0
2 3 female 26.0 0 0 STON/O2. 3101282 7.9250 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
3 1 female 35.0 1 0 113803 53.1000 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0
4 3 male 35.0 0 0 373450 8.0500 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
处理性别。 很幸运的是,性别并没有空值。
combined.Sex.isnull().sum()
————————————————————————————————————————————————————————
0
将male和female分别映射到1和0。
def modify_Sex():
global combined
combined['Sex']=combined.Sex.map({'male':1,'female':0})
status('sex')
return combined
处理乘客等级。
#使用虚拟编码整合Pclass(1,2,3)标签
def modify_Pclass():
global combined
Pclass_dummies = pd.get_dummies(combined['Pclass'],prefix='Pclass')
combined = pd.concat([combined,Pclass_dummies],axis=1)
combined.drop('Pclass',axis=1,inplace=True)
status('pclass')
return combined
combined = modify_Pclass()
————————————————————————————————————————————————————————
Processing pclass :ok
处理船票号码。
#将船票号的前缀字母提取出来
def preperform_Ticket(ticket):
ticket = ticket.replace('.','')
ticket = ticket.replace('/','')
ticket = ticket.split()
ticket = map(lambda t : t.strip(),ticket)
ticket = list(filter(lambda t : not t.isdigit(),ticket))
if len(ticket) >0:
return ticket[0]
else:
return 'xxx'
#使用虚拟编码整合Ticket_xx标签,并删除Ticket标签
def modify_Ticket():
global combined
combined['Ticket'] = combined['Ticket'].map(preperform_Ticket)
tickets_dummies = pd.get_dummies(combined['Ticket'],prefix = 'Ticket')
combined = pd.concat([combined,tickets_dummies],axis=1)
combined.drop('Ticket',axis=1,inplace = True)
status('ticket')
return combined
combined = modify_Ticket()
————————————————————————————————————————————————————————
Processing ticket :ok
处理家庭,根据家庭成员数量分为单人,小家庭(少于5人的家庭)和大家庭(多于5人的家庭)。
def modify_Family():
global combined
#计算家庭成员总数
combined['Family_size'] = combined['Parch'] + combined['SibSp'] +1
combined['Singleone']=combined['Family_size'].map(lambda s : 1 if s==1 else 0)
combined['BigFamily']= combined['Family_size'].map(lambda s:1 if s>4 else 0)
combined['SmallFamily']=combined['Family_size'].map(lambda s:1 if 1<s<=4 else 0)
status('family')
return combined
combined = modify_Family()
————————————————————————————————————————————————————————
Processing family :ok
模型训练
导入需要的模块
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
获取处理后的训练集,测试集和预测目标值。
def get_train_test():
global combined
train = combined.iloc[:891]
test = combined.iloc[891:]
targets = pd.read_csv('../git_hub_manager/tantlic/train.csv',
usecols=['Survived'])['Survived'].values
return train,test,targets
train,test,targets = get_train_test()
使用随机森林分类器训练得到训练集与预测目标值的模型。
clf = RandomForestClassifier(n_estimators = 50,max_features ='sqrt')
clf = clf.fit(train,targets)
#创建DataFrame类数据,将特征与随机森林计算得到的特征重要性关联起来
features= pd.DataFrame()
features['feature'] = train.columns
features['importance'] = clf.feature_importances_
features.sort_values(by=['importance'],inplace=True,ascending=True)
features.set_index('feature',inplace=True)
#数据可视化
features.plot(kind='barh',figsize=(20,20))
可以看到与Title_Mr,Age,Fare,Sex的关联性较强。
上面已经分析了数据集中特征的重要性分布,因此可以使用这个分布模型进一步减少数据集特征数量,从而优化数据集。
特征选择
model =SelectFromModel(clf,prefit=True)
#训练集进行特征优化
train_clf = model.transform(train)
train_clf.shape
————————————————————————————————————————————————————————
(891, 14) #特征数量大大减少
#测试集进行特征优化
test_clf =model.transform(test)
test_clf.shape
————————————————————————————————————————————————————————
(418, 14)
基础模型
先使用不同的基础模型进行训练。
#逻辑回归
logreg = LogisticRegression()
#逻辑回归交叉验证选择正则化系数
logreg_cv =LogisticRegressionCV()
#随机森林分类器
rf = RandomForestClassifier()
gboost = GradientBoostingClassifier()
models = [logreg,logreg_cv,rf,gboost]
定义一个评分函数,使用5折交叉验证的方法对不同模型进行评分。
def evaluate_score(clf,x,y,scoring='accuracy'):
xval = cross_val_score(clf,x,y,cv=5,scoring=scoring)
return np.mean(xval)
for model in models:
print(f'Cross-Validation of: {model.__class__}')
score = evaluate_score(clf=model,x=train_clf,y=targets,scoring='accuracy')
print(f'CV score = {score}')
————————————————————————————————————————————————————————
Cross-Validation of: <class 'sklearn.linear_model._logistic.LogisticRegression'>
CV score = 0.8170422446801833
******
Cross-Validation of: <class 'sklearn.linear_model._logistic.LogisticRegressionCV'>
CV score = 0.8260310087251271
******
Cross-Validation of: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
CV score = 0.8193019898311468
******
Cross-Validation of: <class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
CV score = 0.8338961772644531
******
调节超参数
利用网格搜索调节超参数,优化模型。
parameter_grid ={
'max_depth' :[4,6,8],
'n_estimators': [50,10],
'max_features' : ['sqrt','auto','log2'],
'min_samples_split':[2,3,10],
'min_samples_leaf' :[1,3,10],
'bootstrap' : [True,False], }
forest = RandomForestClassifier()
#K折交叉划分验证(5折)
cross_validation =StratifiedKFold(n_splits=5)
#网格搜索交叉验证
grid_search =GridSearchCV(forest,scoring='accuracy',param_grid =parameter_grid,cv =cross_validation,verbose=1)
model =grid_search.fit(train,targets)
parameters = grid_search.best_params_
print(f'Best score : {grid_search.best_score_}')
print(f'Best parameters : {parameters}')
model = RandomForestClassifier(**parameters)
model.fit(train,targets)
模型预测,输出结果
通过调节超参数构建的模型,输出结果文件。
#用模型对测试集进行预测并输出结果
output = model.predict(test).astype(int)
#创建空的DataFrame,
df_output = pd.DataFrame()
test1 = pd.read_csv('../git_hub_manager/tantlic/test.csv')
df_output['PassengerId'] = test1['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('../2021_project/gridsearch_rf2021.csv',index=False)