前言
大数据时代背景下,很多领域都需要对产生的数据进行价值挖掘,通过对数据的分析可以描述事物发展的状态,也可以对事物的未来发展轨迹进行一定程度的预测描述,本文以医疗领域为背景,分析心跳信号数据来预测属于哪一类别。
1.1 Jupyter设置、导包及数据集加载
导入相关模块。
import pandas as pd,numpy as np,matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.exceptions import ConvergenceWarning
from typing import types
import sklearn
import pandas_profiling
拦截警告
warnings.filterwarnings('ignore')
warnings.filterwarnings(action ='ignore',category=ConvergenceWarning)
防止中文乱码,设置seaborn中文字体。
mpl.rcParams['font.sans-serif'] =[u'simHei']
mpl.rcParams['axes.unicode_minus'] =False
sns.set(font='SimHei')
设置jupyter显示行数
pd.options.display.min_rows = None
pd.set_option('display.expand_frame_repr', False)
pd.set_option('expand_frame_repr', False)
pd.set_option('max_rows', 30)
pd.set_option('max_columns', 30)
加载数据。
df_train =pd.read_csv('train.csv',encoding='utf-8')
df_test=pd.read_csv('testA.csv',encoding='utf-8')
1.2 探索性分析
1.2.1 数据集预览
df_train.head(5).append(df_train.tail(5))
————————————————————————————————————————————————————————
id heartbeat_signals label
0 0 0.9912297987616655,0.9435330436439665,0.764677... 0.0
1 1 0.9714822034884503,0.9289687459588268,0.572932... 0.0
2 2 1.0,0.9591487564065292,0.7013782792997189,0.23... 2.0
3 3 0.9757952826275774,0.9340884687738161,0.659636... 0.0
4 4 0.0,0.055816398940721094,0.26129357194994196,0... 2.0
99995 99995 1.0,0.677705342021188,0.22239242747868546,0.25... 0.0
99996 99996 0.9268571578157265,0.9063471198026871,0.636993... 2.0
99997 99997 0.9258351628306013,0.5873839035878395,0.633226... 3.0
99998 99998 1.0,0.9947621698382489,0.8297017704865509,0.45... 2.0
99999 99999 0.9259994004527861,0.916476635326053,0.4042900... 0.0
- 预览相关统计量
df_train.describe()
————————————————————————————————————————————————————————
id label
count 100000.000000 100000.000000
mean 49999.500000 0.856960
std 28867.657797 1.217084
min 0.000000 0.000000
25% 24999.750000 0.000000
50% 49999.500000 0.000000
75% 74999.250000 2.000000
max 99999.000000 3.000000
- 预览数据类型
- 主要的信号数据为str类型,这可能需要转换成float类型才能训练
df_train.info()
————————————————————————————————————————————————————————<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 id 100000 non-null int64
1 heartbeat_signals 100000 non-null object
2 label 100000 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.3+ MB
- 预览训练集、测试集维度
df_train.shape,df_test.shape
————————————————————————————————————————————————————————
((100000, 3), (20000, 2))
- 数据挺好,没有缺失值,官方已经处理为0了
missing_pct = df_train.isnull().sum() * 100 / len(df_train) #将列中为空的个数统计出来
missing = pd.DataFrame({
'name': df_train.columns,
'missing_pct': missing_pct,
})
missing.sort_values(by='missing_pct', ascending=False).head()
————————————————————————————————————————————————————————
name missing_pct
id id 0.0
heartbeat_signals heartbeat_signals 0.0
label label 0.0
1.2.2 心跳信号类别数量与分布
- 类别0占比最多,达到64.3%,类别1占比最少,仅6.4%
fig,ax = plt.subplots(1,2,figsize=(15,8))
sns.countplot('label',data=df_train,ax=ax[0],palette=['m','c','pink','orange']
)
df_train['label'].value_counts().plot.pie(autopct='%1.1f%%',ax=ax[1],
colors=['m','c','pink','orange'])
ax[0].set_ylabel('')
ax[0].set_xlabel('label')
ax[1].set_ylabel('')
ax[1].set_xlabel('label')
plt.show()
1.2.3 不同类别信号变化趋势
- 四种类别的信号数据曲线走势有所不同
- 信号逐渐变强,都存在一个峰值
# 参数n为我们想要看几例心跳信号数据变化
def get_figure(n) -> int:
labels_ds=[]
fig,ax=plt.subplots(2,2,figsize=(15,8))
axs=[ax[i][j] for i in range(2) for j in range(2)]
for i in range(4):
for j in range(n):
df_train[df_train['label']==i][:n].iloc[j,:-1].plot(ax=axs[i])
labels_ds.append(f'信号{j+1}')
axs[i].legend(labels=labels_ds)
axs[i].set_title(f'类别{i}',fontsize=20)
get_figure(4)
1.2.4 心跳信号类别平均变化趋势
- 可以看到,四种类别的心跳信号变化趋势不同,类别3整体变化较为陡峭,类别0较为平缓。
def get_figure(n) -> int:
labels_ds=[]
plt.figure()
for i in range(4):
df_train[df_train['label']==i].iloc[:,:-1].mean().plot()
labels_ds.append(f'类别{i}')
plt.legend(labels=labels_ds)
plt.title(f'average',fontsize=20)
get_figure(4)
1.3 特征工程
1.3.1 数据特征提取
- 由于heartbeat_signals为str类型,在分析时具有难度,需要进行数据分离,构建新特征。对此测试集添加到训练集一起进行。
targets=df_train.label
df_train.drop(['label'],axis=1,inplace=True)
combined=df_train.append(df_test)
- 通过转换函数构建新特征
def transform_dataset():
global combined
hbs='heartbeat_signals'
rebuilt = combined[hbs].str.split(',',expand=True)
rebuilt.columns = [f'hbs_{i}' for i in range(rebuilt.shape[1])]
combined.drop(hbs,axis=1,inplace=True)
combined = pd.concat([combined,rebuilt],axis=1)
combined.drop('id',axis=1,inplace=True)
combined =combined.astype(float)
return combined
combined =transform_dataset()
- 预览一下处理后的数据集
combined.head()
————————————————————————————————————————————————————————
hbs_0 hbs_1 hbs_2 hbs_3 hbs_4 hbs_5 hbs_6 hbs_7 hbs_8 hbs_9 hbs_10 hbs_11 hbs_12 hbs_13 hbs_14 ... hbs_190 hbs_191 hbs_192 hbs_193 hbs_194 hbs_195 hbs_196 hbs_197 hbs_198 hbs_199 hbs_200 hbs_201 hbs_202 hbs_203 hbs_204
0 0.991230 0.943533 0.764677 0.618571 0.379632 0.190822 0.040237 0.025995 0.031709 0.065524 0.125531 0.146747 0.167656 0.193374 0.226135 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.971482 0.928969 0.572933 0.178457 0.122962 0.132360 0.094392 0.089575 0.030481 0.040499 0.020392 0.027965 0.035499 0.015321 0.045483 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 1.000000 0.959149 0.701378 0.231778 0.000000 0.080698 0.128376 0.187448 0.280826 0.328261 0.320463 0.322416 0.324367 0.322416 0.332144 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.975795 0.934088 0.659637 0.249921 0.237116 0.281445 0.249921 0.249921 0.241397 0.230670 0.224196 0.228515 0.232822 0.234970 0.226357 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.000000 0.055816 0.261294 0.359847 0.433143 0.453698 0.499004 0.542796 0.616904 0.676696 0.737882 0.755473 0.772850 0.774175 0.786173 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 rows × 205 columns
1.3.2 时序特征处理(此篇为单独叙述)
- 目标:行列互转,将每一行样本信号数据以时间序列沿axis=1方向分布
- 每个样本信号数据提取出来放入列表
train_hbs = df_train["heartbeat_signals"].str.split(",", expand=True)
- 将列索引heartbeat_signals转成最内层行索引
- 索引变换后重置index,将产生的level_0列转换成行标签并删除标签名
- 修改列名:{"level_1":"time", 0:"heartbeat_signals"}
- 修改数据类型为float
train_hbs=train_hbs.stack()
train_hbs = train_hbs.reset_index()
train_hbs = train_hbs.set_index("level_0")
train_hbs.index.name = None
train_hbs.rename(columns={"level_1":"time", 0:"heartbeat_signals"}, inplace=True)
train_hbs["heartbeat_signals"] = train_hbs["heartbeat_signals"].astype(float)
train_hbs
- 将预测值label添加到新的训练集中
targets =df_train['label']
df_train.drop(['label','heartbeat_signals'],axis=1,inplace=True)
df_train=df_train.join(train_hbs)
- 导入时间特征提取模块
import tsfresh as tsf
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
- 执行特征提取
- 此处内存和计算力要求较高,笔者使用的是16G的内存,在执行到60%的时候,最后还是发生内存错误;后在Kaggle运行,20%就显示内存不足,因此目前着重叙述笔者的思路与方法,后面再弥补具体实现。
df_features =extract_features(df_train, column_id='id', column_sort='time')
df_features
————————————————————————————————————————————————————————
Feature Extraction: 60%|█████████████████████████████████████▏ | 18/30 [43:52<13:09, 65.83s/it]Exception in thread Thread-8:
Traceback (most recent call last):
File "E:\AI_home\Anaconda3\lib\threading.py", line 926, in _bootstrap_inner
self.run()
File "E:\AI_home\Anaconda3\lib\threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "E:\AI_home\Anaconda3\lib\multiprocessing\pool.py", line 470, in _handle_results
task = get()
File "E:\AI_home\Anaconda3\lib\multiprocessing\connection.py", line 250, in recv
buf = self._recv_bytes()
File "E:\AI_home\Anaconda3\lib\multiprocessing\connection.py", line 318, in _recv_bytes
return self._get_more_data(ov, maxsize)
File "E:\AI_home\Anaconda3\lib\multiprocessing\connection.py", line 340, in _get_more_data
ov, err = _winapi.ReadFile(self._handle, left, overlapped=True)
MemoryError
Feature Extraction: 60%|████████████████████████████████████▌ | 18/30 [58:00<38:40, 193.36s/it]
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-7-250d996ac0cf> in <module>
----> 1 df_features =extract_features(df_train, column_id='id', column_sort='time')
2 df_features
#kaggle运行
add Codeadd Markdown
df_features =extract_features(df_train, column_id='id', column_sort='time')
df_features
Feature Extraction: 20%|██ | 2/10 [38:37<2:07:36, 957.02s/it]
- 空值处理
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import select_features
impute(train_features)
- 特征选择
train_features_filtered = select_features(train_features, data_train_label)
1.3.3 独热编码
- 在最后将预测值label进行独热编码
def dummies_coder():
global df_train
name='label'
df_dummies = pd.get_dummies(df_train[name],prefix=name)
df_train = pd.concat([df_train,df_dummies],axis=1)
df_train.drop(name,axis=1,inplace=True)
return df_train
1.4 模型训练
- 导入相关模块
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.svm import SVC
import lightgbm as lgb
- 分离训练集、测试集
df_train = combined[:100000]
df_test = combined[100000:]
- 改变数据类型,减少内存使用
def reduce_mem_usage(df):
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
df_train = reduce_mem_usage(df_train)
————————————————————————————————————————————————————————
Memory usage of dataframe is 157.17 MB
Memory usage after optimization is: 39.86 MB
Decreased by 74.6%
- 划分训练集、验证集
x_train,x_val,y_train,y_val =train_test_split(df_train,targets,test_size=0.2,random_state=42)
1.4.1 利用基础模型及评分
- 创建模型分类器
lgrcv =LogisticRegressionCV()
rf =RandomForestClassifier()
knn=KNeighborsClassifier()
dt=DecisionTreeClassifier()
xgb =xgb.XGBClassifier()
lgbm = lgb.LGBMClassifier()
models =[lgrcv, rf,knn ,dt ,xgb,lgbm]
- 模型训练及评估
for model in models:
model=model.fit(x_train,y_train)
predict_train =model.predict(x_train)
predict_val=model.predict(x_val)
print(model)
# print('train Accureacy:',metrics.accuracy_score(y_train,predict_train))
print('val Accureacy:',metrics.accuracy_score(y_val,predict_val))
# print('train f1-score :',metrics.f1_score(y_train,predict_train))
print('val f1-score :',metrics.f1_score(y_val,predict_val,average='macro'))
# print('train mean_squared_error :',metrics.mean_squared_error(y_train,predict_train))
print('val mean_squared_error :',metrics.mean_squared_error(y_val,predict_val))
a = model.predict_proba(x_val)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_score=[i[1] for i in a], pos_label=1)
print('auc:',metrics.auc(fpr, tpr))
print('**********************************')
- 模型评估结果
LogisticRegressionCV()
val Accureacy: 0.8738
val f1-score : 0.7797043643993938
val mean_squared_error : 0.52295
auc: 0.8751658941402964
**********************************
RandomForestClassifier()
val Accureacy: 0.9806
val f1-score : 0.9486853009647026
val mean_squared_error : 0.06155
auc: 0.9920398427752294
**********************************
KNeighborsClassifier()
val Accureacy: 0.9783
val f1-score : 0.9486774842272341
val mean_squared_error : 0.0706
auc: 0.9699227365057602
**********************************
DecisionTreeClassifier()
val Accureacy: 0.9568
val f1-score : 0.9094243474402218
val mean_squared_error : 0.1598
auc: 0.8794967552617907
**********************************
XGBClassifier(objective='multi:softprob')
val Accureacy: 0.95
val f1-score : 0.8916767743199139
val mean_squared_error : 0.2072
auc: 0.9635107593719082
**********************************LGBMClassifier()
val Accureacy: 0.9826
val f1-score : 0.9552452452921629
val mean_squared_error : 0.05355
auc: 0.9909607298485397
1.4.2 调节超参数
- lightgbm表现较好,选择lightgbm进一步优化参数进行模型训练
- 转换为Dataset数据格式
train_matrix = lgb.Dataset(x_train,label = y_train)
val_matrix = lgb.Dataset(x_val,label = y_val)
- 定义一个f1_score的函数
def val_f1_score(preds, lgtrain):
preds = np.argmax(preds.reshape(4, -1),axis=0)
label = lgtrain.get_label()
result_f1_score = f1_score(preds,label,average = 'macro')
return 'f1_score',result_f1_score,True
- 设置初始参数
params = {
"learning_rate": 0.1,
"boosting": 'gbdt',
"lambda_l2": 0.1,
"max_depth": -1,
"num_leaves": 128,
"bagging_fraction": 0.8,
"feature_fraction": 0.8,
"metric": None,
"objective": "multiclass",
"num_class": 4,
"nthread": 10,
"verbose": -1,
}
- 模型训练
model = lgb.train(params,
train_set=train_matrix,
valid_sets=val_matrix,
num_boost_round=2000,
verbose_eval=50,
early_stopping_rounds=200,
feval=val_f1_score)
______________________________________________________
Training until validation scores don't improve for 200 rounds
[50] valid_0's multi_logloss: 0.0472522 valid_0's f1_score: 0.962766
[100] valid_0's multi_logloss: 0.0414565 valid_0's f1_score: 0.969621
[150] valid_0's multi_logloss: 0.0432075 valid_0's f1_score: 0.971069
[200] valid_0's multi_logloss: 0.0446821 valid_0's f1_score: 0.971485
[250] valid_0's multi_logloss: 0.0460871 valid_0's f1_score: 0.971569
Early stopping, best iteration is:
[99] valid_0's multi_logloss: 0.0414366 valid_0's f1_score: 0.969254
- 添加参数后,f1_score有所提高
val_pre = model.predict(x_val, num_iteration=model.best_iteration)
preds = np.argmax(val_pre, axis=1)
score = f1_score(y_true=y_val, y_pred=preds, average='macro')
score
_______________________________________________________
0.9692538553684411
- 网格搜索调整参数
model_lgb=lgb.LGBMClassifier(random_state=2021)
params_dic=dict(learning_rate=[0.01, 0.1, 1], n_estimators=[20,50,120,300],
num_leaves=[10,30],max_depth=[-1,4,10])
f1_score=make_scorer(f1_score,average='micro')
grid_search = GridSearchCV(model_lgb, cv=5,
param_grid=params_dic,
scoring=f1_score)
grid_search.fit(x_train,y_train)
print(f'最好的参数是:{grid_search.best_params_}')
print(f'最好的分数是:{grid_search.best_score_}')
————————————————————————————————————————————————————————
最好的参数是:{'learning_rate': 0.1, 'max_depth': -1, 'n_estimators': 300, 'num_leaves': 30}
最好的分数是:0.9863375000000001
1.4.3 计算abs - sum
- 训练最终模型,对df_test进行预测
model_final = lgb.LGBMClassifier(random_state=2021,learning_rate=0.1 ,
n_estimators=300 ,max_depth= -1,num_leaves=30 )
model_final.fit(x_train,y_train)
pre_test=model_final.predict(df_test)
proba_test=model_final.predict_proba(df_test)
- 将预测值label独热编码为4类
df_pred =pd.DataFrame(pre_test,columns=['label'])
def dummies_coder():
global df_pred
name='label'
df_dummies = pd.get_dummies(df_pred[name],prefix=name)
df_pred = pd.concat([df_pred,df_dummies],axis=1)
df_pred.drop(name,axis=1,inplace=True)
return df_pred
df_pred =dummies_coder()
pred_arr=np.array(df_pred)
- 计算abs-sum
def abs_sum():
global pred_arr,proba_test
return sum(sum(abs(proba_test-pred_arr)))
result = abs_sum()
result
————————————————————————————————————————————————————————
310.1228671266683
1.4.3 模型预测及输出结果文件
-生成预测概率文件并提交
df_proba =pd.DataFrame(proba_test,columns=['1','2','3','4'])
df_result = pd.DataFrame()
abc = pd.read_csv('sample_submit.csv',encoding='utf-8')
df_result['id'] = abc['id']
df_result['label_0']=df_proba['1']
df_result['label_1']=df_proba['2']
df_result['label_2']=df_proba['3']
df_result['label_3']=df_proba['4']
df_result
df_result.to_csv('411_submit.csv', index=False)