import pandas as pd
import numpy as np
def compute_h(p1,p2):
H=-(p1*np.log2(p1) + p2*np.log2(p2))
return H
def load_data_pre():
df=pd.read_excel('water.xlsx')
t=df['好瓜'].count()
p=df[df['好瓜']=='是']['编号'].count()
n=df[df['好瓜']=='否']['编号'].count()
# Y信息熵
H=round(compute_h(p/t,n/t),3)
# ['编号' '色泽' '根蒂' '敲声' '纹理' '脐部' '触感' '好瓜']
color_gain=get_gain(df,H,'色泽','好瓜')
print('色泽的信息增益: %.3f' % color_gain)
root_gain=get_gain(df,H,'根蒂','好瓜')
print('根蒂的信息增益: %.3f' % root_gain)
sound_gain=get_gain(df,H,'敲声','好瓜')
print('敲声的信息增益: %.3f' % sound_gain)
vein_gain=get_gain(df,H,'纹理','好瓜')
print('纹理的信息增益:%.3f' % vein_gain)
belly_gain=get_gain(df,H,'脐部','好瓜')
print('脐部的信息增益:%.3f' % belly_gain)
touch_gain=get_gain(df,H,'触感','好瓜')
print('触感的增益信息:%.3f' % touch_gain)
def get_gain(df,H,col_x,col_y):
df_color = df.groupby([col_x, col_y])['编号'].count().reset_index(name='count')
df_tmp = df.groupby(col_x)['编号'].count().reset_index(name='count')
df_tt = pd.merge(df_color, df_tmp, on=col_x, how='outer')
# 求占比
df_tt['rate'] = df_tt['count_x'] / df_tt['count_y']
# 求信息熵
df_tt['s'] = df_tt['rate'].apply(lambda x: x * np.log2(x))
a_color = df_tt.groupby(col_x)['s', 'count_x'].sum().reset_index()
b_color = -(a_color['s'] * (a_color['count_x'] / a_color['count_x'].sum()))
# 所以,色泽的信息增益为
color_h = H - b_color.sum()
return color_h
if __name__=='__main__':
load_data_pre()
water.xlsx