python计算信息增益

346 阅读1分钟
import pandas as pd
import numpy as np


def compute_h(p1,p2):
    H=-(p1*np.log2(p1) + p2*np.log2(p2))

    return H




def load_data_pre():
    df=pd.read_excel('water.xlsx')


    t=df['好瓜'].count()
    p=df[df['好瓜']=='是']['编号'].count()
    n=df[df['好瓜']=='否']['编号'].count()

    # Y信息熵
    H=round(compute_h(p/t,n/t),3)
    # ['编号' '色泽' '根蒂' '敲声' '纹理' '脐部' '触感' '好瓜']

    color_gain=get_gain(df,H,'色泽','好瓜')
    print('色泽的信息增益: %.3f' % color_gain)

    root_gain=get_gain(df,H,'根蒂','好瓜')
    print('根蒂的信息增益: %.3f' % root_gain)

    sound_gain=get_gain(df,H,'敲声','好瓜')
    print('敲声的信息增益: %.3f' % sound_gain)

    vein_gain=get_gain(df,H,'纹理','好瓜')
    print('纹理的信息增益:%.3f' % vein_gain)

    belly_gain=get_gain(df,H,'脐部','好瓜')
    print('脐部的信息增益:%.3f' % belly_gain)

    touch_gain=get_gain(df,H,'触感','好瓜')
    print('触感的增益信息:%.3f' % touch_gain)


def get_gain(df,H,col_x,col_y):
    df_color = df.groupby([col_x, col_y])['编号'].count().reset_index(name='count')

    df_tmp = df.groupby(col_x)['编号'].count().reset_index(name='count')

    df_tt = pd.merge(df_color, df_tmp, on=col_x, how='outer')

    # 求占比
    df_tt['rate'] = df_tt['count_x'] / df_tt['count_y']


    # 求信息熵
    df_tt['s'] = df_tt['rate'].apply(lambda x: x * np.log2(x))

    a_color = df_tt.groupby(col_x)['s', 'count_x'].sum().reset_index()
    b_color = -(a_color['s'] * (a_color['count_x'] / a_color['count_x'].sum()))

    # 所以,色泽的信息增益为
    color_h = H - b_color.sum()

    return color_h






if __name__=='__main__':

    load_data_pre()

water.xlsx