Python 决策树计算熵、gini系数、误分率

857 阅读3分钟

本文已参与「新人创作礼」活动,一起开启掘金创作之路

决策树计算熵、gini系数、误分率

决策树是一种典型的分类算法,通过属性水平的不断分支来构造一棵层次树,决策树算法的一个关键是判断优先选择哪一个指标进行树的分支,可以使用一些指标的增益来进行判断,下面列出了常用的三种指标:

(1)熵

在这里插入图片描述

p( j | t) 是类j在节点t的相对频率

(2)Gini系数

在这里插入图片描述

(3)误分率

在这里插入图片描述

根据上面三种指数的定义,在DecisionTree.py中补充相关代码,分别使用三种指标对给出的测试数据计算其分类结果,请调试通过并运行获取结果。 (1)按照熵指数的定义,实现熵的计算函数。 (2)按照Gini系数的定义,实现其计算函数。 (3)按照误分率的定义,实现其计算函数。

# -*- coding: utf-8 -*-


import numpy as np

class DecisionTree:
    
    def __init__(self, chooseMethod):
        self._tree = None
        self._chooseMethod = chooseMethod
    
    # 计算熵的函数
    def _calcEntropy(self,y):
        #-------------------请在此处完成熵的计算函数---------------------------------
        num=y.shape[0]
        labelCounts={}
        for label in y:
            if label not in labelCounts.keys():#分别统计标签的个数,yes,no的个数
                labelCounts[label]=0
            labelCounts[label]+=1
        #计算熵
        entropy = 0.0
        prob_list=[]
        for key in labelCounts:
            prob = float(labelCounts[key])/num
            entropy -= prob* np.log2(prob)
        return entropy
    
    # 计算Gini系数的函数
    def _calcGini(self,y):
        #-------------------请在此处完成Gini系数的计算函数---------------------------------
        num=y.shape[0]
        labelCounts={}
        for label in y:
            if label not in labelCounts.keys():#分别统计标签的个数,yes,no的个数
                labelCounts[label]=0
            labelCounts[label]+=1
        #计算Gini
        p_sum = 0
        for key in labelCounts:
            prob = float(labelCounts[key])/num
            p_sum += prob**2
        gini = 1-p_sum  #求gini
        return gini
    # 计算误分率
    def _calcErrorRate(self,y):
        #-------------------请在此处完成误分率的计算函数---------------------------------
        num=y.shape[0]
        labelCounts={}
        for label in y:
            if label not in labelCounts.keys():#分别统计标签的个数,yes,no的个数
                labelCounts[label]=0
            labelCounts[label]+=1
        #计算Error
        prob_list=[]
        for key in labelCounts:
            prob = float(labelCounts[key])/num
            prob_list.append(prob)  #求p列表最大值

        error = 1-max(prob_list) #求误分率
        return error
    def _splitDataSet(self,X,y,index,value):        
        ret = []
        featVec = X[:,index]
        X = X[:,[i for i in range(X.shape[1]) if i!=index]]
        for i in range(len(featVec)):
            if featVec[i]==value:
                ret.append(i)
        return X[ret,:],y[ret]
    
    
    def _chooseBestFeatureToSplit(self,X,y):       
        numFeatures = X.shape[1]
        oldIndex = self._chooseMethod(self, y)
        bestGain = 0.0
        bestFeatureIndex = -1        
        for i in range(numFeatures):        
            featList = X[:,i]
            uniqueVals = set(featList)
            newIndex = 0.0
            for value in uniqueVals:
                sub_X,sub_y = self._splitDataSet(X,y,i,value)
                prob = len(sub_y)/float(len(y))
                newIndex += prob * self._chooseMethod(self, sub_y)              
            indexGain = oldIndex - newIndex
            if (indexGain > bestGain):
                bestGain = indexGain
                bestFeatureIndex = i
        return bestFeatureIndex
          
    def _majorityCnt(self,labelList):       
        labelCount={}
        for vote in labelList:
            if vote not in labelCount.keys(): labelCount[vote] = 0
            labelCount[vote] += 1
        sortedClassCount = sorted(labelCount.iteritems(),key=lambda x:x[1], reverse=True)
        return sortedClassCount[0][0]    
    
    
    def _createTree(self,X,y,featureIndex):
        labelList = list(y)        
        if labelList.count(labelList[0]) == len(labelList): 
            return labelList[0]        
        if len(featureIndex) == 0:
            return self._majorityCnt(labelList)        
        
        bestFeatIndex = self._chooseBestFeatureToSplit(X,y) 
        bestFeatStr = featureIndex[bestFeatIndex]
        featureIndex = list(featureIndex)
        featureIndex.remove(bestFeatStr)
        featureIndex = tuple(featureIndex)        
        myTree = {bestFeatStr:{}}
        featValues = X[:,bestFeatIndex]
        uniqueVals = set(featValues)
        for value in uniqueVals:            
            sub_X,sub_y = self._splitDataSet(X,y, bestFeatIndex, value)
            myTree[bestFeatStr][value] = self._createTree(sub_X,sub_y,featureIndex)
        return myTree  
    
    def fit(self,X,y):        
        if isinstance(X,np.ndarray) and isinstance(y,np.ndarray):
            pass
        else: 
            try:
                X = np.array(X)
                y = np.array(y)
            except:
                raise TypeError("numpy.ndarray required for X,y")
        
        featureIndex = tuple(['x'+str(i) for i in range(X.shape[1])])
        self._tree = self._createTree(X,y,featureIndex)
        return self    

    def predict(self,X):        
        if isinstance(X,np.ndarray): 
            pass
        else: 
            try:
                X = np.array(X)
            except:
                raise TypeError("numpy.ndarray required for X")
        
        def _classify(tree,sample):
            featIndex = list(tree.keys())[0]
            secondDict = tree[featIndex]
            key = sample[int(featIndex[1:])]
            valueOfkey = secondDict[key]
            if isinstance(valueOfkey, dict): 
                label = _classify(valueOfkey,sample)
            else: label = valueOfkey
            return label
            
        if len(X.shape)==1:
            return _classify(self._tree,X)
        else:   
            results = []
            for i in range(X.shape[0]):
                results.append(_classify(self._tree,X[i]))
            return np.array(results)

X = [[1, 2, 0, 1, 0],
     [0, 1, 1, 0, 1],
     [1, 0, 0, 0, 1],
     [2, 1, 1, 0, 1],
     [1, 1, 0, 1, 1]]
y = ['yes','yes','no','no','no']


clf = DecisionTree(chooseMethod = DecisionTree._calcEntropy)
clf.fit(X,y)
print(clf.predict(X))   

clf = DecisionTree(chooseMethod = DecisionTree._calcGini)
clf.fit(X,y)
print(clf.predict(X))   

clf = DecisionTree(chooseMethod = DecisionTree._calcErrorRate)
clf.fit(X,y)
print(clf.predict(X))