本文已参与「新人创作礼」活动,一起开启掘金创作之路
决策树计算熵、gini系数、误分率
决策树是一种典型的分类算法,通过属性水平的不断分支来构造一棵层次树,决策树算法的一个关键是判断优先选择哪一个指标进行树的分支,可以使用一些指标的增益来进行判断,下面列出了常用的三种指标:
(1)熵
p( j | t) 是类j在节点t的相对频率
(2)Gini系数
(3)误分率
根据上面三种指数的定义,在DecisionTree.py中补充相关代码,分别使用三种指标对给出的测试数据计算其分类结果,请调试通过并运行获取结果。 (1)按照熵指数的定义,实现熵的计算函数。 (2)按照Gini系数的定义,实现其计算函数。 (3)按照误分率的定义,实现其计算函数。
# -*- coding: utf-8 -*-
import numpy as np
class DecisionTree:
def __init__(self, chooseMethod):
self._tree = None
self._chooseMethod = chooseMethod
# 计算熵的函数
def _calcEntropy(self,y):
#-------------------请在此处完成熵的计算函数---------------------------------
num=y.shape[0]
labelCounts={}
for label in y:
if label not in labelCounts.keys():#分别统计标签的个数,yes,no的个数
labelCounts[label]=0
labelCounts[label]+=1
#计算熵
entropy = 0.0
prob_list=[]
for key in labelCounts:
prob = float(labelCounts[key])/num
entropy -= prob* np.log2(prob)
return entropy
# 计算Gini系数的函数
def _calcGini(self,y):
#-------------------请在此处完成Gini系数的计算函数---------------------------------
num=y.shape[0]
labelCounts={}
for label in y:
if label not in labelCounts.keys():#分别统计标签的个数,yes,no的个数
labelCounts[label]=0
labelCounts[label]+=1
#计算Gini
p_sum = 0
for key in labelCounts:
prob = float(labelCounts[key])/num
p_sum += prob**2
gini = 1-p_sum #求gini
return gini
# 计算误分率
def _calcErrorRate(self,y):
#-------------------请在此处完成误分率的计算函数---------------------------------
num=y.shape[0]
labelCounts={}
for label in y:
if label not in labelCounts.keys():#分别统计标签的个数,yes,no的个数
labelCounts[label]=0
labelCounts[label]+=1
#计算Error
prob_list=[]
for key in labelCounts:
prob = float(labelCounts[key])/num
prob_list.append(prob) #求p列表最大值
error = 1-max(prob_list) #求误分率
return error
def _splitDataSet(self,X,y,index,value):
ret = []
featVec = X[:,index]
X = X[:,[i for i in range(X.shape[1]) if i!=index]]
for i in range(len(featVec)):
if featVec[i]==value:
ret.append(i)
return X[ret,:],y[ret]
def _chooseBestFeatureToSplit(self,X,y):
numFeatures = X.shape[1]
oldIndex = self._chooseMethod(self, y)
bestGain = 0.0
bestFeatureIndex = -1
for i in range(numFeatures):
featList = X[:,i]
uniqueVals = set(featList)
newIndex = 0.0
for value in uniqueVals:
sub_X,sub_y = self._splitDataSet(X,y,i,value)
prob = len(sub_y)/float(len(y))
newIndex += prob * self._chooseMethod(self, sub_y)
indexGain = oldIndex - newIndex
if (indexGain > bestGain):
bestGain = indexGain
bestFeatureIndex = i
return bestFeatureIndex
def _majorityCnt(self,labelList):
labelCount={}
for vote in labelList:
if vote not in labelCount.keys(): labelCount[vote] = 0
labelCount[vote] += 1
sortedClassCount = sorted(labelCount.iteritems(),key=lambda x:x[1], reverse=True)
return sortedClassCount[0][0]
def _createTree(self,X,y,featureIndex):
labelList = list(y)
if labelList.count(labelList[0]) == len(labelList):
return labelList[0]
if len(featureIndex) == 0:
return self._majorityCnt(labelList)
bestFeatIndex = self._chooseBestFeatureToSplit(X,y)
bestFeatStr = featureIndex[bestFeatIndex]
featureIndex = list(featureIndex)
featureIndex.remove(bestFeatStr)
featureIndex = tuple(featureIndex)
myTree = {bestFeatStr:{}}
featValues = X[:,bestFeatIndex]
uniqueVals = set(featValues)
for value in uniqueVals:
sub_X,sub_y = self._splitDataSet(X,y, bestFeatIndex, value)
myTree[bestFeatStr][value] = self._createTree(sub_X,sub_y,featureIndex)
return myTree
def fit(self,X,y):
if isinstance(X,np.ndarray) and isinstance(y,np.ndarray):
pass
else:
try:
X = np.array(X)
y = np.array(y)
except:
raise TypeError("numpy.ndarray required for X,y")
featureIndex = tuple(['x'+str(i) for i in range(X.shape[1])])
self._tree = self._createTree(X,y,featureIndex)
return self
def predict(self,X):
if isinstance(X,np.ndarray):
pass
else:
try:
X = np.array(X)
except:
raise TypeError("numpy.ndarray required for X")
def _classify(tree,sample):
featIndex = list(tree.keys())[0]
secondDict = tree[featIndex]
key = sample[int(featIndex[1:])]
valueOfkey = secondDict[key]
if isinstance(valueOfkey, dict):
label = _classify(valueOfkey,sample)
else: label = valueOfkey
return label
if len(X.shape)==1:
return _classify(self._tree,X)
else:
results = []
for i in range(X.shape[0]):
results.append(_classify(self._tree,X[i]))
return np.array(results)
X = [[1, 2, 0, 1, 0],
[0, 1, 1, 0, 1],
[1, 0, 0, 0, 1],
[2, 1, 1, 0, 1],
[1, 1, 0, 1, 1]]
y = ['yes','yes','no','no','no']
clf = DecisionTree(chooseMethod = DecisionTree._calcEntropy)
clf.fit(X,y)
print(clf.predict(X))
clf = DecisionTree(chooseMethod = DecisionTree._calcGini)
clf.fit(X,y)
print(clf.predict(X))
clf = DecisionTree(chooseMethod = DecisionTree._calcErrorRate)
clf.fit(X,y)
print(clf.predict(X))