1 Star 0 Fork 0

AIpi-machine/decesiontree

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
decesiontree.py 4.15 KB
一键复制 编辑 原始数据 按行查看 历史
吴官骏 提交于 2021-12-21 01:45 . decesiontree
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
import operator
from math import log
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def calcShannonEnt(dataSet): # 计算数据的熵(entropy)
numEntries=len(dataSet) # 数据条数
labelCounts={'R':0,'M':0}
for index,featVec in dataSet.iterrows():
currentLabel=featVec[-1] # 每行数据的最后一个字(类别)
labelCounts[currentLabel]+=1 # 统计有多少个类以及每个类的数量
shannonEnt=0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
if prob==0:
shannonEnt-=0
else:shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
return shannonEnt
def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据,取大于该值
retDataSet=pd.DataFrame()
retDataSet2=pd.DataFrame()
reducedFeatvec=dataSet.copy()
for index,featVec in reducedFeatvec.iterrows():
if featVec[axis]>=value:
retDataSet=retDataSet.append(featVec)
else:
retDataSet2=retDataSet2.append(featVec)
return retDataSet,retDataSet2
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet.columns)
baseEntropy = calcShannonEnt(dataSet)#原始数据的熵
bestInfoGain=-1
bestFeature=0
flag=0
count=0
for i in range(numFeatures):#遍历所有的特征,取一个特征开始循环
featList=dataSet.iloc[:,i]
uniqueVals = set(featList)#取该列所有的值,并代入计算。将每一个值设为一个分类对象进行分类
for value in uniqueVals:
newEntropy = 0
subDataSet,subDataSet2 = splitDataSet(dataSet, dataSet.columns[i], value)
if len(subDataSet)==0 or len(subDataSet2)==0:
continue
prob = len(subDataSet) / float(len(dataSet))
prob1= len(subDataSet2) /float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet) # 按特征分类后的熵
newEntropy += prob1 * calcShannonEnt(subDataSet2)
infoGain = baseEntropy - newEntropy # 原始熵与按特征分类后的熵的差值
if (infoGain > bestInfoGain): # 若按某特征划分后,熵值减少的最大,则次特征为最优分类特征
bestInfoGain = infoGain
bestFeature = i
flag=value
count+=1
print(count,bestInfoGain)
return bestFeature,flag
def dropDataSet(dataSet,axis,value): # 按某个特征分类后的数据,取大于该值
retDataSet=pd.DataFrame()
reducedFeatvec=dataSet.copy()
for index,featVec in reducedFeatvec.iterrows():
if featVec[axis]<value:
retDataSet=retDataSet.append(featVec)
return retDataSet
def TreeGenerate(dataSet,labels):
classList = [example[-1] for example in dataSet] # 类别:R或L
# 若全都属于一类,则返回该类。
if classList.count(classList[0]) == len(classList):
return classList[0]
# 若总共只有一个特征,则返回类别中数量最多的类。
if dataSet.shape[1] == 2:
return majorityCnt(classList)
bestFeat,flag = chooseBestFeatureToSplit(dataSet) # 选择最优特征,同时返回该特征的分类位置。
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel: {},'value':flag}
dataSet=dataSet.drop(columns=[labels[bestFeat]])
temp,dataSet=splitDataSet(dataSet,bestFeat,flag)
print(dataSet.shape)
del(labels[bestFeat])
myTree[bestFeatLabel] = TreeGenerate(dataSet,labels)
return myTree
dataSet=pd.read_csv('sonar.csv')
dataSet=dataSet.head(5)
pd.set_option("display.max_columns", 1000000)
pd.set_option('display.width', 10000)
labels=dataSet.columns.tolist()
Tree=TreeGenerate(dataSet,labels)
print(Tree)
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/aipi-machine/decesiontree.git
git@gitee.com:aipi-machine/decesiontree.git
aipi-machine
decesiontree
decesiontree
master

搜索帮助

0d507c66 1850385 C8b1a773 1850385