代码拉取完成,页面将自动刷新
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from math import log
import operator
def calcShannonEnt(dataSet): # 计算数据的熵(entropy)
numEntries=dataSet.shape[0] # 数据条数
if numEntries==0:return 0
labelCounts={'R':0,'M':0}
for featVec in dataSet:
currentLabel=featVec[-1] # 每行数据的最后一个字(类别)
labelCounts[currentLabel]+=1 # 统计有多少个类以及每个类的数量
shannonEnt=0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
if prob==0:
return 0
else:
shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
return shannonEnt
def createDataSet1(): # 创造示例数据
pd.set_option("display.max_columns", 1000000)
pd.set_option('display.width', 10000)
dataSet = pd.read_csv('sonar.csv')
labels=dataSet.columns
dataSet=dataSet.to_numpy()
labels = labels.tolist() #给特征命名
return dataSet,labels
def classify(inputTree,featLabels,testVec):
firstStr = inputTree.keys()[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else: classLabel = valueOfFeat
return classLabel
def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
retDataSet=np.empty(shape=(0,dataSet.shape[1]))
reducedDataSet=np.empty(shape=(0,dataSet.shape[1]))
for featVec in dataSet:
if featVec[axis]>=value:
retDataSet=np.vstack((retDataSet,featVec))
else:
reducedDataSet=np.vstack((reducedDataSet,featVec))
return retDataSet,reducedDataSet
def chooseBestFeatureToSplit(dataSet): # 选择最优的分类特征
numFeatures = dataSet.shape[1]
baseEntropy = calcShannonEnt(dataSet) # 原始的熵
#离散的香农熵和连续的香农熵计算方法不同。
bestInfoGain = 0
bestFeature = -1
for i in range(numFeatures-1):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0
for value in uniqueVals:
subDataSetup,subDataSetdown = splitDataSet(dataSet,i,value)
prob =len(subDataSetup)/float(len(dataSet))
newEntropy =prob*calcShannonEnt(subDataSetup) # 按特征分类后的熵
prob2=1-prob
newEntropy+=prob2*calcShannonEnt(subDataSetdown)
infoGain = baseEntropy - newEntropy # 原始熵与按特征分类后的熵的差值
if (infoGain>bestInfoGain): # 若按某特征划分后,熵值减少的最大,则次特征为最优分类特征
bestInfoGain=infoGain
bestFeature = i
bestflag=value
print("besInfoGain is:{}".format(bestInfoGain))
return bestFeature,bestflag
def majorityCnt(classList): #按分类后类别数量排序,比如:最后分类为2男1女,则判定为男;
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def purity(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
sum=0
temp=0
minans=1000000000
for vote in classCount.keys():
sum=sum+classCount[vote]
temp=classCount[vote]
ans=temp/sum
if minans>ans:
minans=ans
return minans
def createTree(dataSet,labels):
classList=[example[-1] for example in dataSet] # 类别:R或L
# 若全都属于一类,则返回该类。
if classList.count(classList[0])==len(classList):
return classList[0]
# 若总共只有一个特征,则返回类别中数量最多的类。
if dataSet.shape[1]==2:
return majorityCnt(classList)
bestFeat,bestflag=chooseBestFeatureToSplit(dataSet)
subDataSetup,subDataSetdown=splitDataSet(dataSet, bestFeat, bestflag)
classlist2=[example[-1] for example in subDataSetup]#选择最优特征
classlist3=[example[-1] for example in subDataSetdown]
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{},bestflag:{}}
# myTree={bestFeatLabel: {},bestflag:majorityCnt(classlist2)} #分类结果以字典形式保存
del(labels[bestFeat])
subdataSetup = np.delete(subDataSetup, bestFeat, axis=1)
subDataSetdown =np.delete(subDataSetdown,bestFeat,axis=1)
subLabels=labels[:]
myTree[bestflag]=createTree(subDataSetup,subLabels)
myTree[bestFeatLabel]=createTree(subDataSetdown,subLabels)
return myTree
if __name__=='__main__':
dataSet, labels=createDataSet1() # 创造示列数据
print(createTree(dataSet, labels)) # 输出决策树模型结果
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。