master

分支 (1)

管理

管理

master

decesiontree
/
main.py

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from math import log
import operator

def calcShannonEnt(dataSet):  # 计算数据的熵(entropy)
    numEntries=dataSet.shape[0]  # 数据条数
    if numEntries==0:return 0
    labelCounts={'R':0,'M':0}
    for featVec in dataSet:
        currentLabel=featVec[-1] # 每行数据的最后一个字（类别）
        labelCounts[currentLabel]+=1  # 统计有多少个类以及每个类的数量
    shannonEnt=0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
        if prob==0:
            return 0
        else:
            shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
    return shannonEnt
def createDataSet1():    # 创造示例数据
    pd.set_option("display.max_columns", 1000000)
    pd.set_option('display.width', 10000)
    dataSet = pd.read_csv('sonar.csv')
    labels=dataSet.columns
    dataSet=dataSet.to_numpy()
    labels = labels.tolist()  #给特征命名
    return dataSet,labels

def classify(inputTree,featLabels,testVec):
    firstStr = inputTree.keys()[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict):
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
    retDataSet=np.empty(shape=(0,dataSet.shape[1]))
    reducedDataSet=np.empty(shape=(0,dataSet.shape[1]))
    for featVec in dataSet:
        if featVec[axis]>=value:
            retDataSet=np.vstack((retDataSet,featVec))
        else:
            reducedDataSet=np.vstack((reducedDataSet,featVec))
    return retDataSet,reducedDataSet

def chooseBestFeatureToSplit(dataSet):  # 选择最优的分类特征
    numFeatures = dataSet.shape[1]
    baseEntropy = calcShannonEnt(dataSet)  # 原始的熵
    #离散的香农熵和连续的香农熵计算方法不同。
    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeatures-1):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0
        for value in uniqueVals:
            subDataSetup,subDataSetdown = splitDataSet(dataSet,i,value)
            prob =len(subDataSetup)/float(len(dataSet))
            newEntropy =prob*calcShannonEnt(subDataSetup)  # 按特征分类后的熵
            prob2=1-prob
            newEntropy+=prob2*calcShannonEnt(subDataSetdown)
            infoGain = baseEntropy - newEntropy  # 原始熵与按特征分类后的熵的差值
            if (infoGain>bestInfoGain):   # 若按某特征划分后，熵值减少的最大，则次特征为最优分类特征
                bestInfoGain=infoGain
                bestFeature = i
                bestflag=value
        print("besInfoGain is:{}".format(bestInfoGain))
    return bestFeature,bestflag

def majorityCnt(classList):    #按分类后类别数量排序，比如：最后分类为2男1女，则判定为男；
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]
def purity(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
        sum=0
        temp=0
        minans=1000000000
    for vote in classCount.keys():
        sum=sum+classCount[vote]
        temp=classCount[vote]
        ans=temp/sum
        if minans>ans:
            minans=ans
    return minans
def createTree(dataSet,labels):
    classList=[example[-1] for example in dataSet]  # 类别：R或L
    # 若全都属于一类，则返回该类。
    if classList.count(classList[0])==len(classList):
        return classList[0]
    # 若总共只有一个特征，则返回类别中数量最多的类。
    if dataSet.shape[1]==2:
        return majorityCnt(classList)
    bestFeat,bestflag=chooseBestFeatureToSplit(dataSet)
    subDataSetup,subDataSetdown=splitDataSet(dataSet, bestFeat, bestflag)
    classlist2=[example[-1] for example in subDataSetup]#选择最优特征
    classlist3=[example[-1] for example in subDataSetdown]
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{},bestflag:{}}
    # myTree={bestFeatLabel: {},bestflag:majorityCnt(classlist2)} #分类结果以字典形式保存
    del(labels[bestFeat])
    subdataSetup = np.delete(subDataSetup, bestFeat, axis=1)
    subDataSetdown =np.delete(subDataSetdown,bestFeat,axis=1)
    subLabels=labels[:]
    myTree[bestflag]=createTree(subDataSetup,subLabels)
    myTree[bestFeatLabel]=createTree(subDataSetdown,subLabels)
    return myTree


if __name__=='__main__':
    dataSet, labels=createDataSet1()  # 创造示列数据
    print(createTree(dataSet, labels))  # 输出决策树模型结果