User talk:Yuu B313


 * Numbered list item
 * Bulleted list item
 * Bulleted list item
 * Bulleted list item
 * Bulleted list item
 * Bulleted list item

content
coding=utf-8 无需言，做自己 from math import log import operator import matplotlib.pyplot as plt import pickle
 * 1) __author__=Eshter Yuu
 * 2) python3.5

计算香农熵

划分数据集 def splitDataSet(dataSet, axis,value): retDataSet =[] for featVec in dataSet: if featVec[axis] == value: reducedFeatVec = featVec[:axis] reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def chooseBestFeatureToSplit(dataSet): numFeatures = len(dataSet[0])-1#计算属性个数 baseEntropy = calShannonEntries(dataSet) bestInfoGain =0.0; bestFeature =-1 for i in range(numFeatures): featList =[example[i] for example in dataSet] uniqueVals = set(featList) newEntropy = 0.0 for value in uniqueVals: subDataSet = splitDataSet(dataSet,i,value) prob = len(subDataSet) / float(len(dataSet)) newEntropy += prob *calShannonEntries(subDataSet) infoGain = baseEntropy - newEntropy if (infoGain>bestInfoGain): bestInfoGain = infoGain bestFeature =i return bestFeature
 * 1) 选择最好的数据集进行划分

def loadDataSet(filename): dataMat =[]; #classLabel =[] fr = open(filename) for line in fr.readlines: lineArr = line.strip.split('\t') dataMat.append(lineArr[:]) #classLabel.append(lineArr[-1]) #dataMat = np.mat(dataMat) #classLabel = np.mat(classLabel) return dataMat#,classLabel
 * 1) 加载数据集

def majortyCnt(classList): classCount={} for vote in classList: if vote not in classCount.keys: classCount[vote] =0 classCount[vote] += 1 sortedClassCount = sorted(classCount.items, key = operator.itemgetter(1), reverse = True) return sortedClassCount[0][0]
 * 1) 多数表决，投票

def createTree(dataSet,labels): classList =[example[-1] for example in dataSet] if classList.count(classList[0]) == len(classList): return classList[0] if len(dataSet[0]) ==1: return majortyCnt(classList) bestFeat = chooseBestFeatureToSplit(dataSet) bestFeatLabel = labels[bestFeat] myTree ={bestFeatLabel:{}} del(labels[bestFeat]) featValues =[example[bestFeat] for example in dataSet] uniqueVals = set(featValues) for value in uniqueVals: subLabels =labels[:] myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value), subLabels) return myTree
 * 构建树

decisionNode = dict(boxstyle ="sawtooth", fc ="0.8")#定义文本框和箭头格式 leafNode = dict(boxstyle="round4", fc="0.8") arrow_args = dict(arrowstyle ="<-") def plotNode(nodeTxt, centerPt, parentPt, nodeType): createPlot.ax1.annotate(nodeTxt, xy =parentPt, xycoords='axes fraction',xytext = centerPt, textcoords ='axes fraction',	                     va = "center", ha ="center",bbox =nodeType, arrowprops =arrow_args)
 * 1) 使用文本绘制树节点

def createPlot: fig = plt.figure(1, facecolor='white') fig.clf createPlot.ax1 = plt.subplot(111, frameon =False) plotNode(U'决策节点',(0.5,0.1),(0.1,0.5),decisionNode) plotNode(U'叶节点',(0.8,0.1),(0.3,0.8),leafNode) plt.show

def getNumLeafs(mytree): numLeaf =0 firstStr =list(mytree.keys)[0] secondDict = mytree[firstStr] for key in secondDict.keys: if type(secondDict[key]).__name__ =='dict': numLeaf += getNumLeafs(secondDict[key]) else: numLeaf +=1 return numLeaf
 * 1) 构造注解树
 * 2) 获取叶子节点的数目和层数

def getTreeDepth(mytree): maxDepth =0 firstStr =list( mytree.keys)[0]##此处应加list # 在python2.x中，dict.keys返回一个列表，在python3.x中，dict.keys返回一个dict_keys对象 # ，比起列表，这个对象的行为更像是set，所以不支持索引的. secondDict = mytree[firstStr] for key in secondDict.keys: if type(secondDict[key]).__name__=='dict': thisDepth =1+ getTreeDepth(secondDict[key]) else :thisDepth =1 if thisDepth > maxDepth: maxDepth =thisDepth return maxDepth

def plotMidText(cntrPt, parentPt, txtString): xMid = (parentPt[0] - cntrPt[0]) / 2.0 +cntrPt[0] yMid = (parentPt[1]-cntrPt[1]) / 2.0 +cntrPt[1] createPlot.ax1.text(xMid, yMid, txtString) def plotTree(mytree, parentPt, nodeTxt): numLeafs = getNumLeafs(mytree) depth = getTreeDepth(mytree) firstStr = list(mytree.keys)[0] cntrPt = (plotTree.xoff+(1.0 + float(numLeafs)) / 2.0/plotTree.totalW,plotTree.yoff) plotMidText(cntrPt, parentPt,nodeTxt) plotNode(firstStr, cntrPt,parentPt, decisionNode) secondDict = mytree[firstStr] plotTree.yoff = plotTree.yoff - 1.0 / plotTree.totalD for key in secondDict.keys: if type(secondDict[key]).__name__ =='dict': plotTree(secondDict[key], cntrPt, str(key)) else: plotTree.xoff = plotTree.xoff +1.0/ plotTree.totalW plotNode(secondDict[key], (plotTree.xoff,plotTree.yoff), cntrPt,leafNode) plotMidText((plotTree.xoff, plotTree.yoff), cntrPt, str(key)) plotTree.yoff = plotTree.yoff +1.0 / plotTree.totalD
 * 1) 在父子节点间填充文本信息
 * 1) 计算宽与高

def createPlot(inTree): fig = plt.figure(1, facecolor='white') fig.clf axprops = dict(xticks=[], yticks =[]) createPlot.ax1 = plt.subplot(111, frameon =False, **axprops) plotTree.totalW = float(getNumLeafs(inTree)) plotTree.totalD = float(getTreeDepth(inTree)) plotTree.xoff = -0.5 / plotTree.totalW;plotTree.yoff =1.0; plotTree(inTree,(0.5,1.0),'') plt.show


 * 1) createPlot

def retrieveTree(i): listOfTrees = [{'no surfacing':{0:'no', 1:{'flippers':{0:'no',1:'yes'}}}}, {'no surfacing':{0:'no',1:{'flippers':{0:{'head':{0:'no',1:'yes'}},1:'no'}}}}] return listOfTrees[i]

numLeaf = getNumLeafs(retrieveTree(0)) treeDepth = getTreeDepth(retrieveTree(0))


 * 1) print('树的深度为%d'%treeDepth)
 * 2) print('\n')
 * 3) print('树的叶节点个数为%d'%numLeaf)


 * 1) createPlot(retrieveTree(0))

def classify(inputTree, featLabels,testVec): firstStr = list(inputTree.keys)[0]#得到根节点 secondDict = inputTree[firstStr] featIndex = featLabels.index(firstStr)#将标签字符串转化为索引,得到比如说 某一列，列标号的名字 for key in secondDict.keys: if testVec[featIndex] == key: if type(secondDict[key]).__name__ == 'dict': classLabel = classify(secondDict[key],featLabels,testVec) else: classLabel = secondDict[key] return classLabel
 * 1) 决策树的分类函数

intree1 = retrieveTree(0)

labels1 =['no surfacing', 'flippers'] classlabel=classify(intree1,labels1, [1,0])
 * 1) print("分类结果为：",classlabel)

''' 决策树的存储 ''' def storeTree(inputTree, filename): fw = open(filename,'ab+')#a是追加，b+---文件byte打开 #w+以纯文本方式读写，而wb+是以二进制方式进行读写. inputTree=str(inputTree)#.encode('utf-8').decode('gb2312') #字典类型不能解码编码 print(inputTree) pickle.dump(inputTree,fw) fw.close def grabTree (filename): fr =open(filename) return pickle.load(fr)
 * 1) 使用pickle模块实现树的存储


 * 1) storeTree(intree1,'a.txt')#存储的是二进制格式（头部加上了€�XA显示乱码  ）


 * 1) filename='daikuan.txt'
 * 2) dataMat=loadDataSet(filename)
 * 3) labels = ["年龄","有工作", "有自己的房子","信贷情况"]
 * 4) # print(dataMat)
 * 5) # print('\n')
 * 6) bestFeature = chooseBestFeatureToSplit(dataMat)
 * 7) print('\n')
 * 8) mytree1= createTree(dataMat,labels)
 * 9) print("mytree1=",mytree1)
 * 10) numLeaf = getNumLeafs(mytree1)
 * 11) treeDepth = getTreeDepth(mytree1)
 * 12) createPlot(mytree1)


 * 1) lenses_feature = loadDataSet('lenses.txt')
 * 2) #print(lenses_feature)
 * 3) lenses_label = ['age','prescript','astigmatic','tearRate']
 * 4) lenses= createTree(lenses_feature,lenses_label)
 * 5) print(lenses)
 * 6) print(lenses)
 * 7) createPlot(lenses)

weather_feature1 = loadDataSet("play.txt") weather_label1= ['outlook','tempetature','humidity','windy','playtennis'] weather1= createTree(weather_feature1 ,weather_label1) print(weather1)
 * 1) print(weather_feature1 )

createPlot(weather1)