赞
踩
DTree.py实现了ID3、C4.5做树,而CART只是实现计算Gini肯尼指数,没实现做树,步骤是一样的,我进度拖得比较慢,就没有实现,基本上可以套用。你们也可直接计算下,我计算过一遍然后在直接写代码
- # -*- coding: utf-8 -*-
- import math
- C45_Flag = True#算法标志
- ID3_Flag = False
-
- class DtreeStruct:
- def __init__(self,next_nodelist=None,Ai=None,Aivalue=None,ck=None,value=None):
- self.next_nodelist = next_nodelist
- self.Ai = Ai
- self.Aival= Aivalue
- self.value = value
- self.ck = ck
-
- def Print(self):
- def Fprint(node):
- if node.next_nodelist == None:
- print("叶节点:",node.Ai,node.Aival,node.value,node.ck)
- return
- print("节点",node.next_nodelist,node.Ai,node.Aival,node.value,node.ck)
- for node in node.next_nodelist:
- Fprint(node)
-
- print("根节点",self.next_nodelist,self.Ai,self.Aival,self.value,self.ck)
- for node in self.next_nodelist:
- Fprint(node)
-
- class DTree:
- def __init__(self,datasets,labels):
- self.tree = None
- self.datasets = datasets
- self.labels = labels
- self.GetAandC()
-
-
- def GetAandC(self):
- self.C = {}
- self.A = {}
- self.C[self.labels[-1]] = set([ line[-1] for line in self.datasets])
- for i in range(len(self.labels)-1):
- self.A[self.labels[i]] = set([ line[i] for line in self.datasets])
-
- def ID3CreateTree(self,epsilon,ICflag):
- #经验熵
- def emp_entropy(data,label):
- dic = {}
- datalen = len(data)
- indx = self.labels.index(label)
- for line in data:
- if line[indx] not in dic:
- dic[line[indx]] = 0
- #该特征下的取值分类个数(基本上为‘类别’)
- dic[line[indx]] += 1
- return -sum([(dic[p]/datalen)*math.log(dic[p]/datalen,2) for p in dic.keys()])
-
- #经验条件熵
- def emp_cdtl_entropy(data,Ai):
- dic = {}
- data_dic = {}
- c = list(self.C.keys())[0]
- indx = self.labels.index(Ai)
- datalen = len(data)
- for line in data:
- if line[indx] not in dic:
- dic[line[indx]] = 0
- data_dic[line[indx]] = []
- #以特征取值分类
- dic[line[indx]] += 1
- #数据子集
- data_dic[line[indx]].append(line)
-
- #经验条件熵公式
- return sum([ (dic[k]/datalen)*(emp_entropy(data_dic[k],c)) for k in dic.keys()])
-
- def infgain(dataset,C,A):
- #经验熵
- D = emp_entropy(dataset,C)
- #所有特征的信息增益值
- return [D-emp_cdtl_entropy(dataset, Ai) for Ai in A]
-
- def infgainrate(dataset,C,A):
- #经验熵
- D = emp_entropy(dataset,C)
- #所有特征的信息增益比值
- return [(D-emp_cdtl_entropy(dataset, Ai))/emp_entropy(dataset, Ai) for Ai in A]
-
- def getCk(Data,C_key):
- cat = [p[-1] for p in Data]
- maxck = None
- for Cv in self.C[C_key]:
- if cat.count(Cv) > cat.count(maxck):
- maxck = Cv
-
- return maxck
- def getMaxAg(ifglist):
- Max = 0.0
- Ag = 0
- for i,ifg in enumerate(ifglist):
- if ifg > Max:
- Max = ifg
- Ag = i
- return Max,Ag
-
- #ID3 C4.5 算法
- def id3orC45(node,Data,C,A,ICflag):
- #1.剩下的是不是同一类的
- if len(set([p[-1] for p in Data])) == 1:
- node.value = Data
- node.ck = Data[0][-1]
- return
- #2.A=None时
- elif A == None or A == []:
- C_key = list(self.C.keys())[0]
- node.ck = getCk(Data,C_key)
- node.value = Data
- return
- if ICflag == ID3_Flag:
- #3.计算信息增益
- ifglist = infgain(Data, C, A)
- Max,Ag = getMaxAg(ifglist)
- #4.是否小于阈值
- if Max < epsilon:
- node.ck = getCk(Data,C_key)
- node.value = Data
- return
- else:
- #5.切分数据集D
- spdict = {}
- for line in Data:
- if line[Ag] not in spdict:
- spdict[line[Ag]] = []
- spdict[line[Ag]].append(line)
- node.next_nodelist = []
- #6.A-Ag 以Di递归
- ag = A[Ag]
- print(ag)
- A.pop(Ag)
- for k in spdict.keys():
- nodei = DtreeStruct(Ai = ag,Aivalue=k)
- node.next_nodelist.append(nodei)
- id3orC45(nodei, spdict[k], C, A, ID3_Flag)
- elif ICflag == C45_Flag:
- #3.计算信息增益比
- ifglist = infgainrate(Data, C, A)
- Max,Ag = getMaxAg(ifglist)
- #4.是否小于阈值
- if Max < epsilon:
- node.ck = getCk(Data,C_key)
- node.value = Data
- return
- else:
- #5.切分数据集D
- spdict = {}
- for line in Data:
- if line[Ag] not in spdict:
- spdict[line[Ag]] = []
- spdict[line[Ag]].append(line)
- node.next_nodelist = []
- ag = A[Ag]
- print(ag)
- A.pop(Ag)
- for k in spdict.keys():
- nodei = DtreeStruct(Ai = ag,Aivalue=k)
- node.next_nodelist.append(nodei)
- id3orC45(nodei, spdict[k], C, A, C45_Flag)
-
-
- C = list(self.C.keys())[0]
- A = list(self.A.keys())
- self.tree = DtreeStruct()
- id3orC45(self.tree, self.datasets, C, A, ICflag)
-
- #CRAT算法
- def CART(self):
- def Gini(data):
- datalen = len(data)
- dic = {}
- indx = -1
- for line in data:
- if line[indx] not in dic:
- dic[line[indx]] = 0
- #以类别分类
- dic[line[indx]] += 1
- val = 1-sum([(i/datalen)**2 for i in dic.values()])
- return val
-
- def ContialGini(data,Ai):
- val = []
- mydata = None
- otherdata = None
- indx = self.labels.index(Ai)
- datalen = len(data)
- for aival in self.A[Ai]:
- mydata = []
- otherdata = []
- for line in data:
- if line[indx] == aival:
- mydata.append(line)
- else:
- otherdata.append(line)
- mycont = len(mydata)
- ohtercont =len(otherdata)
- val.append(mycont/datalen*Gini(mydata)+ohtercont/datalen*Gini(otherdata))
-
- return val
-
- def AGini(data,A):
- return [ContialGini(data, Ai) for Ai in A]
-
- print(AGini(self.datasets, self.A))

main.py
- # -*- coding: utf-8 -*-
-
- import pandas as pd
- import DTree
-
- def create_data():
- datasets = [['青年', '否', '否', '一般', '否'],
- ['青年', '否', '否', '好', '否'],
- ['青年', '是', '否', '好', '是'],
- ['青年', '是', '是', '一般', '是'],
- ['青年', '否', '否', '一般', '否'],
- ['中年', '否', '否', '一般', '否'],
- ['中年', '否', '否', '好', '否'],
- ['中年', '是', '是', '好', '是'],
- ['中年', '否', '是', '非常好', '是'],
- ['中年', '否', '是', '非常好', '是'],
- ['老年', '否', '是', '非常好', '是'],
- ['老年', '否', '是', '好', '是'],
- ['老年', '是', '否', '好', '是'],
- ['老年', '是', '否', '非常好', '是'],
- ['老年', '否', '否', '一般', '否'],
- ]
- labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别']
- # 返回数据集和每个维度的名称
- return datasets, labels
-
- def main():
- datasets, labels = create_data()
- dtree = DTree.DTree(datasets,labels)
- dtree.ID3CreateTree(0.1,DTree.ID3_Flag)
- dtree.tree.Print( )
-
- pass
-
-
- if __name__ == '__main__':
- main()

这道题我使用的是书上说的最小二乘法,直接上代码(程序有点混乱,精神不太好):
- # -*- coding: utf-8 -*-
-
- class Ctree:
- def __init__(self,spvalue=None):
- self.spvalue = spvalue
- self.value = None
- self.lnode = None
- self.rnode = None
-
- class CART:
- def __init__(self,data):
- self.data = sorted(data)
- self.tree = None
-
- def SqartLost(self):
- def sumspart(data,c):
- sum= 0.0
- for d in data:
- sum+= (d - c)**2
- return sum
- def splittosumsq(data,c):
- datal = []
- datah = []
- for d in data:
- if d <= c:
- datal.append(d)
- else:
- datah.append(d)
- return sumspart(datal, c)+sumspart(datah, c)
-
- def createTree(node,data):
- if len(data) == 1:
- node.value = data
- return
- minsp = float("inf")
- indx = 0
- for i,d in enumerate(data[1:-2]):
- sp = splittosumsq(data, d)
- if minsp > sp:
- minsp = sp
- indx = i
-
- node.value = data
- node.lnode = Ctree(data[indx])
- createTree(node.lnode, data[0:indx+1])
- node.rnode = Ctree(data[indx])
- createTree(node.rnode, data[indx+1:])
- self.tree = Ctree()
- createTree(self.tree, self.data)
-
- def Print(self):
- def Fprint(node):
- if node.lnode == None:
- print("叶节点:",node.value)
- return
- print("节点:",node.spvalue,node.value)
- Fprint(node.lnode)
- Fprint(node.rnode)
-
- print("根节点")
- Fprint(self.tree)
-
- def main():
- data = [4.50,4.75,4.91,5.34,5.80,7.05,7.90,8.23,8.70,9.00]
- cart = CART(data)
- cart.SqartLost()
- cart.Print()
- pass
-
-
- if __name__ == '__main__':
- main()

证明题就不证了
5.3 证明CART剪枝算法中,当αα确定时,存在唯一的最小子树TαTα使损失函数Cα(T)Cα(T)最小。
5.4 证明CART剪枝算法中求出的子树序列{T0,T1,…,Tn}{T0,T1,…,Tn}分别是区间α∈[αi,αi+1)α∈[αi,αi+1)的最优子树TαTα,这里i=0,1,…,n,0=α0<α1<⋯<αn<+∞.
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。