当前位置:   article > 正文

机器学习:分类决策树(Python)_决策树分类器python

决策树分类器python

一、各种熵的计算

entropy_utils.py

  1. import numpy as np # 数值计算
  2. import math # 标量数据的计算
  3. class EntropyUtils:
  4. """
  5. 决策树中各种熵的计算,包括信息熵、信息增益、信息增益率、基尼指数。
  6. 统一要求:按照信息增益最大、信息增益率最大、基尼指数增益最大
  7. """
  8. @staticmethod
  9. def _set_sample_weight(sample_weight, n_samples):
  10. """
  11. 扩展到集成学习,此处为样本权重的设置
  12. :param sample_weight: 各样本的权重
  13. :param n_samples: 样本量
  14. :return:
  15. """
  16. if sample_weight is None:
  17. sample_weight = np.asarray([1.0] * n_samples)
  18. return sample_weight
  19. def cal_info_entropy(self, y_labels, sample_weight=None):
  20. """
  21. 计算样本的信息熵
  22. :param y_labels: 递归样本子集中类别集合或特征取值
  23. :param sample_weight: 各样本的权重
  24. :return:
  25. """
  26. y = np.asarray(y_labels)
  27. sample_weight = self._set_sample_weight(sample_weight, len(y))
  28. y_values = np.unique(y) # 样本中不同类别值
  29. ent_y = 0.0
  30. for val in y_values:
  31. p_i = len(y[y == val]) * np.mean(sample_weight[y == val]) / len(y)
  32. ent_y += -p_i * math.log2(p_i)
  33. return ent_y
  34. def conditional_entropy(self, feature_x, y_labels, sample_weight=None):
  35. """
  36. 计算条件熵,给定特征属性的情况下,信息熵的计算
  37. :param feature_x: 某个样本特征
  38. :param y_labels: 递归样本子集中的类别集合
  39. :param sample_weight: 各样本的权重
  40. :return:
  41. """
  42. x, y = np.asarray(feature_x), np.asarray(y_labels)
  43. sample_weight = self._set_sample_weight(sample_weight, len(y))
  44. cond_ent = 0.0
  45. for x_val in np.unique(x):
  46. x_idx = np.where(x == x_val) # 某个特征取值的样本索引集合
  47. sub_x, sub_y = x[x_idx], y[x_idx]
  48. sub_sample_weight = sample_weight[x_idx]
  49. p_k = len(sub_y) / len(y)
  50. cond_ent += p_k * self.cal_info_entropy(sub_y, sub_sample_weight)
  51. return cond_ent
  52. def info_gain(self, feature_x, y_labels, sample_weight=None):
  53. """
  54. 计算信息增益
  55. :param feature_x:
  56. :param y_labels:
  57. :param sample_weight:
  58. :return:
  59. """
  60. return self.cal_info_entropy(y_labels, sample_weight) - \
  61. self.conditional_entropy(feature_x, y_labels, sample_weight)
  62. def info_gain_rate(self, feature_x, y_labels, sample_weight=None):
  63. """
  64. 计算信息增益率
  65. :param feature_x:
  66. :param y_labels:
  67. :param sample_weight:
  68. :return:
  69. """
  70. return self.info_gain(feature_x, y_labels, sample_weight) / \
  71. self.cal_info_entropy(feature_x, sample_weight)
  72. def cal_gini(self, y_label, sample_weight=None):
  73. """
  74. 计算当前特征或类别集合的基尼值
  75. :param y_label: 递归样本子集中类别集合或特征取值
  76. :param sample_weight:
  77. :return:
  78. """
  79. y = np.asarray(y_label)
  80. sample_weight = self._set_sample_weight(sample_weight, len(y))
  81. y_values = np.unique(y)
  82. gini_val = 1.0
  83. for val in y_values:
  84. p_k = len(y[y == val]) * np.mean(sample_weight[y == val]) / len(y)
  85. gini_val -= p_k ** 2
  86. return gini_val
  87. def conditional_gini(self, feature_x, y_labels, sample_weight=None):
  88. """
  89. 计算条件基尼指数
  90. :param feature_x:
  91. :param y_labels:
  92. :param sample_weight:
  93. :return:
  94. """
  95. x, y = np.asarray(feature_x), np.asarray(y_labels)
  96. sample_weight = self._set_sample_weight(sample_weight, len(y))
  97. cond_gini = 0.0
  98. for x_val in np.unique(x):
  99. x_idx = np.where(x == x_val) # 某个特征取值的样本索引集合
  100. sub_x, sub_y = x[x_idx], y[x_idx]
  101. sub_sample_weight = sample_weight[x_idx]
  102. p_k = len(sub_y) / len(y)
  103. cond_gini += p_k * self.cal_gini(sub_y, sub_sample_weight)
  104. return cond_gini
  105. def gini_gain(self, feature_x, y_labels, sample_weight=None):
  106. """
  107. 计算基尼指数增益
  108. :param feature_x:
  109. :param y_labels:
  110. :param sample_weight:
  111. :return:
  112. """
  113. return self.cal_gini(y_labels, sample_weight) - \
  114. self.conditional_gini(feature_x, y_labels, sample_weight)
  115. # if __name__ == '__main__':
  116. # y = np.random.randint(0, 2, 50)
  117. # entropy = EntropyUtils()
  118. # ent = entropy.cal_info_entropy(y)
  119. # print(ent)

二、连续特征数据的离散分箱

data_bin_wrapper.py

  1. import numpy as np
  2. class DataBinsWrapper:
  3. """
  4. 连续特征数据的离散化,分箱(分段)操作,根据用户传参max_bins,计算分位数,以分位数分箱(分段)
  5. 然后根据样本特征取值所在区间段(哪个箱)位置索引标记当前值
  6. 1. fit(x)根据样本进行分箱
  7. 2. transform(x)根据已存在的箱,把数据分成max_bins类
  8. """
  9. def __init__(self, max_bins=10):
  10. self.max_bins = max_bins # 分箱数:10%,20%,...,90%
  11. self.XrangeMap = None # 箱(区间段)
  12. def fit(self, x_samples):
  13. """
  14. 根据样本进行分箱
  15. :param x_samples: 样本(二维数组 n * k),或一个特征属性的数据(二维 n * 1)
  16. :return:
  17. """
  18. if x_samples.ndim == 1: # 一个特征属性,转换为二维数组
  19. n_features = 1
  20. x_samples = x_samples[:, np.newaxis] # 添加一个轴,转换为二维数组
  21. else:
  22. n_features = x_samples.shape[1]
  23. # 构建分箱,区间段
  24. self.XrangeMap = [[] for _ in range(n_features)]
  25. for idx in range(n_features):
  26. x_sorted = sorted(x_samples[:, idx]) # 按特征索引取值,并从小到大排序
  27. for bin in range(1, self.max_bins):
  28. p = (bin / self.max_bins) * 100 // 1
  29. p_val = np.percentile(x_sorted, p)
  30. self.XrangeMap[idx].append(p_val)
  31. self.XrangeMap[idx] = sorted(list(set(self.XrangeMap[idx])))
  32. def transform(self, x_samples, XrangeMap=None):
  33. """
  34. 根据已存在的箱,把数据分成max_bins类
  35. :param x_samples: 样本(二维数组 n * k),或一个特征属性的数据(二维 n * 1)
  36. :return:
  37. """
  38. if x_samples.ndim == 1:
  39. if XrangeMap is not None:
  40. return np.asarray(np.digitize(x_samples, XrangeMap[0])).reshape(-1)
  41. else:
  42. return np.asarray(np.digitize(x_samples, self.XrangeMap[0])).reshape(-1)
  43. else:
  44. return np.asarray([np.digitize(x_samples[:, i], self.XrangeMap[i])
  45. for i in range(x_samples.shape[1])]).T
  46. # if __name__ == '__main__':
  47. # x = np.random.randn(10, 5)
  48. # print(x)
  49. # dbw = DataBinsWrapper(max_bins=5)
  50. # dbw.fit(x)
  51. # print(dbw.XrangeMap)
  52. # print(dbw.transform(x))

三、可视化分类边界函数

plt_decision_funtion.py

  1. import matplotlib.pylab as plt
  2. import numpy as np
  3. def plot_decision_function(X, y, clf, acc=None, title_info=None, is_show=True, support_vectors=None):
  4. """
  5. 可视化分类边界函数
  6. :param X, y: 测试样本与类别
  7. :param clf: 分类模型
  8. :param acc: 模型分类正确率
  9. :param title_info: 可视化标题title的额外信息
  10. :param is_show: 是否在当前显示图像,用于父函数绘制子图
  11. :param support_vectors: 扩展支持向量机
  12. :return:
  13. """
  14. if is_show:
  15. plt.figure(figsize=(7, 5))
  16. # 根据特征变量的最小值和最大值,生成二维网络,用于绘制等值线
  17. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
  18. y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
  19. xi, yi = np.meshgrid(np.arange(x_min, x_max, 0.02),
  20. np.arange(y_min, y_max, 0.02))
  21. y_pred = clf.predict(np.c_[xi.ravel(), yi.ravel()]) # 模型预测值
  22. y_pred = y_pred.reshape(xi.shape)
  23. plt.contourf(xi, yi, y_pred, alpha=0.4)
  24. plt.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolors="k")
  25. plt.xlabel("Feature 1", fontdict={"fontsize": 12})
  26. plt.ylabel("Feature 2", fontdict={"fontsize": 12})
  27. if acc:
  28. if title_info:
  29. plt.title("Model Classification Boundary %s \n(accuracy = %.5f)"
  30. % (title_info, acc), fontdict={"fontsize": 14})
  31. else:
  32. plt.title("Model Classification Boundary (accuracy = %.5f)"
  33. % acc, fontdict={"fontsize": 14})
  34. else:
  35. if title_info:
  36. plt.title("Model Classification Boundary %s"
  37. % title_info, fontdict={"fontsize": 14})
  38. else:
  39. plt.title("Model Classification Boundary", fontdict={"fontsize": 14})
  40. if support_vectors is not None: # 可视化支持向量,针对SVM
  41. plt.scatter(X[support_vectors, 0], X[support_vectors, 1],
  42. s=50, c="None", alpha=0.7, edgecolors="red")
  43. if is_show:
  44. plt.show()

四、熵计算的测试

test_entropy.py

  1. import numpy as np
  2. import pandas as pd
  3. from utils.entropy_utils import EntropyUtils
  4. from utils.data_bin_wrapper import DataBinsWrapper
  5. data = pd.read_csv("data/watermelon.csv").iloc[:, 1:]
  6. feat_names = data.columns[:6]
  7. y = data.iloc[:, -1]
  8. ent_obj = EntropyUtils()
  9. print("各特征的信息增益如下:")
  10. for feat in feat_names:
  11. print(feat, ":", ent_obj.info_gain(data.loc[:, feat], y))
  12. print("=" * 60)
  13. print("各特征的信息增益率如下:")
  14. for feat in feat_names:
  15. print(feat, ":", ent_obj.info_gain_rate(data.loc[:, feat], y))
  16. print("=" * 60)
  17. print("各特征的基尼指数增益如下:")
  18. for feat in feat_names:
  19. print(feat, ":", ent_obj.gini_gain(data.loc[:, feat], y))
  20. print("=" * 60)
  21. x1 = np.asarray(data.loc[:, ["密度", "含糖率"]])
  22. print(x1)
  23. dbw = DataBinsWrapper(max_bins=8)
  24. dbw.fit(x1)
  25. print(dbw.transform(x1))

五、树的结点信息封装 

tree_node.py

  1. class TreeNode_C:
  2. """
  3. 决策树分类算法,树的结点信息封装,实体类:setXXX()、getXXX()
  4. """
  5. def __init__(self, feature_idx: int = None, feature_val=None, criterion_val: float = None,
  6. n_samples: int = None, target_dist: dict = None, weight_dist: dict = None,
  7. left_child_Node=None, right_child_Node=None):
  8. """
  9. 决策树结点信息封装
  10. :param feature_idx: 特征索引,如果指定特征属性的名称,可以按照索引取值
  11. :param feature_val: 特征取值
  12. :param criterion_val: 划分结点的标准:信息增益(率)、基尼指数增益
  13. :param n_samples: 当前结点所包含的样本量
  14. :param target_dist: 当前结点类别分布:0-25%,1-50%,2-25%
  15. :param weight_dist: 当前结点所包含的样本权重分布
  16. :param left_child_Node: 左子树
  17. :param right_child_Node: 右子树
  18. """
  19. self.feature_idx = feature_idx
  20. self.feature_val = feature_val
  21. self.criterion_val = criterion_val
  22. self.n_samples = n_samples
  23. self.target_dist = target_dist
  24. self.weight_dist = weight_dist
  25. self.left_child_Node = left_child_Node # 递归
  26. self.right_child_Node = right_child_Node # 递归
  27. def level_order(self):
  28. """
  29. 按层次遍历树...
  30. :return:
  31. """
  32. pass
  33. # def get_feature_idx(self):
  34. # return self.get_feature_idx()
  35. #
  36. # def set_feature_idx(self, feature_idx):
  37. # self.feature_idx = feature_idx

六、分类决策树算法的实现

decision_tree_C.py

  1. import numpy as np
  2. from utils.entropy_utils import EntropyUtils
  3. from utils.tree_node import TreeNode_C
  4. from utils.data_bin_wrapper import DataBinsWrapper
  5. class DecisionTreeClassifier:
  6. """
  7. 分类决策树算法实现:无论是ID3、C4.5或CART,统一按照二叉树构造
  8. 1. 划分标准:信息增益(率)、基尼指数增益,都按照最大值选择特征属性
  9. 2. 创建决策树fit(),递归算法实现,注意出口条件
  10. 3. 预测predict_proba()、predict() --> 对树的搜索
  11. 4. 数据的预处理操作,尤其是连续数据的离散化,分箱
  12. 5. 剪枝处理
  13. """
  14. def __init__(self, criterion="CART", is_feature_all_R=False, dbw_feature_idx=None,
  15. max_depth=None, min_sample_split=2, min_sample_leaf=1,
  16. min_impurity_decrease=0, max_bins=10):
  17. self.utils = EntropyUtils() # 结点划分类
  18. self.criterion = criterion # 结点的划分标准
  19. if criterion.lower() == "cart":
  20. self.criterion_func = self.utils.gini_gain # 基尼指数增益
  21. elif criterion.lower() == "c45":
  22. self.criterion_func = self.utils.info_gain_rate # 信息增益率
  23. elif criterion.lower() == "id3":
  24. self.criterion_func = self.utils.info_gain # 信息增益
  25. else:
  26. raise ValueError("参数criterion仅限cart、c45或id3...")
  27. self.is_feature_all_R = is_feature_all_R # 所有样本特征是否全是连续数据
  28. self.dbw_feature_idx = dbw_feature_idx # 混合类型数据,可指定连续特征属性的索引
  29. self.max_depth = max_depth # 树的最大深度,不传参,则一直划分下去
  30. self.min_sample_split = min_sample_split # 最小的划分结点的样本量,小于则不划分
  31. self.min_sample_leaf = min_sample_leaf # 叶子结点所包含的最小样本量,剩余的样本小于这个值,标记叶子结点
  32. self.min_impurity_decrease = min_impurity_decrease # 最小结点不纯度减少值,小于这个值,不足以划分
  33. self.max_bins = max_bins # 连续数据的分箱数,越大,则划分越细
  34. self.root_node: TreeNode_C() = None # 分类决策树的根节点
  35. self.dbw = DataBinsWrapper(max_bins=max_bins) # 连续数据离散化对象
  36. self.dbw_XrangeMap = {} # 存储训练样本连续特征分箱的端点
  37. self.class_values = None # 样本的类别取值
  38. def _data_bin_wrapper(self, x_samples):
  39. """
  40. 针对特定的连续特征属性索引dbw_feature_idx,分别进行分箱,考虑测试样本与训练样本使用同一个XrangeMap
  41. :param x_samples: 样本:即可以是训练样本,也可以是测试样本
  42. :return:
  43. """
  44. self.dbw_feature_idx = np.asarray(self.dbw_feature_idx)
  45. x_samples_prop = [] # 分箱之后的数据
  46. if not self.dbw_XrangeMap:
  47. # 为空,即创建决策树前所做的分箱操作
  48. for i in range(x_samples.shape[1]):
  49. if i in self.dbw_feature_idx: # 说明当前特征是连续数值
  50. self.dbw.fit(x_samples[:, i])
  51. self.dbw_XrangeMap[i] = self.dbw.XrangeMap
  52. x_samples_prop.append(self.dbw.transform(x_samples[:, i]))
  53. else:
  54. x_samples_prop.append(x_samples[:, i])
  55. else: # 针对测试样本的分箱操作
  56. for i in range(x_samples.shape[1]):
  57. if i in self.dbw_feature_idx: # 说明当前特征是连续数值
  58. x_samples_prop.append(self.dbw.transform(x_samples[:, i], self.dbw_XrangeMap[i]))
  59. else:
  60. x_samples_prop.append(x_samples[:, i])
  61. return np.asarray(x_samples_prop).T
  62. def fit(self, x_train, y_train, sample_weight=None):
  63. """
  64. 决策树的创建,递归操作前的必要信息处理
  65. :param x_train: 训练样本:ndarray,n * k
  66. :param y_train: 目标集:ndarray,(n, )
  67. :param sample_weight: 各样本的权重,(n, )
  68. :return:
  69. """
  70. x_train, y_train = np.asarray(x_train), np.asarray(y_train)
  71. self.class_values = np.unique(y_train) # 样本的类别取值
  72. n_samples, n_features = x_train.shape # 训练样本的样本量和特征属性数目
  73. if sample_weight is None:
  74. sample_weight = np.asarray([1.0] * n_samples)
  75. self.root_node = TreeNode_C() # 创建一个空树
  76. if self.is_feature_all_R: # 全部是连续数据
  77. self.dbw.fit(x_train)
  78. x_train = self.dbw.transform(x_train)
  79. elif self.dbw_feature_idx:
  80. x_train = self._data_bin_wrapper(x_train)
  81. self._build_tree(1, self.root_node, x_train, y_train, sample_weight)
  82. # print(x_train)
  83. def _build_tree(self, cur_depth, cur_node: TreeNode_C, x_train, y_train, sample_weight):
  84. """
  85. 递归创建决策树算法,核心算法。按先序(中序、后序)创建的
  86. :param cur_depth: 递归划分后的树的深度
  87. :param cur_node: 递归划分后的当前根结点
  88. :param x_train: 递归划分后的训练样本
  89. :param y_train: 递归划分后的目标集合
  90. :param sample_weight: 递归划分后的各样本权重
  91. :return:
  92. """
  93. n_samples, n_features = x_train.shape # 当前样本子集中的样本量和特征属性数目
  94. target_dist, weight_dist = {}, {} # 当前样本类别分布和权重分布 0-->30%,1-->70%
  95. class_labels = np.unique(y_train) # 不同的类别值
  96. for label in class_labels:
  97. target_dist[label] = len(y_train[y_train == label]) / n_samples
  98. weight_dist[label] = np.mean(sample_weight[y_train == label])
  99. cur_node.target_dist = target_dist
  100. cur_node.weight_dist = weight_dist
  101. cur_node.n_samples = n_samples
  102. # 递归出口判断
  103. if len(target_dist) <= 1: # 所有的样本全属于同一个类别,递归出口1
  104. # 如果为0,则表示当前样本集合为空,递归出口3
  105. return
  106. if n_samples < self.min_sample_split: # 当前结点所包含的样本量不足以划分
  107. return
  108. if self.max_depth is not None and cur_depth > self.max_depth: # 树的深度达到最大深度
  109. return
  110. # 划分标准,选择最佳的划分特征及其取值
  111. best_idx, best_val, best_criterion_val = None, None, 0.0
  112. for k in range(n_features): # 对当前样本集合中每个特征计算划分标准
  113. for f_val in np.unique(x_train[:, k]): # 当前特征的不同取值
  114. feat_k_values = (x_train[:, k] == f_val).astype(int) # 是当前取值f_val就是1,否则就是0
  115. criterion_val = self.criterion_func(feat_k_values, y_train, sample_weight)
  116. if criterion_val > best_criterion_val:
  117. best_criterion_val = criterion_val # 最佳的划分标准值
  118. best_idx, best_val = k, f_val # 当前最佳特征索引以及取值
  119. # 递归出口的判断
  120. if best_idx is None: # 当前属性为空,或者所有样本在所有属性上取值相同,无法划分
  121. return
  122. if best_criterion_val <= self.min_impurity_decrease: # 小于最小不纯度阈值,不划分
  123. return
  124. cur_node.criterion_val = best_criterion_val
  125. cur_node.feature_idx = best_idx
  126. cur_node.feature_val = best_val
  127. # print("当前划分的特征索引:", best_idx, "取值:", best_val, "最佳标准值:", best_criterion_val)
  128. # print("当前结点的类别分布:", target_dist)
  129. # 创建左子树,并递归创建以当前结点为子树根节点的左子树
  130. left_idx = np.where(x_train[:, best_idx] == best_val) # 左子树所包含的样本子集索引
  131. if len(left_idx) >= self.min_sample_leaf: # 小于叶子结点所包含的最少样本量,则标记为叶子结点
  132. left_child_node = TreeNode_C() # 创建左子树空结点
  133. # 以当前结点为子树根结点,递归创建
  134. cur_node.left_child_Node = left_child_node
  135. self._build_tree(cur_depth + 1, left_child_node, x_train[left_idx],
  136. y_train[left_idx], sample_weight[left_idx])
  137. right_idx = np.where(x_train[:, best_idx] != best_val) # 右子树所包含的样本子集索引
  138. if len(right_idx) >= self.min_sample_leaf: # 小于叶子结点所包含的最少样本量,则标记为叶子结点
  139. right_child_node = TreeNode_C() # 创建右子树空结点
  140. # 以当前结点为子树根结点,递归创建
  141. cur_node.right_child_Node = right_child_node
  142. self._build_tree(cur_depth + 1, right_child_node, x_train[right_idx],
  143. y_train[right_idx], sample_weight[right_idx])
  144. def _search_tree_predict(self, cur_node: TreeNode_C, x_test):
  145. """
  146. 根据测试样本从根结点到叶子结点搜索路径,判定类别
  147. 搜索:按照后续遍历
  148. :param x_test: 单个测试样本
  149. :return:
  150. """
  151. if cur_node.left_child_Node and x_test[cur_node.feature_idx] == cur_node.feature_val:
  152. return self._search_tree_predict(cur_node.left_child_Node, x_test)
  153. elif cur_node.right_child_Node and x_test[cur_node.feature_idx] != cur_node.feature_val:
  154. return self._search_tree_predict(cur_node.right_child_Node, x_test)
  155. else:
  156. # 叶子结点,类别,包含有类别分布
  157. # print(cur_node.target_dist)
  158. class_p = np.zeros(len(self.class_values)) # 测试样本的类别概率
  159. for i, c in enumerate(self.class_values):
  160. class_p[i] = cur_node.target_dist.get(c, 0) * cur_node.weight_dist.get(c, 1.0)
  161. class_p / np.sum(class_p) # 归一化
  162. return class_p
  163. def predict_proba(self, x_test):
  164. """
  165. 预测测试样本x_test的类别概率
  166. :param x_test: 测试样本ndarray、numpy数值运算
  167. :return:
  168. """
  169. x_test = np.asarray(x_test) # 避免传递DataFrame、list...
  170. if self.is_feature_all_R:
  171. if self.dbw.XrangeMap is not None:
  172. x_test = self.dbw.transform(x_test)
  173. else:
  174. raise ValueError("请先创建决策树...")
  175. elif self.dbw_feature_idx is not None:
  176. x_test = self._data_bin_wrapper(x_test)
  177. prob_dist = [] # 用于存储测试样本的类别概率分布
  178. for i in range(x_test.shape[0]):
  179. prob_dist.append(self._search_tree_predict(self.root_node, x_test[i]))
  180. return np.asarray(prob_dist)
  181. def predict(self, x_test):
  182. """
  183. 预测测试样本的类别
  184. :param x_test: 测试样本
  185. :return:
  186. """
  187. x_test = np.asarray(x_test) # 避免传递DataFrame、list...
  188. return np.argmax(self.predict_proba(x_test), axis=1)
  189. def _prune_node(self, cur_node: TreeNode_C, alpha):
  190. """
  191. 递归剪枝,针对决策树中的内部结点,自底向上,逐个考察
  192. 方法:后序遍历
  193. :param cur_node: 当前递归的决策树的内部结点
  194. :param alpha: 剪枝阈值
  195. :return:
  196. """
  197. # 若左子树存在,递归左子树进行剪枝
  198. if cur_node.left_child_Node:
  199. self._prune_node(cur_node.left_child_Node, alpha)
  200. # 若右子树存在,递归右子树进行剪枝
  201. if cur_node.right_child_Node:
  202. self._prune_node(cur_node.right_child_Node, alpha)
  203. # 针对决策树的内部结点剪枝,非叶结点
  204. if cur_node.left_child_Node is not None or cur_node.right_child_Node is not None:
  205. for child_node in [cur_node.left_child_Node, cur_node.right_child_Node]:
  206. if child_node is None:
  207. # 可能存在左右子树之一为空的情况,当左右子树划分的样本子集数小于min_samples_leaf
  208. continue
  209. if child_node.left_child_Node is not None or child_node.right_child_Node is not None:
  210. return
  211. # 计算剪枝前的损失值,2表示当前结点包含两个叶子结点
  212. pre_prune_value = 2 * alpha
  213. for child_node in [cur_node.left_child_Node, cur_node.right_child_Node]:
  214. # 计算左右叶子结点的经验熵
  215. if child_node is None:
  216. # 可能存在左右子树之一为空的情况,当左右子树划分的样本子集数小于min_samples_leaf
  217. continue
  218. for key, value in child_node.target_dist.items(): # 对每个叶子结点的类别分布
  219. pre_prune_value += -1 * child_node.n_samples * value * np.log(value) * \
  220. child_node.weight_dist.get(key, 1.0)
  221. # 计算剪枝后的损失值,当前结点即是叶子结点
  222. after_prune_value = alpha
  223. for key, value in cur_node.target_dist.items(): # 当前待剪枝的结点的类别分布
  224. after_prune_value += -1 * cur_node.n_samples * value * np.log(value) * \
  225. cur_node.weight_dist.get(key, 1.0)
  226. if after_prune_value <= pre_prune_value: # 进行剪枝操作
  227. cur_node.left_child_Node = None
  228. cur_node.right_child_Node = None
  229. cur_node.feature_idx, cur_node.feature_val = None, None
  230. def prune(self, alpha=0.01):
  231. """
  232. 决策树后剪枝算法(李航)C(T) + alpha * |T|
  233. :param alpha: 剪枝阈值,权衡模型对训练数据的拟合程度与模型的复杂度
  234. :return:
  235. """
  236. self._prune_node(self.root_node, alpha)
  237. return self.root_node

七、分类决策树算法的测试

test_decision_tree_C.py

  1. import pandas as pd
  2. from decision_tree_C import DecisionTreeClassifier
  3. from sklearn.datasets import load_iris, load_breast_cancer
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.metrics import classification_report, accuracy_score
  6. import numpy as np
  7. import matplotlib.pyplot as plt
  8. from sklearn.preprocessing import LabelEncoder
  9. # data = pd.read_csv("data/watermelon.csv").iloc[:, 1:]
  10. # X = data.iloc[:, :-1]
  11. # y = data.iloc[:, -1]
  12. # iris = load_iris()
  13. # X, y = iris.data, iris.target
  14. # bc_data = load_breast_cancer()
  15. # X, y = bc_data.data, bc_data.target
  16. nursery = pd.read_csv("data/nursery.csv").dropna()
  17. X, y = np.asarray(nursery.iloc[:, :-1]), np.asarray(nursery.iloc[:, -1])
  18. y = LabelEncoder().fit_transform(y)
  19. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
  20. depth = np.linspace(2, 12, 11, dtype=np.int64)
  21. accuracy = []
  22. for d in depth:
  23. dtc = DecisionTreeClassifier(is_feature_all_R=False, max_depth=d)
  24. dtc.fit(X_train, y_train)
  25. y_pred_labels = dtc.predict(X_test)
  26. acc = accuracy_score(y_test, y_pred_labels)
  27. # print(acc)
  28. accuracy.append(acc)
  29. # dtc = DecisionTreeClassifier(dbw_feature_idx=[6, 7], max_bins=8, max_depth=2)
  30. # dtc.fit(X, y)
  31. # y_pred_prob = dtc.predict_proba(X)
  32. # print(y_pred_prob)
  33. # print(classification_report(y_test, y_pred_labels))
  34. plt.figure(figsize=(7, 5))
  35. plt.plot(depth, accuracy, "ko-", lw=1)
  36. plt.show()

test_decision_tree_C_2.py

  1. import numpy as np
  2. import matplotlib.pyplot as plt
  3. from decision_tree_C import DecisionTreeClassifier
  4. from sklearn.datasets import make_classification
  5. from sklearn.metrics import classification_report, accuracy_score
  6. from utils.plt_decision_function import plot_decision_function
  7. # 生成数据
  8. data, target = make_classification(n_samples=100, n_features=2, n_classes=2, n_informative=1, n_redundant=0,
  9. n_clusters_per_class=1, class_sep=0.8, random_state=21)
  10. # print(data)
  11. # print(target)
  12. cart_tree = DecisionTreeClassifier(is_feature_all_R=True)
  13. cart_tree.fit(data, target)
  14. y_test_pred = cart_tree.predict(data)
  15. print(classification_report(target, y_test_pred))
  16. plt.figure(figsize=(14, 10))
  17. plt.subplot(221)
  18. acc = accuracy_score(target, y_test_pred)
  19. plot_decision_function(data, target, cart_tree, acc=acc, is_show=False, title_info="By CART UnPrune")
  20. # 剪枝处理
  21. alpha = [1, 3, 5]
  22. for i in range(3):
  23. cart_tree.prune(alpha=alpha[i])
  24. y_test_pred = cart_tree.predict(data)
  25. acc = accuracy_score(target, y_test_pred)
  26. plt.subplot(222 + i)
  27. plot_decision_function(data, target, cart_tree, acc=acc, is_show=False,
  28. title_info="By CART Prune α = %.1f" % alpha[i])
  29. plt.tight_layout()
  30. plt.show()

test_decision_tree_C_3.py

  1. import copy
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from decision_tree_C import DecisionTreeClassifier
  5. from sklearn.datasets import load_breast_cancer, load_iris
  6. from sklearn.metrics import classification_report, accuracy_score
  7. from utils.plt_decision_function import plot_decision_function
  8. from sklearn.model_selection import StratifiedKFold
  9. bc_data = load_breast_cancer()
  10. X, y = bc_data.data, bc_data.target
  11. alphas = np.linspace(0, 10, 30)
  12. accuracy_scores = [] # 存储每个alpha阈值下的交叉验证均分
  13. cart = DecisionTreeClassifier(criterion="cart", is_feature_all_R=True, max_bins=10)
  14. for alpha in alphas:
  15. scores = []
  16. k_fold = StratifiedKFold(n_splits=10).split(X, y)
  17. for train_idx, test_idx in k_fold:
  18. tree = copy.deepcopy(cart)
  19. tree.fit(X[train_idx], y[train_idx])
  20. tree.prune(alpha=alpha)
  21. y_test_pred = tree.predict(X[test_idx])
  22. scores.append(accuracy_score(y[test_idx], y_test_pred))
  23. del tree
  24. print(alpha, ":", np.mean(scores))
  25. accuracy_scores.append(np.mean(scores))
  26. plt.figure(figsize=(7, 5))
  27. plt.plot(alphas, accuracy_scores, "ko-", lw=1)
  28. plt.grid(ls=":")
  29. plt.xlabel("Alpha", fontdict={"fontsize": 12})
  30. plt.ylabel("Accuracy Scores", fontdict={"fontsize": 12})
  31. plt.title("Cross Validation Scores under different Prune Alpha", fontdict={"fontsize": 14})
  32. plt.show()

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/330911?site
推荐阅读
相关标签
  

闽ICP备14008679号