from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
 
# 加载数据
iris = load_iris()
X = iris.data
y = iris.target
 
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
# 构建随机森林模型
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
 
# 拟合模型
clf.fit(X_train, y_train)
 
# 预测测试集
y_pred = clf.predict(X_test)
 
# 输出准确率
print("Accuracy:", clf.score(X_test, y_test))

三、极端随机树

3.1、极端随机数简介

极端随机树（Extra Trees）是一种决策树算法，它和随机森林类似，同样采用随机抽样和随机特征选择的方法构建多个决策树。不同的是，极端随机树在分裂节点时，会随机选择特征的划分点，而不是像决策树和随机森林那样使用最优的划分点。这样可以减少模型的方差，提高模型的鲁棒性。

3.2、具体案例实例

下面是一个使用极端随机树算法来分类鸢尾花数据集的Python代码：


from sklearn.datasets import load_iris
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
 
# 加载数据
iris = load_iris()
X = iris.data
y = iris.target
 
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
# 构建极端随机树模型
clf = ExtraTreesClassifier(n_estimators=100, max_depth=2, random_state=0)
 
# 拟合模型
clf.fit(X_train, y_train)
 
# 预测测试集
y_pred = clf.predict(X_test)
 
# 输出准确率
print("Accuracy:", clf.score(X_test, y_test))

四、单颗决策树分类器

4.1、单颗决策树分类器简介

单颗决策树分类器是一种基本的分类算法，它将数据集划分为多个子集，每个子集对应一个决策节点，直到叶子节点为止。在分类时，根据测试样本的特征值，沿着决策树从根节点到叶子节点依次做出决策，最终得到分类结果。单颗决策树分类器容易过拟合，因此通常需要采用集成学习算法来提高分类性能。

4.2、具体案例实例

下面是一个使用单颗决策树分类器来分类鸢尾花数据集的Python代码：


from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
 
# 加载数据
iris = load_iris()
X = iris.data
y = iris.target
 
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
# 构建单颗决策树分类器模型
clf = DecisionTreeClassifier(max_depth=2, random_state=0)
 
# 拟合模型
clf.fit(X_train, y_train)
 
# 预测测试集
y_pred = clf.predict(X_test)
 
# 输出准确率
print("Accuracy:", clf.score(X_test, y_test))

五、综合实例

使用随机森林算法、极端随机树和单颗决策树分类器对手写数字数据进行对比分析


#使用随机森林算法、极端随机树和单颗决策树分类器对手写数字数据进行对比分析
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']#解决图形中的中文显示乱码
plt.matplotlib.rcParams['axes.unicode_minus']=False 
#导入sklearn内置数据集 
from sklearn.datasets import load_digits
#导入手写数字数据
digits = load_digits()
#以图片形式显示前100号手写数字
import matplotlib.pyplot as plt
plt.figure(1, figsize=(2.5,2.5),facecolor='white') 
for i in range(10): 
    for j in range(10):
        ax= plt.subplot(10,10,10*i+j+1)
        #设置子图的位置
        ax.set_xticks([])#隐藏横坐标
        ax.set_yticks([])#隐藏纵坐标
        plt.imshow(digits.images[9*i+j],cmap=plt.cm.gray_r,interpolation="nearest")
plt.show()
#导入sklearn中的模型验证类 
from sklearn.model_selection import train_test_split
#使用train test_split函数自动分割训练数据集和测试数据集
x_train,x_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size=0.3)
#导入sklearn模块中的决策树分类器类 
from sklearn.tree import DecisionTreeClassifier
#导入sklearn模块中的随机森林分类器类 
from sklearn.ensemble import RandomForestClassifier
#导入sklearn模块中的极端随机森林分类器类 
from sklearn.ensemble import ExtraTreesClassifier
#定义一个决策树分类器对象
dtc = DecisionTreeClassifier(max_depth=7,criterion='entropy',random_state = 0)
#定义一个随机森林分类器对象
rfc = RandomForestClassifier(max_depth=7,criterion='entropy',n_estimators=99,random_state = 0)
#定义一个极端森林分类器对象
etc = ExtraTreesClassifier(max_depth=7,criterion='entropy',random_state = 0)
dtc.fit(x_train,y_train)
rfc.fit(x_train,y_train)
etc.fit(x_train,y_train)
print("决策树在训练集上的准确率：%.3f"%dtc.score(x_train,y_train))
print("决策树在测试集上的准确率：%.3f"%dtc.score(x_test,y_test))
print("随机森林（T=99）在决策树在训练集上的准确率：%.3f"%rfc.score(x_train,y_train))
print("随机森林（T=99）在测试集上的准确率：%.3f"%rfc.score(x_test,y_test))
print("极端随机树在训练集上的准确率：%.3f"%etc.score(x_train,y_train))
print("极端随机树在试集上的准确率：%.3f"%etc.score(x_test,y_test))
#观察弱分类器数量对分类准确度的影响
#弱分类器的最大值
T_max =39
rfc_train_scores=[] 
rfc_test_scores=[] 
for i in range(1,T_max+1):
    rfc = RandomForestClassifier(n_estimators=i)
    rfc.fit(x_train,y_train)
    rfc_train_scores.append(rfc.score(x_train,y_train))
    rfc_test_scores.append(rfc.score(x_test,y_test))
#绘制测试结果
plt.figure()
plt.plot(range(1,T_max+1),rfc_train_scores,color='r',label='训练集')
plt.plot(range(1,T_max+1),rfc_test_scores,color='g',label='测试集')
plt.title("随机森林基分类器数量对性能的影响")
plt.xlabel("基分类器数量")
plt.ylabel("准确率")
plt.xlim(1,T_max)
plt.grid(color='gray',linewidth = '0.5',linestyle = '--')
plt.legend()
plt.show()
#观察弱分类器数量对分类准确度的影响
#弱分类器的最大值
T_max =39
etc_train_scores=[] 
etc_test_scores=[] 
for i in range(1,T_max+1):
    etc = ExtraTreesClassifier(max_depth = 7,criterion = 'entropy',n_estimators=i)
    etc.fit(x_train,y_train)
    etc_train_scores.append(etc.score(x_train,y_train))
    etc_test_scores.append(etc.score(x_test,y_test))
#绘制测试结果
plt.figure()
plt.plot(range(1,T_max+1),etc_train_scores,color='r',label='训练集')
plt.plot(range(1,T_max+1),etc_test_scores,color='g',label='测试集')
plt.title("极端随机树基分类器数量对性能的影响")
plt.xlabel("基分类器数量")
plt.ylabel("准确率")
plt.xlim(1,T_max)
plt.grid(color='gray',linewidth = '0.5',linestyle = '--')
plt.legend()
plt.show()

总结

以上就是今天的内容~

最后欢迎大家点赞

声明：本文内容由网友自发贡献，不代表【wpsshop博客】立场，版权归原作者所有，本站不承担相应法律责任。如您发现有侵权的内容，请联系我们。转载请注明出处：https://www.wpsshop.cn/article/detail/52969?site