当前位置:   article > 正文

Python机器学习库sklearn几种分类算法建模可视化(实验)_python机器学习分类模型可视化

python机器学习分类模型可视化

sklearn官网API查询http://scikit-learn.org/stable/modules/classes.html

scikit-learn中自带了一些数据集,比如说最著名的Iris数据集。

数据集中第3列和第4列数据表示花瓣的长度和宽度,类别标签列已经转成了数字,比如0=Iris-Setosa,1=Iris-Versicolor,2=Iris-Virginica.

 

一、导入python库和实验数据集

  1. from IPython.display import Image
  2. %matplotlib inline
  3. # Added version check for recent scikit-learn 0.18 checks
  4. from distutils.version import LooseVersion as Version
  5. from sklearn import __version__ as sklearn_version
  6. from sklearn import datasets
  7. import numpy as np
  8. iris = datasets.load_iris()
  9. #http://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
  10. X = iris.data[:, [2, 3]]
  11. y = iris.target #取species列,类别
  12. print('Class labels:', np.unique(y))
  13. #Output:Class labels: [0 1 2]

​​​​​

二、数据集切分

把数据集切分成训练集和测试集,这里70%的训练集,30%的测试集

  1. if Version(sklearn_version) < '0.18':
  2. from sklearn.cross_validation import train_test_split
  3. else:
  4. from sklearn.model_selection import train_test_split
  5. X_train, X_test, y_train, y_test = train_test_split(
  6. X, y, test_size=0.3, random_state=0) #train_test_split方法分割数据集
  7. X_train.shape
  8. #Output:(105, 2)
  9. X_test.shape
  10. #Output:(45, 2)
  11. X.shape
  12. #Output:(150, 2)
  13. y_train.shape
  14. #Output: (105,)
  15. y_test.shape
  16. #Output: (45,)

 

三、对特征做标准化

非树形模型一般都要对特征数据进行标准化处理,避免数据波动的影响。处理后各维特征有0均值,单位方差。也叫z-score规范化(零均值规范化)。计算方式是将特征值减去均值,除以标准差。

  1. #scaler = sklearn.preprocessing.StandardScaler().fit(train)
  2. #scaler.transform(train);scaler.transform(test)
  3. #fit()方法建模,transform()方法转换
  4. from sklearn.preprocessing import StandardScaler
  5. sc = StandardScaler() #初始化一个对象sc去对数据集作变换
  6. sc.fit(X_train) #用对象去拟合数据集X_train,并且存下来拟合参数
  7. #Output:StandardScaler(copy=True, with_mean=True, with_std=True)
  8. #type(sc.fit(X_train))
  9. #Output:sklearn.preprocessing.data.StandardScaler
  10. sc.scale_ #sc.std_同样输出结果
  11. #Output:array([ 1.79595918, 0.77769705])
  12. sc.mean_
  13. #Output:array([ 3.82857143, 1.22666667])
  14. import numpy as np
  15. X_train_std = sc.transform(X_train)
  16. X_test_std = sc.transform(X_test)
  17. #test标准化原理
  18. at=X_train_std[:5]*sc.scale_+sc.mean_
  19. a=X_train[:5]
  20. at==a
  21. #Output:
  22. #array([[ True, True],
  23. # [ True, True],
  24. # [ True, True],
  25. # [ True, True],
  26. # [ True, True]], dtype=bool)
 

四、各种算法分类及可视化

下面各算法中,通过plot_decision_region函数可视化,方便直观看分类结果

  1. from matplotlib.colors import ListedColormap
  2. import matplotlib.pyplot as plt
  3. import warnings
  4. def versiontuple(v):#Numpy版本检测函数
  5. return tuple(map(int, (v.split("."))))
  6. def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
  7. #画决策边界,X是特征,y是标签,classifier是分类器,test_idx是测试集序号
  8. # setup marker generator and color map
  9. markers = ('s', 'x', 'o', '^', 'v')
  10. colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
  11. cmap = ListedColormap(colors[:len(np.unique(y))])
  12. # plot the decision surface
  13. x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 #第一个特征取值范围作为横轴
  14. x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 #第二个特征取值范围作为纵轴
  15. xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
  16. np.arange(x2_min, x2_max, resolution)) #reolution是网格剖分粒度,xx1和xx2数组维度一样
  17. Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
  18. #classifier指定分类器,ravel是数组展平;Z的作用是对组合的二种特征进行预测
  19. Z = Z.reshape(xx1.shape) #Z是列向量
  20. plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
  21. #contourf(x,y,z)其中x和y为两个等长一维数组,z为二维数组,指定每一对xy所对应的z值。
  22. #对等高线间的区域进行填充(使用不同的颜色)
  23. plt.xlim(xx1.min(), xx1.max())
  24. plt.ylim(xx2.min(), xx2.max())
  25. for idx, cl in enumerate(np.unique(y)):
  26. plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
  27. alpha=0.8, c=cmap(idx),
  28. marker=markers[idx], label=cl) #全数据集,不同类别样本点的特征作为坐标(x,y),用不同颜色画散点图
  29. # highlight test samples
  30. if test_idx:
  31. # plot all samples
  32. if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
  33. X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
  34. warnings.warn('Please update to NumPy 1.9.0 or newer')
  35. else:
  36. X_test, y_test = X[test_idx, :], y[test_idx] #X_test取测试集样本两列特征,y_test取测试集标签
  37. plt.scatter(X_test[:, 0],
  38. X_test[:, 1],
  39. c='',
  40. alpha=1.0,
  41. linewidths=1,
  42. marker='o',
  43. s=55, label='test set') #c设置颜色,测试集不同类别的实例点画图不区别颜色
 

 

用scikit-learn中的感知器做分类(三类分类)

 

  1. from sklearn.linear_model import Perceptron
  2. #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron
  3. #ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
  4. ppn = Perceptron() #y=w.x+b
  5. ppn.fit(X_train_std, y_train)
  6. #Output:Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
  7. # n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
  8. # verbose=0, warm_start=False)
  9. ppn.coef_ #分类决策函数中的特征系数w
  10. #Output:array([[-1.48746619, -1.1229737 ],
  11. # [ 3.0624304 , -2.18594118],
  12. # [ 2.9272062 , 2.64027405]])
  13. ppn.intercept_ #分类决策函数中的偏置项b
  14. #Output:array([-1., 0., -2.])
  15. y_pred = ppn.predict(X_test_std) #对测试集做类别预测
  16. y_pred
  17. #Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
  18. # 0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0])
  19. y_test
  20. #Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
  21. # 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0, 0])
  22. y_pred == y_test
  23. #Output:array([ True, True, True, True, True, True, True, True, True,
  24. # True, False, True, True, True, True, True, True, True,
  25. # True, True, True, True, True, True, True, True, True,
  26. # True, False, True, True, True, True, True, True, True,
  27. # True, False, True, True, True, True, True, True, True], dtype=bool)
  28. print('Misclassified samples: %d' % (y_test != y_pred).sum())
  29. #Output:Misclassified samples: 3
  30. from sklearn.metrics import accuracy_score
  31. print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) #预测准确度,(len(y_test)-3)/len(y_test):0.9333333333333333
  32. #Output:Accuracy: 0.93

用标准化的数据做一个感知器分类器

  1. %matplotlib inline
  2. X_combined_std = np.vstack((X_train_std, X_test_std)) #shape是(150,2)
  3. y_combined = np.hstack((y_train, y_test)) #shape是(150,)
  4. plot_decision_regions(X=X_combined_std, y=y_combined,
  5. classifier=ppn, test_idx=range(105, 150))
  6. plt.xlabel('petal length [standardized]')
  7. plt.ylabel('petal width [standardized]')
  8. plt.legend(loc='upper left')
  9. plt.tight_layout() #紧凑显示图片,居中显示;避免出现叠影
  10. # plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
  11. plt.show()
 

用scikit-learn中的LR预测属于每个类别的概率(三类分类)

  1. from sklearn.linear_model import LogisticRegression
  2. lr = LogisticRegression(C=1000.0, random_state=0)
  3. #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
  4. lr.fit(X_train_std, y_train)
  5. plot_decision_regions(X_combined_std, y_combined, classifier=lr, test_idx=range(105, 150))
  6. plt.xlabel('petal length [standardized]')
  7. plt.ylabel('petal width [standardized]')
  8. plt.legend(loc='upper left')
  9. plt.tight_layout()
  10. # plt.savefig('./figures/logistic_regression.png', dpi=300)
  11. plt.show()

 

 

过拟合/overfitting 与 正则化/regularization

  1. weights, params = [], []
  2. for c in range(-5,6):
  3. lr = LogisticRegression(C=10**c, random_state=0) #默认L2正则,C是正则化系数的倒数(C越小,特征权重越小)
  4. lr.fit(X_train_std, y_train)
  5. weights.append(lr.coef_[1]) ##############################lr.coef_[1]
  6. params.append(10**c)
  7. weights = np.array(weights)
  8. plt.plot(params,weights[:, 0],label='petal length')
  9. plt.plot(params,weights[:, 1],linestyle='--',label='petal width')
  10. plt.ylabel('weight coefficient')
  11. plt.xlabel('C')
  12. plt.legend(loc='upper left')
  13. plt.xscale('log') #在x轴上画对数坐标轴
  14. # plt.savefig('./figures/regression_path.png', dpi=300)
  15. plt.show()

 

用scikit-learn中的SVM做分类(三类分类)

  1. from sklearn.svm import SVC
  2. #http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
  3. svm = SVC(kernel='linear', C=1.0, random_state=0)
  4. svm.fit(X_train_std, y_train)
  5. plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150))
  6. plt.xlabel('petal length [standardized]')
  7. plt.ylabel('petal width [standardized]')
  8. plt.legend(loc='upper left')
  9. plt.tight_layout()
  10. # plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
  11. plt.show()

 

上图利用的是线性SVM分类,但是从上图可见有些点被分错类了,进一步,考虑利用核函数进行非线性分类

  1. from sklearn.svm import SVC
  2. svm = SVC(kernel='rbf',random_state=0,gamma=0.2,C=1.0)
  3. svm.fit(X_train_std,y_train)
  4. plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150))
  5. plt.xlabel('petal length [standardized]')
  6. plt.ylabel('petal width [standardized]')
  7. plt.legend(loc='upper left')
  8. plt.tight_layout()
  9. # plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
  10. plt.show()
 

 

 

用scikit-learn中的决策树做分类(三类分类)

  1. from sklearn.tree import DecisionTreeClassifier
  2. #http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
  3. tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
  4. #特征选择的度量,entropy是信息增益;max_depth参数是数的最大深度
  5. tree.fit(X_train, y_train)
  6. X_combined = np.vstack((X_train, X_test))
  7. y_combined = np.hstack((y_train, y_test))
  8. plot_decision_regions(X_combined, y_combined, classifier=tree, test_idx=range(105, 150))
  9. plt.xlabel('petal length [cm]')
  10. plt.ylabel('petal width [cm]')
  11. plt.legend(loc='upper left')
  12. plt.tight_layout()
  13. # plt.savefig('./figures/decision_tree_decision.png', dpi=300)
  14. plt.show()

​​​​​​​

用scikit-learn中的随机森林做分类(三类分类)

  1. from sklearn.ensemble import RandomForestClassifier
  2. forest = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1, n_jobs=2)
  3. #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
  4. #criterion特征选择度量,n_estimators随机森林中单棵树数目,n_jobs设置并行生成树模型得数目
  5. forest.fit(X_train, y_train)
  6. plot_decision_regions(X_combined, y_combined, classifier=forest, test_idx=range(105, 150))
  7. plt.xlabel('petal length [cm]')
  8. plt.ylabel('petal width [cm]')
  9. plt.legend(loc='upper left')
  10. plt.tight_layout()
  11. # plt.savefig('./figures/random_forest.png', dpi=300)
  12. plt.show()

 

​​​​​​​

用scikit-learn中的k-近邻做分类(三类分类)

  1. from sklearn.neighbors import KNeighborsClassifier
  2. knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
  3. #http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
  4. #p是度量范数,metric='minkowski'距离度量标准2范数
  5. knn.fit(X_train_std, y_train)
  6. plot_decision_regions(X_combined_std, y_combined, classifier=knn, test_idx=range(105, 150))
  7. plt.xlabel('petal length [standardized]')
  8. plt.ylabel('petal width [standardized]')
  9. plt.legend(loc='upper left')
  10. plt.tight_layout()
  11. # plt.savefig('./figures/k_nearest_neighbors.png', dpi=300)
  12. plt.show()

 

 

 

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/菜鸟追梦旅行/article/detail/358893
推荐阅读
相关标签
  

闽ICP备14008679号