赞
踩
sklearn官网API查询http://scikit-learn.org/stable/modules/classes.html
scikit-learn中自带了一些数据集,比如说最著名的Iris数据集。
数据集中第3列和第4列数据表示花瓣的长度和宽度,类别标签列已经转成了数字,比如0=Iris-Setosa,1=Iris-Versicolor,2=Iris-Virginica.
- from IPython.display import Image
- %matplotlib inline
- # Added version check for recent scikit-learn 0.18 checks
- from distutils.version import LooseVersion as Version
- from sklearn import __version__ as sklearn_version
-
- from sklearn import datasets
- import numpy as np
- iris = datasets.load_iris()
- #http://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
- X = iris.data[:, [2, 3]]
- y = iris.target #取species列,类别
- print('Class labels:', np.unique(y))
- #Output:Class labels: [0 1 2]
把数据集切分成训练集和测试集,这里70%的训练集,30%的测试集
- if Version(sklearn_version) < '0.18':
- from sklearn.cross_validation import train_test_split
- else:
- from sklearn.model_selection import train_test_split
-
- X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=0.3, random_state=0) #train_test_split方法分割数据集
-
- X_train.shape
- #Output:(105, 2)
- X_test.shape
- #Output:(45, 2)
- X.shape
- #Output:(150, 2)
- y_train.shape
- #Output: (105,)
- y_test.shape
- #Output: (45,)

非树形模型一般都要对特征数据进行标准化处理,避免数据波动的影响。处理后各维特征有0均值,单位方差。也叫z-score规范化(零均值规范化)。计算方式是将特征值减去均值,除以标准差。
- #scaler = sklearn.preprocessing.StandardScaler().fit(train)
- #scaler.transform(train);scaler.transform(test)
- #fit()方法建模,transform()方法转换
- from sklearn.preprocessing import StandardScaler
- sc = StandardScaler() #初始化一个对象sc去对数据集作变换
- sc.fit(X_train) #用对象去拟合数据集X_train,并且存下来拟合参数
- #Output:StandardScaler(copy=True, with_mean=True, with_std=True)
- #type(sc.fit(X_train))
- #Output:sklearn.preprocessing.data.StandardScaler
- sc.scale_ #sc.std_同样输出结果
- #Output:array([ 1.79595918, 0.77769705])
- sc.mean_
- #Output:array([ 3.82857143, 1.22666667])
-
- import numpy as np
- X_train_std = sc.transform(X_train)
- X_test_std = sc.transform(X_test)
- #test标准化原理
- at=X_train_std[:5]*sc.scale_+sc.mean_
- a=X_train[:5]
- at==a
- #Output:
- #array([[ True, True],
- # [ True, True],
- # [ True, True],
- # [ True, True],
- # [ True, True]], dtype=bool)

下面各算法中,通过plot_decision_region函数可视化,方便直观看分类结果
- from matplotlib.colors import ListedColormap
- import matplotlib.pyplot as plt
- import warnings
- def versiontuple(v):#Numpy版本检测函数
- return tuple(map(int, (v.split("."))))
- def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
- #画决策边界,X是特征,y是标签,classifier是分类器,test_idx是测试集序号
- # setup marker generator and color map
- markers = ('s', 'x', 'o', '^', 'v')
- colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
- cmap = ListedColormap(colors[:len(np.unique(y))])
-
- # plot the decision surface
- x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1 #第一个特征取值范围作为横轴
- x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1 #第二个特征取值范围作为纵轴
- xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
- np.arange(x2_min, x2_max, resolution)) #reolution是网格剖分粒度,xx1和xx2数组维度一样
- Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
- #classifier指定分类器,ravel是数组展平;Z的作用是对组合的二种特征进行预测
- Z = Z.reshape(xx1.shape) #Z是列向量
- plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
- #contourf(x,y,z)其中x和y为两个等长一维数组,z为二维数组,指定每一对xy所对应的z值。
- #对等高线间的区域进行填充(使用不同的颜色)
- plt.xlim(xx1.min(), xx1.max())
- plt.ylim(xx2.min(), xx2.max())
-
- for idx, cl in enumerate(np.unique(y)):
- plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
- alpha=0.8, c=cmap(idx),
- marker=markers[idx], label=cl) #全数据集,不同类别样本点的特征作为坐标(x,y),用不同颜色画散点图
-
- # highlight test samples
- if test_idx:
- # plot all samples
- if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
- X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
- warnings.warn('Please update to NumPy 1.9.0 or newer')
- else:
- X_test, y_test = X[test_idx, :], y[test_idx] #X_test取测试集样本两列特征,y_test取测试集标签
-
- plt.scatter(X_test[:, 0],
- X_test[:, 1],
- c='',
- alpha=1.0,
- linewidths=1,
- marker='o',
- s=55, label='test set') #c设置颜色,测试集不同类别的实例点画图不区别颜色

- from sklearn.linear_model import Perceptron
- #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron
- #ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
- ppn = Perceptron() #y=w.x+b
- ppn.fit(X_train_std, y_train)
- #Output:Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
- # n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
- # verbose=0, warm_start=False)
- ppn.coef_ #分类决策函数中的特征系数w
- #Output:array([[-1.48746619, -1.1229737 ],
- # [ 3.0624304 , -2.18594118],
- # [ 2.9272062 , 2.64027405]])
- ppn.intercept_ #分类决策函数中的偏置项b
- #Output:array([-1., 0., -2.])
- y_pred = ppn.predict(X_test_std) #对测试集做类别预测
- y_pred
- #Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
- # 0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0])
- y_test
- #Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
- # 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0, 0])
- y_pred == y_test
- #Output:array([ True, True, True, True, True, True, True, True, True,
- # True, False, True, True, True, True, True, True, True,
- # True, True, True, True, True, True, True, True, True,
- # True, False, True, True, True, True, True, True, True,
- # True, False, True, True, True, True, True, True, True], dtype=bool)
- print('Misclassified samples: %d' % (y_test != y_pred).sum())
- #Output:Misclassified samples: 3
- from sklearn.metrics import accuracy_score
- print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) #预测准确度,(len(y_test)-3)/len(y_test):0.9333333333333333
- #Output:Accuracy: 0.93

用标准化的数据做一个感知器分类器
- %matplotlib inline
- X_combined_std = np.vstack((X_train_std, X_test_std)) #shape是(150,2)
- y_combined = np.hstack((y_train, y_test)) #shape是(150,)
-
- plot_decision_regions(X=X_combined_std, y=y_combined,
- classifier=ppn, test_idx=range(105, 150))
- plt.xlabel('petal length [standardized]')
- plt.ylabel('petal width [standardized]')
- plt.legend(loc='upper left')
-
- plt.tight_layout() #紧凑显示图片,居中显示;避免出现叠影
- # plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
- plt.show()
- from sklearn.linear_model import LogisticRegression
- lr = LogisticRegression(C=1000.0, random_state=0)
- #http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
- lr.fit(X_train_std, y_train)
- plot_decision_regions(X_combined_std, y_combined, classifier=lr, test_idx=range(105, 150))
- plt.xlabel('petal length [standardized]')
- plt.ylabel('petal width [standardized]')
- plt.legend(loc='upper left')
- plt.tight_layout()
- # plt.savefig('./figures/logistic_regression.png', dpi=300)
- plt.show()
过拟合/overfitting 与 正则化/regularization
- weights, params = [], []
-
- for c in range(-5,6):
- lr = LogisticRegression(C=10**c, random_state=0) #默认L2正则,C是正则化系数的倒数(C越小,特征权重越小)
- lr.fit(X_train_std, y_train)
- weights.append(lr.coef_[1]) ##############################lr.coef_[1]
- params.append(10**c)
-
- weights = np.array(weights)
- plt.plot(params,weights[:, 0],label='petal length')
- plt.plot(params,weights[:, 1],linestyle='--',label='petal width')
- plt.ylabel('weight coefficient')
- plt.xlabel('C')
- plt.legend(loc='upper left')
- plt.xscale('log') #在x轴上画对数坐标轴
- # plt.savefig('./figures/regression_path.png', dpi=300)
- plt.show()

- from sklearn.svm import SVC
- #http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
- svm = SVC(kernel='linear', C=1.0, random_state=0)
- svm.fit(X_train_std, y_train)
-
- plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150))
- plt.xlabel('petal length [standardized]')
- plt.ylabel('petal width [standardized]')
- plt.legend(loc='upper left')
- plt.tight_layout()
- # plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
- plt.show()
上图利用的是线性SVM分类,但是从上图可见有些点被分错类了,进一步,考虑利用核函数进行非线性分类
- from sklearn.svm import SVC
- svm = SVC(kernel='rbf',random_state=0,gamma=0.2,C=1.0)
- svm.fit(X_train_std,y_train)
- plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150))
- plt.xlabel('petal length [standardized]')
- plt.ylabel('petal width [standardized]')
- plt.legend(loc='upper left')
- plt.tight_layout()
- # plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
- plt.show()
- from sklearn.tree import DecisionTreeClassifier
- #http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
- tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
- #特征选择的度量,entropy是信息增益;max_depth参数是数的最大深度
- tree.fit(X_train, y_train)
- X_combined = np.vstack((X_train, X_test))
- y_combined = np.hstack((y_train, y_test))
- plot_decision_regions(X_combined, y_combined, classifier=tree, test_idx=range(105, 150))
- plt.xlabel('petal length [cm]')
- plt.ylabel('petal width [cm]')
- plt.legend(loc='upper left')
- plt.tight_layout()
- # plt.savefig('./figures/decision_tree_decision.png', dpi=300)
- plt.show()
- from sklearn.ensemble import RandomForestClassifier
- forest = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1, n_jobs=2)
- #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
- #criterion特征选择度量,n_estimators随机森林中单棵树数目,n_jobs设置并行生成树模型得数目
- forest.fit(X_train, y_train)
- plot_decision_regions(X_combined, y_combined, classifier=forest, test_idx=range(105, 150))
- plt.xlabel('petal length [cm]')
- plt.ylabel('petal width [cm]')
- plt.legend(loc='upper left')
- plt.tight_layout()
- # plt.savefig('./figures/random_forest.png', dpi=300)
- plt.show()
- from sklearn.neighbors import KNeighborsClassifier
- knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
- #http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
- #p是度量范数,metric='minkowski'距离度量标准2范数
- knn.fit(X_train_std, y_train)
- plot_decision_regions(X_combined_std, y_combined, classifier=knn, test_idx=range(105, 150))
- plt.xlabel('petal length [standardized]')
- plt.ylabel('petal width [standardized]')
- plt.legend(loc='upper left')
- plt.tight_layout()
- # plt.savefig('./figures/k_nearest_neighbors.png', dpi=300)
- plt.show()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。