赞
踩
在时间场景中,我们一般会另设一个Dev数据集作为开发数据集(测试集),我们在成功建模后用它对模型调优。模型是从训练集中训练出来的,然后用Dev数据来度量它的指标准确度等,基于上述过程的结果,如果需要进一步提高,模型会被更深入的调优。
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
import numpy as np
def get_iris():
data=load_iris()
x=data['data']
y=data['target']
input_dataset=np.column_stack([x,y])
np.random.shuffle(input_dataset)
return input_dataset
#采用8/2分布分割数据集
data=get_iris()
train,test=train_test_split(data,train_size=0.8)
print("train size",train.shape)
print("test size ",test.shape)
# 检测训练集和测试集的类别标签是否分布合理
def get_class_distribution(y):
distribution={}
set_y=set(y)
for y_label in set_y:
no_element=len(np.where(y==y_label)[0])
distribution[y_label]=no_element
return distribution
def print_class_label_split(train,test):
y_train=train[:,-1]
train_distribution=get_class_distribution(y_train)
print("\nTrain data set class label distribution")
print("======================================\n")
for k ,v in train_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
y_test=test[:,-1]
test_distribution=get_class_distribution(y_test)
print("\nTest data set class label distribution")
print("======================================\n")
for k ,v in test_distribution.items():
print("class label=%d, Percentage records=%0.2f"%(k,v))
print_class_label_split(train,test)

train size (120, 5) test size (30, 5) Train data set class label distribution ====================================== class label=0, Percentage records=36.00 class label=1, Percentage records=42.00 class label=2, Percentage records=42.00 Test data set class label distribution ====================================== class label=0, Percentage records=14.00 class label=1, Percentage records=8.00 class label=2, Percentage records=8.00
如何在训练集和测试集中均匀的分割类别标签
from sklearn.cross_validation import StratifiedShuffleSplit
straified_split=StratifiedShuffleSplit(data[:,-1],test_size=0.2,n_iter=1)
for train_index,test_index in straified_split:
train=data[train_index]
test=data[test_index]
print_class_label_split(train,test)
Train data set class label distribution
======================================
class label=0, Percentage records=40.00
class label=1, Percentage records=40.00
class label=2, Percentage records=40.00
Test data set class label distribution
======================================
class label=0, Percentage records=10.00
class label=1, Percentage records=10.00
class label=2, Percentage records=10.00
首先需要混淆矩阵,它是类别标签的真实值与预测值的对应排列矩阵。
预测值 | |
---|---|
真实值 | T |
F |
- FP:TRUE Positive的缩写。测试集中真实标签为T,预测值标签也为T的总数
- FN: False Negative的缩写。测试集中真实标签为T,预测值却为F的总数
- FP:False Positive的缩写。测试集中真实标签为F,预测值却为T的总数
- TN:TRUE Negative的缩写。测试集中真实标签为F,预测值也为F的总数
准确度是正确的预测总数的总数。从混淆矩阵中,我们知道TP和TN之和就是正确的预测数。
K近邻算法(KNN)把所有的训练集数据加载到内存中,当它需要对测试实例进行分类时,它衡量这个实例和所有训练实例之间的距离,基于距离,它选择训练集里的K个最近的实例。测试集的分类预测值就是基于这K个最近邻的主体分类情况
# 准备数据
from sklearn.datasets import make_classification
import numpy as np
import matplotlib.pyplot as plt
import itertools
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
def get_data():
x,y=make_classification(n_features=4,n_samples=1000)
return x,y
def plot_data(x,y):
"""
Plot a scatter plot fo all variable combinations
"""
subplot_start = 321
col_numbers = range(0,4)
col_pairs = itertools.combinations(col_numbers,2)
plt.subplots_adjust(wspace=0.36,hspace=0.2)
plt.figure(figsize=(16,9))
for col_pair in col_pairs:
plt.subplot(subplot_start)
plt.scatter(x[:,col_pair[0]],x[:,col_pair[1]],c=y)
title_string = str(col_pair[0]) + "-" + str(col_pair[1])
plt.title(title_string)
x_label = str(col_pair[0])
y_label = str(col_pair[1])
plt.xlabel(x_label)
plt.xlabel(y_label)
subplot_start+=1
x,y=get_data()
plot_data(x,y)

<Figure size 432x288 with 0 Axes>
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
def get_train_test(x,y):
train_size=0.7
test_size=1-train_size
input_data=np.column_stack([x,y])
straified_split=StratifiedShuffleSplit(input_data[:,-1],train_size=train_size,n_iter=1)
for train_indx,test_indx in straified_split:
train_x=input_data[train_indx,:-1]
train_y=input_data[train_indx,-1]
test_x=input_data[test_indx,:-1]
test_y=input_data[test_indx,-1]
return train_x,train_y,test_x,test_y
def build_model(x,y,k=2):
knn=KNeighborsClassifier(n_neighbors=k)
knn.fit(x,y)
return knn
def test_model(x,y,knn_model):
y_predicted=knn_model.predict(x)
print(classification_report(y,y_predicted))
x,y=get_data()
plot_data(x,y)
train_x,train_y,test_x,test_y=get_train_test(x,y)
knn_model=build_model(train_x,train_y)
test_model(test_x,test_y,knn_model)
test_model(train_x,train_y,knn_model)

precision recall f1-score support 0.0 0.89 0.94 0.91 50 1.0 0.94 0.88 0.91 50 avg / total 0.91 0.91 0.91 100 precision recall f1-score support 0.0 0.90 1.00 0.95 353 1.0 1.00 0.88 0.94 347 avg / total 0.95 0.94 0.94 700 <Figure size 432x288 with 0 Axes>
这种算法的驱动力来自贝叶斯规则,公式如下:
要比较这两个等式来决定最终结果,我们可以忽略分母,因此它只是简单的缩放因此,等式左边称为后验概率。等式右边的分子部分P(review|positive)∗P(positive),其中P(positive)是正面评价的先验概率,它是我们从训练集中获取的正面分类标签的分布信仰,我们通过下面的公式把它从训练集中计算出来:
#加载库和生成数据
from nltk.corpus import movie_reviews,stopwords
from sklearn.cross_validation import StratifiedShuffleSplit
import nltk
from nltk.collocations import BigramCollocationFinder,BigramAssocMeasures
def get_data():
dataset=[]
ylabel=[]
for cat in movie_reviews.categories():
for field in movie_reviews.fileids(cat):
words=list(movie_reviews.words(field))
dataset.append((words,cat))
ylabel.append(cat)
return dataset,ylabel
def get_train_test(input_dataset,ylabel):
stragiht_split=StratifiedShuffleSplit(ylabel,train_size=0.8,n_iter=1)
for train_index,test_index in stragiht_split:
train=[input_dataset[i] for i in train_index]
train_y=[ylabel[i] for i in train_index]
test=[input_dataset[i] for i in test_index]
test_y=[ylabel[i] for i in test_index]
return train,train_y,test,test_y

#模型构建
def build_word_features(instance):
feature_set={}
words=instance[0]
for word in words:
feature_set[word]=1
return (feature_set,instance[1])
def build_negate_features(instance):
words= instance[0]
final_words=[]
negate=False
negate_words=['no','not']
for word in words:
if negate:
word='not_'+word
negate=False
if word not in negate_words:
final_words.append(word)
else:
negate=True
feature_set={}
for word in final_words:
feature_set[word]=1
return (feature_set,instance[1])
def remove_stop_words(in_data):
stopwordlist=stopwords.words('english')
negate_words=['no','not']
New_stopwords=[word for word in stopwordlist if word not in negate_words]
label=in_data[1]
words=[word for word in in_data[0] if word not in New_stopwords]
return (words,label)
def build_keyphrase_features(instance):
feature_set={}
instance=remove_stop_words(instance)
words=instance[0]
bigram_finder=BigramCollocationFinder.from_words(words)
bigrams=bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)
for bigram in bigrams:
feature_set[bigram]=1
return (feature_set,instance[1])
def build_model(features):
model=nltk.NaiveBayesClassifier.train(features)
return model
def probel_model(model,features,dataset_type='Train'):
accuracy=nltk.classify.accuracy(model,features)
print("\n"+ dataset_type+"Accuracy=%0.2f"%(accuracy*100)+"%")
def show_features(model,features=5):
print("\nFeature Importance")
print("=====================")
print(model.show_most_informative_features(features))

#模型调整
def build_model_cycle_1(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
def build_model_cycle_2(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model
def build_model_cycle_3(train_data,dev_data):
train_features=map(build_word_features,train_data)
dev_features=map(build_negate_features,dev_data)
model=build_model(train_features)
probel_model(model,train_features)
probel_model(model,dev_features,"Dev")
return model

#主程序
input_dataset,ylabels=get_data()
train_data,train_y,ALL_test_data,ALL_test_y=get_train_test(input_dataset,ylabels)
dev_data,dev_y,test_data,test_y=get_train_test(ALL_test_data,ALL_test_y)
print("\n Origin Data Size=",len(input_dataset))
print("\n Train Data Size=",len(train_data))
print("\n Dev Data Size=",len(dev_data))
print("]n Test Data Size=",len(test_data))
model_cycle_1=build_model_cycle_1(train_data,dev_data)
show_features(model_cycle_1)
model_cycle_2=build_model_cycle_2(train_data,dev_data)
show_features(model_cycle_2)
model_cycle_3=build_model_cycle_3(train_data,dev_data)
show_features(model_cycle_3)
Origin Data Size= 2000 Train Data Size= 1600 Dev Data Size= 160 ]n Test Data Size= 20 TrainAccuracy=0.00% DevAccuracy=0.00% Feature Importance ===================== Most Informative Features sucks = 1 neg : pos = 16.3 : 1.0 lousy = 1 neg : pos = 13.7 : 1.0 seagal = 1 neg : pos = 11.7 : 1.0 outstanding = 1 pos : neg = 11.4 : 1.0 stupidity = 1 neg : pos = 11.0 : 1.0 None TrainAccuracy=0.00% DevAccuracy=0.00% Feature Importance ===================== Most Informative Features sucks = 1 neg : pos = 16.3 : 1.0 lousy = 1 neg : pos = 13.7 : 1.0 seagal = 1 neg : pos = 11.7 : 1.0 outstanding = 1 pos : neg = 11.4 : 1.0 stupidity = 1 neg : pos = 11.0 : 1.0 None TrainAccuracy=0.00% DevAccuracy=0.00% Feature Importance ===================== Most Informative Features sucks = 1 neg : pos = 16.3 : 1.0 lousy = 1 neg : pos = 13.7 : 1.0 seagal = 1 neg : pos = 11.7 : 1.0 outstanding = 1 pos : neg = 11.4 : 1.0 stupidity = 1 neg : pos = 11.0 : 1.0 None
理论上,对于给定的决策树可以构建许多决策树,其中一些的准确度要更好一些。现在有一些高效的算法,能够有限的时间内生成较为合理准确的树。比如Hunt算法,ID3,C4.5和CART等算法都是基于它而来的,这种算法的概述如下:
给定一个数据集D,他有n条记录,每条记录具有每个属性,而且每条记录的标签为y1,y2,y3三者中的一个。算法过程如下
- 如果D里的所有记录都属于同一个类别,假定为y1,则y1是树的叶子节点,标签为y1
- 如果D的记录属于多个类别,则采用一个特征测试条件将记录分割成较小的子集。假定第一次运行,我们在所有的属性上执行特征测试条件,从中找出一个属性能够把数据字分割成为3个较小的子集,然后将这个属性变成根节点,在3个子集中应用测试条件来找出下一级节点。这个过程不断迭代执行
决策树有许多的有点,列举如下:
- 易于解释
- 仅需要极少的数据准备和数据-特征转换
- 天然支持多分类问题
决策树也有自己的不足:
- 容易过拟合:训练集的准确度很高而对测试集的效果很差
- 对于一个给定的数据集,能产生成千上万的决策树
- 类别不平衡的影响十分严重:在二元分类问题中,每类的实例数量不同时就会爆发类别不平衡,对多分类问题也是一样
特征测试条件就是基于一个称为“不纯性”的标准或指标,将输入的记录分割成多个自己。不纯性则是通过对实例的每个属性上的类别标签进行相关的计算得出,对它影响最大的属性被分割为数据的基准属性。也就是树里的这一级节点。在决策树中,采用“熵”的概念作为计算不纯度的指标。熵的定义如下
为了找到最适合用来分割数据的变量,我们选择了熵,首先要做的就是基于类别标签对熵进行计算,公式如下:
from sklearn.datasets import load_iris
from sklearn.cross_validation import StratifiedShuffleSplit
import numpy as np
from sklearn import tree
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import pprint
def get_data():
data=load_iris()
x=data['data']
y=data['target']
label_names=data['target_names']
return x,y,label_names.tolist()
def get_feature_names():
data=load_iris()
x_feature=data['feature_names']
return data['feature_names']
def get_train_test(x,y):
input_data=np.column_stack([x,y])
straight_split=StratifiedShuffleSplit(input_data[:,-1],train_size=0.7,n_iter=1,random_state=77)
for train_index,test_index in straight_split:
train_x=input_data[train_index,:-1]
train_y=input_data[train_index,-1]
test_x=input_data[test_index,:-1]
test_y=input_data[test_index,-1]
print(train_x.shape)
print(test_x.shape)
return train_x,train_y,test_x,test_y
def build_model(x,y):
model=tree.DecisionTreeClassifier(criterion="entropy")
model=model.fit(x,y)
return model
def test_model(x,y,model,label_names):
y_predicted=model.predict(x)
print("Model Accuracy = %0.2f" %(accuracy_score(y,y_predicted)*100),"%\n")
print("\n Confusion Matrix")
print('====================')
print(pprint.pprint(confusion_matrix(y,y_predicted)))
print('\nClassification Report')
print('========================')
print(classification_report(y,y_predicted,target_names=label_names))
x,y,label_names=get_data()
train_x,train_y,test_x,test_y=get_train_test(x,y)
model=build_model(train_x,train_y)
test_model(test_x,test_y,model,label_names)
tree.export_graphviz(model,out_file='tree.dot',feature_names=get_feature_names(),class_names=label_names)#指定特征值标签和类标签

(105, 4) (15, 4) Model Accuracy = 93.33 % Confusion Matrix ==================== array([[5, 0, 0], [0, 4, 1], [0, 0, 5]], dtype=int64) None Classification Report ======================== precision recall f1-score support setosa 1.00 1.00 1.00 5 versicolor 1.00 0.80 0.89 5 virginica 0.83 1.00 0.91 5 avg / total 0.94 0.93 0.93 15
%%cmd
dot -Tpdf tree.dot -o tree.pdf
Microsoft Windows [版本 10.0.16299.371]
(c) 2017 Microsoft Corporation。保留所有权利。
E:\PycharmProjects\JupyterFiles\python>dot -Tpdf tree.dot -o tree.pdf
E:\PycharmProjects\JupyterFiles\python>
回归可以被认为是一种函数逼近,它的任务是找到这样一个函数:当一系列随机变量X作为函数的输入时,返回反应变量Y。简单回归框架虽然威力强大,但是仍然你手指与一个缺陷:由于线性回归采用的系数值上限和下限无法控制,对于给定的数据,回归容易过度拟合。对于未知数据,输出的回归模型可能执行效果不佳,缩减方法就是用来解决这个问题的,它也被称为正则化方法。
一个线性回归模型的定义如下:使用训练集数据来查找系数
这个等式的值越小,线性回归模型的效果越号,因此,这个优化问题就是使得上面的等式最小化,也就是说找出使得等式最小的wi系数。
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
def get_data():
data=load_boston()
x=data['data']
y=data['target']
return x,y
def build_model(x,y):
model=LinearRegression(normalize=True,fit_intercept=True)
model.fit(x,y)
return model
def model_view(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
def plot_residuals(y,predicted_y):
diff=y-predicted_y
plt.xlabel('Predicted_Y')
plt.ylabel("Residuals")
plt.plot(predicted_y,diff,'go')

#主程序
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
model=build_model(x_train,y_train)
predicted_y=model.predict(x_train)
plot_residuals(y_train,predicted_y)
model_view(model)
print("\nModel Performance On Train_set")
model_worth(y_train,predicted_y)
predicted_y=model.predict(x_dev)
print("\nModel Performance On Dev_set")
model_worth(y_dev,predicted_y)
Model Coefficents ====================== Coefficients 1 -0.109 Coefficients 2 0.043 Coefficients 3 0.053 Coefficients 4 2.237 Coefficients 5 -15.879 Coefficients 6 3.883 Coefficients 7 0.001 Coefficients 8 -1.321 Coefficients 9 0.284 Coefficients 10 -0.012 Coefficients 11 -0.904 Coefficients 12 0.009 Coefficients 13 -0.529 Intercept 33.288 Model Performance On Train_set Mean squared error = 23.18 Model Performance On Dev_set Mean squared error = 18.25
# 准备一些多项式特征
poly_features=PolynomialFeatures(2)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_dev_poly=poly_features.transform(x_dev)
#用多项式特征建模
model_poly=build_model(x_train_poly,y_train)
predicted_y=model_poly.predict(x_train_poly)
print("\nModel Performance On Train_set(poly_features)")
model_worth(y_train,predicted_y)
predicted_y=model_poly.predict(x_dev_poly)
print("\nModel Performance On Dev_set(Poly_features)")
model_worth(y_dev,predicted_y)
Model Performance On Train_set(poly_features)
Mean squared error = 5.45
Model Performance On Dev_set(Poly_features)
Mean squared error = 13.23
#将模型应用到测试集上
x_test_poly=poly_features.transform(x_test)
predicted_y=model_poly.predict(x_test_poly)
print("\nModel Performance On Test_set(poly_features)")
model_worth(y_test,predicted_y)
predicted_y=model.predict(x_test)
print("\nModel Performance On Test_set(Regular_features)")
model_worth(y_test,predicted_y)
Model Performance On Test_set(poly_features)
Mean squared error = 15.17
Model Performance On Test_set(Regular_features)
Mean squared error = 21.66
在训练线性回归模型时,有的系数取值会很大,导致模型很不稳定。正则化或者缩减是控制系数权重的一种途径,这样的权重不会使用过大的数值。如果数据集中包含着大量关联的预测器,仅仅微笑的改变就可能导致模型不稳定。此外,我们还要面对如何解释模型的问题。例如,假定有两个负相关的变量,他们对反应变量的影响应该是相反的。我们可以对相关联的变量进行手工检查,并删除其中其主导因素的变量。然后再进行进行建模,当然如果能够自动完成这些操作,将会方便很多。
我们把线性回归的代价函数进行修改,将系数包含进来,如你所知,代价函数的值最小,模型的效果才好。将系数引入到代价函数之后,可以对权重取值太高的系数进行大幅惩罚。这种方法称为缩减方法,因为他们减小了系数的值。
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
def get_data():
data=load_boston()
x=data['data']
y=data['target']
x=x-np.mean(x,axis=0)
return x,y
def build_model(x,y):
model=Ridge(normalize=True,alpha=0.015)
model.fit(x,y)
return model
def model_view(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#准备一些多项式特征
poly_features=PolynomialFeatures(interaction_only=True)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_dev_poly=poly_features.transform(x_dev)
x_test_poly=poly_features.transform(x_test)
choosen_model=build_model(x_train_poly,y_train)
#应用模型到测试集
predicted_y=choosen_model.predict(x_train_poly)
print("\nModel Performance in Training set(Poly_features)")
mse=model_worth(y_train,predicted_y)
model_view(choosen_model)
#应用模型到Dev
predicted_y=choosen_model.predict(x_dev_poly)
print("\nModel Performance in Dev set(Poly_features)")
mse=model_worth(y_dev,predicted_y)
#应用模型到测试集
predicted_y=choosen_model.predict(x_test_poly)
print("\nModel Performance in Test set(Poly_features)")
mse=model_worth(y_test,predicted_y)

Model Performance in Training set(Poly_features) Mean squared error = 6.85 Model Coefficents ====================== Coefficients 1 0.000 Coefficients 2 -0.016 Coefficients 3 -0.003 Coefficients 4 0.006 Coefficients 5 4.285 Coefficients 6 -10.623 Coefficients 7 5.401 Coefficients 8 -0.050 Coefficients 9 -1.361 Coefficients 10 0.103 Coefficients 11 -0.009 Coefficients 12 -0.209 Coefficients 13 0.018 Coefficients 14 -0.408 Coefficients 15 0.003 Coefficients 16 -0.004 Coefficients 17 2.027 Coefficients 18 -0.535 Coefficients 19 0.156 Coefficients 20 0.001 Coefficients 21 -0.044 Coefficients 22 -0.006 Coefficients 23 -0.000 Coefficients 24 0.006 Coefficients 25 -0.000 Coefficients 26 0.016 Coefficients 27 0.002 Coefficients 28 -0.066 Coefficients 29 0.035 Coefficients 30 -0.004 Coefficients 31 -0.001 Coefficients 32 0.006 Coefficients 33 0.003 Coefficients 34 0.000 Coefficients 35 -0.000 Coefficients 36 -0.000 Coefficients 37 -0.007 Coefficients 38 -0.030 Coefficients 39 1.945 Coefficients 40 0.098 Coefficients 41 0.002 Coefficients 42 -0.009 Coefficients 43 -0.010 Coefficients 44 0.000 Coefficients 45 -0.018 Coefficients 46 0.001 Coefficients 47 -0.003 Coefficients 48 -36.405 Coefficients 49 -4.002 Coefficients 50 0.001 Coefficients 51 -1.241 Coefficients 52 -0.326 Coefficients 53 0.009 Coefficients 54 0.154 Coefficients 55 0.026 Coefficients 56 -0.177 Coefficients 57 -1.193 Coefficients 58 -0.547 Coefficients 59 1.027 Coefficients 60 -1.362 Coefficients 61 -0.006 Coefficients 62 -3.407 Coefficients 63 -0.002 Coefficients 64 0.434 Coefficients 65 -0.048 Coefficients 66 0.002 Coefficients 67 -0.223 Coefficients 68 -0.011 Coefficients 69 -0.518 Coefficients 70 -0.008 Coefficients 71 -0.292 Coefficients 72 -0.009 Coefficients 73 0.008 Coefficients 74 -0.000 Coefficients 75 -0.011 Coefficients 76 -0.001 Coefficients 77 -0.006 Coefficients 78 -0.091 Coefficients 79 -0.003 Coefficients 80 -0.045 Coefficients 81 -0.004 Coefficients 82 0.094 Coefficients 83 0.000 Coefficients 84 0.004 Coefficients 85 -0.000 Coefficients 86 -0.029 Coefficients 87 0.004 Coefficients 88 -0.000 Coefficients 89 -0.001 Coefficients 90 -0.002 Coefficients 91 0.031 Coefficients 92 -0.001 Intercept 21.027 Model Performance in Dev set(Poly_features) Mean squared error = 11.54 Model Performance in Test set(Poly_features) Mean squared error = 9.46
最小绝对值和选择操作(LASSO)是另一种在回归问题中常用的缩减方法。它和岭回归相比,更倾向于稀疏的结果。如果一个结果的大多数系数被缩减为0,那它被称为稀疏的。LASSO的大多数系数都变成了0,对于相关联的变量,只选择保留其中一个,而不像岭回归那样给这些变量的系数分配相同的权重。LASSO的这种特性可以用来选择变量。
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Lasso,LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
def get_data():
data=load_boston()
x=data['data']
y=data['target']
return x,y
def build_model(x,y):
alpha_range=np.linspace(0,0.5,200)
model=Lasso(normalize=True)
coeffiecients=[]
for alpha in alpha_range:
model.set_params(alpha=alpha)
model.fit(x,y)
coeffiecients.append(model.coef_)
coeff_path(alpha_range,coeffiecients)
def model_view(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
def coeff_path(alpha_range,coeffiecients):
plt.figure(figsize=(16,9))
plt.xlabel("alpha value")
plt.ylabel("coeffiecient weight")
plt.plot(alpha_range,coeffiecients)
plt.axis('tight')
def get_coeff(x,y,alpha):
model=Lasso(normalize=True,alpha=alpha)
model.fit(x,y)
coefs=model.coef_
indices=[i for i,coef in enumerate(coefs) if abs(coef)>0.0]
return indices
x,y=get_data()
build_model(x,y)
print("\nPredicting using all the variables")
full_model=LinearRegression(normalize=True)
full_model.fit(x,y)
predicted_y=full_model.predict(x)
model_worth(y,predicted_y)
print("\nModels at different alpha values")
alpha_values=[0.22,0.08,0.01]
for alpha in alpha_values:
indices=get_coeff(x,y,alpha)
print("\t Attributes include",indices)
x_new=x[:,indices]
model=LinearRegression(normalize=True)
model.fit(x_new,y)
predicted_y=model.predict(x_new)
model_worth(y,predicted_y)

C:\Users\Administrator\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: UserWarning: With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator C:\Users\Administrator\Anaconda3\lib\site-packages\sklearn\linear_model\coordinate_descent.py:477: UserWarning: Coordinate descent with no regularization may lead to unexpected results and is discouraged. positive) C:\Users\Administrator\Anaconda3\lib\site-packages\sklearn\linear_model\coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems. ConvergenceWarning) Predicting using all the variables Mean squared error = 21.90 Models at different alpha values Attributes include [5, 12] Mean squared error = 30.51 Attributes include [5, 10, 12] Mean squared error = 27.13 Attributes include [0, 1, 3, 4, 5, 7, 10, 11, 12] Mean squared error = 22.89
交叉验证一般也叫K折交叉验证,训练集被划分为k粉,模型在k-1份数据上进行训练,剩下的用来测试,这样就不需要单独划分Dev集
from sklearn.datasets import load_iris
from sklearn.cross_validation import KFold,StratifiedKFold
def get_data():
data=load_iris()
x=data['data']
y=data['target']
return x,y
def class_distribution(y):
class_dist={}
total=0
for entry in y:
try:
class_dist[entry]+=1
except KeyError:
class_dist[entry]=1
total+=1
for k,v in class_dist.items():
print('\tclass %d percentage = %0.2f'%(k,v/(1.0*total)))
#第一种分法
x,y=get_data()
kfolds=KFold(n=y.shape[0],n_folds=3)
fold_count=1
for train,test in kfolds:
print("Fold %d x train shape" %(fold_count),x[train].shape,'x test shape',x[test].shape)
y_train=y[train]
y_test=y[test]
print("Train Class Distribution")
class_distribution(y_train)
print("Test Class Distribution")
class_distribution(y_test)
fold_count +=1
#第二种分法
skfolds=StratifiedKFold(y,n_folds=3)
fold_count=1
for train,test in skfolds:
print("Fold %d x train shape" %(fold_count),x[train].shape,'x test shape',x[test].shape)
y_train=y[train]
y_test=y[test]
print("Train Class Distribution")
class_distribution(y_train)
print("Test Class Distribution")
class_distribution(y_test)
fold_count +=1

Fold 1 x train shape (100, 4) x test shape (50, 4) Train Class Distribution class 1 percentage = 0.50 class 2 percentage = 0.50 Test Class Distribution class 0 percentage = 1.00 Fold 2 x train shape (100, 4) x test shape (50, 4) Train Class Distribution class 0 percentage = 0.50 class 2 percentage = 0.50 Test Class Distribution class 1 percentage = 1.00 Fold 3 x train shape (100, 4) x test shape (50, 4) Train Class Distribution class 0 percentage = 0.50 class 1 percentage = 0.50 Test Class Distribution class 2 percentage = 1.00 Fold 1 x train shape (99, 4) x test shape (51, 4) Train Class Distribution class 0 percentage = 0.33 class 1 percentage = 0.33 class 2 percentage = 0.33 Test Class Distribution class 0 percentage = 0.33 class 1 percentage = 0.33 class 2 percentage = 0.33 Fold 2 x train shape (99, 4) x test shape (51, 4) Train Class Distribution class 0 percentage = 0.33 class 1 percentage = 0.33 class 2 percentage = 0.33 Test Class Distribution class 0 percentage = 0.33 class 1 percentage = 0.33 class 2 percentage = 0.33 Fold 3 x train shape (102, 4) x test shape (48, 4) Train Class Distribution class 0 percentage = 0.33 class 1 percentage = 0.33 class 2 percentage = 0.33 Test Class Distribution class 0 percentage = 0.33 class 1 percentage = 0.33 class 2 percentage = 0.33
from sklearn.datasets import load_boston
from sklearn.cross_validation import KFold,train_test_split
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
def get_data():
data=load_iris()
x=data['data']
y=data['target']
return x,y
def build_model(x,y):
kfold=KFold(y.shape[0],5)
model=Ridge(normalize=True)
alpha_range=np.linspace(0.0015,0.0017,30)
grid_param={'alpha':alpha_range}
grid=GridSearchCV(estimator=model,param_grid=grid_param,cv=kfold,scoring='mean_squared_error')
grid.fit(x,y)
display_param_results(grid.grid_scores_)
print(grid.best_params_)
return grid.best_estimator_
def view_model(model):
print("\n Model Coefficents")
print("======================")
for i,coef in enumerate(model.coef_):
print("\tCoefficients %d %0.3f" %(i+1,coef))
print("\n\t Intercept %0.3f" %(model.intercept_))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
def display_param_results(param_results):
fold=1
for param_result in param_results:
print("Fold %d mean squared error %0.2f" %(fold,abs(param_result[1])),param_result[0])
fold+=1

x,y=get_data()
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=9)
#准备一些多项式特征
poly_features=PolynomialFeatures(interaction_only=True)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_test_poly=poly_features.transform(x_test)
choosen_model=build_model(x_train_poly,y_train)
predicted_y=choosen_model.predict(x_train_poly)
model_worth(y_train,predicted_y)
view_model(choosen_model)
predicted_y=choosen_model.predict(x_test_poly)
model_worth(y_test,predicted_y)

Fold 1 mean squared error 0.05 {'alpha': 0.0015} Fold 2 mean squared error 0.05 {'alpha': 0.001506896551724138} Fold 3 mean squared error 0.05 {'alpha': 0.0015137931034482758} Fold 4 mean squared error 0.05 {'alpha': 0.001520689655172414} Fold 5 mean squared error 0.05 {'alpha': 0.0015275862068965518} Fold 6 mean squared error 0.05 {'alpha': 0.0015344827586206897} Fold 7 mean squared error 0.05 {'alpha': 0.0015413793103448276} Fold 8 mean squared error 0.05 {'alpha': 0.0015482758620689655} Fold 9 mean squared error 0.05 {'alpha': 0.0015551724137931034} Fold 10 mean squared error 0.05 {'alpha': 0.0015620689655172415} Fold 11 mean squared error 0.05 {'alpha': 0.0015689655172413794} Fold 12 mean squared error 0.05 {'alpha': 0.0015758620689655172} Fold 13 mean squared error 0.05 {'alpha': 0.0015827586206896551} Fold 14 mean squared error 0.05 {'alpha': 0.001589655172413793} Fold 15 mean squared error 0.05 {'alpha': 0.001596551724137931} Fold 16 mean squared error 0.05 {'alpha': 0.001603448275862069} Fold 17 mean squared error 0.05 {'alpha': 0.001610344827586207} Fold 18 mean squared error 0.05 {'alpha': 0.0016172413793103448} Fold 19 mean squared error 0.05 {'alpha': 0.0016241379310344827} Fold 20 mean squared error 0.05 {'alpha': 0.0016310344827586206} Fold 21 mean squared error 0.05 {'alpha': 0.0016379310344827587} Fold 22 mean squared error 0.05 {'alpha': 0.0016448275862068966} Fold 23 mean squared error 0.05 {'alpha': 0.0016517241379310345} Fold 24 mean squared error 0.05 {'alpha': 0.0016586206896551724} Fold 25 mean squared error 0.05 {'alpha': 0.0016655172413793102} Fold 26 mean squared error 0.05 {'alpha': 0.0016724137931034481} Fold 27 mean squared error 0.05 {'alpha': 0.001679310344827586} Fold 28 mean squared error 0.05 {'alpha': 0.0016862068965517241} Fold 29 mean squared error 0.05 {'alpha': 0.001693103448275862} Fold 30 mean squared error 0.05 {'alpha': 0.0017} {'alpha': 0.0015} Mean squared error = 0.04 Model Coefficents ====================== Coefficients 1 0.000 Coefficients 2 -0.452 Coefficients 3 -0.432 Coefficients 4 0.324 Coefficients 5 0.547 Coefficients 6 0.109 Coefficients 7 0.002 Coefficients 8 -0.021 Coefficients 9 -0.063 Coefficients 10 -0.148 Coefficients 11 0.142 Intercept 1.663 Mean squared error = 0.04 0.03618990322467848
GridSearchCV是sklearn提供的一个便捷的函数,帮助我们采用一个范围内的参数对模型进行训练。我们看一下GridSearchCV的参数
- estimator : 这是指定用给定参数和数据来运行的模型的类型
- param-grid :这个是一个参数字典,用来评估模型效果
- cv : 这个参数定义了感兴趣的交叉验证类型,我们要传递之前创建的k份迭代器作为CV参数
- 评分函数:在本例中,我们使用的是平法误差,也就是用来评估模型的指标
现实生活中,面对着不确定情况,却要做出艰难决定时,我们通常会听取多个朋友的意见。然后基于朋友们的集体智慧来做出决定,机器学习里的集成方法就是采用了这种相似的概念。
集成方法的基本思路是拥有大量的模型,每一个在训练集上产生差别不大的结果,一些模型相较于其他的在某些方面的数据效果会更好一些。可以相信,最后从多个模型得到的输出结果肯定比仅从一个模型中得到的结果要好一些。
引导聚集,通常称为挂袋法,是一种简练优雅的方法,它产生了大量的模型,并将它们的输出集成起来获得最终的预测值。挂袋法集成中的每一个模型只使用训练集的一部分,它们的思路是减少对数据产生过拟合。前面规定了每个模型的差别不能太大,在每个模型训练时采用带替换的采样,这样就产生了一定的差异。还有一种方法是对属性进行采样,采用所有的属性,不同的模型采用不同的属性集合。挂袋法很容易实现并行化。当并行处理框架可用时,模型能并行处理不同的训练集样本。挂袋法对如线性回归之类的线性预测器无效
提升法也是一种集成技术,它产生了一个逐步复杂的模型序列。它按顺序基于钱一个模型的错误训练新的模型,每次训练得到的模型被赋予一个权重,这个权重一句模型在给定数据的效果而定。最终的预测值产生时,这些权重值就是每个特定模型对于最终输出结果的影响力的判据。
挂袋法也就引导聚集,它只有在潜在的模型能够产生不同的变化时才有效,也就是只有能够让潜在的数据引入变化, 它就能产生有着轻微变化的多种模型。我们使用自举在数据集上产生模型的变化,所谓自举,就是在给定的数据集上随机采样一定数量的实例,无论是否带有替换,在挂袋法里,我们用自举产生m个不同的数据集,然后用他们中的每一个构建一个模型。对于回归问题,最后用所有模型产生的输出来产生最终的预测值
随机化是用来在建模过程中引入变化的另一种技术,一个例子就是在集成的每个模型里随机选择属性的自己,这样不同模型试用不同的属性集合,这种技术被称为随机子空间方法。对于一些很稳定的模型,挂袋法的效果不明显,它适合那些对很小的改变也十分敏感的分类器,如决策树
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
def get_data():
x,y=make_classification(n_samples=500,n_features=30,flip_y=0.03,n_informative=18,n_redundant=3,n_repeated=3,random_state=7)
return x,y
def build_single_model(x,y):
model=KNeighborsClassifier()
model.fit(x,y)
return model
def bagging_model(x,y):
bagging=BaggingClassifier(KNeighborsClassifier(),n_estimators=100,random_state=9,max_samples=1.0,max_features=0.7,
bootstrap=True,bootstrap_features=True)
bagging.fit(x,y)
return bagging
def view_model(model):
print("\n Sampled attributes in top 10 estimators \n")
for i,features in enumerate(model.estimators_features_[0:10]):
print("estimator %d"%(i+1),features)

x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#构建单个模型
model=build_single_model(x_train,y_train)
predicted_y=model.predict(x_train)
print("\n Single Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
#构建多个模型
bagging=bagging_model(x_train,y_train)
predicted_y=bagging.predict(x_train)
print("\n Bagging Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
view_model(bagging)
#查看Dev集的运行情况
predicted_y=model.predict(x_dev)
print("\n Single Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
predicted_y=bagging.predict(x_dev)
print("\n Bagging Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))

Single Model Accuracy on training data precision recall f1-score support 0 0.88 0.87 0.88 181 1 0.87 0.88 0.87 169 avg / total 0.87 0.87 0.87 350 Bagging Model Accuracy on training data precision recall f1-score support 0 0.93 0.97 0.95 181 1 0.96 0.92 0.94 169 avg / total 0.95 0.95 0.95 350 Sampled attributes in top 10 estimators estimator 1 [25 20 10 6 17 18 11 17 9 14 3 10 10 23 22 18 17 11 21 20 1] estimator 2 [14 3 27 28 20 20 27 25 0 21 1 12 20 21 29 1 0 28 16 4 9] estimator 3 [29 5 23 19 2 16 21 4 13 27 1 15 24 5 14 1 4 25 22 26 29] estimator 4 [23 10 16 7 22 11 0 14 14 17 8 17 27 12 13 23 8 7 27 0 27] estimator 5 [ 3 0 26 13 23 7 27 15 18 11 26 18 26 3 22 6 11 21 6 12 19] estimator 6 [16 5 24 19 21 2 2 22 12 21 14 28 5 29 9 19 24 14 21 8 11] estimator 7 [ 7 23 2 17 22 2 12 14 25 5 7 10 25 5 17 16 9 0 9 9 15] estimator 8 [16 10 7 8 8 18 6 3 12 29 13 17 20 9 2 25 6 28 15 0 16] estimator 9 [22 29 2 5 6 11 18 4 19 27 17 28 20 15 21 26 14 5 28 15 21] estimator 10 [29 22 17 10 16 10 27 8 2 18 26 1 3 2 1 17 2 12 10 22 26] Single Model Accuracy on Dev data precision recall f1-score support 0 0.83 0.84 0.83 51 1 0.85 0.83 0.84 54 avg / total 0.84 0.84 0.84 105 Bagging Model Accuracy on Dev data precision recall f1-score support 0 0.83 0.88 0.86 51 1 0.88 0.83 0.86 54 avg / total 0.86 0.86 0.86 105
提升法是一种强大的集成技术,在数据科学中得到了广泛的应用,实际上,它是数据科学工具包的最基本工具之一。和挂袋法一样,提升法也使用了一群评估器,但这也是两者之间仅有的相似之处。在深入了解之前,我们先了解一下提升法是如何成为一个高效的集成工具的
以我们熟悉的二元分类问题为例,输入是一系列的预测器(x),输出的取值只能是0或者1的反应变量(y)。这个分类器的输入可以表达为下式:
提升法和挂袋法的不同之处就在于权重α和顺序建模。前面说过,提升法构建了一系列的弱分类器,并给每个分类器使用经过微调的数据集。我们来看看数据微调是怎么回事,正式这些微调的影响产生了权重的α。
从第一个分类器初始化开始,m=1,先把每个实例的权重定位1/N,也就是说,如果有100条记录,每条记录获得0.01的权重,我们用w来表示权重,现在有100个这样的权重值,如下
假定模型1的错误比例为0.3,也就是说它对70%的记录进行正确分类,因此这个模型的权重将大致为0.8,这是个不错的权重,基于这个结果,我们回头给单独的记录设置权重,方法如下:
如你所见,那么被错误分裂的属性权重值都上升了,这就提高了那些分类错误的记录被下下一个分类器选中的几率。序列中随后的分类器都会选择权重较大的实例,并试着适配它。就这样,后续的分类器都会对钱一个分类器错误分类的实例更加关注
这就是提升法的威力,它将多个弱分类器转化为一个强分类器整体
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,zero_one_loss
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
import itertools
def get_data():
x,y=make_classification(n_samples=500,n_features=30,flip_y=0.03,n_informative=18,n_redundant=3,n_repeated=3,random_state=7)
return x,y
def build_single_model(x,y):
model=DecisionTreeClassifier()
model.fit(x,y)
return model
def build_boosting_model(x,y,n_estimators=20):
boosting=AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,min_samples_leaf=1),
random_state=9,n_estimators=n_estimators,algorithm="SAMME")
boosting.fit(x,y)
return boosting
def view_model(model):
print("\n Estimator Weights and Error\n")
for i,weight in enumerate(model.estimator_weights_):
print("estimator %d weight =%0.4f error= %0.4f"%(i+1,weight,model.estimator_errors_[i]))
plt.plot(model.estimator_weights_,model.estimator_errors_)
def number_estimators_vs_err_rate(x,y,x_dev,y_dev):
no_estimators=range(20,120,10)
misclassy_rate=[]
misclassy_rate_dev=[]
for no_estimator in no_estimators:
boosting=build_boosting_model(x,y,no_estimator)
predicted_y=boosting.predict(x)
predicted_y_dev=boosting.predict(x_dev)
misclassy_rate.append(zero_one_loss(y,predicted_y))
misclassy_rate_dev.append(zero_one_loss(y_dev,predicted_y_dev))
# no_estimators=np.asarray(no_estimators)
# misclassy_rate=np.asarray(misclassy_rate)
# misclassy_rate_dev=np.asarray(misclassy_rate_dev)
# print(no_estimators,misclassy_rate)
plt.plot(no_estimators,misclassy_rate,label='Train',color='g')
plt.plot(no_estimators,misclassy_rate_dev,label="Dev",color='r')

x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#构建单个模型
model=build_single_model(x_train,y_train)
predicted_y=model.predict(x_train)
print("\n Single Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
print("Fraction of misclassfication = %0.2f" %(zero_one_loss(y_train,predicted_y)*100,),'%')
#构建多个模型
boosting=build_boosting_model(x_train,y_train)
predicted_y=boosting.predict(x_train)
print("\n Boosting Model Accuracy on training data\n")
print(classification_report(y_train,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_train,predicted_y)*100),"%")
view_model(boosting)
#查看在Dev集上的运行情况
predicted_y=model.predict(x_dev)
print("\n Single Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_dev,predicted_y)*100),"%")
predicted_y=boosting.predict(x_dev)
print("\n Boosting Model Accuracy on Dev data\n")
print(classification_report(y_dev,predicted_y))
print("Fraction of misclassification = %0.2f" %(zero_one_loss(y_dev,predicted_y)*100),"%")
number_estimators_vs_err_rate(x_train,y_train,x_dev,y_dev)

Single Model Accuracy on training data precision recall f1-score support 0 1.00 1.00 1.00 181 1 1.00 1.00 1.00 169 avg / total 1.00 1.00 1.00 350 Fraction of misclassfication = 0.00 % Boosting Model Accuracy on training data precision recall f1-score support 0 0.86 0.94 0.90 181 1 0.93 0.84 0.88 169 avg / total 0.89 0.89 0.89 350 Fraction of misclassification = 10.86 % Estimator Weights and Error estimator 1 weight =0.8337 error= 0.3029 estimator 2 weight =0.8921 error= 0.2907 estimator 3 weight =0.6730 error= 0.3378 estimator 4 weight =0.6067 error= 0.3528 estimator 5 weight =0.5746 error= 0.3602 estimator 6 weight =0.5537 error= 0.3650 estimator 7 weight =0.5697 error= 0.3613 estimator 8 weight =0.5538 error= 0.3650 estimator 9 weight =0.5579 error= 0.3640 estimator 10 weight =0.4530 error= 0.3886 estimator 11 weight =0.4530 error= 0.3886 estimator 12 weight =0.3564 error= 0.4118 estimator 13 weight =0.4130 error= 0.3982 estimator 14 weight =0.3679 error= 0.4091 estimator 15 weight =0.3142 error= 0.4221 estimator 16 weight =0.3888 error= 0.4040 estimator 17 weight =0.4902 error= 0.3799 estimator 18 weight =0.2798 error= 0.4305 estimator 19 weight =0.4463 error= 0.3902 estimator 20 weight =0.2645 error= 0.4343 Single Model Accuracy on Dev data precision recall f1-score support 0 0.62 0.75 0.68 51 1 0.70 0.57 0.63 54 avg / total 0.66 0.66 0.65 105 Fraction of misclassification = 34.29 % Boosting Model Accuracy on Dev data precision recall f1-score support 0 0.71 0.86 0.78 51 1 0.84 0.67 0.74 54 avg / total 0.78 0.76 0.76 105 Fraction of misclassification = 23.81 %
梯度提升法采用的是梯度,而不是权重来鉴别缺陷。以一个简单的回归问题为例,我们给定所需的预测器变量X和反应变量Y,这二者都是一个实数。
先从简单的模型开始,如平均值:
下一个分类器在如下的数据上进行训练
现在来探究为什么要在残差上进行训练,到目前位置,我们清楚提升法创造了渐进的模型,假设我们构建了两个模型F1(X)和F2(X),来预测Y,依据渐进的原则,可以把两个模型组合成以下的形式
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
def get_data():
data=load_boston()
x=data['data']
y=data['target']
return x,y
def build_model(x,y,n_estimators=500):
"""
Build a Gradient Boost regression model
"""
model = GradientBoostingRegressor(n_estimators=n_estimators,verbose=10,\
subsample=0.7, learning_rate= 0.15,max_depth=3,random_state=77)
model.fit(x,y)
return model
def view_model(model):
print("\n Training Scores")
print("===================")
for i,score in enumerate(model.train_score_):
print("\t Estimator %d score %0.3f" %(i+1,score))
plt.plot(range(1,model.estimators_.shape[0]+1),model.train_score_)
plt.xlabel("model sequence")
plt.ylabel("Model Score")
print("\n Feature Importance ")
print("=======================")
for i,score in enumerate(model.feature_importances_):
print("\t Feature %d score %0.3f" %(i+1,score))
def model_worth(true_y,predicted_y):
print("\tMean squared error = %0.2f" %(mean_squared_error(true_y,predicted_y)))
return mean_squared_error(true_y,predicted_y)
x,y=get_data()
x_train,x_test_all,y_train,y_test_all=train_test_split(x,y,test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test=train_test_split(x_test_all,y_test_all,test_size=0.3,random_state=9)
#准备一些多项式特征
poly_features=PolynomialFeatures(2,interaction_only=True)
poly_features.fit(x_train)
x_train_poly=poly_features.transform(x_train)
x_dev_poly=poly_features.transform(x_dev)
#用多项式特征建模
model_poly=build_model(x_train_poly,y_train)
predicted_y=model_poly.predict(x_train_poly)
print("\n Model Performance in Training set(Polynomial features)\n")
model_worth(y_train,predicted_y)
#查看模型细节
view_model(model_poly)
#把模型应用到Dev集
predicted_y=model_poly.predict(x_dev_poly)
print("\n Model Performance in Dev set(Polynomial features)\n")
model_worth(y_dev,predicted_y)
#把模型应用到测试集中
x_test_poly=poly_features.transform(x_test)
predicted_y=model_poly.predict(x_test_poly)
print("\n Model Performance in Test set(Polynomial features)\n")
model_worth(y_test,predicted_y)

Iter Train Loss OOB Improve Remaining Time 1 58.5196 20.8748 1.50s 2 45.2833 10.3732 1.37s 3 40.1522 8.8467 1.33s 4 27.7772 8.2210 1.37s 5 27.6316 3.9991 1.49s 6 21.0990 4.0621 1.52s 7 17.5833 2.8307 1.59s 8 15.0893 2.3535 1.54s 9 11.9637 1.8070 1.53s 10 10.2020 1.4632 1.50s 11 9.5262 0.7945 1.49s 12 7.2975 0.6774 1.46s 13 6.6403 0.4107 1.46s 14 6.4880 0.2982 1.44s 15 6.0600 0.2326 1.46s 16 5.4829 0.0987 1.48s 17 4.4872 0.4339 1.46s 18 4.5728 -0.0933 1.46s 19 4.4907 -0.1070 1.47s 20 4.2713 0.1204 1.46s 21 3.8139 -0.0294 1.46s 22 4.0569 0.1370 1.47s 23 3.2827 -0.1233 1.47s 24 3.6827 -0.0219 1.46s 25 3.3933 0.0265 1.45s 26 3.0773 0.0355 1.44s 27 2.9170 -0.0794 1.43s 28 2.8000 0.0090 1.42s 29 2.9376 0.0555 1.42s 30 2.9341 -0.0120 1.42s 31 2.7107 -0.0145 1.41s 32 2.4329 -0.0825 1.41s 33 2.2248 -0.0282 1.42s 34 2.3968 0.0218 1.41s 35 2.1704 -0.0340 1.41s 36 2.4265 -0.0543 1.41s 37 2.0856 -0.0292 1.40s 38 1.9023 -0.0714 1.39s 39 2.2966 -0.1218 1.38s 40 2.0939 -0.0538 1.38s 41 1.9168 -0.0141 1.37s 42 1.7684 -0.0197 1.37s 43 1.9962 -0.0659 1.37s 44 1.9309 -0.0236 1.37s 45 1.7975 -0.0312 1.37s 46 1.7221 0.0071 1.36s 47 1.8995 -0.0159 1.36s 48 1.6663 -0.0241 1.35s 49 1.6364 -0.0024 1.34s 50 1.6127 -0.0219 1.34s 51 1.5108 -0.0099 1.33s 52 1.4282 -0.0125 1.34s 53 1.4411 -0.0601 1.34s 54 1.5713 -0.0175 1.33s 55 1.4338 0.0106 1.33s 56 1.2473 -0.0265 1.32s 57 1.3721 -0.0129 1.32s 58 1.3093 -0.0234 1.31s 59 1.2621 -0.0350 1.31s 60 1.2321 -0.0122 1.31s 61 1.2824 -0.0339 1.30s 62 1.2321 -0.0079 1.30s 63 1.1697 -0.0329 1.29s 64 1.1407 -0.0417 1.29s 65 1.1132 -0.0320 1.29s 66 1.1506 -0.0135 1.29s 67 0.9701 -0.0237 1.29s 68 1.0161 -0.0143 1.29s 69 0.9928 -0.0242 1.28s 70 0.9920 -0.0250 1.29s 71 0.9580 -0.0345 1.28s 72 0.8313 -0.0304 1.28s 73 0.9547 -0.0056 1.27s 74 0.7691 -0.0207 1.27s 75 0.9145 0.0017 1.26s 76 0.8059 -0.0348 1.26s 77 0.7404 -0.0211 1.26s 78 0.7966 -0.0302 1.25s 79 0.8268 -0.0111 1.25s 80 0.8224 -0.0075 1.24s 81 0.6483 -0.0431 1.24s 82 0.8112 -0.0243 1.23s 83 0.6981 -0.0066 1.23s 84 0.6891 -0.0102 1.23s 85 0.6803 -0.0283 1.22s 86 0.6356 -0.0080 1.22s 87 0.6988 -0.0047 1.21s 88 0.6938 -0.0316 1.21s 89 0.6239 -0.0091 1.22s 90 0.7027 -0.0166 1.23s 91 0.6343 -0.0096 1.23s 92 0.5790 -0.0290 1.23s 93 0.5781 -0.0199 1.23s 94 0.6280 -0.0085 1.22s 95 0.5657 -0.0167 1.22s 96 0.5596 -0.0061 1.21s 97 0.5814 -0.0080 1.21s 98 0.5678 -0.0053 1.21s 99 0.5716 -0.0108 1.21s 100 0.5283 -0.0096 1.20s 101 0.4609 -0.0274 1.20s 102 0.5268 -0.0105 1.19s 103 0.4963 -0.0030 1.19s 104 0.4829 -0.0097 1.18s 105 0.4424 -0.0143 1.18s 106 0.4895 -0.0222 1.18s 107 0.4521 -0.0135 1.17s 108 0.4749 -0.0016 1.17s 109 0.4321 -0.0138 1.16s 110 0.4301 -0.0109 1.16s 111 0.4250 -0.0049 1.16s 112 0.3632 -0.0089 1.15s 113 0.3951 -0.0094 1.15s 114 0.4158 -0.0189 1.15s 115 0.3850 -0.0041 1.14s 116 0.3878 -0.0069 1.14s 117 0.3758 -0.0118 1.14s 118 0.3982 -0.0035 1.13s 119 0.3775 -0.0047 1.13s 120 0.3300 -0.0058 1.12s 121 0.3339 -0.0058 1.12s 122 0.3446 -0.0113 1.12s 123 0.3177 -0.0064 1.11s 124 0.3026 -0.0008 1.11s 125 0.3065 -0.0033 1.10s 126 0.3337 -0.0036 1.10s 127 0.2870 -0.0100 1.10s 128 0.2903 -0.0077 1.09s 129 0.3062 -0.0055 1.09s 130 0.2854 -0.0061 1.08s 131 0.2862 -0.0039 1.08s 132 0.2437 -0.0052 1.07s 133 0.2479 -0.0105 1.07s 134 0.2616 -0.0051 1.06s 135 0.2704 -0.0113 1.06s 136 0.2330 -0.0096 1.06s 137 0.2263 -0.0079 1.05s 138 0.2517 -0.0053 1.05s 139 0.2440 -0.0102 1.04s 140 0.2561 -0.0018 1.04s 141 0.2234 -0.0037 1.04s 142 0.2313 -0.0012 1.04s 143 0.2226 -0.0034 1.04s 144 0.2326 -0.0029 1.04s 145 0.2462 -0.0034 1.03s 146 0.2043 -0.0014 1.03s 147 0.2156 -0.0055 1.02s 148 0.2051 -0.0062 1.02s 149 0.2068 -0.0064 1.02s 150 0.1821 -0.0069 1.02s 151 0.1884 -0.0046 1.01s 152 0.1747 -0.0040 1.01s 153 0.1733 -0.0049 1.01s 154 0.1967 -0.0044 1.00s 155 0.1949 -0.0086 1.00s 156 0.1783 -0.0013 0.99s 157 0.1683 -0.0058 0.99s 158 0.1728 -0.0052 0.99s 159 0.1717 -0.0046 0.98s 160 0.1743 -0.0037 0.98s 161 0.1591 -0.0018 0.98s 162 0.1659 -0.0059 0.98s 163 0.1582 -0.0018 0.98s 164 0.1488 0.0002 0.97s 165 0.1649 -0.0028 0.97s 166 0.1519 -0.0044 0.97s 167 0.1408 -0.0064 0.96s 168 0.1351 -0.0015 0.96s 169 0.1402 -0.0041 0.96s 170 0.1382 -0.0025 0.96s 171 0.1321 -0.0019 0.96s 172 0.1295 -0.0038 0.95s 173 0.1351 -0.0035 0.95s 174 0.1277 -0.0018 0.95s 175 0.1326 -0.0020 0.94s 176 0.1225 -0.0045 0.94s 177 0.1251 -0.0063 0.94s 178 0.1203 -0.0045 0.94s 179 0.1278 -0.0025 0.94s 180 0.1110 -0.0022 0.94s 181 0.1105 -0.0023 0.93s 182 0.1081 -0.0015 0.93s 183 0.1014 -0.0046 0.93s 184 0.1036 -0.0013 0.92s 185 0.0965 -0.0015 0.92s 186 0.1002 -0.0054 0.92s 187 0.0984 -0.0013 0.92s 188 0.0952 -0.0028 0.91s 189 0.0926 -0.0033 0.91s 190 0.1002 -0.0020 0.91s 191 0.0945 -0.0017 0.91s 192 0.0947 -0.0016 0.91s 193 0.0884 -0.0017 0.91s 194 0.0872 -0.0039 0.90s 195 0.0895 -0.0020 0.90s 196 0.0871 -0.0020 0.90s 197 0.0829 -0.0035 0.90s 198 0.0760 -0.0016 0.90s 199 0.0846 -0.0024 0.89s 200 0.0808 -0.0018 0.89s 201 0.0831 -0.0021 0.89s 202 0.0722 -0.0016 0.89s 203 0.0771 -0.0012 0.89s 204 0.0711 -0.0009 0.88s 205 0.0704 -0.0012 0.88s 206 0.0724 -0.0009 0.88s 207 0.0645 -0.0039 0.88s 208 0.0690 -0.0006 0.87s 209 0.0662 -0.0014 0.87s 210 0.0612 -0.0016 0.87s 211 0.0608 -0.0005 0.87s 212 0.0619 -0.0024 0.87s 213 0.0593 0.0001 0.86s 214 0.0618 -0.0014 0.86s 215 0.0570 -0.0016 0.86s 216 0.0599 -0.0015 0.86s 217 0.0504 -0.0019 0.85s 218 0.0525 -0.0018 0.85s 219 0.0510 -0.0022 0.85s 220 0.0549 -0.0009 0.84s 221 0.0513 -0.0011 0.84s 222 0.0483 -0.0005 0.84s 223 0.0496 -0.0013 0.84s 224 0.0460 -0.0006 0.83s 225 0.0496 -0.0008 0.83s 226 0.0486 -0.0003 0.83s 227 0.0501 -0.0021 0.83s 228 0.0481 -0.0005 0.83s 229 0.0451 -0.0010 0.82s 230 0.0470 -0.0005 0.82s 231 0.0436 -0.0013 0.82s 232 0.0413 -0.0019 0.82s 233 0.0397 -0.0010 0.82s 234 0.0382 -0.0002 0.81s 235 0.0394 -0.0005 0.81s 236 0.0381 -0.0003 0.81s 237 0.0391 -0.0007 0.81s 238 0.0375 -0.0010 0.81s 239 0.0396 -0.0007 0.80s 240 0.0367 -0.0005 0.80s 241 0.0377 -0.0008 0.80s 242 0.0367 -0.0011 0.79s 243 0.0362 -0.0007 0.79s 244 0.0338 -0.0004 0.79s 245 0.0351 -0.0005 0.79s 246 0.0326 -0.0008 0.78s 247 0.0316 -0.0015 0.78s 248 0.0337 -0.0007 0.78s 249 0.0346 -0.0004 0.77s 250 0.0299 -0.0005 0.77s 251 0.0307 -0.0013 0.77s 252 0.0330 -0.0008 0.76s 253 0.0316 -0.0009 0.76s 254 0.0263 -0.0011 0.76s 255 0.0314 -0.0002 0.76s 256 0.0291 -0.0011 0.75s 257 0.0288 -0.0005 0.75s 258 0.0274 -0.0007 0.75s 259 0.0280 -0.0001 0.74s 260 0.0276 -0.0012 0.74s 261 0.0265 -0.0009 0.74s 262 0.0282 -0.0005 0.73s 263 0.0251 -0.0007 0.73s 264 0.0251 -0.0007 0.73s 265 0.0246 -0.0008 0.72s 266 0.0253 -0.0003 0.72s 267 0.0245 -0.0005 0.72s 268 0.0234 -0.0005 0.72s 269 0.0218 -0.0005 0.71s 270 0.0241 -0.0005 0.71s 271 0.0220 -0.0003 0.71s 272 0.0217 -0.0007 0.71s 273 0.0228 -0.0007 0.70s 274 0.0209 -0.0013 0.70s 275 0.0221 -0.0009 0.69s 276 0.0224 -0.0002 0.69s 277 0.0222 -0.0007 0.69s 278 0.0195 -0.0009 0.68s 279 0.0207 -0.0005 0.68s 280 0.0194 -0.0003 0.68s 281 0.0190 -0.0004 0.68s 282 0.0196 -0.0005 0.67s 283 0.0167 -0.0005 0.67s 284 0.0173 -0.0007 0.67s 285 0.0157 -0.0004 0.66s 286 0.0178 -0.0009 0.66s 287 0.0171 -0.0002 0.66s 288 0.0172 -0.0004 0.66s 289 0.0158 -0.0006 0.65s 290 0.0158 -0.0004 0.65s 291 0.0175 -0.0004 0.64s 292 0.0159 -0.0003 0.64s 293 0.0151 -0.0005 0.64s 294 0.0152 -0.0001 0.63s 295 0.0147 -0.0003 0.63s 296 0.0158 -0.0002 0.63s 297 0.0150 -0.0003 0.62s 298 0.0152 -0.0004 0.62s 299 0.0134 -0.0004 0.62s 300 0.0134 -0.0005 0.61s 301 0.0133 -0.0004 0.61s 302 0.0134 -0.0001 0.61s 303 0.0123 -0.0002 0.60s 304 0.0132 -0.0003 0.60s 305 0.0127 -0.0003 0.60s 306 0.0118 -0.0003 0.59s 307 0.0123 -0.0001 0.59s 308 0.0117 -0.0005 0.59s 309 0.0122 -0.0002 0.58s 310 0.0121 -0.0002 0.58s 311 0.0114 -0.0005 0.58s 312 0.0121 -0.0002 0.57s 313 0.0107 -0.0002 0.57s 314 0.0118 -0.0001 0.56s 315 0.0097 -0.0001 0.56s 316 0.0111 -0.0002 0.56s 317 0.0108 -0.0001 0.55s 318 0.0100 -0.0002 0.55s 319 0.0105 -0.0003 0.55s 320 0.0103 -0.0003 0.54s 321 0.0093 -0.0002 0.54s 322 0.0094 -0.0002 0.54s 323 0.0102 -0.0004 0.53s 324 0.0086 -0.0002 0.53s 325 0.0087 -0.0004 0.53s 326 0.0086 -0.0001 0.53s 327 0.0088 -0.0001 0.52s 328 0.0082 -0.0001 0.52s 329 0.0091 -0.0001 0.52s 330 0.0087 -0.0001 0.51s 331 0.0077 -0.0002 0.51s 332 0.0082 -0.0002 0.51s 333 0.0078 -0.0003 0.50s 334 0.0079 -0.0002 0.50s 335 0.0081 -0.0002 0.50s 336 0.0071 -0.0003 0.49s 337 0.0070 -0.0002 0.49s 338 0.0078 -0.0003 0.49s 339 0.0071 -0.0001 0.48s 340 0.0071 -0.0002 0.48s 341 0.0070 -0.0001 0.48s 342 0.0070 -0.0001 0.47s 343 0.0068 -0.0002 0.47s 344 0.0072 -0.0001 0.47s 345 0.0066 -0.0002 0.47s 346 0.0063 -0.0003 0.46s 347 0.0056 -0.0002 0.46s 348 0.0060 -0.0001 0.46s 349 0.0065 -0.0002 0.45s 350 0.0058 -0.0002 0.45s 351 0.0062 -0.0002 0.45s 352 0.0059 -0.0003 0.44s 353 0.0056 -0.0001 0.44s 354 0.0059 -0.0002 0.44s 355 0.0059 -0.0002 0.43s 356 0.0055 -0.0001 0.43s 357 0.0057 -0.0002 0.43s 358 0.0059 -0.0002 0.43s 359 0.0053 -0.0001 0.42s 360 0.0056 -0.0000 0.42s 361 0.0050 -0.0001 0.42s 362 0.0051 -0.0002 0.41s 363 0.0056 -0.0001 0.41s 364 0.0053 -0.0002 0.41s 365 0.0050 -0.0003 0.41s 366 0.0049 -0.0002 0.40s 367 0.0044 -0.0002 0.40s 368 0.0043 -0.0000 0.40s 369 0.0048 -0.0002 0.39s 370 0.0046 -0.0001 0.39s 371 0.0045 -0.0001 0.39s 372 0.0044 -0.0002 0.38s 373 0.0051 -0.0001 0.38s 374 0.0043 -0.0001 0.38s 375 0.0043 -0.0001 0.38s 376 0.0043 -0.0001 0.37s 377 0.0044 -0.0001 0.37s 378 0.0041 -0.0001 0.37s 379 0.0039 -0.0001 0.36s 380 0.0041 -0.0001 0.36s 381 0.0037 -0.0001 0.36s 382 0.0042 -0.0000 0.35s 383 0.0040 -0.0000 0.35s 384 0.0036 -0.0001 0.35s 385 0.0039 -0.0000 0.34s 386 0.0035 -0.0000 0.34s 387 0.0036 -0.0001 0.34s 388 0.0035 -0.0001 0.33s 389 0.0036 -0.0001 0.33s 390 0.0035 -0.0001 0.33s 391 0.0035 -0.0000 0.33s 392 0.0034 -0.0001 0.32s 393 0.0029 -0.0001 0.32s 394 0.0032 -0.0001 0.32s 395 0.0030 -0.0001 0.31s 396 0.0032 -0.0001 0.31s 397 0.0030 -0.0001 0.31s 398 0.0029 -0.0001 0.30s 399 0.0030 -0.0000 0.30s 400 0.0027 -0.0001 0.30s 401 0.0026 -0.0000 0.30s 402 0.0027 -0.0001 0.29s 403 0.0029 -0.0000 0.29s 404 0.0025 -0.0001 0.29s 405 0.0027 -0.0002 0.28s 406 0.0028 -0.0001 0.28s 407 0.0026 -0.0001 0.28s 408 0.0023 -0.0002 0.27s 409 0.0024 -0.0001 0.27s 410 0.0026 -0.0001 0.27s 411 0.0025 -0.0000 0.26s 412 0.0026 -0.0001 0.26s 413 0.0023 -0.0001 0.26s 414 0.0024 -0.0000 0.26s 415 0.0025 -0.0000 0.25s 416 0.0022 -0.0001 0.25s 417 0.0023 -0.0000 0.25s 418 0.0022 -0.0001 0.24s 419 0.0021 -0.0001 0.24s 420 0.0020 -0.0000 0.24s 421 0.0020 -0.0000 0.23s 422 0.0020 -0.0000 0.23s 423 0.0020 -0.0000 0.23s 424 0.0022 -0.0001 0.23s 425 0.0020 -0.0000 0.22s 426 0.0020 -0.0001 0.22s 427 0.0021 -0.0000 0.22s 428 0.0019 -0.0001 0.22s 429 0.0019 -0.0000 0.21s 430 0.0017 -0.0000 0.21s 431 0.0019 -0.0001 0.21s 432 0.0017 -0.0001 0.20s 433 0.0019 -0.0001 0.20s 434 0.0016 -0.0001 0.20s 435 0.0017 -0.0000 0.19s 436 0.0016 -0.0001 0.19s 437 0.0018 -0.0000 0.19s 438 0.0016 -0.0000 0.19s 439 0.0016 -0.0000 0.18s 440 0.0017 -0.0000 0.18s 441 0.0015 -0.0000 0.18s 442 0.0015 -0.0000 0.17s 443 0.0015 -0.0000 0.17s 444 0.0014 -0.0001 0.17s 445 0.0013 -0.0001 0.16s 446 0.0014 -0.0001 0.16s 447 0.0015 -0.0000 0.16s 448 0.0014 -0.0000 0.16s 449 0.0013 -0.0001 0.15s 450 0.0013 -0.0000 0.15s 451 0.0013 -0.0001 0.15s 452 0.0012 -0.0000 0.14s 453 0.0012 -0.0000 0.14s 454 0.0012 -0.0000 0.14s 455 0.0012 -0.0000 0.13s 456 0.0012 -0.0000 0.13s 457 0.0011 -0.0000 0.13s 458 0.0012 -0.0001 0.12s 459 0.0012 -0.0000 0.12s 460 0.0012 -0.0000 0.12s 461 0.0011 -0.0000 0.12s 462 0.0011 -0.0000 0.11s 463 0.0011 -0.0000 0.11s 464 0.0011 -0.0000 0.11s 465 0.0010 -0.0000 0.10s 466 0.0010 -0.0000 0.10s 467 0.0010 -0.0000 0.10s 468 0.0010 -0.0000 0.09s 469 0.0010 -0.0000 0.09s 470 0.0009 -0.0000 0.09s 471 0.0010 -0.0000 0.09s 472 0.0010 -0.0000 0.08s 473 0.0009 -0.0000 0.08s 474 0.0009 -0.0000 0.08s 475 0.0009 -0.0000 0.07s 476 0.0008 -0.0000 0.07s 477 0.0008 -0.0000 0.07s 478 0.0008 -0.0000 0.07s 479 0.0008 -0.0000 0.06s 480 0.0008 -0.0000 0.06s 481 0.0008 -0.0000 0.06s 482 0.0008 -0.0000 0.05s 483 0.0008 -0.0000 0.05s 484 0.0007 -0.0000 0.05s 485 0.0007 -0.0000 0.04s 486 0.0008 -0.0000 0.04s 487 0.0007 -0.0000 0.04s 488 0.0007 -0.0000 0.04s 489 0.0007 -0.0000 0.03s 490 0.0006 -0.0000 0.03s 491 0.0007 -0.0000 0.03s 492 0.0006 -0.0000 0.02s 493 0.0006 -0.0000 0.02s 494 0.0006 -0.0000 0.02s 495 0.0006 -0.0000 0.01s 496 0.0006 -0.0000 0.01s 497 0.0006 -0.0000 0.01s 498 0.0006 -0.0000 0.01s 499 0.0006 -0.0000 0.00s 500 0.0005 -0.0000 0.00s Model Performance in Training set(Polynomial features) Mean squared error = 0.00 Training Scores =================== Estimator 1 score 58.520 Estimator 2 score 45.283 Estimator 3 score 40.152 Estimator 4 score 27.777 Estimator 5 score 27.632 Estimator 6 score 21.099 Estimator 7 score 17.583 Estimator 8 score 15.089 Estimator 9 score 11.964 Estimator 10 score 10.202 Estimator 11 score 9.526 Estimator 12 score 7.298 Estimator 13 score 6.640 Estimator 14 score 6.488 Estimator 15 score 6.060 Estimator 16 score 5.483 Estimator 17 score 4.487 Estimator 18 score 4.573 Estimator 19 score 4.491 Estimator 20 score 4.271 Estimator 21 score 3.814 Estimator 22 score 4.057 Estimator 23 score 3.283 Estimator 24 score 3.683 Estimator 25 score 3.393 Estimator 26 score 3.077 Estimator 27 score 2.917 Estimator 28 score 2.800 Estimator 29 score 2.938 Estimator 30 score 2.934 Estimator 31 score 2.711 Estimator 32 score 2.433 Estimator 33 score 2.225 Estimator 34 score 2.397 Estimator 35 score 2.170 Estimator 36 score 2.427 Estimator 37 score 2.086 Estimator 38 score 1.902 Estimator 39 score 2.297 Estimator 40 score 2.094 Estimator 41 score 1.917 Estimator 42 score 1.768 Estimator 43 score 1.996 Estimator 44 score 1.931 Estimator 45 score 1.797 Estimator 46 score 1.722 Estimator 47 score 1.899 Estimator 48 score 1.666 Estimator 49 score 1.636 Estimator 50 score 1.613 Estimator 51 score 1.511 Estimator 52 score 1.428 Estimator 53 score 1.441 Estimator 54 score 1.571 Estimator 55 score 1.434 Estimator 56 score 1.247 Estimator 57 score 1.372 Estimator 58 score 1.309 Estimator 59 score 1.262 Estimator 60 score 1.232 Estimator 61 score 1.282 Estimator 62 score 1.232 Estimator 63 score 1.170 Estimator 64 score 1.141 Estimator 65 score 1.113 Estimator 66 score 1.151 Estimator 67 score 0.970 Estimator 68 score 1.016 Estimator 69 score 0.993 Estimator 70 score 0.992 Estimator 71 score 0.958 Estimator 72 score 0.831 Estimator 73 score 0.955 Estimator 74 score 0.769 Estimator 75 score 0.914 Estimator 76 score 0.806 Estimator 77 score 0.740 Estimator 78 score 0.797 Estimator 79 score 0.827 Estimator 80 score 0.822 Estimator 81 score 0.648 Estimator 82 score 0.811 Estimator 83 score 0.698 Estimator 84 score 0.689 Estimator 85 score 0.680 Estimator 86 score 0.636 Estimator 87 score 0.699 Estimator 88 score 0.694 Estimator 89 score 0.624 Estimator 90 score 0.703 Estimator 91 score 0.634 Estimator 92 score 0.579 Estimator 93 score 0.578 Estimator 94 score 0.628 Estimator 95 score 0.566 Estimator 96 score 0.560 Estimator 97 score 0.581 Estimator 98 score 0.568 Estimator 99 score 0.572 Estimator 100 score 0.528 Estimator 101 score 0.461 Estimator 102 score 0.527 Estimator 103 score 0.496 Estimator 104 score 0.483 Estimator 105 score 0.442 Estimator 106 score 0.490 Estimator 107 score 0.452 Estimator 108 score 0.475 Estimator 109 score 0.432 Estimator 110 score 0.430 Estimator 111 score 0.425 Estimator 112 score 0.363 Estimator 113 score 0.395 Estimator 114 score 0.416 Estimator 115 score 0.385 Estimator 116 score 0.388 Estimator 117 score 0.376 Estimator 118 score 0.398 Estimator 119 score 0.378 Estimator 120 score 0.330 Estimator 121 score 0.334 Estimator 122 score 0.345 Estimator 123 score 0.318 Estimator 124 score 0.303 Estimator 125 score 0.306 Estimator 126 score 0.334 Estimator 127 score 0.287 Estimator 128 score 0.290 Estimator 129 score 0.306 Estimator 130 score 0.285 Estimator 131 score 0.286 Estimator 132 score 0.244 Estimator 133 score 0.248 Estimator 134 score 0.262 Estimator 135 score 0.270 Estimator 136 score 0.233 Estimator 137 score 0.226 Estimator 138 score 0.252 Estimator 139 score 0.244 Estimator 140 score 0.256 Estimator 141 score 0.223 Estimator 142 score 0.231 Estimator 143 score 0.223 Estimator 144 score 0.233 Estimator 145 score 0.246 Estimator 146 score 0.204 Estimator 147 score 0.216 Estimator 148 score 0.205 Estimator 149 score 0.207 Estimator 150 score 0.182 Estimator 151 score 0.188 Estimator 152 score 0.175 Estimator 153 score 0.173 Estimator 154 score 0.197 Estimator 155 score 0.195 Estimator 156 score 0.178 Estimator 157 score 0.168 Estimator 158 score 0.173 Estimator 159 score 0.172 Estimator 160 score 0.174 Estimator 161 score 0.159 Estimator 162 score 0.166 Estimator 163 score 0.158 Estimator 164 score 0.149 Estimator 165 score 0.165 Estimator 166 score 0.152 Estimator 167 score 0.141 Estimator 168 score 0.135 Estimator 169 score 0.140 Estimator 170 score 0.138 Estimator 171 score 0.132 Estimator 172 score 0.129 Estimator 173 score 0.135 Estimator 174 score 0.128 Estimator 175 score 0.133 Estimator 176 score 0.123 Estimator 177 score 0.125 Estimator 178 score 0.120 Estima
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。