赞
踩
这是kaggle上Getting Started 的Prediction Competition,也是比较入门和简单的新人赛,我的最好成绩好像有进入top8%,重新地回顾巩固一下这个比赛,我将分成三个部分:
赛题地址:Titanic: Machine Learning from Disaster
1912年4月15日,在她的处女航中,被普遍认为“沉没”的RMS泰坦尼克号与冰山相撞后沉没。
不幸的是,船上没有足够的救生艇供所有人使用,导致2224名乘客和机组人员中的1502人死亡。虽然幸存有一些运气,但似乎有些人比其他人更有可能生存。
在这一挑战中,我们要求您建立一个预测模型来回答以下问题:“什么样的人更有可能生存?” 使用乘客数据(即姓名,年龄,性别,社会经济舱等)
任务分析:这是一个分类任务,建立模型预测幸存者
train_data = data_all[data_all['train']==1]
test_x = data_all[data_all['train']==0]
## 标签
y=train_data ['Survived'].values
train_data.drop(['train','Survived'],axis=1,inplace=True)
test_x.drop(['train','Survived'],axis=1,inplace=True)
np.save('./result/label',y)
from sklearn.model_selection import train_test_split
feature=train_data.values
X_train, X_test, Y_train, Y_test = train_test_split(features, y, test_size=0.2, random_state=seed)
def Titanicmodel(clf,features,test_data,y,model_name): if model_name =='LinearSVC': num_classes = 1 #类别数 else: num_classes = 2 #类别数 num_fold = 10 #10折 fold_len = features.shape[0] // num_fold #每一折的数据量 skf_indices = [] skf = StratifiedKFold(n_splits=num_fold, shuffle=True, random_state=seed) #将训练集分为10折 for i, (train_idx, valid_idx) in enumerate(skf.split(np.ones(features.shape[0]), y)): skf_indices.extend(valid_idx.tolist()) train_pred = np.zeros((features.shape[0], num_classes)) #在训练集上的预测结果 (train_samples,classes) test_pred = np.zeros((test_data.shape[0], num_classes))#在测试集上的预测结果 (test_samples,classes) for fold in tqdm(range(num_fold)): fold_start = fold * fold_len fold_end = (fold + 1) * fold_len if fold == num_fold - 1: fold_end = train_data.shape[0] #训练部分索引 9折 train_indices = skf_indices[:fold_start] + skf_indices[fold_end:] # 验证部分索引 1折 test_indices = skf_indices[fold_start:fold_end] #训练部分数据 9折 train_x = features[train_indices] train_y = y[train_indices] #验证部分数据 1折 cv_test_x = features[test_indices] clf.fit(train_x, train_y) #训练 if model_name =='LinearSVC': pred = clf.decision_function(cv_test_x) #在验证部分数据上 进行验证 train_pred[test_indices] = (pred).reshape(len(pred),1) #把预测结果先通过softmax转换为概率分布(归一化) 赋给验证部分对应的位置 循环结束将会得到整个训练集上的预测结果 pred = clf.decision_function(test_data) #得到 当前训练的模型在测试集上的预测结果 test_pred += pred.reshape(len(pred),1) / num_fold#对每个模型在测试集上的预测结果先通过softmax转换为概率分布,再直接取平均(10折将会有10个结果) else: pred = clf.predict_proba(cv_test_x) #在验证部分数据上 进行验证 train_pred[test_indices] = pred #把预测结果 赋给验证部分对应的位置 循环结束将会得到整个训练集上的预测结果 pred = clf.predict_proba(test_data) #得到 当前训练的模型在测试集上的预测结果 test_pred += pred / num_fold #对每个模型在测试集上的预测结果直接取平均(10折将会有10个结果) y_pred = np.argmax(train_pred, axis=1) #对训练集上的预测结果按行取最大值 得到预测的标签 if model_name =='LinearSVC': y_pred = (train_pred>0).astype(np.int32).reshape(len(train_pred)) pre = (test_pred>0).astype(np.int32).reshape(len(test_pred)) else: pre = np.argmax(test_pred,axis=1) score = accuracy_score(y, y_pred) #和训练集对应的真实标签 accuracy_score print('accuracy_score:',score) #保存逻辑回归模型在训练集和测试集上的预测结果 np.save('./result/{0}'.format(model_name)+'train',train_pred) np.save('./result/{0}'.format(model_name)+'test',test_pred) submit = pd.DataFrame({'PassengerId':np.array(range(892,1310)),'Survived':pre.astype(np.int32)}) submit.to_csv('{0}_submit.csv'.format(model_name),index=False) return clf,score
pipe=Pipeline([('select',PCA(n_components=0.95)),
('classify', LogisticRegression(random_state = seed, solver = 'liblinear'))])
param = {
'classify__penalty':['l1','l2'],
'classify__C':[0.001, 0.01, 0.1, 1, 5,7,8,9,10,]}
LR_grid = GridSearchCV(estimator =pipe, param_grid = param, scoring='roc_auc', cv=5)
LR_grid.fit(features,y)
print(LR_grid.best_params_, LR_grid.best_score_)
C=LR_grid.best_params_['classify__C']
penalty = LR_grid.best_params_['classify__penalty']
LR_classify=LogisticRegression(C=C,penalty=penalty,random_state = seed, solver = 'liblinear')
LR_select = PCA(n_components=0.95)
LR_pipeline = make_pipeline(LR_select, LR_classify)
lr_model,lr_score = Titanicmodel(LR_pipeline,feature,test_data,y,'LR')
pipe=Pipeline([('select',SelectKBest(k=20)), ('classify',LinearSVC(random_state=seed))]) param = { 'select__k':list(range(20,40,2)), 'classify__penalty':['l1','l2'], 'classify__C':[0.001, 0.01, 0.1, 1, 5,7,8,9,10,50,100]} SVC_grid=GridSearchCV(estimator=pipe,param_grid=param,cv=5,scoring='roc_auc') SVC_grid.fit(features,y) print(SVC_grid.best_params_, SVC_grid.best_score_) C=SVC_grid.best_params_['classify__C'] k=SVC_grid.best_params_['select__k'] penalty = SVC_grid.best_params_['classify__penalty'] SVC_classify=LinearSVC(C=C,penalty=penalty,random_state = seed) SVC_select = PCA(n_components=0.95) SVC_pipeline = make_pipeline(SVC_select, SVC_classify) SVC_model,LinearSVC_score = Titanicmodel(SVC_pipeline,feature,test_data,y,'LinearSVC')
pipe=Pipeline([('select',SelectKBest(k=34)), ('classify', RandomForestClassifier(criterion='gini', random_state = seed, min_samples_split=4, min_samples_leaf=5, max_features = 'sqrt', n_jobs=-1, ))]) param = { 'classify__n_estimators':list(range(40,50,2)), 'classify__max_depth':list(range(10,25,2))} rfc_grid = GridSearchCV(estimator = pipe, param_grid = param, scoring='roc_auc', cv=10) rfc_grid.fit(features,y) print(rfc_grid.best_params_, rfc_grid.best_score_) n_estimators=rfc_grid.best_params_['classify__n_estimators'] max_depth = rfc_grid.best_params_['classify__max_depth'] rfc_classify=RandomForestClassifier(criterion='gini', n_estimators= n_estimators, max_depth=max_depth, random_state = seed, min_samples_split=4, min_samples_leaf=5, max_features = 'sqrt') rfc_select = PCA(n_components=0.95) rfc_pipeline = make_pipeline(rfc_select, rfc_classify) rfc_model,rfc_score = Titanicmodel(rfc_pipeline,feature,test_data,y,'rfc')
pipe=Pipeline([('select',SelectKBest(k=34)), ('classify', lgb.LGBMClassifier(random_state=seed,learning_rate=0.12,n_estimators=88,max_depth=16, min_child_samples=28, min_child_weight=0.0, classify__colsample_bytree= 0.8, colsample_bytree=0.4, objective='binary' ) )]) param = {'select__k':[i for i in range(20,40)] # 'classify__learning_rate':[i/100 for i in range(20)] } lgb_grid = GridSearchCV(estimator = pipe, param_grid = param, scoring='roc_auc', cv=10) lgb_grid.fit(features,y) print(lgb_grid.best_params_, lgb_grid.best_score_) lgb_classify= lgb.LGBMClassifier(random_state=seed, learning_rate=0.12, n_estimators=88, max_depth=16, min_child_samples=28, min_child_weight=0.0, classify__colsample_bytree= 0.8, colsample_bytree=0.4, objective='binary' ) lgb_select = PCA(n_components=0.96) lgb_pipeline = make_pipeline(lgb_select, lgb_classify) lgb_model,lgb_score = Titanicmodel(lgb_pipeline,feature,test_data,y,'lgb')
pipe=Pipeline([('select',SelectKBest(k=34)), ('classify', xgb.XGBClassifier(random_state=seed, learning_rate=0.12, n_estimators=80, max_depth=8, min_child_weight=3, subsample=0.8, colsample_bytree=0.8, gamma=0.2, reg_alpha=0.2, reg_lambda=0.1, ) )]) param = { 'select__k':[i for i in range(20,40) 'classify__learning_rate':[i/100 for i in range(10,20)], } xgb_grid = GridSearchCV(estimator = pipe, param_grid = param, scoring='roc_auc', cv=10) xgb_grid.fit(features,y) print(xgb_grid.best_params_, xgb_grid.best_score_) xgb_classify= xgb.XGBClassifier(random_state=seed, learning_rate=0.12, n_estimators=80, max_depth=8, min_child_weight=3, subsample=0.8, colsample_bytree=0.8, gamma=0.2, reg_alpha=0.2, reg_lambda=0.1, ) xgb_select = SelectKBest(k = 34) xgb_pipeline = make_pipeline(xgb_select, xgb_classify) xgb_model,xgb_score = Titanicmodel(xgb_pipeline,'xgb')
LR_train = np.load('./result/LRtrain.npy')
LR_test = np.load('./result/LRtest.npy')
LinearSVC_train = np.load('./result/LinearSVCtrain.npy')
LinearSVC_test = np.load('./result/LinearSVCtest.npy')
rfc_train = np.load('./result/rfctrain.npy')
rfc_test = np.load('./result/rfctest.npy')
xgb_train = np.load('./result/xgbtrain.npy')
xgb_test = np.load('./result/xgbtest.npy')
lgb_train = np.load('./result/lgbtrain.npy')
lgb_test= np.load('./result/lgbtest.npy')
label = np.load('./result/label.npy')
train_data = ( LR_train, rfc_train, LinearSVC_train,xgb_train, lgb_train)
test_x = ( LR_test, rfc_test, LinearSVC_test,xgb_test, lgb_test)
train_data = np.hstack(train_data)
test_x = np.hstack(test_x)
model = LogisticRegression(random_state=seed)
lgbm_7leaves_model,lgbm_7leaves_score = Titanicmodel(model,features=train_data,test_data=test_x,y=label,model_name='lr_stacking')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。