Kaggle泰坦尼克号生存预测挑战——模型建立、模型调参、融合_泰坦尼克号模型调参测试

作者：你好赵伟 | 2024-03-14 20:22:57

踩

泰坦尼克号模型调参测试

Kaggle泰坦尼克号生存预测挑战

这是kaggle上Getting Started 的Prediction Competition，也是比较入门和简单的新人赛，我的最好成绩好像有进入top8%，重新地回顾巩固一下这个比赛，我将分成三个部分：

先修知识

numpy
pandas
matplotlib
seaborn
sklearn

赛题地址：Titanic: Machine Learning from Disaster

泰坦尼克号的沉没

1912年4月15日，在她的处女航中，被普遍认为“沉没”的RMS泰坦尼克号与冰山相撞后沉没。

不幸的是，船上没有足够的救生艇供所有人使用，导致2224名乘客和机组人员中的1502人死亡。虽然幸存有一些运气，但似乎有些人比其他人更有可能生存。

在这一挑战中，我们要求您建立一个预测模型来回答以下问题：“什么样的人更有可能生存？” 使用乘客数据（即姓名，年龄，性别，社会经济舱等）

任务分析：这是一个分类任务，建立模型预测幸存者

模型建立、模型调参、融合

将数据分成训练集和测试集

train_data = data_all[data_all['train']==1]
test_x = data_all[data_all['train']==0]
## 标签
y=train_data ['Survived'].values 
train_data.drop(['train','Survived'],axis=1,inplace=True)
test_x.drop(['train','Survived'],axis=1,inplace=True)
np.save('./result/label',y)
1
2
3
4
5
6
7

将数据分成训练集和测试集：比例为 8：2

from sklearn.model_selection import train_test_split
feature=train_data.values

X_train, X_test, Y_train, Y_test = train_test_split(features, y, test_size=0.2, random_state=seed)
1
2
3
4

模型建立

自定义封装一个函数

def Titanicmodel(clf,features,test_data,y,model_name):
    if model_name =='LinearSVC':
            num_classes = 1 #类别数
    else:
            num_classes = 2 #类别数
    num_fold = 10  #10折
    fold_len = features.shape[0] // num_fold #每一折的数据量
    skf_indices = []
    skf = StratifiedKFold(n_splits=num_fold, shuffle=True, random_state=seed) #将训练集分为10折
    for i, (train_idx, valid_idx) in enumerate(skf.split(np.ones(features.shape[0]), y)):
        skf_indices.extend(valid_idx.tolist())
    
    train_pred = np.zeros((features.shape[0], num_classes)) #在训练集上的预测结果 (train_samples,classes)
    test_pred = np.zeros((test_data.shape[0], num_classes))#在测试集上的预测结果 (test_samples,classes)


    for fold in tqdm(range(num_fold)):


        fold_start = fold * fold_len
        fold_end = (fold + 1) * fold_len
        if fold == num_fold - 1:
            fold_end = train_data.shape[0]
        #训练部分索引 9折
        train_indices = skf_indices[:fold_start] + skf_indices[fold_end:]
        # 验证部分索引 1折
        test_indices = skf_indices[fold_start:fold_end]

        #训练部分数据 9折
        train_x = features[train_indices]
        train_y = y[train_indices]
        #验证部分数据 1折
        cv_test_x = features[test_indices]

        clf.fit(train_x, train_y) #训练

        if model_name =='LinearSVC':
            pred = clf.decision_function(cv_test_x) #在验证部分数据上 进行验证
            train_pred[test_indices] = (pred).reshape(len(pred),1) #把预测结果先通过softmax转换为概率分布(归一化) 赋给验证部分对应的位置  循环结束将会得到整个训练集上的预测结果
            pred = clf.decision_function(test_data) #得到 当前训练的模型在测试集上的预测结果
            test_pred += pred.reshape(len(pred),1) / num_fold#对每个模型在测试集上的预测结果先通过softmax转换为概率分布，再直接取平均(10折将会有10个结果)
            
        else:
            pred = clf.predict_proba(cv_test_x) #在验证部分数据上 进行验证
            train_pred[test_indices] = pred   #把预测结果 赋给验证部分对应的位置  循环结束将会得到整个训练集上的预测结果
            pred = clf.predict_proba(test_data)  #得到 当前训练的模型在测试集上的预测结果
            test_pred += pred / num_fold  #对每个模型在测试集上的预测结果直接取平均(10折将会有10个结果)
            y_pred = np.argmax(train_pred, axis=1) #对训练集上的预测结果按行取最大值 得到预测的标签
            

    if model_name =='LinearSVC':
        y_pred = (train_pred>0).astype(np.int32).reshape(len(train_pred))
        pre = (test_pred>0).astype(np.int32).reshape(len(test_pred))
    else:
        pre = np.argmax(test_pred,axis=1)
    score = accuracy_score(y, y_pred) #和训练集对应的真实标签 accuracy_score
    print('accuracy_score:',score)
    #保存逻辑回归模型在训练集和测试集上的预测结果
    np.save('./result/{0}'.format(model_name)+'train',train_pred)
    np.save('./result/{0}'.format(model_name)+'test',test_pred)
    
    submit = pd.DataFrame({'PassengerId':np.array(range(892,1310)),'Survived':pre.astype(np.int32)})
    submit.to_csv('{0}_submit.csv'.format(model_name),index=False)
    return clf,score

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

逻辑回归(LR)

pipe=Pipeline([('select',PCA(n_components=0.95)), 
               ('classify', LogisticRegression(random_state = seed, solver = 'liblinear'))])
param = {
        'classify__penalty':['l1','l2'],  
        'classify__C':[0.001, 0.01, 0.1, 1, 5,7,8,9,10,]}
LR_grid = GridSearchCV(estimator =pipe, param_grid = param, scoring='roc_auc', cv=5)
LR_grid.fit(features,y)
print(LR_grid.best_params_, LR_grid.best_score_)
C=LR_grid.best_params_['classify__C']
penalty = LR_grid.best_params_['classify__penalty']
LR_classify=LogisticRegression(C=C,penalty=penalty,random_state = seed, solver = 'liblinear')
LR_select =  PCA(n_components=0.95)
LR_pipeline = make_pipeline(LR_select, LR_classify)
lr_model,lr_score = Titanicmodel(LR_pipeline,feature,test_data,y,'LR')
1
2
3
4
5
6
7
8
9
10
11
12
13
14

支持向量机(SVM)


pipe=Pipeline([('select',SelectKBest(k=20)), 
               ('classify',LinearSVC(random_state=seed))])
param = {
        'select__k':list(range(20,40,2)),
        'classify__penalty':['l1','l2'],  
        'classify__C':[0.001, 0.01, 0.1, 1, 5,7,8,9,10,50,100]}
SVC_grid=GridSearchCV(estimator=pipe,param_grid=param,cv=5,scoring='roc_auc')
SVC_grid.fit(features,y)
print(SVC_grid.best_params_, SVC_grid.best_score_)
C=SVC_grid.best_params_['classify__C']
k=SVC_grid.best_params_['select__k']
penalty = SVC_grid.best_params_['classify__penalty']
SVC_classify=LinearSVC(C=C,penalty=penalty,random_state = seed)
SVC_select =  PCA(n_components=0.95)
SVC_pipeline = make_pipeline(SVC_select, SVC_classify)
SVC_model,LinearSVC_score = Titanicmodel(SVC_pipeline,feature,test_data,y,'LinearSVC')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

RandomForestClassifier

pipe=Pipeline([('select',SelectKBest(k=34)), 
               ('classify', RandomForestClassifier(criterion='gini',
                                                   random_state = seed,
                                                   min_samples_split=4,
                                                   min_samples_leaf=5, 
                                                   max_features = 'sqrt',
                                                  n_jobs=-1,
                                                   ))])

param = {
            'classify__n_estimators':list(range(40,50,2)),  
            'classify__max_depth':list(range(10,25,2))}
rfc_grid = GridSearchCV(estimator = pipe, param_grid = param, scoring='roc_auc', cv=10)
rfc_grid.fit(features,y)
print(rfc_grid.best_params_, rfc_grid.best_score_)
n_estimators=rfc_grid.best_params_['classify__n_estimators']
max_depth = rfc_grid.best_params_['classify__max_depth']
rfc_classify=RandomForestClassifier(criterion='gini',
                                        n_estimators= n_estimators,
                                        max_depth=max_depth,
                                       random_state = seed,
                                       min_samples_split=4,
                                       min_samples_leaf=5, 
                                       max_features = 'sqrt')
rfc_select =  PCA(n_components=0.95)
rfc_pipeline = make_pipeline(rfc_select, rfc_classify)
rfc_model,rfc_score = Titanicmodel(rfc_pipeline,feature,test_data,y,'rfc')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

LightGBM

pipe=Pipeline([('select',SelectKBest(k=34)), 
               ('classify', lgb.LGBMClassifier(random_state=seed,learning_rate=0.12,n_estimators=88,max_depth=16,
                                           min_child_samples=28,
                                            min_child_weight=0.0,
                                           classify__colsample_bytree= 0.8,
                                               colsample_bytree=0.4,
                                               objective='binary'
                                           
                                              ) )])

param = {'select__k':[i for i in range(20,40)]
#            'classify__learning_rate':[i/100 for i in range(20)]    
}
lgb_grid = GridSearchCV(estimator = pipe, param_grid = param, scoring='roc_auc', cv=10)
lgb_grid.fit(features,y)
print(lgb_grid.best_params_, lgb_grid.best_score_)
lgb_classify= lgb.LGBMClassifier(random_state=seed,
                                 learning_rate=0.12,
                                 n_estimators=88,
                                 max_depth=16,
                                 min_child_samples=28,
                                 min_child_weight=0.0,
                                 classify__colsample_bytree= 0.8,
                                 colsample_bytree=0.4,
                                 objective='binary'
                                )
lgb_select = PCA(n_components=0.96)
lgb_pipeline = make_pipeline(lgb_select, lgb_classify)
lgb_model,lgb_score = Titanicmodel(lgb_pipeline,feature,test_data,y,'lgb')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

Xgboost

pipe=Pipeline([('select',SelectKBest(k=34)), 
               ('classify', xgb.XGBClassifier(random_state=seed,
                                              learning_rate=0.12,
                                              n_estimators=80,
                                              max_depth=8,
                                              min_child_weight=3,
                                              subsample=0.8,
                                              colsample_bytree=0.8,
                                              gamma=0.2,
                                              reg_alpha=0.2,
                                              reg_lambda=0.1,
                                             )
               )])
param = {  'select__k':[i for i in range(20,40)
           'classify__learning_rate':[i/100 for i in range(10,20)],
}
xgb_grid = GridSearchCV(estimator = pipe, param_grid = param, scoring='roc_auc', cv=10)
xgb_grid.fit(features,y)
print(xgb_grid.best_params_, xgb_grid.best_score_)
xgb_classify= xgb.XGBClassifier(random_state=seed,
                                              learning_rate=0.12,
                                              n_estimators=80,
                                              max_depth=8,
                                              min_child_weight=3,
                                              subsample=0.8,
                                              colsample_bytree=0.8,
                                              gamma=0.2,
                                              reg_alpha=0.2,
                                              reg_lambda=0.1,
                                             )
xgb_select =  SelectKBest(k = 34)
xgb_pipeline = make_pipeline(xgb_select, xgb_classify)
xgb_model,xgb_score = Titanicmodel(xgb_pipeline,'xgb')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

模型融合

LR_train = np.load('./result/LRtrain.npy')
LR_test = np.load('./result/LRtest.npy')
LinearSVC_train = np.load('./result/LinearSVCtrain.npy')
LinearSVC_test = np.load('./result/LinearSVCtest.npy')
rfc_train = np.load('./result/rfctrain.npy')
rfc_test = np.load('./result/rfctest.npy')
xgb_train = np.load('./result/xgbtrain.npy')
xgb_test = np.load('./result/xgbtest.npy')
lgb_train = np.load('./result/lgbtrain.npy')
lgb_test= np.load('./result/lgbtest.npy')
label = np.load('./result/label.npy')
train_data = ( LR_train, rfc_train, LinearSVC_train,xgb_train, lgb_train)
test_x = ( LR_test, rfc_test, LinearSVC_test,xgb_test, lgb_test)
train_data = np.hstack(train_data)
test_x = np.hstack(test_x)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

model = LogisticRegression(random_state=seed)
lgbm_7leaves_model,lgbm_7leaves_score = Titanicmodel(model,features=train_data,test_data=test_x,y=label,model_name='lr_stacking')

1
2
3

本文内容由网友自发贡献，转载请注明出处：【wpsshop博客】