赞
踩
(1)训练集(train.csv)
(2)测试集(test.csv)
(3)提交文件示例(gender_submission.csv)
对于训练集,我们为每位乘客提供结果。模型将基于乘客的性别和阶级等“特征”也可以使用特征工程来创建新特征。我们要做的就是对于测试集中的每个乘客,使用训练的模型来预测他们是否在泰坦尼克号沉没中幸存下来。
属性 | 说明 |
---|---|
PassengerId | 乘客ID |
Survived | 是否获救,1为是,0为否 |
Pclass | 乘客票务舱,1表示最高级,还有2、3等级的 |
Name | 乘客姓名 |
Sex | 性别 |
Age | 年龄 |
SibSp | 堂兄弟妹个数 |
Parch | 父母与小孩个数 |
Ticket | 船票号 |
Fare | 票价 |
Cabin | 客舱号 |
Embarked | 登船港口 |
1.网站注册是“register”,注册的时候需要连接“特殊工具”,不然没法进行人机验证。具体解决办法可以跳转到:https://www.cnblogs.com/liuxiaomin/p/11785645.html【解决】
2.数据下载、结果提交
1.导入第三方库和数据文件
#导入第三方库
import pandas as pd
import numpy as np
#读取数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submit = pd.read_csv('gender_submission.csv')
2.查看缺失值
代码:
print(train.info())
结果:
3.观察变量信息
代码:
print(train.describe())
结果:
4.总结
从上面的缺失值和变量关系可以看出,训练集的数据中"Age","Embarked"有缺失值的情况,因为891条数据里,只有"Age"的count只有714,"Embarked"的count只有。所以需要对"Age"的缺失值进行处理。
1.缺失值处理
代码:
# 用中值填补缺失值
train['Age'] = train['Age'].fillna(train['Age'].median())
#缺失值
print(train.info())
结果:
2.数据转化
把数据类型是"object"转化
代码:
print(train['Sex'].unique())
# titanic.loc[0]表示第0行的样本
# titanic.loc[0, 'PassengerId']表示行为0,列为PassengerId的值
train.loc[train['Sex'] == 'male', 'Sex'] = 0
train.loc[train['Sex'] == 'female', 'Sex'] = 1
完整:
"""导入库""" # 数据分析与整理 import pandas as pd import numpy as np import random as rnd # 可视化 import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline # 机器学习 from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier """获取数据""" train_df = pd.read_csv('train.csv') test_df = pd.read_csv('test.csv') combine = [train_df, test_df] train_df = train_df.drop(['Ticket', 'Cabin'], axis=1) test_df = test_df.drop(['Ticket', 'Cabin'], axis=1) combine = [train_df, test_df] for dataset in combine: dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)# expand=False表示返回DataFrame # 用一个更常见的名字替换许多标题,分类稀有标题 for dataset in combine: dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\ 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') train_df = train_df.drop(['Name', 'PassengerId'], axis=1) test_df = test_df.drop(['Name'], axis=1) combine = [train_df, test_df] for dataset in combine: dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int) all_data = pd.concat([train_df, test_df], ignore_index = True) #用随机森林对Age缺失值进行填充 from sklearn import model_selection from sklearn.ensemble import RandomForestRegressor train_df = all_data[all_data['Survived'].notnull()] test_df = all_data[all_data['Survived'].isnull()] # 分割数据,按照 训练数据:cv数据 = 1:1的比例 train_split_1, train_split_2 = model_selection.train_test_split(train_df, test_size=0.5, random_state=0) def predict_age_use_cross_validationg(df1,df2,dfTest): age_df1 = df1[['Age', 'Pclass','Sex','Title']] age_df1 = pd.get_dummies(age_df1) age_df2 = df2[['Age', 'Pclass','Sex','Title']] age_df2 = pd.get_dummies(age_df2) known_age = age_df1[age_df1.Age.notnull()].iloc[:,:].values unknow_age_df1 = age_df1[age_df1.Age.isnull()].iloc[:,:].values unknown_age = age_df2[age_df2.Age.isnull()].iloc[:,:].values print (unknown_age.shape) y = known_age[:, 0] X = known_age[:, 1:] rfr = RandomForestRegressor(random_state=0, n_estimators=100, n_jobs=-1) rfr.fit(X, y) predictedAges = rfr.predict(unknown_age[:, 1::]) df2.loc[ (df2.Age.isnull()), 'Age' ] = predictedAges predictedAges = rfr.predict(unknow_age_df1[:,1::]) df1.loc[(df1.Age.isnull()),'Age'] = predictedAges age_Test = dfTest[['Age', 'Pclass','Sex','Title']] age_Test = pd.get_dummies(age_Test) age_Tmp = df2[['Age', 'Pclass','Sex','Title']] age_Tmp = pd.get_dummies(age_Tmp) age_Tmp = pd.concat([age_Test[age_Test.Age.notnull()],age_Tmp]) known_age1 = age_Tmp.iloc[:,:].values unknown_age1 = age_Test[age_Test.Age.isnull()].iloc[:,:].values y = known_age1[:,0] x = known_age1[:,1:] rfr.fit(x, y) predictedAges = rfr.predict(unknown_age1[:, 1:]) dfTest.loc[ (dfTest.Age.isnull()), 'Age' ] = predictedAges return dfTest t1 = train_split_1.copy() t2 = train_split_2.copy() tmp1 = test_df.copy() t5 = predict_age_use_cross_validationg(t1,t2,tmp1) t1 = pd.concat([t1,t2]) t3 = train_split_1.copy() t4 = train_split_2.copy() tmp2 = test_df.copy() t6 = predict_age_use_cross_validationg(t4,t3,tmp2) t3 = pd.concat([t3,t4]) train_df['Age'] = (t1['Age'] + t3['Age'])/2 test_df['Age'] = (t5['Age'] + t6['Age']) / 2 all_data = pd.concat([train_df,test_df]) print (train_df.describe()) print (test_df.describe()) guess_ages = np.zeros((2,3)) # 迭代sex(0或1)和pclass(1,2,3)来计算六个组合的年龄估计值。 for dataset in combine: for i in range(0, 2): for j in range(0, 3): guess_df = dataset[(dataset['Sex'] == i) & \ (dataset['Pclass'] == j+1)]['Age'].dropna() age_guess = guess_df.median() guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5 for i in range(0, 2): for j in range(0, 3): dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\ 'Age'] = guess_ages[i,j] dataset['Age'] = dataset['Age'].astype(int) train_df['AgeBand'] = pd.cut(train_df['Age'], 5) train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True) for dataset in combine: dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0 dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1 dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2 dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3 dataset.loc[ dataset['Age'] > 64, 'Age'] train_df = train_df.drop(['AgeBand'], axis=1) combine = [train_df, test_df] for dataset in combine: dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 for dataset in combine: dataset['IsAlone'] = 0 dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1 train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1) test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1) combine = [train_df, test_df] for dataset in combine: dataset['Age*Class'] = dataset.Age * dataset.Pclass freq_port = train_df.Embarked.dropna().mode()[0]# 众数 for dataset in combine: dataset['Embarked'] = dataset['Embarked'].fillna(freq_port) for dataset in combine: dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True) train_df['FareBand'] = pd.qcut(train_df['Fare'], 4) train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True) for dataset in combine: dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3 dataset['Fare'] = dataset['Fare'].astype(int) train_df = train_df.drop(['FareBand'], axis=1) combine = [train_df, test_df] #将Title进行转化清洗 train_df.loc[train_df['Title'] == 'Mr', 'Title'] = 0 train_df.loc[train_df['Title'] == 'Miss', 'Title'] = 1 train_df.loc[train_df['Title'] == 'Mrs', 'Title'] = 2 train_df.loc[train_df['Title'] == 'Master', 'Title'] = 3 train_df.loc[train_df['Title'] == 'Dr', 'Title'] = 4 train_df.loc[train_df['Title'] == 'Rev', 'Title'] = 5 train_df.loc[train_df['Title'] == 'Major', 'Title'] = 6 train_df.loc[train_df['Title'] == 'Col', 'Title'] = 7 train_df.loc[train_df['Title'] == 'Mlle', 'Title'] = 8 train_df.loc[train_df['Title'] == 'Mme', 'Title'] = 9 train_df.loc[train_df['Title'] == 'Don', 'Title'] = 10 train_df.loc[train_df['Title'] == 'Lady', 'Title'] = 11 train_df.loc[train_df['Title'] == 'Countess', 'Title'] = 12 train_df.loc[train_df['Title'] == 'Jonkheer', 'Title'] = 13 train_df.loc[train_df['Title'] == 'Sir', 'Title'] = 14 train_df.loc[train_df['Title'] == 'Capt', 'Title'] = 15 train_df.loc[train_df['Title'] == 'Ms', 'Title'] = 16 train_df.loc[train_df['Title'] == 'Rare', 'Title'] = 17 #将测试集Title进行转化清洗 test_df.loc[test_df['Title'] == 'Mr', 'Title'] = 0 test_df.loc[test_df['Title'] == 'Miss', 'Title'] = 1 test_df.loc[test_df['Title'] == 'Mrs', 'Title'] = 2 test_df.loc[test_df['Title'] == 'Master', 'Title'] = 3 test_df.loc[test_df['Title'] == 'Dr', 'Title'] = 4 test_df.loc[test_df['Title'] == 'Rev', 'Title'] = 5 test_df.loc[test_df['Title'] == 'Major', 'Title'] = 6 test_df.loc[test_df['Title'] == 'Col', 'Title'] = 7 test_df.loc[test_df['Title'] == 'Mlle', 'Title'] = 8 test_df.loc[test_df['Title'] == 'Mme', 'Title'] = 9 test_df.loc[test_df['Title'] == 'Don', 'Title'] = 10 test_df.loc[test_df['Title'] == 'Lady', 'Title'] = 11 test_df.loc[test_df['Title'] == 'Countess', 'Title'] = 12 test_df.loc[test_df['Title'] == 'Jonkheer', 'Title'] = 13 test_df.loc[test_df['Title'] == 'Sir', 'Title'] = 14 test_df.loc[test_df['Title'] == 'Capt', 'Title'] = 15 test_df.loc[test_df['Title'] == 'Ms', 'Title'] = 16 test_df.loc[test_df['Title'] == 'Rare', 'Title'] = 17 #test_df = test_df.drop(['Survived'], axis=1) train_df = train_df.drop(['PassengerId'], axis=1) X_train = train_df.drop("Survived", axis=1) Y_train = train_df["Survived"] X_test = test_df.drop("PassengerId", axis=1).copy()
1.决策树
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree
2.SVC
# Support Vector Machines
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc
3.LogR
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
acc_log
4.KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn
5.Gaussian
# Gaussian Naive Bayes
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian
6.Perceptron
# Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron
7.Stochastic
# Stochastic Gradient Descent
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
pipe=Pipeline([('select',SelectKBest(k='all')),
('classify', RandomForestClassifier(random_state = 10, max_features = 'sqrt'))])
param_test = {'classify__n_estimators':list(range(20,50,2)),
'classify__max_depth':list(range(3,60,3))}
gsearch = GridSearchCV(estimator = pipe, param_grid = param_test, scoring='roc_auc', cv=10)
gsearch.fit(X_train,Y_train)
print(gsearch.best_params_, gsearch.best_score_)
———————————————————
写的太烂了,重新去打基础学习了QAQ
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。