Titanic Dataset¶

The challenge here is to complete the analysis of what sorts of people were likely to survive the most infamous shipwrecks in history. In particular, the goal is to apply the tools of machine learning to predict which passengers survived the tragedy, i.e to predict if a passenger survived the sinking of the Titanic or not.¶

https://www.kaggle.com/c/titanic ¶

import numpy as np
import os
import pandas as pd

np.random.seed(42)

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

PROJECT_ROOT_DIR = "."
CHAPTER_ID = "titanic_classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

TITANIC_PATH = os.path.join("datasets","")

def load_titanic_data(file, titanic_path= TITANIC_PATH):
    csv_path = os.path.join(titanic_path, file)
    return pd.read_csv(csv_path)

titanic_train = load_titanic_data("titanic_train.csv")
titanic_test = load_titanic_data('titanic_test.csv')
titanic_train.head()

import matplotlib.pyplot as plt
titanic_train.hist(bins= 50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

Saving figure attribute_histogram_plots

train_set = titanic_train
test_set = titanic_test

train_set.head(20)

train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB

from sklearn.base import BaseEstimator, TransformerMixin

# A class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

from future_encoders import OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")

num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
        ("imputer", Imputer(strategy="median")),
    ])

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
        ("imputer", MostFrequentImputer()),
        ("one_hot_cat_encoder", OneHotEncoder(sparse=False)),
    ])

num_pipeline.fit_transform(train_set)

array([[22.    ,  1.    ,  0.    ,  7.25  ],
       [38.    ,  1.    ,  0.    , 71.2833],
       [26.    ,  0.    ,  0.    ,  7.925 ],
       ...,
       [28.    ,  1.    ,  2.    , 23.45  ],
       [26.    ,  0.    ,  0.    , 30.    ],
       [32.    ,  0.    ,  0.    ,  7.75  ]])

cat_pipeline.fit_transform(train_set)

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 1., 0.]])

from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

X_train = preprocess_pipeline.fit_transform(train_set)
X_train[:5]

array([[22.    ,  1.    ,  0.    ,  7.25  ,  0.    ,  0.    ,  1.    ,
         0.    ,  1.    ,  0.    ,  0.    ,  1.    ],
       [38.    ,  1.    ,  0.    , 71.2833,  1.    ,  0.    ,  0.    ,
         1.    ,  0.    ,  1.    ,  0.    ,  0.    ],
       [26.    ,  0.    ,  0.    ,  7.925 ,  0.    ,  0.    ,  1.    ,
         1.    ,  0.    ,  0.    ,  0.    ,  1.    ],
       [35.    ,  1.    ,  0.    , 53.1   ,  1.    ,  0.    ,  0.    ,
         1.    ,  0.    ,  0.    ,  0.    ,  1.    ],
       [35.    ,  0.    ,  0.    ,  8.05  ,  0.    ,  0.    ,  1.    ,
         0.    ,  1.    ,  0.    ,  0.    ,  1.    ]])

y_train = train_set["Survived"]

from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

svm_clf = SVC()
svm_clf.fit(X_train, y_train)
svm_score = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_score.mean()

0.7365250822835092

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8115690614005221

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_scores = cross_val_score(knn_clf, X_train, y_train, cv=10)
knn_scores.mean()

0.7184720803541028

X_test = preprocess_pipeline.fit_transform(test_set)

y_pred = forest_clf.predict(X_test)

test_set[["PassengerId","Age"]][:10]

X_test[:10]

array([[34.5   ,  0.    ,  0.    ,  7.8292,  0.    ,  0.    ,  1.    ,
         0.    ,  1.    ,  0.    ,  1.    ,  0.    ],
       [47.    ,  1.    ,  0.    ,  7.    ,  0.    ,  0.    ,  1.    ,
         1.    ,  0.    ,  0.    ,  0.    ,  1.    ],
       [62.    ,  0.    ,  0.    ,  9.6875,  0.    ,  1.    ,  0.    ,
         0.    ,  1.    ,  0.    ,  1.    ,  0.    ],
       [27.    ,  0.    ,  0.    ,  8.6625,  0.    ,  0.    ,  1.    ,
         0.    ,  1.    ,  0.    ,  0.    ,  1.    ],
       [22.    ,  1.    ,  1.    , 12.2875,  0.    ,  0.    ,  1.    ,
         1.    ,  0.    ,  0.    ,  0.    ,  1.    ],
       [14.    ,  0.    ,  0.    ,  9.225 ,  0.    ,  0.    ,  1.    ,
         0.    ,  1.    ,  0.    ,  0.    ,  1.    ],
       [30.    ,  0.    ,  0.    ,  7.6292,  0.    ,  0.    ,  1.    ,
         1.    ,  0.    ,  0.    ,  1.    ,  0.    ],
       [26.    ,  1.    ,  1.    , 29.    ,  0.    ,  1.    ,  0.    ,
         0.    ,  1.    ,  0.    ,  0.    ,  1.    ],
       [18.    ,  0.    ,  0.    ,  7.2292,  0.    ,  0.    ,  1.    ,
         1.    ,  0.    ,  1.    ,  0.    ,  0.    ],
       [21.    ,  2.    ,  0.    , 24.15  ,  0.    ,  0.    ,  1.    ,
         0.    ,  1.    ,  0.    ,  0.    ,  1.    ]])

from sklearn.model_selection import GridSearchCV

Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = [{'C': Cs, 'gamma' : gammas}]
#knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(svm_clf, param_grid, cv=10)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10], 'gamma': [0.001, 0.01, 0.1, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

grid_search.best_params_

{'C': 10, 'gamma': 0.001}

grid_search.best_score_

0.7845117845117845

from sklearn.metrics import accuracy_score

y_pred_grid = grid_search.predict(X_test)

predictions = {'PassengerId' : test_set["PassengerId"],
              'Survived': y_pred_grid}

final_df = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])

final_df.head(10)

final_df.to_csv('titanic_submission_svm.csv',index=False)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	0	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	0	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	0	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	0	373450	8.0500	NaN	S
5	6	0	3	Moran, Mr. James	male	NaN	0	0	330877	8.4583	NaN	Q
6	7	0	1	McCarthy, Mr. Timothy J	male	54.0	0	0	17463	51.8625	E46	S
7	8	0	3	Palsson, Master. Gosta Leonard	male	2.0	3	1	349909	21.0750	NaN	S
8	9	1	3	Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)	female	27.0	0	2	347742	11.1333	NaN	S
9	10	1	2	Nasser, Mrs. Nicholas (Adele Achem)	female	14.0	1	0	237736	30.0708	NaN	C
10	11	1	3	Sandstrom, Miss. Marguerite Rut	female	4.0	1	1	PP 9549	16.7000	G6	S
11	12	1	1	Bonnell, Miss. Elizabeth	female	58.0	0	0	113783	26.5500	C103	S
12	13	0	3	Saundercock, Mr. William Henry	male	20.0	0	0	A/5. 2151	8.0500	NaN	S
13	14	0	3	Andersson, Mr. Anders Johan	male	39.0	1	5	347082	31.2750	NaN	S
14	15	0	3	Vestrom, Miss. Hulda Amanda Adolfina	female	14.0	0	0	350406	7.8542	NaN	S
15	16	1	2	Hewlett, Mrs. (Mary D Kingcome)	female	55.0	0	0	248706	16.0000	NaN	S
16	17	0	3	Rice, Master. Eugene	male	2.0	4	1	382652	29.1250	NaN	Q
17	18	1	2	Williams, Mr. Charles Eugene	male	NaN	0	0	244373	13.0000	NaN	S
18	19	0	3	Vander Planke, Mrs. Julius (Emelia Maria Vande...	female	31.0	1	0	345763	18.0000	NaN	S
19	20	1	3	Masselmani, Mrs. Fatima	female	NaN	0	0	2649	7.2250	NaN	C

	PassengerId	Age
0	892	34.5
1	893	47.0
2	894	62.0
3	895	27.0
4	896	22.0
5	897	14.0
6	898	30.0
7	899	26.0
8	900	18.0
9	901	21.0

	PassengerId	Survived
0	892	0
1	893	1
2	894	0
3	895	0
4	896	1
5	897	0
6	898	1
7	899	0
8	900	1
9	901	0

Titanic Dataset¶

https://www.kaggle.com/c/titanic¶

https://www.kaggle.com/c/titanic ¶