import numpy as np
import os
import pandas as pd
np.random.seed(42)
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "titanic_classification"
def save_fig(fig_id, tight_layout=True):
path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format='png', dpi=300)
TITANIC_PATH = os.path.join("datasets","")
def load_titanic_data(file, titanic_path= TITANIC_PATH):
csv_path = os.path.join(titanic_path, file)
return pd.read_csv(csv_path)
titanic_train = load_titanic_data("titanic_train.csv")
titanic_test = load_titanic_data('titanic_test.csv')
titanic_train.head()
import matplotlib.pyplot as plt
titanic_train.hist(bins= 50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()
train_set = titanic_train
test_set = titanic_test
train_set.head(20)
train_set.info()
from sklearn.base import BaseEstimator, TransformerMixin
# A class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
class MostFrequentImputer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
index=X.columns)
return self
def transform(self, X, y=None):
return X.fillna(self.most_frequent_)
from future_encoders import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
num_pipeline = Pipeline([
("select_numeric", DataFrameSelector(["Age", "SibSp", "Parch", "Fare"])),
("imputer", Imputer(strategy="median")),
])
cat_pipeline = Pipeline([
("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
("imputer", MostFrequentImputer()),
("one_hot_cat_encoder", OneHotEncoder(sparse=False)),
])
num_pipeline.fit_transform(train_set)
cat_pipeline.fit_transform(train_set)
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
X_train = preprocess_pipeline.fit_transform(train_set)
X_train[:5]
y_train = train_set["Survived"]
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svm_clf = SVC()
svm_clf.fit(X_train, y_train)
svm_score = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_score.mean()
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_scores = cross_val_score(knn_clf, X_train, y_train, cv=10)
knn_scores.mean()
X_test = preprocess_pipeline.fit_transform(test_set)
y_pred = forest_clf.predict(X_test)
test_set[["PassengerId","Age"]][:10]
X_test[:10]
from sklearn.model_selection import GridSearchCV
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = [{'C': Cs, 'gamma' : gammas}]
#knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(svm_clf, param_grid, cv=10)
grid_search.fit(X_train, y_train)
grid_search.best_params_
grid_search.best_score_
from sklearn.metrics import accuracy_score
y_pred_grid = grid_search.predict(X_test)
predictions = {'PassengerId' : test_set["PassengerId"],
'Survived': y_pred_grid}
final_df = pd.DataFrame(predictions, columns = ['PassengerId', 'Survived'])
final_df.head(10)
final_df.to_csv('titanic_submission_svm.csv',index=False)