Anthony Sullivan – Data Science

%load_ext sql

%sql postgresql://dsi_student:gastudents@dsi.c20gkj5cvu3l.us-east-1.rds.amazonaws.com/titanic

u'Connected: dsi_student@titanic'

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

%%sql

select * from train limit 5;

5 rows affected.

index	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.25	None	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Thayer)	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.925	None	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.05	None	S

titanic_train = %%sql select * from train;
titanic_df = titanic_train.DataFrame()

891 rows affected.

titanic_df.head()

	index	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	None	S
1	1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	None	S
3	3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	None	S

# titanic_df[0:50]
titanic_df[titanic_df['Cabin'] == 'G6']

	index	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
10	10	11	1	3	Sandstrom, Miss. Marguerite Rut	female	4.0	1	1	PP 9549	16.7000	G6	S
205	205	206	0	3	Strom, Miss. Telma Matilda	female	2.0	0	1	347054	10.4625	G6	S
251	251	252	0	3	Strom, Mrs. Wilhelm (Elna Matilda Persson)	female	29.0	1	1	347054	10.4625	G6	S
394	394	395	1	3	Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...	female	24.0	0	2	PP 9549	16.7000	G6	S

titanic_df[titanic_df['Ticket'] == 'PP 9549']

	index	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
10	10	11	1	3	Sandstrom, Miss. Marguerite Rut	female	4.0	1	1	PP 9549	16.7	G6	S
394	394	395	1	3	Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...	female	24.0	0	2	PP 9549	16.7	G6	S

#titanic_df[titanic_df['Cabin'] == re.compile]
titanic_df[titanic_df['Name'].str.contains("Sandstrom")]

	index	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
10	10	11	1	3	Sandstrom, Miss. Marguerite Rut	female	4.0	1	1	PP 9549	16.7	G6	S
394	394	395	1	3	Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengt...	female	24.0	0	2	PP 9549	16.7	G6	S

titanic_df['Name'][394]

u'Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)'

test = pd.read_csv('test.csv')

test.shape

(418, 11)

test[test['Name'].str.contains("Sandstrom")]

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
117	1009	3	Sandstrom, Miss. Beatrice Irene	female	1.0	1	1	PP 9549	16.7	G6	S

dropped = ['index', 'PassengerId']
df = titanic_df.drop(dropped, 1)

df.head()

	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	None	S
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	None	S
3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	None	S

Check for null values

df.Age.isnull().value_counts()

False    714
True     177
Name: Age, dtype: int64

# will need to find a way to impute age, else will use median

df.Embarked.isnull().value_counts()

False    889
True       2
Name: Embarked, dtype: int64

# since only two embarked values are missing will fill with majority class

df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

# Check two missing embarked indexes
df[df.Embarked.isnull() == True]

	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
61	1	1	Icard, Miss. Amelie	female	38.0	0	0	113572	80.0	B28	None
829	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	80.0	B28	None

# Let's see if the ticket numbers can give us a clue, 
# Search for similar ticket numbers with fuzzy wuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

query = '113572'
choices = df.Ticket

process.extract(query, choices)

[(u'113572', 100),
 (u'113572', 100),
 (u'113792', 83),
 (u'5727', 77),
 (u'11752', 73)]

df[df['Ticket'] == '113792']

	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
467	0	1	Smart, Mr. John Montgomery	male	56.0	0	0	113792	26.55	None	S

# df[df['Ticket'].str.contains('113')]

df['Embarked'] = df['Embarked'].fillna('S')

df['Embarked'].isnull().value_counts()

False    891
Name: Embarked, dtype: int64

# Check for missing values in Cabin
df.Cabin.isnull().value_counts()

True     687
False    204
Name: Cabin, dtype: int64

# We will not really be able to impute Missing cabins and some passengeres were not assigned a cabin 
# so will replace with value for missing

Feature engineering

# Create a family size feature
df['Fsize'] = df['SibSp'] + df['Parch'] + 1

# Pull Titles from Name
import re
def titles(string):
    titles = re.search(' ([A-Za-z]+)\.', string)
    # If the title exists, extract and return it.
    if titles:
        return titles.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
df['title'] = df['Name'].apply(titles)

df.title.value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Countess      1
Ms            1
Lady          1
Jonkheer      1
Don           1
Mme           1
Capt          1
Sir           1
Name: title, dtype: int64

df['title'][df['title'] == 'Mlle'] = 'Miss'

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

df['title'][df['title'] == 'Countess'] = 'Lady'

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

df['title'][df['title'] == 'Mme'] = 'Ms'

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

male_hon = ['Major', 'Col','Jonkheer','Don','Capt']
for i in male_hon:
    df['title'][df['title'] == i] = 'MHon'

/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

df[df['title'] == 'Dr']

	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Fsize	title
245	0	1	Minahan, Dr. William Edward	male	44.0	2	19928	90.0000	C78	Q	3	Dr
317	0	2	Moraweck, Dr. Ernest	male	54.0	0	29011	14.0000	None	S	1	Dr
398	0	2	Pain, Dr. Alfred	male	23.0	0	244278	10.5000	None	S	1	Dr
632	1	1	Stahelin-Maeglin, Dr. Max	male	32.0	0	13214	30.5000	B50	C	1	Dr
660	1	1	Frauenthal, Dr. Henry William	male	50.0	2	PC 17611	133.6500	None	S	3	Dr
766	0	1	Brewe, Dr. Arthur Jackson	male	NaN	0	112379	39.6000	None	C	1	Dr
796	1	1	Leader, Dr. Alice (Farnham)	female	49.0	0	17465	25.9292	D17	S	1	Dr

df.title.value_counts()

Mr        517
Miss      184
Mrs       125
Master     40
MHon        7
Dr          7
Rev         6
Ms          2
Lady        2
Sir         1
Name: title, dtype: int64

# Fill missing cabin values with 'X'
df['Cabin'] = df['Cabin'].fillna('X')

# Create a feature for Deck
# Pull deck from Cabin
def deck(string):
    decks = re.search('[A-Za-z]', string)
    # If the deck exists, extract and return it.
    if decks:
        return decks.group(0)
    return ""
# Create a new feature deck
df['deck'] = df['Cabin'].apply(deck)

df.head()

	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Fsize	title	deck
0	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	X	S	2	Mr	X
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	2	Mrs	C
2	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	X	S	1	Miss	X
3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S	2	Mrs	C
4	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	X	S	1	Mr	X

df['deck'][df['Pclass'] == 2].value_counts()

X    168
F      8
D      4
E      4
Name: deck, dtype: int64

# Surname feature, will try using this feature in decision tree models
names = df['Name'].str.split(',')
surnames = []
for i in names:
    surnames.append(i[0])
df['surname'] = surnames

df.head()

	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Fsize	title	deck	surname
0	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	X	S	2	Mr	X	Braund
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	2	Mrs	C	Cumings
2	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	X	S	1	Miss	X	Heikkinen
3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S	2	Mrs	C	Futrelle
4	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	X	S	1	Mr	X	Allen

df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean()

	Sex	Survived
0	female	0.742038
1	male	0.188908

df[["Fsize", "Survived"]].groupby(['Fsize'], as_index=False).mean()

	Fsize	Survived
0	1	0.303538
1	2	0.552795
2	3	0.578431
3	4	0.724138
4	5	0.200000
5	6	0.136364
6	7	0.333333
7	8	0.000000
8	11	0.000000

df[df['Fsize'] == 11]

	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Fsize	title	deck	surname
159	3	Sage, Master. Thomas Henry	male	NaN	8	2	CA. 2343	69.55	X	S	11	Master	X	Sage
180	3	Sage, Miss. Constance Gladys	female	NaN	8	2	CA. 2343	69.55	X	S	11	Miss	X	Sage
201	3	Sage, Mr. Frederick	male	NaN	8	2	CA. 2343	69.55	X	S	11	Mr	X	Sage
324	3	Sage, Mr. George John Jr	male	NaN	8	2	CA. 2343	69.55	X	S	11	Mr	X	Sage
792	3	Sage, Miss. Stella Anna	female	NaN	8	2	CA. 2343	69.55	X	S	11	Miss	X	Sage
846	3	Sage, Mr. Douglas Bullen	male	NaN	8	2	CA. 2343	69.55	X	S	11	Mr	X	Sage
863	3	Sage, Miss. Dorothy Edith "Dolly"	female	NaN	8	2	CA. 2343	69.55	X	S	11	Miss	X	Sage

test[test['Name'].str.contains('Sage')]

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
188	1080	3	Sage, Miss. Ada	female	NaN	8	2	CA. 2343	69.55	NaN	S
342	1234	3	Sage, Mr. John George	male	NaN	1	9	CA. 2343	69.55	NaN	S
360	1252	3	Sage, Master. William Henry	male	14.5	8	2	CA. 2343	69.55	NaN	S
365	1257	3	Sage, Mrs. John (Annie Bullen)	female	NaN	1	9	CA. 2343	69.55	NaN	S

# It appears the fare column may be fare per ticket and not per passenger
# Will try and address that later if time permits

df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean()

	Parch	Survived
0	0	0.343658
1	1	0.550847
2	2	0.500000
3	3	0.600000
4	4	0.000000
5	5	0.200000
6	6	0.000000

df[["Pclass", "Survived"]].groupby(['Pclass'], as_index=False).mean()

	Pclass	Survived
0	1	0.629630
1	2	0.472826
2	3	0.242363

df[["Fsize", "Survived"]].groupby(['Fsize'], as_index=False).mean()

	Fsize	Survived
0	1	0.303538
1	2	0.552795
2	3	0.578431
3	4	0.724138
4	5	0.200000
5	6	0.136364
6	7	0.333333
7	8	0.000000
8	11	0.000000

df[df['Ticket'] == 'F.C.C. 13529']

	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked	Fsize	title	deck	surname
314	0	2	Hart, Mr. Benjamin	male	43.0	1	1	F.C.C. 13529	26.25	X	S	3	Mr	X	Hart
440	1	2	Hart, Mrs. Benjamin (Esther Ada Bloomfield)	female	45.0	1	1	F.C.C. 13529	26.25	X	S	3	Mrs	X	Hart
535	1	2	Hart, Miss. Eva Miriam	female	7.0	0	2	F.C.C. 13529	26.25	X	S	3	Miss	X	Hart

# Impute missing ages

pd.pivot_table(df[df['Age'].isnull() == True], index=['Pclass', 'title'], values=['Age'])

		Age
Pclass	title
1	Dr	NaN
	Miss	NaN
	Mr	NaN
	Mrs	NaN
2	Miss	NaN
2	Mr	NaN
3	Master	NaN
	Miss	NaN
	Mr	NaN
	Mrs	NaN

# Fill missings ages with median for title and pclass
df['Age'].fillna(df.groupby(["title", "Pclass"])["Age"].transform("median"), inplace=True)

df['Age'].isnull().value_counts()

False    891
Name: Age, dtype: int64

df['Age'][df['title'] == 'Dr'][df['Pclass'] == 1]

245    44.0
632    32.0
660    50.0
766    46.5
796    49.0
Name: Age, dtype: float64

df['Age'][df['title'] == 'Dr'][df['Pclass'] == 1].median()

46.5

Lets plot some of our features and Determine how to treat them

import seaborn as sns

# Create a list of quantitative data columns
quant = [f for f in df.columns if df.dtypes[f] != 'object']
quant

[u'Survived', u'Pclass', u'Age', u'SibSp', u'Parch', u'Fare', 'Fsize']

hists = ['Age', 'Fare']

# Plot Histograms of numerical data columns
sns.set(rc={"figure.figsize": (8, 4)})
for i in hists:
    sns.distplot(df[i])
    plt.xlabel(i)
    plt.ylabel('Count')
    plt.show()

png

df[df['Fare'] > 500]

	Survived	Pclass	Name	Sex	Age	Parch	Ticket	Fare	Cabin	Embarked	Fsize	title	deck	surname
258	1	1	Ward, Miss. Anna	female	35.0	0	PC 17755	512.3292	X	C	1	Miss	X	Ward
679	1	1	Cardeza, Mr. Thomas Drake Martinez	male	36.0	1	PC 17755	512.3292	B51 B53 B55	C	2	Mr	B	Cardeza
737	1	1	Lesurer, Mr. Gustave J	male	35.0	0	PC 17755	512.3292	B101	C	1	Mr	B	Lesurer

plt.figure(figsize=(8,8))
plt.title('Correlation of Features', y=1.05, size=15)
sns.heatmap(df.corr(),linewidths=0.1,vmax=1.0, square=True, linecolor='white', annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x1130a55d0>

png

g = sns.pairplot(df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare', 'Embarked', 'Fsize', 'title']], \
                 hue='Survived',size=1,diag_kind = 'kde',diag_kws=dict(shade=True),plot_kws=dict(s=10))
g.set(xticklabels=[])

<seaborn.axisgrid.PairGrid at 0x10f838490>

png

# Standardize Age

from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler()
age_df_std = std_scale.fit_transform(df[['Age']])

fig, ax = plt.subplots(1,2, figsize=(15,6))
sns.distplot(df['Age'], ax=ax[0], kde=False, color="steelblue", bins=30)
sns.distplot(age_df_std, ax=ax[1], kde=False, color="seagreen", bins=30)
ax[1].set_xlabel('Sklearn');

png

# Standardize Fare
std_scale = StandardScaler()
fare_df_std = std_scale.fit_transform(df[['Fare']])

fig, ax = plt.subplots(1,2, figsize=(15,6))
sns.distplot(df['Fare'], ax=ax[0], kde=False, color="steelblue", bins=30)
sns.distplot(fare_df_std, ax=ax[1], kde=False, color="seagreen", bins=30)
ax[1].set_xlabel('Sklearn');

png

df_lr = df.copy()

df.head()

	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked	Fsize	title	deck	surname
0	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	X	S	2	Mr	X	Braund
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C	2	Mrs	C	Cumings
2	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	X	S	1	Miss	X	Heikkinen
3	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S	2	Mrs	C	Futrelle
4	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	X	S	1	Mr	X	Allen

lr_drop = ['Name', 'Ticket', 'Cabin', 'surname']

df_lr.drop(lr_drop, axis=1, inplace=True)
df_lr.head()

	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked	Fsize	title	deck
0	0	3	male	22.0	1	7.2500	S	2	Mr	X
1	1	1	female	38.0	1	71.2833	C	2	Mrs	C
2	1	3	female	26.0	0	7.9250	S	1	Miss	X
3	1	1	female	35.0	1	53.1000	S	2	Mrs	C
4	0	3	male	35.0	0	8.0500	S	1	Mr	X

df_lr['Sex'] = df_lr['Sex'].apply(lambda x: 1 if x == 'male' else 0)

df_lr['Survived'][df_lr['Survived'] == 0].count()*1.0/(df_lr['Survived'][df_lr['Survived'] == 0].count()+\
                                                   df_lr['Survived'][df_lr['Survived'] == 1].count())*1.0

0.61616161616161613

df_lr = pd.get_dummies(df_lr)
df_lr.columns

Index([    u'Survived',       u'Pclass',          u'Sex',          u'Age',
              u'SibSp',        u'Parch',         u'Fare',        u'Fsize',
         u'Embarked_C',   u'Embarked_Q',   u'Embarked_S',     u'title_Dr',
         u'title_Lady',   u'title_MHon', u'title_Master',   u'title_Miss',
           u'title_Mr',    u'title_Mrs',     u'title_Ms',    u'title_Rev',
          u'title_Sir',       u'deck_A',       u'deck_B',       u'deck_C',
             u'deck_D',       u'deck_E',       u'deck_F',       u'deck_G',
             u'deck_T',       u'deck_X'],
      dtype='object')

df_lr.drop(['Embarked_C','title_Mr','deck_X'], axis=1, inplace=True)
df_lr.head()

	Survived	Pclass	Sex	Age	SibSp	Fare	Fsize	Embarked_S	...	deck_C
0	0	3	1	22.0	1	7.2500	2	1.0	...	0.0
1	1	1	0	38.0	1	71.2833	2	0.0	...	1.0
2	1	3	0	26.0	0	7.9250	1	1.0	...	0.0
3	1	1	0	35.0	1	53.1000	2	1.0	...	1.0
4	0	3	1	35.0	0	8.0500	1	1.0	...	0.0

5 rows × 27 columns

df_lr = pd.concat([df_lr.drop('Pclass',axis=1),pd.get_dummies(df_lr['Pclass'], prefix='Class',drop_first=True)], axis = 1)

df_lr = pd.concat([df_lr.drop('SibSp',axis=1),pd.get_dummies(df_lr['SibSp'], prefix='SibSp',drop_first=True)], axis = 1)

df_lr = pd.concat([df_lr.drop('Parch',axis=1),pd.get_dummies(df_lr['Parch'], prefix='Parch',drop_first=True)], axis = 1)

df_lr = pd.concat([df_lr.drop('Fsize',axis=1),pd.get_dummies(df_lr['Fsize'], prefix='Fsize',drop_first=True)], axis = 1)

df_lr.shape

(891, 45)

df_lr['age_std'] = age_df_std

df_lr['fare_std'] = fare_df_std

df_lr.columns

Index([    u'Survived',          u'Sex',          u'Age',         u'Fare',
         u'Embarked_Q',   u'Embarked_S',     u'title_Dr',   u'title_Lady',
         u'title_MHon', u'title_Master',   u'title_Miss',    u'title_Mrs',
           u'title_Ms',    u'title_Rev',    u'title_Sir',       u'deck_A',
             u'deck_B',       u'deck_C',       u'deck_D',       u'deck_E',
             u'deck_F',       u'deck_G',       u'deck_T',      u'Class_2',
            u'Class_3',      u'SibSp_1',      u'SibSp_2',      u'SibSp_3',
            u'SibSp_4',      u'SibSp_5',      u'SibSp_8',      u'Parch_1',
            u'Parch_2',      u'Parch_3',      u'Parch_4',      u'Parch_5',
            u'Parch_6',      u'Fsize_2',      u'Fsize_3',      u'Fsize_4',
            u'Fsize_5',      u'Fsize_6',      u'Fsize_7',      u'Fsize_8',
           u'Fsize_11',      u'age_std',     u'fare_std'],
      dtype='object')

df_lr.drop(['Age','Fare'], axis=1, inplace=True)

corr = df_lr.corr(method='pearson', min_periods=1).iloc[:,0]

corr = corr[np.argsort(corr, axis=0)[::-1]]
corr = pd.DataFrame(corr)

plt.figure(figsize=(6, 0.25*len(corr)))
sns.barplot(data=corr, y=corr.index, x=corr['Survived'], orient='h')

<matplotlib.axes._subplots.AxesSubplot at 0x11a133690>

png

X_lr = df_lr.iloc[:,1:]
y_lr = df_lr['Survived']

X_lr.head()

	Sex	Embarked_S	title_Miss	title_Mrs	...	Fsize_2	age_std	fare_std
0	1	1.0	0.0	0.0	...	1.0	-0.529702	-0.502445
1	0	0.0	0.0	1.0	...	1.0	0.656200	0.786845
2	0	1.0	1.0	0.0	...	0.0	-0.233226	-0.488854
3	0	1.0	0.0	1.0	...	1.0	0.433843	0.420730
4	1	1.0	0.0	0.0	...	0.0	0.433843	-0.486337

5 rows × 44 columns

# Split training and test
from sklearn.model_selection import train_test_split, cross_val_score

Xlr_train, Xlr_test, ylr_train, ylr_test = train_test_split(X_lr, y_lr, test_size=.30, random_state=42)

from sklearn.linear_model import LogisticRegression

# fit model
lr = LogisticRegression()
lr_model = lr.fit(Xlr_train, ylr_train)

# predictions
ylr_pred = lr_model.predict(Xlr_test)

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

acc = accuracy_score(ylr_test, ylr_pred)

acc

0.82089552238805974

# conf matrix
lr_cm = confusion_matrix(ylr_test, ylr_pred)
lr_cm = pd.DataFrame(lr_cm, columns=["Survived-N", "Survived-Y"], index=["Survived-N", "Survived-Y"])
lr_cm

	Survived-N	Survived-Y
Survived-N	138	19
Survived-Y	29	82

from sklearn.metrics import classification_report

print(classification_report(ylr_test, ylr_pred))

             precision    recall  f1-score   support

          0       0.83      0.88      0.85       157
          1       0.81      0.74      0.77       111

avg / total       0.82      0.82      0.82       268

Our recall on the positive class is only 74%. Let’s tune to see if we can improve and then think about adjusting the threshold so that we increase the recall on the positive class.

cv_lr = cross_val_score(lr, X_lr, y_lr, cv=3)
cv_lr

array([ 0.80808081,  0.82491582,  0.82154882])

cv_lr.mean()

0.81818181818181823

lr_model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

from sklearn.model_selection import GridSearchCV

lrcv = LogisticRegression(verbose=False)
Cs = [0.0001, 0.001, 0.01, 0.1, .15, .25, .275, .33, 0.5, .66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs = GridSearchCV(lrcv, {'penalty': penalties, 'C': Cs},verbose=False, cv=15)
gs.fit(Xlr_train, ylr_train)

GridSearchCV(cv=15, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=False, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 0.15, 0.25, 0.275, 0.33, 0.5, 0.66, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=False)

print gs.best_params_
print gs.best_score_

{'penalty': 'l2', 'C': 1.0}
0.841091492777

lr2 = LogisticRegression(penalty='l2', C=1.0)
lr2_model = lr2.fit(Xlr_train, ylr_train)

# predictions
ylr_pred = lr2_model.predict(Xlr_test)

acc = accuracy_score(ylr_test, ylr_pred)
acc

0.82089552238805974

# conf matrix
lr_cm = confusion_matrix(ylr_test, ylr_pred)
lr_cm = pd.DataFrame(lr_cm, columns=["Survived_N", "Survived_Y"], index=["Survived_N", "Survived_Y"])
lr_cm

	Survived_N	Survived_Y
Survived_N	138	19
Survived_Y	29	82

# lrcv_model.predict_proba(Xlr_test)

from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
%matplotlib inline

Y_score = lr2_model.decision_function(Xlr_test)

FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(ylr_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[6,5])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('ROC for predicting Survival', fontsize=18)
plt.legend(loc="lower right")
plt.show()

png

# Lets try to improve our recall on the positive class using the above roc to aim for about 86% recall.
lrcv3 = LogisticRegression(penalty='l2', C=1.0, class_weight={1: 0.95, 0:.30})
lrcv3_model = lrcv3.fit(Xlr_train, ylr_train)

# predictions
ylr_pred3 = lrcv3_model.predict(Xlr_test)

# conf matrix
lr_cm3 = confusion_matrix(ylr_test, ylr_pred3)
lr_cm3 = pd.DataFrame(lr_cm3, columns=["Survived_N", "Survived_Y"], index=["Survived_N", "Survived_Y"])
lr_cm3

	Survived_N	Survived_Y
Survived_N	120	37
Survived_Y	16	95

print(classification_report(ylr_test, ylr_pred3))

             precision    recall  f1-score   support

          0       0.88      0.76      0.82       157
          1       0.72      0.86      0.78       111

avg / total       0.81      0.80      0.80       268

Lets look at Knn

# We will have to rescale our our age feature, we are going to leave out fare

df_knn = df_lr.copy()

# df_knn.head()

# I wonder if it matters if we min max scale from standardized data.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
age_scaled = scaler.fit_transform(df_knn['age_std'])
df_knn['age_scaled'] = age_scaled

//anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:321: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
//anaconda/lib/python2.7/site-packages/sklearn/preprocessing/data.py:356: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

df_knn.drop(['age_std', 'fare_std'], axis=1, inplace=True)

df_knn.drop(['Embarked_Q', 'Embarked_S'], axis=1, inplace=True)

df_knn.columns

Index([    u'Survived',          u'Sex',     u'title_Dr',   u'title_Lady',
         u'title_MHon', u'title_Master',   u'title_Miss',    u'title_Mrs',
           u'title_Ms',    u'title_Rev',    u'title_Sir',       u'deck_A',
             u'deck_B',       u'deck_C',       u'deck_D',       u'deck_E',
             u'deck_F',       u'deck_G',       u'deck_T',      u'Class_2',
            u'Class_3',      u'SibSp_1',      u'SibSp_2',      u'SibSp_3',
            u'SibSp_4',      u'SibSp_5',      u'SibSp_8',      u'Parch_1',
            u'Parch_2',      u'Parch_3',      u'Parch_4',      u'Parch_5',
            u'Parch_6',      u'Fsize_2',      u'Fsize_3',      u'Fsize_4',
            u'Fsize_5',      u'Fsize_6',      u'Fsize_7',      u'Fsize_8',
           u'Fsize_11',   u'age_scaled'],
      dtype='object')

df_knn.drop(['deck_A','deck_B','deck_C','deck_D','deck_E','deck_F','deck_G','deck_T'], axis=1, inplace=True)

df_knn.drop(['SibSp_1','SibSp_2','SibSp_3','SibSp_4','SibSp_5','SibSp_8'], axis=1, inplace=True)

df_knn.drop(['Parch_1','Parch_2','Parch_3','Parch_4','Parch_5','Parch_6'], axis=1, inplace=True)

df_knn.drop(['title_Dr','title_Lady','title_MHon', 'title_Master','title_Miss',\
             'title_Mrs','title_Ms','title_Rev','title_Sir'], axis=1, inplace=True)

X_knn = df_knn.iloc[:,1:]
y_knn = df_knn['Survived']

X_knn.head()

	Sex	Class_3	Fsize_2	age_scaled
0	1	1.0	1.0	0.271174
1	0	0.0	1.0	0.472229
2	0	1.0	0.0	0.321438
3	0	0.0	1.0	0.434531
4	1	1.0	0.0	0.434531

Xknn_train, Xknn_test, yknn_train, yknn_test = train_test_split(X_knn, y_knn, test_size=.30, random_state=78)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')

knn.fit(Xknn_train,yknn_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

# Check accuracy
knn_pred = knn.predict(Xknn_test)
accuracy_score(yknn_test, knn_pred)

0.84701492537313428

:P, Reducing our features to four categories brought our accuracy from low 70’s to 84.

# Let's gridsearch some parameters for knn

K = range(1,11)
wghts = ['uniform','distance']

knn = KNeighborsClassifier()

gs = GridSearchCV(knn, {'n_neighbors': K, 'weights':wghts}, cv=3)
gs.fit(X_knn, y_knn)

GridSearchCV(cv=3, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

print gs.best_params_
print gs.best_score_

{'n_neighbors': 4, 'weights': 'uniform'}
0.814814814815

knn = KNeighborsClassifier(n_neighbors=4, weights='uniform')

knn.fit(Xknn_train,yknn_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=4, p=2,
           weights='uniform')

# Check accuracy
knn_pred = knn.predict(Xknn_test)
accuracy_score(yknn_test, knn_pred)

0.81343283582089554

after tuning our model with gridsearch we get a slightly lower accuracy score, possibly due to the fold size and split of the data.

# conf matrix
knn_cm = confusion_matrix(yknn_test, knn_pred)
knn_cm = pd.DataFrame(knn_cm, columns=["Survived_N", "Survived_Y"], index=["Survived_N", "Survived_Y"])
knn_cm

	Survived_N	Survived_Y
Survived_N	153	13
Survived_Y	37	65

print(classification_report(yknn_test, knn_pred))

             precision    recall  f1-score   support

          0       0.81      0.92      0.86       166
          1       0.83      0.64      0.72       102

avg / total       0.82      0.81      0.81       268

Y_score = knn.predict_proba(Xknn_test)[:,1]


FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(yknn_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[6,5])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Knn - ROC for predicting Survival', fontsize=18)
plt.legend(loc="lower right")
plt.show()

png

# knn does better capturing non-survivors with a 92% recall, but is worse on survivors with a recall of 64, 
# while maintaining a similar accuaracy as log reg

# Lets try a decision tree
from sklearn.tree import DecisionTreeClassifier

df_tree = df.copy()

to_drop = ['Name', 'Ticket', 'Cabin', 'surname']
df_tree.drop(to_drop, axis=1, inplace=True)
df_tree.head()

	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked	Fsize	title	deck
0	0	3	male	22.0	1	7.2500	S	2	Mr	X
1	1	1	female	38.0	1	71.2833	C	2	Mrs	C
2	1	3	female	26.0	0	7.9250	S	1	Miss	X
3	1	1	female	35.0	1	53.1000	S	2	Mrs	C
4	0	3	male	35.0	0	8.0500	S	1	Mr	X

df_tree['Sex'] = df_tree['Sex'].apply(lambda x: 1 if x == 'male' else 0)

df_tree = pd.concat([df_tree.drop('Pclass',axis=1),pd.get_dummies(df['Pclass'], prefix='Class')], axis = 1)
df_tree = pd.concat([df_tree.drop('Fsize',axis=1),pd.get_dummies(df['Fsize'], prefix='Fsize')], axis = 1)

ytr = df_tree['Survived']
Xtr = pd.get_dummies(df_tree.drop('Survived', axis=1))
Xtr.head()

	Sex	Age	SibSp	Fare	Class_1	Class_3	Fsize_1	Fsize_2	...	deck_C	deck_X
0	1	22.0	1	7.2500	0.0	1.0	0.0	1.0	...	0.0	1.0
1	0	38.0	1	71.2833	1.0	0.0	0.0	1.0	...	1.0	0.0
2	0	26.0	0	7.9250	0.0	1.0	1.0	0.0	...	0.0	1.0
3	0	35.0	1	53.1000	1.0	0.0	0.0	1.0	...	1.0	0.0
4	1	35.0	0	8.0500	0.0	1.0	1.0	0.0	...	0.0	1.0

5 rows × 39 columns

Xtr_train, Xtr_test, ytr_train, ytr_test = train_test_split(Xtr, ytr, test_size=.30, random_state=42)

dt = DecisionTreeClassifier(max_depth = 20) 

dt.fit(Xtr_train, ytr_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

feature_importances = pd.DataFrame(dt.feature_importances_,
                                   index = Xtr_train.columns,
                                    columns=['importance'])
feature_importances.sort_values(by='importance', ascending=False)

	importance
title_Mr	0.301381
Fare	0.265834
Age	0.140920
Class_3	0.093943
title_Rev	0.024041
Parch	0.019571
deck_E	0.017983
Embarked_S	0.015582
deck_B	0.015014
title_Dr	0.013680
Embarked_C	0.012709
Fsize_4	0.011801
Fsize_5	0.008725
deck_X	0.008162
deck_C	0.007443
Embarked_Q	0.006782
Fsize_2	0.006149
Class_2	0.005454
SibSp	0.004794
deck_D	0.004494
Fsize_7	0.003923
deck_T	0.003596
Fsize_1	0.003596
Class_1	0.002865
Fsize_3	0.001557
title_Sir	0.000000
deck_G	0.000000
deck_F	0.000000
deck_A	0.000000
title_Mrs	0.000000
title_Ms	0.000000
Fsize_8	0.000000
title_Miss	0.000000
title_Master	0.000000
title_MHon	0.000000
title_Lady	0.000000
Fsize_6	0.000000
Fsize_11	0.000000
Sex	0.000000

# Check accuracy
dt_pred = dt.predict(Xtr_test)
accuracy_score(ytr_test, dt_pred)

0.78358208955223885

K = range(5,31)
wghts = ['balanced', None]

dt = DecisionTreeClassifier()

gs = GridSearchCV(dt, {'max_depth': K, 'class_weight':wghts}, cv=15)
gs.fit(Xtr, ytr)

GridSearchCV(cv=15, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], 'class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

print gs.best_params_
print gs.best_score_

{'max_depth': 8, 'class_weight': None}
0.81593714927

# re fit model with max depth 8
dt = DecisionTreeClassifier(max_depth = 8) 

dt.fit(Xtr_train, ytr_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

# Check accuracy
dt_pred = dt.predict(Xtr_test)
accuracy_score(ytr_test, dt_pred)

0.81343283582089554

# get a slightly better acuuracy than before gridsearch

# conf matrix
dt_cm = confusion_matrix(ytr_test, dt_pred)
dt_cm = pd.DataFrame(dt_cm, columns=["Survived_N", "Survived_Y"], index=["Survived_N", "Survived_Y"])
dt_cm

	Survived_N	Survived_Y
Survived_N	136	21
Survived_Y	29	82

# looks pretty much the same as the first logistic regression
# predicted 82 survivors correctly, but missed 29 survivors and also classified 19 non-survivors as survivors.

print(classification_report(ytr_test, dt_pred))

             precision    recall  f1-score   support

          0       0.82      0.87      0.84       157
          1       0.80      0.74      0.77       111

avg / total       0.81      0.81      0.81       268

# Our precision is fairly good, but our recall on class 1 - survival - is a bit lower, 
# missing 26% of of survivors (29)

Y_score = dt.predict_proba(Xtr_test)[:,1]


FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(ytr_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[6,5])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Decision Tree - ROC for predicting Survival', fontsize=18)
plt.legend(loc="lower right")
plt.show()

png

# Lets adjust the decision threshold  
dt2 = DecisionTreeClassifier(max_depth = 8, class_weight={1: 0.1, 0:.9}) 

dt2.fit(Xtr_train, ytr_train)

DecisionTreeClassifier(class_weight={0: 0.9, 1: 0.1}, criterion='gini',
            max_depth=8, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

dt2_pred = dt2.predict(Xtr_test)
accuracy_score(ytr_test, dt2_pred)

0.78358208955223885

# conf matrix
dt2_cm = confusion_matrix(ytr_test, dt2_pred)
dt2_cm = pd.DataFrame(dt2_cm, columns=["P_Survived_N", "P_Survived_Y"], index=["Survived_N", "Survived_Y"])
dt2_cm

	P_Survived_N	P_Survived_Y
Survived_N	147	10
Survived_Y	48	63

dt_cm

	Survived_N	Survived_Y
Survived_N	136	21
Survived_Y	29	82

We increased the weight of the non-survivor class to be able to identify more people
who would likely be at risk in a disastor. The recall on our 0 class is bumped up without too much sacrifice to accuracy.

print(classification_report(ytr_test, dt2_pred))

             precision    recall  f1-score   support

          0       0.75      0.94      0.84       157
          1       0.86      0.57      0.68       111

avg / total       0.80      0.78      0.77       268

Y_score = dt2.predict_proba(Xtr_test)[:,1]


FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(ytr_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 (has_cancer)
plt.figure(figsize=[6,5])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Decision Tree (weighted) - ROC', fontsize=18)
plt.legend(loc="lower right")
plt.show()

png

Y_score = dt2.predict_proba(Xtr_test)[:,1]

FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(ytr_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 - survival
plt.subplots(figsize=(6,6));
plt.plot(FPR[1], TPR[1], label='CART ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.title('Predicting Survival - ROC', fontsize=18)
plt.legend(loc="lower right")

Y_score = knn.predict_proba(Xknn_test)[:,1]

FPR = dict()
TPR = dict()
ROC_AUC = dict()

FPR[1], TPR[1], _ = roc_curve(yknn_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

plt.plot(FPR[1], TPR[1], label='Knn ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.legend(loc="lower right")

Y_score = lr2_model.decision_function(Xlr_test)

FPR = dict()
TPR = dict()
ROC_AUC = dict()

FPR[1], TPR[1], _ = roc_curve(ylr_test, Y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

plt.plot(FPR[1], TPR[1], label='LR ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=18)
plt.ylabel('True Positive Rate', fontsize=18)
plt.legend(loc="lower right")
plt.show()

png

Comparing our ROC curves it looks like we can get a good tradeoff with Knn at a true positive rate of about .85 and a false positive rate of about .15

# Lets try bagging

from sklearn.ensemble import BaggingClassifier
bagger = BaggingClassifier(dt2, max_samples=1.0)

print "DT Score:\t", cross_val_score(dt2, Xtr, ytr, cv=10, n_jobs=1).mean()
print "Bagging Score:\t", cross_val_score(bagger, Xtr, ytr, cv=10, n_jobs=1).mean()

DT Score:	0.794676256952
Bagging Score:	0.811594313926

baggerk = BaggingClassifier(knn)

print "Knn Score:\t", cross_val_score(knn, X_knn, y_knn, cv=10, n_jobs=1).mean()
print "Bagging Score:\t", cross_val_score(baggerk, X_knn, y_knn, cv=10, n_jobs=1).mean()

Knn Score:	0.814951481103
Bagging Score:	0.810470434684

bagger_lr = BaggingClassifier(lr)

print "LR Score:\t", cross_val_score(lr, X_lr, y_lr, cv=10, n_jobs=1).mean()
print "Bagging Score:\t", cross_val_score(bagger_lr, X_lr, y_lr, cv=10, n_jobs=1).mean()

LR Score:	0.832780331404
Bagging Score:	0.832780331404

# Our decision tree is the only model that benefits from bagging here and only slightly. 
# It perhaps was mildly overfit, while the other two were tuned fairly well already

Report

There was not much difference between each of the classifiers in this case, with just over 80% accuracy for each tuned model. Our Knn model did show a difference in the confusion matrix, having a higher recall for the non-surviving class and a lower recall on the surviving class. Knn predictions captured almost all of the non-survivors but did not capture a high amount of the surviving class.

By adjusting the class weight for a couple of models we were able to slightly increase recall on both the positive and negative class without a big decrease in accuracy. Depending on the implementation and timing one class can be weighted higher than the other to more fully identify either those at risk or those likely to survive.