import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

# SKlearn (denk om "pip install scikit-learn" en NIET "sklearn"):
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
# load the data
url = 'titanic-train.csv'
df = pd.read_csv(url)
#df.columns = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
# inspect the data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# Check de class variabele, de variabele die we willen voorspellen a.d.h.v. de features
df['Survived'].value_counts().plot(kind='bar')
<Axes: xlabel='Survived'>
../../../_images/182370a5189c3f495d273118ac9a37fbc0896460e9371aa4cb34833da02da4fa.png
## Check missing values
# We kunnen makkelijk checken op missing values met de isnull methode van pandas
df.isnull().sum()
# de set bevat missing values op Age en Cabin (en Embarked)
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
# Kan ook grafisch
sns.set_style("whitegrid")

sns.heatmap(df.isnull(), yticklabels=False, cmap="viridis")
<Axes: >
../../../_images/f7a589cb7cb8db9e9a8bc1521f16dc0977d0a1bcdcc72fbbf6621a7ac39672c4.png
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
# plot the data
df.Age.hist(bins=20, label='leeftijd')
plt.legend()
<matplotlib.legend.Legend at 0x1d938c62db0>
../../../_images/c047275ed5712d119d25148edf3aac5ab2e9ddb98e9d965613e891ec990d0947.png
# data is wijd verspreid, dus hoe data missing values in te vullen?
# Is er een relatie met klasse?
sns.boxplot(x='Pclass', y='Age', data=df, palette='hls', hue='Pclass', legend=False)
<Axes: xlabel='Pclass', ylabel='Age'>
../../../_images/0c55b481e7b3be2556e55022ed38cbadf52a0872559543952141c1c21272af40.png

Grof gezegd kunnen we zeggen dat hoe jonger een passagier is, hoe groter de kans is dat hij in de 3e klas zit. Hoe ouder een passagier is, hoe groter de kans dat hij in de 1e klas zit. Er is dus een losse relatie tussen deze variabelen. Laten we dus een functie schrijven die de leeftijd van de passagier benadert, op basis van hun klasse. Uit de boxplot blijkt dat de gemiddelde leeftijd van de 1e klas passagiers ongeveer 37 is, de 2e klas passagiers 29 is en de 3e klas passagiers 24 is.

Laten we dus een functie schrijven die elke nulwaarde in de variabele Age vindt en voor elke nul de waarde van de Pclass controleert en een leeftijdwaarde toekent op basis van de gemiddelde leeftijd van passagiers in die klasse.

def age_approx(cols):
    Age = cols.iloc[0]
    Pclass = cols.iloc[1]
    
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
df['Age'] = df[['Age', 'Pclass']].apply(age_approx, axis=1)
df.isnull().sum()
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
df.head(10)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male 24.0 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
# waarschijnlijk doen naam, ticket en cabin er niet toe, deze kunnen we verwijderen
df = df.drop(['Name','Ticket','Cabin'], axis=1)
df.isnull().sum()
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
dtype: int64
# verwijderen van de rijen met de twee waardes nul in de Embarked zal niet veel uitmaken
df.dropna(inplace=True)
df.isnull().sum()
PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    object 
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB
df.head(3)
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked
0 1 0 3 male 22.0 1 0 7.2500 S
1 2 1 1 female 38.0 1 0 71.2833 C
2 3 1 3 female 26.0 0 0 7.9250 S
#Sex en Embarked moeten numeriek gemaakt worden
df['Sex'] = df['Sex'].map({'female': '1', 'male': '0'})

df.head()
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked
0 1 0 3 0 22.0 1 0 7.2500 S
1 2 1 1 1 38.0 1 0 71.2833 C
2 3 1 3 1 26.0 0 0 7.9250 S
3 4 1 1 1 35.0 1 0 53.1000 S
4 5 0 3 0 35.0 0 0 8.0500 S
df.Embarked.value_counts().plot(kind='barh')
<Axes: ylabel='Embarked'>
../../../_images/823a4ea9efce89296d3467d2982b5c16c38c5b962f5617119793d44eeedf798e.png
df['Embarked'] = df['Embarked'].map({'C': '1', 'S': '2', 'Q': '3'})
df.head()
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked
0 1 0 3 0 22.0 1 0 7.2500 2
1 2 1 1 1 38.0 1 0 71.2833 1
2 3 1 3 1 26.0 0 0 7.9250 2
3 4 1 1 1 35.0 1 0 53.1000 2
4 5 0 3 0 35.0 0 0 8.0500 2
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    object 
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 69.5+ KB
df['Sex'] = pd.to_numeric(df['Sex']) # alt.: df['Sex'] = df['Sex'].astype('int')
df['Embarked'] = pd.to_numeric(df['Embarked'])
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Sex          889 non-null    int64  
 4   Age          889 non-null    float64
 5   SibSp        889 non-null    int64  
 6   Parch        889 non-null    int64  
 7   Fare         889 non-null    float64
 8   Embarked     889 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 69.5 KB
df.head()
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked
0 1 0 3 0 22.0 1 0 7.2500 2
1 2 1 1 1 38.0 1 0 71.2833 1
2 3 1 3 1 26.0 0 0 7.9250 2
3 4 1 1 1 35.0 1 0 53.1000 2
4 5 0 3 0 35.0 0 0 8.0500 2
# get X (features) en y (class/category)
y = np.array(df['Survived'])

X = np.array(df.iloc[:,2:9]) # slice is 'up to but not including'
X.shape
(889, 7)
# normaliseer (gemiddelde eraf en delen door de standaarddeviatie, zodat het op een normaalverdeling gaat lijken)
def normalize(X):
    scaler = StandardScaler()
    scaler = scaler.fit(X)
    X = scaler.transform(X)
    return X

X = normalize(X)
X
array([[ 0.82520863, -0.73534203, -0.53167023, ..., -0.47432585,
        -0.50023975,  0.19880372],
       [-1.57221121,  1.35991138,  0.68023223, ..., -0.47432585,
         0.78894661, -1.74335569],
       [ 0.82520863,  1.35991138, -0.22869462, ..., -0.47432585,
        -0.48664993,  0.19880372],
       ...,
       [ 0.82520863,  1.35991138, -0.38018243, ...,  2.00611934,
        -0.17408416,  0.19880372],
       [-1.57221121, -0.73534203, -0.22869462, ..., -0.47432585,
        -0.0422126 , -1.74335569],
       [ 0.82520863, -0.73534203,  0.22576881, ..., -0.47432585,
        -0.49017322,  2.14096313]])

Na al deze verkennings- en preparatiestappen gaan we eindelijk het model maken, trainen en evalueren!#

# split: 70% trainingsdata en 30% testdata
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state=42)

# train model
logReg = LogisticRegression()
logReg.fit(X_train, y_train)
y_pred = logReg.predict(X_test)
# evalueer
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
[[138  29]
 [ 26  74]]
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       167
           1       0.72      0.74      0.73       100

    accuracy                           0.79       267
   macro avg       0.78      0.78      0.78       267
weighted avg       0.80      0.79      0.79       267

Extra#

Correlaties zijn een goede manier om te kijken welke features relevant zijn voor Survived en welke niet.

corr_matrix = df.corr()
corr_matrix
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked
PassengerId 1.000000 -0.005028 -0.035330 -0.043136 0.033008 -0.057686 -0.001657 0.012703 -0.017487
Survived -0.005028 1.000000 -0.335549 0.541585 -0.052051 -0.034040 0.083151 0.255290 -0.126753
Pclass -0.035330 -0.335549 1.000000 -0.127741 -0.405549 0.081656 0.016824 -0.548193 0.307324
Sex -0.043136 0.541585 -0.127741 1.000000 -0.083730 0.116348 0.247508 0.179958 -0.023175
Age 0.033008 -0.052051 -0.405549 -0.083730 1.000000 -0.242807 -0.170089 0.120938 -0.076559
SibSp -0.057686 -0.034040 0.081656 0.116348 -0.242807 1.000000 0.414542 0.160887 0.031095
Parch -0.001657 0.083151 0.016824 0.247508 -0.170089 0.414542 1.000000 0.217532 -0.035756
Fare 0.012703 0.255290 -0.548193 0.179958 0.120938 0.160887 0.217532 1.000000 -0.269588
Embarked -0.017487 -0.126753 0.307324 -0.023175 -0.076559 0.031095 -0.035756 -0.269588 1.000000
corr_matrix['Survived']
PassengerId   -0.005028
Survived       1.000000
Pclass        -0.335549
Sex            0.541585
Age           -0.052051
SibSp         -0.034040
Parch          0.083151
Fare           0.255290
Embarked      -0.126753
Name: Survived, dtype: float64

Grafisch#

sns.heatmap(corr_matrix, annot=True, vmax=1, vmin=-1, center=0, cmap='vlag')
<Axes: >
../../../_images/3cf1e9629b868efdf4239ab24c09a2b51a937278cf9ffd8d1d2c50674b31c392.png

Klopt dit met de gewichten die het LogReg-model gevonden heeft?#

weights = logReg.coef_
weights
array([[-0.92207254,  1.31801161, -0.65228026, -0.48223878, -0.05739479,
         0.12816244, -0.16910035]])
weights = weights[0].tolist()
weights
[-0.922072543559232,
 1.318011610293939,
 -0.6522802624930832,
 -0.482238782010131,
 -0.057394788324977346,
 0.1281624355258713,
 -0.16910034556667025]
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
fw = zip(weights, features)
tuple(fw)
((-0.922072543559232, 'Pclass'),
 (1.318011610293939, 'Sex'),
 (-0.6522802624930832, 'Age'),
 (-0.482238782010131, 'SibSp'),
 (-0.057394788324977346, 'Parch'),
 (0.1281624355258713, 'Fare'),
 (-0.16910034556667025, 'Embarked'))

Class en Sex zijn naar verwachting hoog. Age is hoger dan verwacht, Fare lager.