import pandas as pd
import plotly.express as px
from plotly.offline import plot
from IPython.display import HTML

df = pd.read_csv('Titanic-Dataset.csv')
df.head()

print(df.shape)

(891, 12)

missing_values = df.isnull().sum()
print(f'Column \'{missing_values.idxmax()}\' had \'{missing_values.max()}\' missing values which is highest than other.')

Column 'Cabin' had '687' missing values which is highest than other.

missing_values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

df['Age'] = df['Age'].fillna(df['Age'].median())

df = df.drop('Cabin', axis=1)

df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 76.7+ KB

def changeToStr(x):
    if x == 1:
        return 'Upper Class'
    if x == 2:
        return 'Middle Class'
    if x == 3:
        return 'Third Class'

df['Pclass'] = df['Pclass'].apply(changeToStr)

pclass_count = df['Pclass'].value_counts().reset_index('Pclass')
pclass_count
fig = px.bar(
    pclass_count,
    x='Pclass', 
    y='count', 
    color='Pclass', 
    text_auto=True)

fig.update_layout(
    title='Passenger Class Counts',
    xaxis_title='Pclass',
    yaxis_title='Counts',
)
HTML(plot(fig, include_plotlyjs='cdn', output_type='div'))

age_survival = df[['Survived', 'Age']].groupby('Age')['Survived'].apply(lambda x: (x.sum()/ len(x)) * 100)

fig = px.histogram(age_survival, text_auto=True, color_discrete_sequence=['indianred'], nbins=20)

fig.update_layout(
    title='Histogram of Survival Age',
    xaxis_title='Ages',
    template='plotly_white'
)

HTML(plot(fig, include_plotlyjs='cdn', output_type='div'))

pclass_sex = df.groupby(['Pclass', 'Sex'])['Survived'].mean().reset_index()
pclass_sex['Survival Rate %'] = pclass_sex['Survived'] * 100

fig = px.bar(pclass_sex, 
             x='Pclass', 
             y='Survival Rate %',
             color='Sex',
             barmode='group',
             title='Survival Rate by Pclass and Gender',
             text_auto='.4s'
             )

fig.update_layout(
    height=500
)

HTML(plot(fig, include_plotlyjs='cdn', output_type='div'))

sex_survival = df[['Survived', 'Sex']].groupby('Sex')['Survived'].apply(lambda x : (x.sum() / len(x)) * 100).reset_index()
sex_survival
fig = px.bar(sex_survival,
             x='Sex',
             y='Survived', 
             color='Sex',
             text_auto='.4s')

fig.update_layout(
    title='Survival Rate of Female vs Male',
    xaxis_title='Sex',
    yaxis_title='Survival Rate in %',
    height=600
)

HTML(plot(fig, include_plotlyjs='cdn', output_type='div'))

def convertToLong(x):
    if x == 'C':
        return 'Cherbourg'
    if x == 'Q':
        return 'Queenstown'
    if x == 'S':
        return 'Southampton'

df['Embarked'] = df['Embarked'].apply(convertToLong)

embark_survival = df[['Embarked', 'Survived']].groupby('Embarked').apply(lambda x: (x.sum() / len(x)) * 100).reset_index('Embarked')

fig = px.bar(
    embark_survival,
    x='Embarked',
    y='Survived',
    color='Embarked',
    text_auto='.4s'
)

fig.update_layout(
    title='Survival Rate Based on Embarkation Point',
    xaxis_title='Embarked Point',
    yaxis_title='Survival Rate in %',
)
HTML(plot(fig, include_plotlyjs='cdn', output_type='div'))

df['Family_Size'] = df['SibSp'] + df['Parch'] + 1

familysize_survival = df[['Family_Size', 'Survived']].groupby('Family_Size')['Survived'].apply(lambda x : (x.sum() / len(x)) * 100)

fig = px.bar(
    familysize_survival[:-2],
    text_auto='.4s'
)

fig.update_layout(
    title='Survival Rate Based on Family Size',
    xaxis_title='Family Size',
    yaxis_title='Survival Rate in %',
)

HTML(plot(fig, include_plotlyjs='cdn', output_type='div'))

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

Titanic Survival Analysis¶

Problem Statement¶

Understanding the Data:¶

Number of Rows and Columns in the dataset¶

Missing values¶

Which Column had highest number of missing values ?¶

Handling Missing Values¶

Replace NaN with median of 'Age' column¶

Drop the 'Cabin' column¶

Replace NaN with mode of 'Embarked' column¶

Data Visualization¶

Change 'Pclass' variable from numerical datatype to categorical¶

How many passengers were in each class¶

Age Distribution and Survival¶

Survival Rate based on Passenger Class(Pclass) and Sex¶

Survival Rate - Men vs Women¶

Embarkation Point and Survival¶

Family Size and Survival¶

Data Analysis Findings¶