import pandas as pd     # For data manipulation and analysis
import numpy as np      # For numrical computation
import matplotlib.pyplot as plt     # For plotting graphs and charts
import seaborn as sns       # For plotting graphs and charts in more advanced way
import math     # For mathematical operations

import warnings
warnings.filterwarnings('ignore')       # Filter out warnings to ignore them

# Load CSV file 'application_data.csv' into a DataFrame
application_data = pd.read_csv("C:/Users/ASUS/OneDrive/Desktop/Datasets/Credit Risk Analysis Dataset/application_data.csv")

# Clients with payment difficulties
application_data[application_data['TARGET'] == 1]

# Client with payment difficulties/other cases
application_data[application_data['TARGET'] == 0]

# Count and percentage of target variable
target_counts = application_data['TARGET'].value_counts()
target_percent = application_data['TARGET'].value_counts(normalize=True)*100

# Create DataFrame for plotting
target_df = pd.DataFrame({
    'Label': ['Other Cases (0)', 'Payment Difficulty (1)'],
    'Count': target_counts.values,
    'Percent': target_percent.values
})

# Plotting bar charts
plt.figure(figsize=(8, 5))      # (x-axis=8, y-axis=5)
barplot = sns.barplot(data=target_df, x='Label', y='Count', hue='Label')

# For adding percentage labels on top
for index, row in target_df.iterrows():
    barplot.text(index, row['Count'] + 1000, f"{row['Percent']:.2f}%", color='black', ha='center', fontweight='bold')

plt.title("Distribution of TARGET Variable", fontsize=14, fontweight='bold')        # For graph title
plt.ylabel("Number of Clients")     # For y axis labeling
plt.xlabel("Loan Repayment Status")     # For x axis labeling
plt.grid(axis='y', linestyle='--', alpha=0.5)
sns.despine()       # For removing boxline from graph

plt.tight_layout()
plt.show()

dataset_description = pd.read_csv("C:/Users/ASUS/OneDrive/Desktop/Datasets/Credit Risk Analysis Dataset/columns_description.csv")
dataset_description

# Displaying the summary of dataframe such as non-null count, data types and memory usage
application_data.iloc[:,1:50].info() # .iloc for selecting column by index. Here we only fetch info for 1 to 49 columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 49 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   TARGET                       307511 non-null  int64  
 1   NAME_CONTRACT_TYPE           307511 non-null  object 
 2   CODE_GENDER                  307511 non-null  object 
 3   FLAG_OWN_CAR                 307511 non-null  object 
 4   FLAG_OWN_REALTY              307511 non-null  object 
 5   CNT_CHILDREN                 307511 non-null  int64  
 6   AMT_INCOME_TOTAL             307511 non-null  float64
 7   AMT_CREDIT                   307511 non-null  float64
 8   AMT_ANNUITY                  307499 non-null  float64
 9   AMT_GOODS_PRICE              307233 non-null  float64
 10  NAME_TYPE_SUITE              306219 non-null  object 
 11  NAME_INCOME_TYPE             307511 non-null  object 
 12  NAME_EDUCATION_TYPE          307511 non-null  object 
 13  NAME_FAMILY_STATUS           307511 non-null  object 
 14  NAME_HOUSING_TYPE            307511 non-null  object 
 15  REGION_POPULATION_RELATIVE   307511 non-null  float64
 16  DAYS_BIRTH                   307511 non-null  int64  
 17  DAYS_EMPLOYED                307511 non-null  int64  
 18  DAYS_REGISTRATION            307511 non-null  float64
 19  DAYS_ID_PUBLISH              307511 non-null  int64  
 20  OWN_CAR_AGE                  104582 non-null  float64
 21  FLAG_MOBIL                   307511 non-null  int64  
 22  FLAG_EMP_PHONE               307511 non-null  int64  
 23  FLAG_WORK_PHONE              307511 non-null  int64  
 24  FLAG_CONT_MOBILE             307511 non-null  int64  
 25  FLAG_PHONE                   307511 non-null  int64  
 26  FLAG_EMAIL                   307511 non-null  int64  
 27  OCCUPATION_TYPE              211120 non-null  object 
 28  CNT_FAM_MEMBERS              307509 non-null  float64
 29  REGION_RATING_CLIENT         307511 non-null  int64  
 30  REGION_RATING_CLIENT_W_CITY  307511 non-null  int64  
 31  WEEKDAY_APPR_PROCESS_START   307511 non-null  object 
 32  HOUR_APPR_PROCESS_START      307511 non-null  int64  
 33  REG_REGION_NOT_LIVE_REGION   307511 non-null  int64  
 34  REG_REGION_NOT_WORK_REGION   307511 non-null  int64  
 35  LIVE_REGION_NOT_WORK_REGION  307511 non-null  int64  
 36  REG_CITY_NOT_LIVE_CITY       307511 non-null  int64  
 37  REG_CITY_NOT_WORK_CITY       307511 non-null  int64  
 38  LIVE_CITY_NOT_WORK_CITY      307511 non-null  int64  
 39  ORGANIZATION_TYPE            307511 non-null  object 
 40  EXT_SOURCE_1                 134133 non-null  float64
 41  EXT_SOURCE_2                 306851 non-null  float64
 42  EXT_SOURCE_3                 246546 non-null  float64
 43  APARTMENTS_AVG               151450 non-null  float64
 44  BASEMENTAREA_AVG             127568 non-null  float64
 45  YEARS_BEGINEXPLUATATION_AVG  157504 non-null  float64
 46  YEARS_BUILD_AVG              103023 non-null  float64
 47  COMMONAREA_AVG               92646 non-null   float64
 48  ELEVATORS_AVG                143620 non-null  float64
dtypes: float64(17), int64(20), object(12)
memory usage: 115.0+ MB

# Displaying total number of rows and columns in dataset (rows,columns)
application_data.shape

(307511, 122)

# Displaing columns
application_data.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=122)

# Displaying first five rows of the dataset to view it's structure and content
application_data.head()

numeric = application_data.select_dtypes('number').columns
object_categorical = application_data.select_dtypes(['object', 'category']).columns
print(f'Total numbers of Numeric column: {len(numeric)}')
print(f'Total numbers of Categorical columns: {len(object_categorical)}')

Total numbers of Numeric column: 106
Total numbers of Categorical columns: 16

# Get the summary statistics for numerical variables
application_data.describe().T

application_data.describe(include=['object']).T

# Calculate percentage of missing values
missing_values = application_data.isnull().sum() / len(application_data) *100 

# Sort the top 50 values in Descending format      
missing_values = missing_values.sort_values(ascending=False)

missing_values.head(20)

COMMONAREA_AVG              69.872297
COMMONAREA_MODE             69.872297
COMMONAREA_MEDI             69.872297
NONLIVINGAPARTMENTS_MEDI    69.432963
NONLIVINGAPARTMENTS_MODE    69.432963
NONLIVINGAPARTMENTS_AVG     69.432963
FONDKAPREMONT_MODE          68.386172
LIVINGAPARTMENTS_AVG        68.354953
LIVINGAPARTMENTS_MEDI       68.354953
LIVINGAPARTMENTS_MODE       68.354953
FLOORSMIN_MODE              67.848630
FLOORSMIN_AVG               67.848630
FLOORSMIN_MEDI              67.848630
YEARS_BUILD_AVG             66.497784
YEARS_BUILD_MODE            66.497784
YEARS_BUILD_MEDI            66.497784
OWN_CAR_AGE                 65.990810
LANDAREA_MEDI               59.376738
LANDAREA_AVG                59.376738
LANDAREA_MODE               59.376738
dtype: float64

# Include only more than 40% missing value columns
remove_missing_column = missing_values.loc[missing_values > 40].index

# Drop the columns that has more than 40% missing values
application_data_update = application_data.drop(columns=remove_missing_column)

# columns that has missing values less than 40%
missing_values_gt_0 = application_data_update.isnull().sum()  / len(application_data) * 100
missing_values_gt_0 = missing_values_gt_0[missing_values_gt_0 > 0].sort_values(ascending=False)
missing_values_gt_0

OCCUPATION_TYPE               31.345545
EXT_SOURCE_3                  19.825307
AMT_REQ_CREDIT_BUREAU_WEEK    13.501631
AMT_REQ_CREDIT_BUREAU_MON     13.501631
AMT_REQ_CREDIT_BUREAU_YEAR    13.501631
AMT_REQ_CREDIT_BUREAU_QRT     13.501631
AMT_REQ_CREDIT_BUREAU_HOUR    13.501631
AMT_REQ_CREDIT_BUREAU_DAY     13.501631
NAME_TYPE_SUITE                0.420148
OBS_30_CNT_SOCIAL_CIRCLE       0.332021
DEF_60_CNT_SOCIAL_CIRCLE       0.332021
DEF_30_CNT_SOCIAL_CIRCLE       0.332021
OBS_60_CNT_SOCIAL_CIRCLE       0.332021
EXT_SOURCE_2                   0.214626
AMT_GOODS_PRICE                0.090403
AMT_ANNUITY                    0.003902
CNT_FAM_MEMBERS                0.000650
DAYS_LAST_PHONE_CHANGE         0.000325
dtype: float64

# Plotting bar charts for visualization of data in easy manner

occupation_counts = application_data_update['OCCUPATION_TYPE'].value_counts().reset_index()
plt.figure(figsize=(10, 5))
sns.barplot(data=occupation_counts, x='OCCUPATION_TYPE', y='count', hue='OCCUPATION_TYPE')
plt.xticks(rotation=90)
plt.title("Distribution of OCCUPATION_TYPE")
plt.xlabel("Occupation Type")
plt.ylabel("Number of Clients")
sns.despine
plt.tight_layout()
plt.show()

# print top 10 values
application_data_update['EXT_SOURCE_3'].head(10)

0    0.139376
1         NaN
2    0.729567
3         NaN
4         NaN
5    0.621226
6    0.492060
7    0.540654
8    0.751724
9         NaN
Name: EXT_SOURCE_3, dtype: float64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of EXT_SOURCE_3")
sns.histplot(data=application_data_update, x='EXT_SOURCE_3', bins=10, color='Salmon')
plt.tight_layout() 
plt.show()

print(f'Mean: {application_data_update['EXT_SOURCE_3'].mean()}')
print(f'Median: {application_data_update['EXT_SOURCE_3'].median()}')
print(f'Mode: {application_data_update['EXT_SOURCE_3'].mode()[0]}')

Mean: 0.5108529061799657
Median: 0.5352762504724826
Mode: 0.746300213050371

# Fill null values with median values
application_data_update['EXT_SOURCE_3'].fillna(application_data_update['EXT_SOURCE_3'].median(), inplace=True)

application_data_update['AMT_REQ_CREDIT_BUREAU_WEEK'].value_counts()

AMT_REQ_CREDIT_BUREAU_WEEK
0.0    257456
1.0      8208
2.0       199
3.0        58
4.0        34
6.0        20
5.0        10
8.0         5
7.0         2
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of AMT_REQ_CREDIT_BUREAU_WEEK")
sns.histplot(data=application_data_update, x='AMT_REQ_CREDIT_BUREAU_WEEK', color='Salmon')
plt.show()

application_data_update['AMT_REQ_CREDIT_BUREAU_MON'].value_counts().head(10)

AMT_REQ_CREDIT_BUREAU_MON
0.0    222233
1.0     33147
2.0      5386
3.0      1991
4.0      1076
5.0       602
6.0       343
7.0       298
9.0       206
8.0       185
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of AMT_REQ_CREDIT_BUREAU_MON")
sns.histplot(data=application_data_update, x='AMT_REQ_CREDIT_BUREAU_MON', color='Salmon')
plt.show()

application_data_update['AMT_REQ_CREDIT_BUREAU_YEAR'].value_counts().head(10)

AMT_REQ_CREDIT_BUREAU_YEAR
0.0    71801
1.0    63405
2.0    50192
3.0    33628
4.0    20714
5.0    12052
6.0     6967
7.0     3869
8.0     2127
9.0     1096
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of AMT_REQ_CREDIT_BUREAU_YEAR")
sns.histplot(data=application_data_update, x='AMT_REQ_CREDIT_BUREAU_YEAR', color='Salmon', bins=25)
plt.show()

# Calculate Average(mean) of the column
print(f'Mean: {application_data_update['AMT_REQ_CREDIT_BUREAU_YEAR'].mean()}') 

# Calculate center value(median) of the column
print(f'Median: {application_data_update['AMT_REQ_CREDIT_BUREAU_YEAR'].median()}')

# Calculate most frequent value(mode) of the column
print(f'Mode: {application_data_update['AMT_REQ_CREDIT_BUREAU_YEAR'].mode()[0]}')

Mean: 1.899974435321363
Median: 1.0
Mode: 0.0

# Fill the missing values with mode 0.0
application_data_update['AMT_REQ_CREDIT_BUREAU_YEAR'].fillna(0.0, inplace=True)

application_data_update['AMT_REQ_CREDIT_BUREAU_QRT'].head(10)

0    0.0
1    0.0
2    0.0
3    NaN
4    0.0
5    1.0
6    1.0
7    0.0
8    0.0
9    NaN
Name: AMT_REQ_CREDIT_BUREAU_QRT, dtype: float64

application_data_update['AMT_REQ_CREDIT_BUREAU_QRT'].value_counts()

AMT_REQ_CREDIT_BUREAU_QRT
0.0      215417
1.0       33862
2.0       14412
3.0        1717
4.0         476
5.0          64
6.0          28
8.0           7
7.0           7
261.0         1
19.0          1
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of AMT_REQ_CREDIT_BUREAU_QRT")
sns.histplot(data=application_data_update, x='AMT_REQ_CREDIT_BUREAU_QRT', color='Salmon')
plt.show()

application_data_update['AMT_REQ_CREDIT_BUREAU_HOUR'].head()

0    0.0
1    0.0
2    0.0
3    NaN
4    0.0
Name: AMT_REQ_CREDIT_BUREAU_HOUR, dtype: float64

application_data_update['AMT_REQ_CREDIT_BUREAU_HOUR'].value_counts()

AMT_REQ_CREDIT_BUREAU_HOUR
0.0    264366
1.0      1560
2.0        56
3.0         9
4.0         1
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of AMT_REQ_CREDIT_BUREAU_HOUR")
sns.histplot(data=application_data_update, x='AMT_REQ_CREDIT_BUREAU_HOUR', color='Salmon')
plt.show()

application_data_update['AMT_REQ_CREDIT_BUREAU_DAY'].head()

0    0.0
1    0.0
2    0.0
3    NaN
4    0.0
Name: AMT_REQ_CREDIT_BUREAU_DAY, dtype: float64

application_data_update['AMT_REQ_CREDIT_BUREAU_DAY'].value_counts()

AMT_REQ_CREDIT_BUREAU_DAY
0.0    264503
1.0      1292
2.0       106
3.0        45
4.0        26
5.0         9
6.0         8
9.0         2
8.0         1
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of AMT_REQ_CREDIT_BUREAU_DAY")
sns.histplot(data=application_data_update, x='AMT_REQ_CREDIT_BUREAU_DAY', color='Salmon')
plt.show()

application_data_update['NAME_TYPE_SUITE'].head()

0    Unaccompanied
1           Family
2    Unaccompanied
3    Unaccompanied
4    Unaccompanied
Name: NAME_TYPE_SUITE, dtype: object

application_data_update['NAME_TYPE_SUITE'].value_counts(normalize=True)*100

NAME_TYPE_SUITE
Unaccompanied      81.159562
Family             13.111205
Spouse, partner     3.713029
Children            1.066884
Other_B             0.578018
Other_A             0.282804
Group of people     0.088499
Name: proportion, dtype: float64

# Here we leverged the used of countplot to see counts of values in each category

plt.figure(figsize=(8,5))
plt.title("Client Accompaniment at Loan Application")
sns.countplot(data=application_data_update, x='NAME_TYPE_SUITE', hue='NAME_TYPE_SUITE', legend=False)
plt.xlabel("Accompaniment Type")
plt.ylabel("Number of Clients")
plt.xticks(rotation=90)
plt.show()

application_data_update['NAME_TYPE_SUITE'].mode()

0    Unaccompanied
Name: NAME_TYPE_SUITE, dtype: object

# Fill null values with mode value
application_data_update['NAME_TYPE_SUITE'].fillna('Unaccompanied', inplace=True)

application_data_update['OBS_30_CNT_SOCIAL_CIRCLE'].head()

0    2.0
1    1.0
2    0.0
3    2.0
4    0.0
Name: OBS_30_CNT_SOCIAL_CIRCLE, dtype: float64

application_data_update['OBS_30_CNT_SOCIAL_CIRCLE'].value_counts().head(10)

OBS_30_CNT_SOCIAL_CIRCLE
0.0    163910
1.0     48783
2.0     29808
3.0     20322
4.0     14143
5.0      9553
6.0      6453
7.0      4390
8.0      2967
9.0      2003
Name: count, dtype: int64

# We simultaneously plot four columns becuase they are correlated with values 

columns = ['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE']

fig, axes = plt.subplots(2, 2, figsize=(14, 10)) 
axes = axes.flatten()

for idx, column in enumerate(columns):
    sns.countplot(data=application_data_update, x=column, palette='Paired', legend=False, ax=axes[idx])
    axes[idx].set_title(f'Countplot for {column}')
    axes[idx].tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

# Find out correlation of below columns with Target variable

result = application_data_update[['OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE',
    'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'TARGET']].corr()

# Plot correlation with heatmap

plt.figure(figsize=(8,5))
plt.title('Correlation with TARGET')
sns.heatmap(result, annot=True, cmap='Reds')
plt.tight_layout()
plt.show()

application_data_update['EXT_SOURCE_2'].head()

0    0.262949
1    0.622246
2    0.555912
3    0.650442
4    0.322738
Name: EXT_SOURCE_2, dtype: float64

application_data_update['EXT_SOURCE_2'].value_counts()

EXT_SOURCE_2
0.285898    721
0.262258    417
0.265256    343
0.159679    322
0.265312    306
           ... 
0.353855      1
0.200589      1
0.146779      1
0.288490      1
0.269471      1
Name: count, Length: 119831, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title("Distribution of EXT_SOURCE_2")
sns.histplot(application_data_update['EXT_SOURCE_2'], bins=10, kde=True, color='Salmon')
plt.tight_layout()
plt.show()

print(f'Mean: {application_data_update['EXT_SOURCE_2'].mean()}')
print(f'Median: {application_data_update['EXT_SOURCE_2'].median()}')
print(f'Mode: {application_data_update['EXT_SOURCE_2'].mode()[0]}')

Mean: 0.5143926741308464
Median: 0.5659614260608526
Mode: 0.2858978721410488

# Fill null values with median of 'EXT_SOURCE_2'
application_data_update['EXT_SOURCE_2'].fillna(application_data_update['EXT_SOURCE_3'].median(), inplace=True)

application_data_update['AMT_GOODS_PRICE'].head()

0     351000.0
1    1129500.0
2     135000.0
3     297000.0
4     513000.0
Name: AMT_GOODS_PRICE, dtype: float64

application_data_update['AMT_GOODS_PRICE'].value_counts()

AMT_GOODS_PRICE
450000.0     26022
225000.0     25282
675000.0     24962
900000.0     15416
270000.0     11428
             ...  
1305706.5        1
2155500.0        1
113724.0         1
171468.0         1
559836.0         1
Name: count, Length: 1002, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8, 5))
plt.title('Distribution of AMT_GOODS_PRICE')
sns.histplot(data=application_data_update, x='AMT_GOODS_PRICE', bins=35, kde=True, color='Salmon')
plt.xlabel('Goods Price')
plt.ylabel('Number of Clients')
plt.tight_layout()
plt.show()

# Fill null values with median
application_data_update['AMT_GOODS_PRICE'].fillna(application_data_update['AMT_GOODS_PRICE'].median(), inplace=True)

application_data_update['AMT_ANNUITY'].value_counts().head(5)

AMT_ANNUITY
9000.0     6385
13500.0    5514
6750.0     2279
10125.0    2035
37800.0    1602
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8, 5))
plt.title('Distribution of AMT_ANNUITY')
sns.histplot(data=application_data_update, x='AMT_ANNUITY', bins=30, kde=True, color='Salmon')
plt.xlabel('Annuity Amount')
plt.tight_layout()
plt.show()

# Fill null values with median of 'AMT_ANNUITY'
application_data_update['AMT_ANNUITY'].fillna(application_data_update['AMT_ANNUITY'].median(), inplace=True)

application_data_update['CNT_FAM_MEMBERS'].value_counts().head(5)

CNT_FAM_MEMBERS
2.0    158357
1.0     67847
3.0     52601
4.0     24697
5.0      3478
Name: count, dtype: int64

# We used countplot for see category count of family members

plt.figure(figsize=(8,5))
plt.title('Distribution of CNT_FAM_MEMBERS')
sns.countplot(data=application_data_update, x='CNT_FAM_MEMBERS', palette='Paired')
plt.xlabel('Count of Family Members')
plt.ylabel('Number of Applicants')
plt.xticks(rotation=90)
plt.show()

# Fill null values with median
application_data_update['CNT_FAM_MEMBERS'].fillna(application_data_update['CNT_FAM_MEMBERS'].median(), inplace=True)

application_data_update['DAYS_LAST_PHONE_CHANGE'].value_counts().head(10)

DAYS_LAST_PHONE_CHANGE
 0.0      37672
-1.0       2812
-2.0       2318
-3.0       1763
-4.0       1285
-5.0        824
-6.0        537
-7.0        442
-8.0        278
-476.0      222
Name: count, dtype: int64

# Convert the values from days to years
application_data_update['DAYS_LAST_PHONE_CHANGE(In Year)'] = -round(application_data_update['DAYS_LAST_PHONE_CHANGE'] / 365, 0)

# Replace -0.0 to 0.0
application_data_update['DAYS_LAST_PHONE_CHANGE(In Year)'].replace(-0.0, 0.0, inplace=True)

application_data_update['DAYS_LAST_PHONE_CHANGE(In Year)'].value_counts()

DAYS_LAST_PHONE_CHANGE(In Year)
0.0     64176
1.0     58998
2.0     48141
4.0     34677
3.0     32321
5.0     31080
6.0     16620
7.0     12631
8.0      5300
9.0      2986
10.0      509
11.0       70
12.0        1
Name: count, dtype: int64

# Here we use histogram to see distribution of data and frequency of data points.

plt.figure(figsize=(8,5))
plt.title('Distribution of DAYS_LAST_PHONE_CHANGE(In Year)')
sns.histplot(data=application_data_update, x='DAYS_LAST_PHONE_CHANGE(In Year)', bins=13, legend=False, color='Salmon')
plt.xlabel('Years')
plt.ylabel('Number of Applicants')
plt.tight_layout()
plt.show()

print(f'Median: {application_data_update['DAYS_LAST_PHONE_CHANGE(In Year)'].median()}')
print(f'Mode: {application_data_update['DAYS_LAST_PHONE_CHANGE(In Year)'].mode()[0]}')

Median: 2.0
Mode: 0.0

application_data_update = application_data_update.dropna(subset=['DAYS_LAST_PHONE_CHANGE(In Year)'])

# check missing values percentage of dataset
(application_data_update.isnull().sum() / len(application_data_update) * 100).sort_values(ascending=False).head(20)

OCCUPATION_TYPE               31.345322
AMT_REQ_CREDIT_BUREAU_WEEK    13.501350
AMT_REQ_CREDIT_BUREAU_MON     13.501350
AMT_REQ_CREDIT_BUREAU_DAY     13.501350
AMT_REQ_CREDIT_BUREAU_QRT     13.501350
AMT_REQ_CREDIT_BUREAU_HOUR    13.501350
OBS_60_CNT_SOCIAL_CIRCLE       0.332022
OBS_30_CNT_SOCIAL_CIRCLE       0.332022
DEF_60_CNT_SOCIAL_CIRCLE       0.332022
DEF_30_CNT_SOCIAL_CIRCLE       0.332022
AMT_ANNUITY                    0.000000
AMT_CREDIT                     0.000000
AMT_INCOME_TOTAL               0.000000
CNT_CHILDREN                   0.000000
FLAG_OWN_REALTY                0.000000
FLAG_OWN_CAR                   0.000000
CODE_GENDER                    0.000000
NAME_CONTRACT_TYPE             0.000000
TARGET                         0.000000
SK_ID_CURR                     0.000000
dtype: float64

# Drop the unnecessary columns that we not analyse 
application_data_update = application_data_update.drop(columns=[
    'AMT_REQ_CREDIT_BUREAU_QRT',
    'AMT_REQ_CREDIT_BUREAU_HOUR',
    'AMT_REQ_CREDIT_BUREAU_DAY',
    'AMT_REQ_CREDIT_BUREAU_WEEK',
    'AMT_REQ_CREDIT_BUREAU_MON',
    'OBS_30_CNT_SOCIAL_CIRCLE',
    'DEF_60_CNT_SOCIAL_CIRCLE',
    'DEF_30_CNT_SOCIAL_CIRCLE',
    'OBS_60_CNT_SOCIAL_CIRCLE'
])

columns_to_change = ['DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH']

for column in columns_to_change:
    application_data_update[column+'(In Year)'] = -round(application_data_update[column] / 365, 0)      # change days to year by diving it 365 and create new columns

# Here we drop the unnecessary columns that are not relevent to our analysis
application_data_update = application_data_update.drop(columns=[
    'DAYS_BIRTH',
    'DAYS_EMPLOYED',
    'DAYS_REGISTRATION',
    'DAYS_ID_PUBLISH',
    'DAYS_LAST_PHONE_CHANGE'
])

numeric_columns = application_data_update.select_dtypes(include=['number'])
columns = numeric_columns.columns

# Plot histograms in subplots
cols = 3
rows = int(np.ceil(len(columns) / cols))
fig, axes = plt.subplots(rows, cols, figsize=(16, rows * 3))
axes = axes.flatten()

for i, col in enumerate(columns[2:]):
    sns.histplot(numeric_columns[col], kde=True, bins=30, color='skyblue', ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Frequency')

# Remove unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle("Histograms of Numeric Columns", fontsize=18, y=1.02)
plt.show()

categorical_columns = application_data_update.select_dtypes(include=['object'])
cat_columns = categorical_columns.columns

# With the help of countplot we analyse the categorical variables


# Number of rows and columns for subplots
cols = 2
rows = math.ceil(len(cat_columns) / cols)

# Plot setup with subplots
fig, axes = plt.subplots(rows, cols, figsize=(20, rows * 7))
axes = axes.flatten()

# Plot countplot
for i, col in enumerate(cat_columns):
    ax = axes[i]
    total = categorical_columns[col].notna().sum()
    sns.countplot(data=categorical_columns, x=col, ax=ax, hue=col, legend=False)
    ax.set_title(col, fontsize=10)
    ax.tick_params(axis='x', rotation=90)
    ax.set_xlabel('')
    ax.set_ylabel('Count')
    
    # Add % labels on top of bar
    for p in ax.patches:
        count = int(p.get_height())
        percentage = 100 * count / total
        ax.text(p.get_x() + p.get_width()/2., p.get_height() + 500,
                f'{percentage:.1f}%', ha='center', fontsize=8)

# Remove unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle('Countplots of Categorical Variable', fontsize=18, y=1.02)
plt.show()

numeric_columns = application_data_update.select_dtypes(include=['number'])
columns = numeric_columns.columns

# Compute correlation with 'TARGET'
target_corr = numeric_columns.corr()['TARGET'].drop('TARGET')  # Drop TARGET correlation with self

# Convert into DataFrame for heatmap
target_corr_df = pd.DataFrame(target_corr).sort_values(by='TARGET', ascending=False)

plt.figure(figsize=(6, len(target_corr_df) // 2))
sns.heatmap(target_corr_df, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
plt.title("Correlation of Features with TARGET")
plt.tight_layout()
plt.show()

# Drop all FLAG_DOCUMENT_* columns for avoid complexity in dataset

document_columns = [col for col in application_data_update.columns if col.startswith('FLAG_DOCUMENT')]
application_data_update.drop(columns=document_columns, inplace=True)

numeric_columns = application_data_update.select_dtypes(include=['number'])
columns = numeric_columns.columns

desc = numeric_columns.describe().T
desc.head()

# Set the number of columns per row in the plot grid
cols_per_row = 3
total_plots = len(columns)
rows = total_plots // cols_per_row + int(total_plots % cols_per_row != 0)

# Create the subplots
plt.figure(figsize=(18, rows * 4))

for i, col in enumerate(numeric_columns):
    plt.subplot(rows, cols_per_row, i + 1)
    sns.boxplot(x=application_data_update[col], color='teal')
    plt.xlabel('')
    plt.title(col)
    plt.tight_layout()

plt.suptitle("Boxplots for Numerical Columns", fontsize=18, y=1.02)
plt.show()

# Handle outliers of CNT_CHILDREN column
application_data_update['CNT_CHILDREN'] = application_data_update['CNT_CHILDREN'].clip(upper=5)     # It will replace values above 5 with 5

# Handle outliers of AMT_INCOME_TOTAL
cap = application_data_update['AMT_INCOME_TOTAL'].quantile(0.99)        # Cap outliers with 0.99 value 
application_data_update['AMT_INCOME_TOTAL'] = application_data_update['AMT_INCOME_TOTAL'].clip(upper=cap)
application_data_update['AMT_INCOME_TOTAL'].value_counts().sort_index(ascending=False)

AMT_INCOME_TOTAL
472500.0    3094
469800.0       1
468823.5       1
468000.0      10
466956.0       1
            ... 
27000.0       66
26550.0        2
26460.0        1
26100.0        3
25650.0        2
Name: count, Length: 2370, dtype: int64

# Handle outliers of 'AMT_ANNUITY'
application_data_update['AMT_ANNUITY'] = np.log1p(application_data_update['AMT_ANNUITY'])   # log1p is numpy function for handling the outliers and normalize the data

# Drop the 'FLAG_MOBIL' column for it has constant values that not provide as much insights for analysis
application_data_update.drop(columns='FLAG_MOBIL', inplace=True)

previous_application = pd.read_csv("C:/Users/ASUS/OneDrive/Desktop/Datasets/Credit Risk Analysis Dataset/previous_application.csv")

previous_train = application_data_update.merge(previous_application, left_on='SK_ID_CURR', right_on='SK_ID_CURR', how='inner')

previous_train.shape

(1413701, 79)

previous_train.head()

# see the list of columns in 'previous_train'
previous_train.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE_x', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT_x', 'AMT_ANNUITY_x', 'AMT_GOODS_PRICE_x',
       'NAME_TYPE_SUITE_x', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'REGION_POPULATION_RELATIVE',
       'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
       'WEEKDAY_APPR_PROCESS_START_x', 'HOUR_APPR_PROCESS_START_x',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
       'ORGANIZATION_TYPE', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'DAYS_LAST_PHONE_CHANGE(In Year)',
       'DAYS_BIRTH(In Year)', 'DAYS_EMPLOYED(In Year)',
       'DAYS_REGISTRATION(In Year)', 'DAYS_ID_PUBLISH(In Year)', 'SK_ID_PREV',
       'NAME_CONTRACT_TYPE_y', 'AMT_ANNUITY_y', 'AMT_APPLICATION',
       'AMT_CREDIT_y', 'AMT_DOWN_PAYMENT', 'AMT_GOODS_PRICE_y',
       'WEEKDAY_APPR_PROCESS_START_y', 'HOUR_APPR_PROCESS_START_y',
       'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY',
       'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY',
       'RATE_INTEREST_PRIVILEGED', 'NAME_CASH_LOAN_PURPOSE',
       'NAME_CONTRACT_STATUS', 'DAYS_DECISION', 'NAME_PAYMENT_TYPE',
       'CODE_REJECT_REASON', 'NAME_TYPE_SUITE_y', 'NAME_CLIENT_TYPE',
       'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE',
       'CHANNEL_TYPE', 'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY',
       'CNT_PAYMENT', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION',
       'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION',
       'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL'],
      dtype='object')

# Categorical columns for analysis
categorical_cols = ['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 
                    'NAME_HOUSING_TYPE', 'NAME_CONTRACT_STATUS', 
                    'NAME_FAMILY_STATUS', 'OCCUPATION_TYPE']

# Grid setup
cols = 2
rows = math.ceil(len(categorical_cols) / cols)

fig, axes = plt.subplots(rows, cols, figsize=(14, rows * 5))
axes = axes.flatten()

# Plotting each subplot
for i, col in enumerate(categorical_cols):
    sns.countplot(data=previous_train, x=col, hue='TARGET', ax=axes[i])
    axes[i].set_title(f'{col} vs Loan Default')
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Count')
    axes[i].tick_params(axis='x', rotation=90)
    axes[i].legend(title='TARGET')

# Hide extra axes if any
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle('Bivariate Analysis: Categorical Columns vs Loan Default', fontsize=16, y=1.02)
plt.show()

# Create function that create 4 graphs for bivariate analysis

def plotting(column, hue, data):
    """
    Plots 4 visualizations for the relationship between two categorical columns:
    - Pie chart of `column`
    - Bar chart: % of hue categories within `column`
    - Countplot of hue split by column
    - Countplot of column split by hue
    """
    col = column

    # Create figure
    fig = plt.figure(figsize=(14, 14))

    # Subplot 1: Pie Chart of column
    ax1 = plt.subplot(221)
    data[col].value_counts().plot.pie(autopct="%1.0f%%", startangle=90, pctdistance=0.85, textprops={'fontsize': 8}, ax=ax1)
    plt.title(f'Distribution of values for: {column}')
    ax1.set_ylabel('')  # Remove ylabel

    # Subplot 2: % Distribution of hue within each column category
    ax2 = plt.subplot(222)
    dist = pd.crosstab(data[col], data[hue], normalize='index') * 100
    dist.plot(kind='bar', stacked=True, ax=ax2)
    plt.title(f'Percentage of {hue} within each {column}')
    plt.xticks(rotation=90)
    plt.xlabel('')

    # Subplot 3: Countplot of hue per column category
    ax3 = plt.subplot(223)
    sns.countplot(data=data, x=col, hue=hue, ax=ax3)
    plt.xticks(rotation=90)
    plt.xlabel('')
    plt.title(f'{hue} count per {col}')

    # Subplot 4: Countplot of column per hue category
    ax4 = plt.subplot(224)
    sns.countplot(data=data, x=hue, hue=col, ax=ax4)
    plt.xticks(rotation=90)
    plt.xlabel('')
    plt.title(f'{col} count per {hue}')

    plt.tight_layout()
    plt.show()

# Plot graph for 'CODE_GENDER' and 'NAME_CONTRACT_STATUS'

plotting('CODE_GENDER', 'NAME_CONTRACT_STATUS', data=previous_train)

# Plot graph for 'NAME_INCOME_TYPE' and 'NAME_CONTRACT_STATUS'

plotting('NAME_INCOME_TYPE', 'NAME_CONTRACT_STATUS', data=previous_train)

# Plot graph for 'NAME_EDUCATION_TYPE' and 'NAME_CONTRACT_STATUS'
plotting('NAME_EDUCATION_TYPE', 'NAME_CONTRACT_STATUS', data=previous_train)

# Plot graph for 'NAME_HOUSING_TYPE' and 'NAME_CONTRACT_STATUS'

plotting('NAME_HOUSING_TYPE', 'NAME_CONTRACT_STATUS', data=previous_train)

# Plot graph for 'NAME_FAMILY_STATUS' and 'NAME_CONTRACT_STATUS'

plotting('NAME_FAMILY_STATUS', 'NAME_CONTRACT_STATUS', data=previous_train)

# Plot graph for 'OCCUPATION_TYPE' and 'NAME_CONTRACT_STATUS'

plotting('OCCUPATION_TYPE', 'NAME_CONTRACT_STATUS', data=previous_train)

categorical_columns_prev = previous_train.select_dtypes(include=['number'])
cat_columns = categorical_columns_prev.columns
cat_columns

Index(['SK_ID_CURR', 'TARGET', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT_x', 'AMT_ANNUITY_x', 'AMT_GOODS_PRICE_x',
       'REGION_POPULATION_RELATIVE', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
       'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY',
       'HOUR_APPR_PROCESS_START_x', 'REG_REGION_NOT_LIVE_REGION',
       'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION',
       'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY',
       'LIVE_CITY_NOT_WORK_CITY', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'DAYS_LAST_PHONE_CHANGE(In Year)',
       'DAYS_BIRTH(In Year)', 'DAYS_EMPLOYED(In Year)',
       'DAYS_REGISTRATION(In Year)', 'DAYS_ID_PUBLISH(In Year)', 'SK_ID_PREV',
       'AMT_ANNUITY_y', 'AMT_APPLICATION', 'AMT_CREDIT_y', 'AMT_DOWN_PAYMENT',
       'AMT_GOODS_PRICE_y', 'HOUR_APPR_PROCESS_START_y',
       'NFLAG_LAST_APPL_IN_DAY', 'RATE_DOWN_PAYMENT', 'RATE_INTEREST_PRIMARY',
       'RATE_INTEREST_PRIVILEGED', 'DAYS_DECISION', 'SELLERPLACE_AREA',
       'CNT_PAYMENT', 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE',
       'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION',
       'NFLAG_INSURED_ON_APPROVAL'],
      dtype='object')

# Define numerical columns and target column
numerical_cols = ['CNT_CHILDREN','AMT_INCOME_TOTAL','CNT_FAM_MEMBERS',
                  'AMT_CREDIT_x', 'DAYS_BIRTH(In Year)', 'EXT_SOURCE_3']
target_col = 'TARGET'

# Set up subplot dimensions
cols = 2
rows = -(-len(numerical_cols) // cols)  # Ceiling division
fig, axes = plt.subplots(rows, cols, figsize=(14, rows * 4))
axes = axes.flatten()

# Plot boxplots
for i, col in enumerate(numerical_cols):
    ax = axes[i]
    sns.boxplot(data=previous_train, x=target_col, y=col, ax=ax)
    ax.set_title(f'{col} vs {target_col}')
    # ax.set_xlabel('Loan Default (TARGET)')
    ax.set_xlabel('')
    ax.set_ylabel('')

# Remove any unused subplot spaces
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.suptitle('Bivariate Analysis: Numerical Variables vs TARGET', fontsize=16, y=1.02)
plt.show()

	count	unique	top	freq
NAME_CONTRACT_TYPE	307511	2	Cash loans	278232
CODE_GENDER	307511	3	F	202448
FLAG_OWN_CAR	307511	2	N	202924
FLAG_OWN_REALTY	307511	2	Y	213312
NAME_TYPE_SUITE	306219	7	Unaccompanied	248526
NAME_INCOME_TYPE	307511	8	Working	158774
NAME_EDUCATION_TYPE	307511	5	Secondary / secondary special	218391
NAME_FAMILY_STATUS	307511	6	Married	196432
NAME_HOUSING_TYPE	307511	6	House / apartment	272868
OCCUPATION_TYPE	211120	18	Laborers	55186
WEEKDAY_APPR_PROCESS_START	307511	7	TUESDAY	53901
ORGANIZATION_TYPE	307511	58	Business Entity Type 3	67992
FONDKAPREMONT_MODE	97216	4	reg oper account	73830
HOUSETYPE_MODE	153214	3	block of flats	150503
WALLSMATERIAL_MODE	151170	7	Panel	66040
EMERGENCYSTATE_MODE	161756	2	No	159428

	count	mean	std	min	25%	50%	75%	max
SK_ID_CURR	307510.0	278181.038399	102789.938288	100002.0	189146.25	278202.5	367142.75	456255.0
TARGET	307510.0	0.080729	0.272419	0.0	0.00	0.0	0.00	1.0
CNT_CHILDREN	307510.0	0.417053	0.722122	0.0	0.00	0.0	1.00	19.0
AMT_INCOME_TOTAL	307510.0	168798.058473	237123.519274	25650.0	112500.00	147150.0	202500.00	117000000.0
AMT_CREDIT	307510.0	599027.041665	402491.016691	45000.0	270000.00	513531.0	808650.00	4050000.0

	SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE_x	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	AMT_INCOME_TOTAL	AMT_CREDIT_x	AMT_ANNUITY_x	...	NAME_SELLER_INDUSTRY	CNT_PAYMENT	NAME_YIELD_GROUP	PRODUCT_COMBINATION	DAYS_FIRST_DRAWING	DAYS_FIRST_DUE	DAYS_LAST_DUE_1ST_VERSION	DAYS_LAST_DUE	DAYS_TERMINATION	NFLAG_INSURED_ON_APPROVAL
0	100002	1	Cash loans	M	N	Y	202500.0	406597.5	10.114619	...	Auto technology	24.0	low_normal	POS other with interest	365243.0	-565.0	125.0	-25.0	-17.0	0.0
1	100003	0	Cash loans	F	N	N	270000.0	1293502.5	10.482892	...	XNA	12.0	low_normal	Cash X-Sell: low	365243.0	-716.0	-386.0	-536.0	-527.0	1.0
2	100003	0	Cash loans	F	N	N	270000.0	1293502.5	10.482892	...	Furniture	6.0	middle	POS industry with interest	365243.0	-797.0	-647.0	-647.0	-639.0	0.0
3	100003	0	Cash loans	F	N	N	270000.0	1293502.5	10.482892	...	Consumer electronics	12.0	middle	POS household with interest	365243.0	-2310.0	-1980.0	-1980.0	-1976.0	1.0
4	100004	0	Revolving loans	M	Y	Y	67500.0	135000.0	8.817446	...	Connectivity	4.0	middle	POS mobile without interest	365243.0	-784.0	-694.0	-724.0	-714.0	0.0

Credit Risk Analysis
¶

Business Understanding¶

Step 1 : Import Libraries¶

Step 2 : Read Dataset¶

Identify Target Variable¶

Check Distribution of the target variable¶

Step 3 : Dataset Overview¶

Step 3.1 : Dataset Basic Information¶

Inferences:¶

Step 3.2 : Summary Statistics for Numerical Variables¶

Step 3.3 : Summary Statistics for Categorical Variables¶

Step 4 : Missing Values Analysis¶

Step 4.1: Indentiy missing values¶

Step 4.2: Missing Values Treatment¶

Step 5 : Value Standardization¶

Step 6 : Univariate Analysis¶

Step 6.1 : Univariate Analysis of Numerical Variables¶

Inferences:¶

Step 6.2 : Univariate Analysis of Categorical Variables¶

Inferences:¶

Step 7 : Correlation¶

Step 8 : Outliers Treatment¶

Inference¶

Step 9 : Merging Dataset¶

Step 10 : Bivariate Analysis¶

Step 10.1 : Categorical columns vs TARGET¶

Inference:¶

Step 10.2 : Numerical columns vs TARGET¶

Business Oriented Conclusions
¶

Age Is a Strong Predictor of Default

External Risk Source Scores Matter

Number of Children Correlates with Higher Default

Education Level Reflects Financial Discipline

Employment Type Affects Default Risk

Family Status Has an Impact

Income Type Should Be Evaluated Carefully

	Unnamed: 0	Table	Row	Description	Special
0	1	application_data	SK_ID_CURR	ID of loan in our sample	NaN
1	2	application_data	TARGET	Target variable (1 - client with payment diffi...	NaN
2	5	application_data	NAME_CONTRACT_TYPE	Identification if loan is cash or revolving	NaN
3	6	application_data	CODE_GENDER	Gender of the client	NaN
4	7	application_data	FLAG_OWN_CAR	Flag if the client owns a car	NaN
...	...	...	...	...	...
155	209	previous_application.csv	DAYS_FIRST_DUE	Relative to application date of current applic...	time only relative to the application
156	210	previous_application.csv	DAYS_LAST_DUE_1ST_VERSION	Relative to application date of current applic...	time only relative to the application
157	211	previous_application.csv	DAYS_LAST_DUE	Relative to application date of current applic...	time only relative to the application
158	212	previous_application.csv	DAYS_TERMINATION	Relative to application date of current applic...	time only relative to the application
159	213	previous_application.csv	NFLAG_INSURED_ON_APPROVAL	Did the client requested insurance during the ...	NaN

	count	mean	std	min	25%	50%	75%	max
SK_ID_CURR	307511.0	278180.518577	102790.175348	100002.0	189145.5	278202.0	367142.5	456255.0
TARGET	307511.0	0.080729	0.272419	0.0	0.0	0.0	0.0	1.0
CNT_CHILDREN	307511.0	0.417052	0.722121	0.0	0.0	0.0	1.0	19.0
AMT_INCOME_TOTAL	307511.0	168797.919297	237123.146279	25650.0	112500.0	147150.0	202500.0	117000000.0
AMT_CREDIT	307511.0	599025.999706	402490.776996	45000.0	270000.0	513531.0	808650.0	4050000.0
...	...	...	...	...	...	...	...	...
AMT_REQ_CREDIT_BUREAU_DAY	265992.0	0.007000	0.110757	0.0	0.0	0.0	0.0	9.0
AMT_REQ_CREDIT_BUREAU_WEEK	265992.0	0.034362	0.204685	0.0	0.0	0.0	0.0	8.0
AMT_REQ_CREDIT_BUREAU_MON	265992.0	0.267395	0.916002	0.0	0.0	0.0	0.0	27.0
AMT_REQ_CREDIT_BUREAU_QRT	265992.0	0.265474	0.794056	0.0	0.0	0.0	0.0	261.0
AMT_REQ_CREDIT_BUREAU_YEAR	265992.0	1.899974	1.869295	0.0	0.0	1.0	3.0	25.0

Credit Risk Analysis¶

Business Understanding¶

Step 1 : Import Libraries¶

Step 2 : Read Dataset¶

Identify Target Variable¶

Check Distribution of the target variable¶

Step 3 : Dataset Overview¶

Step 3.1 : Dataset Basic Information¶

Inferences:¶

Step 3.2 : Summary Statistics for Numerical Variables¶

Step 3.3 : Summary Statistics for Categorical Variables¶

Step 4 : Missing Values Analysis¶

Step 4.1: Indentiy missing values¶

Step 4.2: Missing Values Treatment¶

Step 5 : Value Standardization¶

Step 6 : Univariate Analysis¶

Step 6.1 : Univariate Analysis of Numerical Variables¶

Inferences:¶

Step 6.2 : Univariate Analysis of Categorical Variables¶

Inferences:¶

Step 7 : Correlation¶

Step 8 : Outliers Treatment¶

Inference¶

Step 9 : Merging Dataset¶

Step 10 : Bivariate Analysis¶

Step 10.1 : Categorical columns vs TARGET¶

Inference:¶

Step 10.2 : Numerical columns vs TARGET¶

Business Oriented Conclusions ¶

Age Is a Strong Predictor of Default

External Risk Source Scores Matter

Number of Children Correlates with Higher Default

Education Level Reflects Financial Discipline

Employment Type Affects Default Risk

Family Status Has an Impact

Income Type Should Be Evaluated Carefully

Credit Risk Analysis
¶

Business Oriented Conclusions
¶