This file contains analysis done on credit history of applicants. Click here to go to previous part

In [ ]:
#importing required Libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sidetable
from tabulate import tabulate

import warnings
warnings.filterwarnings('ignore')

The following code is carried over from Analysis of Application Data for the purpose of Merging it with Previous Application Data

Skip to Merged Data Analysis

In [60]:
#setting max number of columns to be displayed to 100, to get a better view.
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)
In [61]:
application_data=pd.read_csv('application_data.csv')
In [62]:
#Finding the columns that have more than or equal to 50% null values and storing it to columns_to_drop.
columns_to_drop=application_data.columns[100*application_data.isnull().sum()/len(application_data)>=50]
In [63]:
#dropping the columns where the null values are >= 50%.
application_data.drop(labels=columns_to_drop,axis=1,inplace=True)
In [64]:
# Checking for Disguised Missing Values
def cat_value_counts(column_name) : 
    print(tabulate(pd.DataFrame(application_data.stb.freq([column_name])), headers='keys', tablefmt='psql'))
    print(pd.DataFrame(application_data[column_name]).stb.missing(),'\n\n\n')
In [65]:
#Replacing XAN with np.nan in Gender column : 
application_data['CODE_GENDER'] = application_data['CODE_GENDER'].replace('XNA',np.nan)
In [66]:
columns_to_convert = ['DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH']
application_data[columns_to_convert] = application_data[columns_to_convert].abs()
In [67]:
# Adding a new column "AGE_YEARS" using 'DAYS_BIRTH' with age in years
def days_to_years(x) : 
    if x < 0 : 
        x = -1*x 
    return x//365
application_data['AGE_YEARS'] = application_data['DAYS_BIRTH'].apply(days_to_years)
In [68]:
# AMT_INCOME_TOTAL - binning continuous variables
min_income = int(application_data['AMT_INCOME_TOTAL'].min())
max_income = int(application_data['AMT_INCOME_TOTAL'].max())


bins = [0,25000,50000,75000,100000,125000,150000,175000,200000,225000,250000,275000,300000,325000,350000,375000,400000,425000,450000,475000,500000,10000000000]
intervals = ['0-25000', '25000-50000','50000-75000','75000-100000','100000-125000', '125000-150000', '150000-175000','175000-200000',
       '200000-225000','225000-250000','250000-275000','275000-300000','300000-325000','325000-350000','350000-375000',
       '375000-400000','400000-425000','425000-450000','450000-475000','475000-500000','500000 and above']

application_data['AMT_INCOME_CAT']=pd.cut(application_data['AMT_INCOME_TOTAL'],bins,labels=intervals)
In [69]:
#AMT_CREDIT
bins = [0,150000,200000,250000,300000,350000,400000,450000,500000,550000,600000,650000,700000,750000,800000,850000,900000,1000000000]
intervals = ['0-150000', '150000-200000','200000-250000', '250000-300000', '300000-350000', '350000-400000','400000-450000',
        '450000-500000','500000-550000','550000-600000','600000-650000','650000-700000','700000-750000','750000-800000',
        '800000-850000','850000-900000','900000 and above']

application_data['AMT_CREDIT_RANGE']=pd.cut(application_data['AMT_CREDIT'],bins=bins,labels=intervals)
application_data['AMT_CREDIT_RANGE'] = application_data['AMT_CREDIT_RANGE'].astype('category')
In [70]:
columnsForAnalysis = ['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE',
       'DAYS_EMPLOYED','FLAG_MOBIL', 'FLAG_CONT_MOBILE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS',
       'REGION_RATING_CLIENT_W_CITY',
                      'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY',
       'ORGANIZATIOdN_TYPE', 'EXT_SOURCE_2', 'EXT_SOURCE_3',
        'DAYS_LAST_PHONE_CHANGE' ,'AGE_YEARS', 'AMT_INCOME_CAT',
       'AMT_CREDIT_RANGE']

Analysis

Loading Previous Application Data

In [71]:
prev_data = pd.read_csv('./previous_application.csv')

Merging with Previous Application Data

In [72]:
# selecting columns from application Data : 
application_data_columns = '''AMT_ANNUITY
AMT_INCOME_TOTAL
AMT_CREDIT
AMT_GOODS_PRICE
NAME_CONTRACT_TYPE
CODE_GENDER
NAME_INCOME_TYPE
DAYS_EMPLOYED
NAME_EDUCATION_TYPE
SK_ID_CURR
AGE_YEARS
AMT_INCOME_CAT
AMT_CREDIT_RANGE
'''
application_data_columns = application_data_columns.splitlines()
selected_application_data = application_data[application_data_columns]
In [73]:
# selecting columns from Previous application Data : 
prev_application_data_columns = '''AMT_ANNUITY
AMT_APPLICATION
AMT_CREDIT
AMT_GOODS_PRICE
CHANNEL_TYPE
CODE_REJECT_REASON
DAYS_DECISION
NAME_CASH_LOAN_PURPOSE
NAME_CLIENT_TYPE
NAME_CONTRACT_STATUS
NAME_CONTRACT_TYPE
NAME_GOODS_CATEGORY
NAME_PORTFOLIO
NAME_PRODUCT_TYPE
NAME_YIELD_GROUP
PRODUCT_COMBINATION
SK_ID_CURR
SK_ID_PREV
'''
prev_application_data_columns = prev_application_data_columns.splitlines()
selected_prev_data = prev_data[prev_application_data_columns]
In [74]:
# merging data , adding suffix _prev and _curr
merged_data = pd.merge(left=selected_application_data,right=selected_prev_data,on='SK_ID_CURR',how='inner',suffixes=('_curr','_prev'))
In [75]:
merged_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1413701 entries, 0 to 1413700
Data columns (total 30 columns):
 #   Column                   Non-Null Count    Dtype   
---  ------                   --------------    -----   
 0   AMT_ANNUITY_curr         1413608 non-null  float64 
 1   AMT_INCOME_TOTAL         1413701 non-null  float64 
 2   AMT_CREDIT_curr          1413701 non-null  float64 
 3   AMT_GOODS_PRICE_curr     1412493 non-null  float64 
 4   NAME_CONTRACT_TYPE_curr  1413701 non-null  object  
 5   CODE_GENDER              1413646 non-null  object  
 6   NAME_INCOME_TYPE         1413701 non-null  object  
 7   DAYS_EMPLOYED            1413701 non-null  float64 
 8   NAME_EDUCATION_TYPE      1413701 non-null  object  
 9   SK_ID_CURR               1413701 non-null  int64   
 10  AGE_YEARS                1413701 non-null  float64 
 11  AMT_INCOME_CAT           1413701 non-null  category
 12  AMT_CREDIT_RANGE         1413701 non-null  category
 13  AMT_ANNUITY_prev         1106483 non-null  float64 
 14  AMT_APPLICATION          1413701 non-null  float64 
 15  AMT_CREDIT_prev          1413700 non-null  float64 
 16  AMT_GOODS_PRICE_prev     1094176 non-null  float64 
 17  CHANNEL_TYPE             1413701 non-null  object  
 18  CODE_REJECT_REASON       1413701 non-null  object  
 19  DAYS_DECISION            1413701 non-null  int64   
 20  NAME_CASH_LOAN_PURPOSE   1413701 non-null  object  
 21  NAME_CLIENT_TYPE         1413701 non-null  object  
 22  NAME_CONTRACT_STATUS     1413701 non-null  object  
 23  NAME_CONTRACT_TYPE_prev  1413701 non-null  object  
 24  NAME_GOODS_CATEGORY      1413701 non-null  object  
 25  NAME_PORTFOLIO           1413701 non-null  object  
 26  NAME_PRODUCT_TYPE        1413701 non-null  object  
 27  NAME_YIELD_GROUP         1413701 non-null  object  
 28  PRODUCT_COMBINATION      1413388 non-null  object  
 29  SK_ID_PREV               1413701 non-null  int64   
dtypes: category(2), float64(10), int64(3), object(15)
memory usage: 315.5+ MB

The Target Variable for analysis is NAME_CONTRACT_STATUS

In [76]:
merged_data.dtypes
Out[76]:
AMT_ANNUITY_curr            float64
AMT_INCOME_TOTAL            float64
AMT_CREDIT_curr             float64
AMT_GOODS_PRICE_curr        float64
NAME_CONTRACT_TYPE_curr      object
CODE_GENDER                  object
NAME_INCOME_TYPE             object
DAYS_EMPLOYED               float64
NAME_EDUCATION_TYPE          object
SK_ID_CURR                    int64
AGE_YEARS                   float64
AMT_INCOME_CAT             category
AMT_CREDIT_RANGE           category
AMT_ANNUITY_prev            float64
AMT_APPLICATION             float64
AMT_CREDIT_prev             float64
AMT_GOODS_PRICE_prev        float64
CHANNEL_TYPE                 object
CODE_REJECT_REASON           object
DAYS_DECISION                 int64
NAME_CASH_LOAN_PURPOSE       object
NAME_CLIENT_TYPE             object
NAME_CONTRACT_STATUS         object
NAME_CONTRACT_TYPE_prev      object
NAME_GOODS_CATEGORY          object
NAME_PORTFOLIO               object
NAME_PRODUCT_TYPE            object
NAME_YIELD_GROUP             object
PRODUCT_COMBINATION          object
SK_ID_PREV                    int64
dtype: object

Merged Data - Data Quality Checks

In [77]:
# replacing disguised missing values 'XAP', 'XNA'
for column in merged_data.columns : 
    if merged_data[column].dtype == 'object' or merged_data[column].dtype.name == 'category' : 
        merged_data[column].replace({'XAP' : np.nan, 'XNA' : np.nan}, inplace=True)
In [78]:
#Finding Percentage of Missing values in each and every column
def color_red(value):
    '''
      Colors elements in a dateframe
      green if nulls <50% and red if
      >=50%.
    '''
    if value >=50:
        color = 'red'
    elif value <50:
        color = 'green'

    return 'color: %s' % color
pd.set_option('precision', 4)
missing_data = np.round(100*merged_data.isnull().sum()/len(merged_data),2)
missing_data = pd.DataFrame(data={'Column Name' :missing_data.index, 'Null Percentage' : missing_data.values})
missing_data.style.applymap(color_red,subset=['Null Percentage'])
Out[78]:
Column Name Null Percentage
0 AMT_ANNUITY_curr 0.0100
1 AMT_INCOME_TOTAL 0.0000
2 AMT_CREDIT_curr 0.0000
3 AMT_GOODS_PRICE_curr 0.0900
4 NAME_CONTRACT_TYPE_curr 0.0000
5 CODE_GENDER 0.0000
6 NAME_INCOME_TYPE 0.0000
7 DAYS_EMPLOYED 0.0000
8 NAME_EDUCATION_TYPE 0.0000
9 SK_ID_CURR 0.0000
10 AGE_YEARS 0.0000
11 AMT_INCOME_CAT 0.0000
12 AMT_CREDIT_RANGE 0.0000
13 AMT_ANNUITY_prev 21.7300
14 AMT_APPLICATION 0.0000
15 AMT_CREDIT_prev 0.0000
16 AMT_GOODS_PRICE_prev 22.6000
17 CHANNEL_TYPE 0.0000
18 CODE_REJECT_REASON 81.3400
19 DAYS_DECISION 0.0000
20 NAME_CASH_LOAN_PURPOSE 95.8000
21 NAME_CLIENT_TYPE 0.1100
22 NAME_CONTRACT_STATUS 0.0000
23 NAME_CONTRACT_TYPE_prev 0.0200
24 NAME_GOODS_CATEGORY 56.3900
25 NAME_PORTFOLIO 21.7300
26 NAME_PRODUCT_TYPE 63.6700
27 NAME_YIELD_GROUP 30.3600
28 PRODUCT_COMBINATION 0.0200
29 SK_ID_PREV 0.0000

Columns 'CODE_REJECT_REASON', 'NAME_CASH_LOAN_PURPOSE', 'NAME_GOODS_CATEGORY','NAME_PRODUCT_TYPE' have null values >=50%

In [79]:
# dropping columns with high null values 
null_info = pd.DataFrame(100*merged_data.isnull().sum()/len(merged_data))
null_info.columns = ['Null Percentage']
high_nulls = null_info[null_info['Null Percentage'] >= 50].index
print(high_nulls)
merged_data = merged_data.drop(columns=high_nulls)
Index(['CODE_REJECT_REASON', 'NAME_CASH_LOAN_PURPOSE', 'NAME_GOODS_CATEGORY',
       'NAME_PRODUCT_TYPE'],
      dtype='object')
In [80]:
# remaining columns 
merged_data.columns
Out[80]:
Index(['AMT_ANNUITY_curr', 'AMT_INCOME_TOTAL', 'AMT_CREDIT_curr',
       'AMT_GOODS_PRICE_curr', 'NAME_CONTRACT_TYPE_curr', 'CODE_GENDER',
       'NAME_INCOME_TYPE', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
       'SK_ID_CURR', 'AGE_YEARS', 'AMT_INCOME_CAT', 'AMT_CREDIT_RANGE',
       'AMT_ANNUITY_prev', 'AMT_APPLICATION', 'AMT_CREDIT_prev',
       'AMT_GOODS_PRICE_prev', 'CHANNEL_TYPE', 'DAYS_DECISION',
       'NAME_CLIENT_TYPE', 'NAME_CONTRACT_STATUS', 'NAME_CONTRACT_TYPE_prev',
       'NAME_PORTFOLIO', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION',
       'SK_ID_PREV'],
      dtype='object')
In [81]:
# taking absolute values of days of processsing 
merged_data['DAYS_DECISION'] = merged_data['DAYS_DECISION'].abs()

Merged Data - Imputation Consideration

In [82]:
#Finding the best values to impute below columns that have <=13% of null values

#Finding columns with <= 13% missing columns

null_info = pd.DataFrame(100*merged_data.isnull().sum()/len(merged_data))
null_info.columns = ['Null Percentage']
null_info[(null_info['Null Percentage'] > 0) & (null_info['Null Percentage'] <=0.13)]
Out[82]:
Null Percentage
AMT_ANNUITY_curr 0.01
AMT_GOODS_PRICE_curr 0.09
CODE_GENDER 0.00
AMT_CREDIT_prev 0.00
NAME_CLIENT_TYPE 0.11
NAME_CONTRACT_TYPE_prev 0.02
PRODUCT_COMBINATION 0.02

Merged Data - Columns To Impute

We have chosen the following columns to consider the best value to impute the missing values with

  • AMT_CREDIT_prev
  • PRODUCT_COMBINATION
  • NAME_CLIENT_TYPE
  • NAME_CONTRACT_TYPE_prev

Note that other columns were already examined in analysis of application_data.csv

AMT_CREDIT_prev Imputation

In [83]:
# Boxplot to check for outliers
merged_data['AMT_CREDIT_prev'].plot.box()
plt.title('\n Box Plot of AMT_CREDIT_prev')

# Calculating Quantiles
print('Quantile\tAMT_CREDIT_prev')
merged_data['AMT_CREDIT_prev'].quantile([0.5,0.8,0.85,0.90,0.95,1])
Quantile	AMT_CREDIT_prev
Out[83]:
0.50      80,595.00
0.80     270,000.00
0.85     382,500.00
0.90     533,160.00
0.95     893,398.50
1.00   4,509,688.50
Name: AMT_CREDIT_prev, dtype: float64

From the above box plot of AMT_CREDIT_prev , there are a lot of outliers.
Calculating Quantiles confirms the same. There is a huge jump from 95 percetile to max value.
Hence, Median:80595.00 is the best value to impute the missing values.

PRODUCT_COMBINATION Imputation

In [84]:
print('Data type of PRODUCT_COMBINATION : ',merged_data['PRODUCT_COMBINATION'].dtype,'\n\n')
print('Category\tNormalized Count\n\n',merged_data['PRODUCT_COMBINATION'].value_counts(normalize=True))
data = merged_data['PRODUCT_COMBINATION'].value_counts(normalize=True)
plt.bar(data.index,data.values)
# data.hist()
plt.xticks(rotation=90)
plt.ylabel('Normalized Value Counts')
plt.title('\nPRODUCT_COMBINATION');
Data type of PRODUCT_COMBINATION :  object 


Category	Normalized Count

 Cash                             0.17
POS household with interest      0.16
POS mobile with interest         0.14
Cash X-Sell: middle              0.08
Cash X-Sell: low                 0.08
Card Street                      0.07
POS industry with interest       0.06
POS household without interest   0.05
Card X-Sell                      0.05
Cash Street: high                0.04
Cash X-Sell: high                0.04
Cash Street: middle              0.02
Cash Street: low                 0.02
POS other with interest          0.01
POS mobile without interest      0.01
POS industry without interest    0.01
POS others without interest      0.00
Name: PRODUCT_COMBINATION, dtype: float64

PRODUCT_COMBINATION is a categorical variable.
The best metric to impute missing values is Mode of the data.
From the above plot, 'Cash' is the Mode

Hence, Cash is the best value to impute the missing values.

NAME_CLIENT_TYPE Imputation

In [85]:
#NAME_CLIENT_TYPE
print('Data type of NAME_CLIENT_TYPE : ',merged_data['NAME_CLIENT_TYPE'].dtype,'\n\n')
print('Category\tNormalized Count\n\n',merged_data['NAME_CLIENT_TYPE'].value_counts(normalize=True))
data = merged_data['NAME_CLIENT_TYPE'].value_counts(normalize=True)
plt.bar(data.index,data.values)
# data.hist()
plt.xticks(rotation=90)
plt.ylabel('Normalized Value Counts')
plt.title('\nNAME_CLIENT_TYPE');
Data type of NAME_CLIENT_TYPE :  object 


Category	Normalized Count

 Repeater    0.73
New         0.18
Refreshed   0.08
Name: NAME_CLIENT_TYPE, dtype: float64

NAME_CLIENT_TYPE is a categorical variable.
The best metric to impute missing values is Mode of the data.
From the above plot, 'Repeater' is the Mode

Hence, 'Repeater' is the best value to impute the missing values.

NAME_CONTRACT_TYPE_prev Imputation

In [86]:
print('Data type of NAME_CONTRACT_TYPE_prev : ',merged_data['NAME_CONTRACT_TYPE_prev'].dtype,'\n\n')
print('Category\tNormalized Count\n\n',merged_data['NAME_CONTRACT_TYPE_prev'].value_counts(normalize=True))
data = merged_data['NAME_CONTRACT_TYPE_prev'].value_counts(normalize=True)
plt.bar(data.index,data.values)
# data.hist()
plt.xticks(rotation=90)
plt.ylabel('Normalized Value Counts')
plt.title('\nNAME_CONTRACT_TYPE_prev');
Data type of NAME_CONTRACT_TYPE_prev :  object 


Category	Normalized Count

 Cash loans        0.44
Consumer loans    0.44
Revolving loans   0.11
Name: NAME_CONTRACT_TYPE_prev, dtype: float64

NAME_CONTRACT_TYPE_prev is a categorical variable.
The best metric to impute missing values is Mode of the data.
From the above plot, this is a bimodal distribution.It has two modes : 'Cash Loans' and 'Consumer Loans'
The best value for imputation needs domain knowledge

Merged Data - Splitting Datasets wrt NAME_CONTRACT_STATUS

In [87]:
# unique categories 
merged_data['NAME_CONTRACT_STATUS'].unique()
Out[87]:
array(['Approved', 'Canceled', 'Refused', 'Unused offer'], dtype=object)
In [88]:
# splitting into four data frames wrt to NAME_CONTRACT_STATUS

merged_a = merged_data[merged_data['NAME_CONTRACT_STATUS'] == 'Approved']
merged_c = merged_data[merged_data['NAME_CONTRACT_STATUS'] == 'Canceled']
merged_r = merged_data[merged_data['NAME_CONTRACT_STATUS'] == 'Refused']
merged_u = merged_data[merged_data['NAME_CONTRACT_STATUS'] == 'Unused offer']

Merged Data - Data Imbalance

In [89]:
column_name = 'NAME_CONTRACT_STATUS'
print(tabulate(pd.DataFrame(merged_data.stb.freq([column_name])), headers='keys', tablefmt='psql'))
print(pd.DataFrame(merged_data[column_name]).stb.missing(),'\n')
+----+------------------------+---------+-----------+--------------------+----------------------+
|    | NAME_CONTRACT_STATUS   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+------------------------+---------+-----------+--------------------+----------------------|
|  0 | Approved               |  886099 | 0.626794  |             886099 |             0.626794 |
|  1 | Canceled               |  259441 | 0.183519  |            1145540 |             0.810313 |
|  2 | Refused                |  245390 | 0.17358   |            1390930 |             0.983893 |
|  3 | Unused offer           |   22771 | 0.0161074 |            1413701 |             1        |
+----+------------------------+---------+-----------+--------------------+----------------------+
                      Missing    Total  Percent
NAME_CONTRACT_STATUS        0  1413701     0.00 

The above percentages show that not all cases are equally represented and that there is definite Data Imbalance

Merged Data - Columns Selection for Analysis

In [90]:
merged_col_for_analysis = '''
AMT_ANNUITY_prev
AMT_APPLICATION
AMT_CREDIT_prev
AMT_GOODS_PRICE_prev
CHANNEL_TYPE
CODE_REJECT_REASON
DAYS_DECISION
NAME_CASH_LOAN_PURPOSE
NAME_CLIENT_TYPE
NAME_CONTRACT_TYPE_prev
NAME_GOODS_CATEGORY
NAME_PORTFOLIO
NAME_PRODUCT_TYPE
NAME_YIELD_GROUP
PRODUCT_COMBINATION'''
merged_col_for_analysis = merged_col_for_analysis.splitlines()

Merged Data - Checking for Outliers

Checking for outliers in the following numerical columns

  • AMT_ANNUITY_prev
  • AMT_APPLICATION
  • AMT_CREDIT_prev
  • AMT_GOODS_PRICE_prev
  • DAYS_DECISION
In [91]:
# taking absolute values of days decision
merged_data['DAYS_DECISION'] = merged_data['DAYS_DECISION'].abs()
merged_data['DAYS_DECISION'].describe()
Out[91]:
count   1,413,701.00
mean          880.37
std           783.54
min             1.00
25%           271.00
50%           582.00
75%         1,313.00
max         2,922.00
Name: DAYS_DECISION, dtype: float64
In [92]:
# Box plots of the above numerical variables 
merged_outlier_check_col = [
    'AMT_ANNUITY_prev',
'AMT_APPLICATION',
'AMT_CREDIT_prev',
'AMT_GOODS_PRICE_prev',
'DAYS_DECISION'
]

fig,ax = plt.subplots(3,2)
fig.set_figheight(15)
fig.set_figwidth(15)
ax[0,0].set_yscale('log')
ax[0,1].set_yscale('log')
ax[1,0].set_yscale('log')
ax[1,1].set_yscale('log')

ax[0,0].set(ylabel ='Annuity in Log Scale')
ax[0,1].set(ylabel ='Application Amount in Log Scale')
ax[1,0].set(ylabel ='Credit Amount in Log Scale')
ax[1,1].set(ylabel ='Goods Price in Log Scale')
ax[2,0].set(ylabel ='Processing Days')

merged_data[merged_outlier_check_col[0]].plot.box(ax=ax[0,0],);
merged_data[merged_outlier_check_col[1]].plot.box(ax=ax[0,1]);
merged_data[merged_outlier_check_col[2]].plot.box(ax=ax[1,0]);
merged_data[merged_outlier_check_col[3]].plot.box(ax=ax[1,1]);

merged_data[merged_outlier_check_col[4]].plot.box(ax=ax[2,0]); 
ax[2,1].axis('off')
print('Box Plots of' + ' '.join(merged_outlier_check_col) +'\n')
Box Plots ofAMT_ANNUITY_prev AMT_APPLICATION AMT_CREDIT_prev AMT_GOODS_PRICE_prev DAYS_DECISION

In [93]:
# quantiles for outlier checks
pd.options.display.float_format = '{:,.2f}'.format
for col in merged_outlier_check_col : 
    print(col,'\n',merged_data[col].quantile([0.5,0.8,0.85,0.90,0.95,1]),'\n\n')
AMT_ANNUITY_prev 
 0.50    11,223.76
0.80    23,390.64
0.85    27,683.37
0.90    34,423.11
0.95    45,205.61
1.00   418,058.15
Name: AMT_ANNUITY_prev, dtype: float64 


AMT_APPLICATION 
 0.50      70,870.50
0.80     225,000.00
0.85     318,978.00
0.90     450,000.00
0.95     805,500.00
1.00   5,850,000.00
Name: AMT_APPLICATION, dtype: float64 


AMT_CREDIT_prev 
 0.50      80,595.00
0.80     270,000.00
0.85     382,500.00
0.90     533,160.00
0.95     893,398.50
1.00   4,509,688.50
Name: AMT_CREDIT_prev, dtype: float64 


AMT_GOODS_PRICE_prev 
 0.50     110,245.50
0.80     315,000.00
0.85     450,000.00
0.90     585,000.00
0.95     900,000.00
1.00   5,850,000.00
Name: AMT_GOODS_PRICE_prev, dtype: float64 


DAYS_DECISION 
 0.50     582.00
0.80   1,566.00
0.85   1,896.00
0.90   2,257.00
0.95   2,555.00
1.00   2,922.00
Name: DAYS_DECISION, dtype: float64 


Outliers in Numerical Columns

From the above box plots and quantile calculations, we see that
AMT_ANNUITY_prev , AMT_APPLICATION, AMT_CREDIT_prev, AMT_GOODS_PRICE_prev,DAYS_DECISION have many outliers. These outliers could be capped by the corresponding 95th percentile values.

Merged Data - Binning Continuous Variables

In [94]:
#AMT_CREDIT_prev
min_credit = int(merged_data['AMT_CREDIT_prev'].min())
max_credit = int(merged_data['AMT_CREDIT_prev'].max())

bins = [0,25000,50000,75000,100000,125000,150000,175000,200000,225000,250000,275000,300000,325000,350000,375000,400000,425000,450000,475000,500000,10000000000]
intervals = ['0-25000', '25000-50000','50000-75000','75000-100000','100000-125000', '125000-150000', '150000-175000','175000-200000',
       '200000-225000','225000-250000','250000-275000','275000-300000','300000-325000','325000-350000','350000-375000',
       '375000-400000','400000-425000','425000-450000','450000-475000','475000-500000','500000 and above']

merged_data['AMT_CREDIT_prev_cat']=pd.cut(merged_data['AMT_CREDIT_prev'],bins,labels=intervals)
print('Credit Range [Prev]\t Count')
print(merged_data['AMT_CREDIT_prev_cat'].value_counts())

credit_cat = merged_data['AMT_CREDIT_prev_cat'].value_counts()
plt.hist(credit_cat)
# (merged_data['AMT_CREDIT_prev_cat'].dropna()).plot.hist()

plt.title('\n Previous Credit Amount vs No of Applications')
plt.xticks(rotation=90);
Credit Range [Prev]	 Count
25000-50000         198605
500000 and above    153535
50000-75000         131378
75000-100000        108912
100000-125000        82194
0-25000              77801
125000-150000        77666
150000-175000        48738
175000-200000        42564
200000-225000        42167
250000-275000        30652
425000-450000        24617
225000-250000        24428
275000-300000        18649
300000-325000        13501
325000-350000        13185
350000-375000        11851
475000-500000        10725
375000-400000         9594
400000-425000         8654
450000-475000         7707
Name: AMT_CREDIT_prev_cat, dtype: int64
In [95]:
#AMT_APPLICATION
min_app_amt = int(merged_data['AMT_APPLICATION'].min())
max_app_amt = int(merged_data['AMT_APPLICATION'].max())


bins = [0,25000,50000,75000,100000,125000,150000,175000,200000,225000,250000,275000,300000,325000,350000,375000,400000,425000,450000,475000,500000,10000000000]
intervals = ['0-25000', '25000-50000','50000-75000','75000-100000','100000-125000', '125000-150000', '150000-175000','175000-200000',
       '200000-225000','225000-250000','250000-275000','275000-300000','300000-325000','325000-350000','350000-375000',
       '375000-400000','400000-425000','425000-450000','450000-475000','475000-500000','500000 and above']

merged_data['AMT_APPLICATION_cat']=pd.cut(merged_data['AMT_APPLICATION'],bins,labels=intervals)
print('AMT_APPLICATION [Prev]\t Count')
print(merged_data['AMT_APPLICATION_cat'].value_counts())

credit_cat = merged_data['AMT_APPLICATION_cat'].value_counts()
plt.hist(merged_data['AMT_APPLICATION_cat'].dropna())

plt.title('\n Application Amount vs No of Applications')
plt.xticks(rotation=90);
AMT_APPLICATION [Prev]	 Count
25000-50000         202958
50000-75000         130287
500000 and above    123697
75000-100000        113475
125000-150000        83960
100000-125000        78973
0-25000              65680
200000-225000        54749
175000-200000        40177
150000-175000        36621
425000-450000        36186
225000-250000        24346
250000-275000        23775
300000-325000        14191
325000-350000        12260
350000-375000        11913
450000-475000        10196
275000-300000         8354
400000-425000         7223
475000-500000         4865
375000-400000         4513
Name: AMT_APPLICATION_cat, dtype: int64

Merged Data : Analysis

Merged Data : Univariate Analysis

In [96]:
# function for categorical variable univariate analysis

def merged_cat_univariate_analysis(column_name,figsize=(10,5)) : 
    # print unique values
    print('Approved\n', merged_a[column_name].unique(),'\n')
    print('Canceled\n',merged_c[column_name].unique(),'\n')
    print('Refused\n',merged_r[column_name].unique(),'\n')
    print('Unused offer\n',merged_u[column_name].unique(),'\n')
    
    # column vs target count plot
    plt.figure(figsize=figsize)
    ax = sns.countplot(x=column_name,hue='NAME_CONTRACT_STATUS',data=merged_data)
    title = column_name + ' vs Number of Applications'
    ax.set(title= title)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2,
                height + 10,
                format(height),
                ha="center")
    # Percentages 
    print('Approved\n', merged_a[column_name].unique(),'\n')
    print(tabulate(pd.DataFrame(merged_a.stb.freq([column_name])), headers='keys', tablefmt='psql'),'\n')
    print('Canceled\n',merged_c[column_name].unique(),'\n')
    print(tabulate(pd.DataFrame(merged_c.stb.freq([column_name])), headers='keys', tablefmt='psql'),'\n')
    print('Refused\n',merged_r[column_name].unique(),'\n')
    print(tabulate(pd.DataFrame(merged_r.stb.freq([column_name])), headers='keys', tablefmt='psql'),'\n')
    print('Unused offer\n',merged_u[column_name].unique(),'\n')
    print(tabulate(pd.DataFrame(merged_u.stb.freq([column_name])), headers='keys', tablefmt='psql'),'\n')
In [97]:
# function for numerical variable univariate analysis

def merged_num_univariate_analysis(column_name,scale='linear') : 
    # boxplot for column vs target
    plt.figure(figsize=(8,6))
    ax = sns.boxplot(x='NAME_CONTRACT_STATUS', y = column_name, data = merged_data)
    title = column_name+' vs NAME_CONTRACT_STATUS'
    ax.set(title=title)
    if scale == 'log' :
        plt.yscale('log')
        ax.set(ylabel=column_name + '(Log Scale)')
    # summary statistic
    print('Approved\n', merged_a[column_name].describe(),'\n')
    print('Canceled\n',merged_c[column_name].describe(),'\n')
    print('Refused\n',merged_r[column_name].describe(),'\n')
    print('Unused offer\n',merged_u[column_name].describe(),'\n')
In [98]:
# function to calculate the proportion of applications in a category compared to total applications
def merged_cat_proportions(column_name) : 
    values = merged_data[column_name].unique()
    values=values.dropna()
    values = values.to_numpy()
    values.tolist()
    data_a = merged_a[column_name].value_counts().to_dict()
    data_c = merged_c[column_name].value_counts().to_dict()
    data_r = merged_r[column_name].value_counts().to_dict()
    data_u = merged_u[column_name].value_counts().to_dict()
    data = merged_data[column_name].value_counts().to_dict()

    for i in values : 
        if data_a[i] != np.nan and data_c[i] != np.nan and data_r[i] != np.nan and data_u[i] != np.nan and data[i] != np.nan:
            print('Proportion of '+ str(i) + ' Approved : ', round(data_a[i]*100/data[i],2),'\n')
            print('Proportion of '+ str(i) + ' Cancelled : ', round(data_c[i]*100/data[i],2),'\n')
            print('Proportion of '+ str(i) + ' Refused : ', round(data_r[i]*100/data[i],2),'\n')
            print('Proportion of '+ str(i) + ' Unused Offer : ', round(data_u[i]*100/data[i],2),'\n')
In [99]:
#AMT_ANNUITY_prev
merged_num_univariate_analysis('AMT_ANNUITY_prev',scale='log')
Approved
 count   886,092.00
mean     14,612.96
std      13,746.23
min           0.00
25%       5,874.24
50%      10,286.39
75%      18,144.04
max     393,868.66
Name: AMT_ANNUITY_prev, dtype: float64 

Canceled
 count     8,760.00
mean     30,189.94
std      18,436.44
min       1,324.31
25%      16,966.99
50%      26,424.09
75%      39,481.53
max     218,713.55
Name: AMT_ANNUITY_prev, dtype: float64 

Refused
 count   210,848.00
mean     20,406.65
std      17,137.06
min           0.00
25%       8,416.93
50%      15,461.06
75%      27,716.05
max     418,058.15
Name: AMT_ANNUITY_prev, dtype: float64 

Unused offer
 count      783.00
mean    10,192.88
std      4,728.62
min        758.92
25%      6,331.95
50%     10,021.32
75%     14,538.53
max     35,258.94
Name: AMT_ANNUITY_prev, dtype: float64 

Amount Annuity Previous

  • Since all the above categories in the plot are having outliers, we will look at the median to make the inferences.
  • Median of "Approved" category is 10,286.39. Approved and Unused offer are having similar patterns.
In [100]:
#AMT_APPLICATION
merged_num_univariate_analysis('AMT_APPLICATION',scale='log')
Approved
 count     886,099.00
mean      179,499.20
std       254,071.95
min             0.00
25%        45,000.00
50%        90,000.00
75%       191,306.25
max     5,850,000.00
Name: AMT_APPLICATION, dtype: float64 

Canceled
 count     259,441.00
mean       23,267.79
std       155,690.92
min             0.00
25%             0.00
50%             0.00
75%             0.00
max     3,600,000.00
Name: AMT_APPLICATION, dtype: float64 

Refused
 count     245,390.00
mean      330,345.15
std       433,570.20
min             0.00
25%        45,000.00
50%       157,500.00
75%       450,000.00
max     4,455,000.00
Name: AMT_APPLICATION, dtype: float64 

Unused offer
 count      22,771.00
mean       69,736.51
std        65,649.21
min             0.00
25%        34,197.75
50%        57,780.00
75%        89,955.00
max     3,511,305.00
Name: AMT_APPLICATION, dtype: float64 

Amount In Previous Application

  • Since all the above categories in plot have outliers, we will infer using median.
  • The median of "Approved" category is 90,000. Anything above that has high risk of refusal.
  • Looking at "Canceled" category, all ranges of amounts faced calcellation.
  • Most of the "Unused offer" category is below the median value of "Approved category".
In [101]:
# AMT_CREDIT_prev
merged_num_univariate_analysis('AMT_CREDIT_prev',scale='log')
Approved
 count     886,098.00
mean      201,625.61
std       275,852.27
min             0.00
25%        47,479.50
50%       101,153.25
75%       225,000.00
max     4,509,688.50
Name: AMT_CREDIT_prev, dtype: float64 

Canceled
 count     259,441.00
mean       24,784.71
std       165,068.20
min             0.00
25%             0.00
50%             0.00
75%             0.00
max     3,847,104.00
Name: AMT_CREDIT_prev, dtype: float64 

Refused
 count     245,390.00
mean      370,460.07
std       470,121.52
min             0.00
25%        59,175.00
50%       180,000.00
75%       493,150.50
max     4,104,351.00
Name: AMT_CREDIT_prev, dtype: float64 

Unused offer
 count      22,771.00
mean       69,753.58
std        65,663.89
min             0.00
25%        34,219.80
50%        57,780.00
75%        89,955.00
max     3,511,305.00
Name: AMT_CREDIT_prev, dtype: float64 

Final Credit Amount

  • Since all the above categories in plot have outliers, we will infer using median.
  • The median of "Approved" category is 101,153.25. Anything above that has high risk of refusal.
  • Amount below 101,153.25 are more likely to get approved.
  • Looking at "Canceled" category, all ranges of amounts faced calcellation.
  • Most of the "Unused offer" category is below the median value of "Approved category".
In [102]:
#AMT_GOODS_PRICE_prev
merged_num_univariate_analysis('AMT_GOODS_PRICE_prev',scale='log')
Approved
 count     848,577.00
mean      187,436.21
std       256,747.40
min             0.00
25%        46,170.00
50%        95,208.48
75%       202,500.00
max     5,850,000.00
Name: AMT_GOODS_PRICE_prev, dtype: float64 

Canceled
 count       8,964.00
mean      673,429.16
std       513,562.76
min             0.00
25%       270,000.00
50%       454,500.00
75%       900,000.00
max     3,600,000.00
Name: AMT_GOODS_PRICE_prev, dtype: float64 

Refused
 count     213,868.00
mean      379,200.31
std       444,129.93
min             0.00
25%        90,000.00
50%       225,000.00
75%       454,500.00
max     4,455,000.00
Name: AMT_GOODS_PRICE_prev, dtype: float64 

Unused offer
 count      22,767.00
mean       69,748.76
std        65,648.46
min             0.00
25%        34,231.39
50%        57,780.00
75%        89,955.00
max     3,511,305.00
Name: AMT_GOODS_PRICE_prev, dtype: float64 

Goods Price

  • since all the categories have outliers, we will consider median to infer.
  • Median of "Approved" Category is 95,208.48. Any amount above this have high risk of getting refused or being calcelled.
In [103]:
#CHANNEL_TYPE
merged_cat_univariate_analysis('CHANNEL_TYPE',figsize=(20,6))
plt.xticks(rotation=90)
Approved
 ['Stone' 'Credit and cash offices' 'Country-wide' 'Regional / Local'
 'AP+ (Cash loan)' 'Contact center' 'Channel of corporate sales'
 'Car dealer'] 

Canceled
 ['Credit and cash offices' 'AP+ (Cash loan)' 'Contact center'
 'Country-wide' 'Channel of corporate sales' 'Regional / Local' 'Stone'
 'Car dealer'] 

Refused
 ['Credit and cash offices' 'AP+ (Cash loan)' 'Stone' 'Country-wide'
 'Contact center' 'Channel of corporate sales' 'Regional / Local'
 'Car dealer'] 

Unused offer
 ['Stone' 'Country-wide' 'Regional / Local' 'Credit and cash offices'
 'AP+ (Cash loan)'] 

Approved
 ['Stone' 'Credit and cash offices' 'Country-wide' 'Regional / Local'
 'AP+ (Cash loan)' 'Contact center' 'Channel of corporate sales'
 'Car dealer'] 

+----+----------------------------+---------+-------------+--------------------+----------------------+
|    | CHANNEL_TYPE               |   Count |     Percent |   Cumulative Count |   Cumulative Percent |
|----+----------------------------+---------+-------------+--------------------+----------------------|
|  0 | Country-wide               |  344456 | 0.388733    |             344456 |             0.388733 |
|  1 | Credit and cash offices    |  244475 | 0.2759      |             588931 |             0.664633 |
|  2 | Stone                      |  163654 | 0.18469     |             752585 |             0.849324 |
|  3 | Regional / Local           |   82265 | 0.0928395   |             834850 |             0.942163 |
|  4 | AP+ (Cash loan)            |   27062 | 0.0305406   |             861912 |             0.972704 |
|  5 | Contact center             |   21604 | 0.024381    |             883516 |             0.997085 |
|  6 | Channel of corporate sales |    2319 | 0.00261709  |             885835 |             0.999702 |
|  7 | Car dealer                 |     264 | 0.000297935 |             886099 |             1        |
+----+----------------------------+---------+-------------+--------------------+----------------------+ 

Canceled
 ['Credit and cash offices' 'AP+ (Cash loan)' 'Contact center'
 'Country-wide' 'Channel of corporate sales' 'Regional / Local' 'Stone'
 'Car dealer'] 

+----+----------------------------+---------+-------------+--------------------+----------------------+
|    | CHANNEL_TYPE               |   Count |     Percent |   Cumulative Count |   Cumulative Percent |
|----+----------------------------+---------+-------------+--------------------+----------------------|
|  0 | Credit and cash offices    |  229905 | 0.886155    |             229905 |             0.886155 |
|  1 | Contact center             |   24459 | 0.0942758   |             254364 |             0.980431 |
|  2 | AP+ (Cash loan)            |    3227 | 0.0124383   |             257591 |             0.992869 |
|  3 | Country-wide               |    1555 | 0.00599366  |             259146 |             0.998863 |
|  4 | Channel of corporate sales |     119 | 0.000458678 |             259265 |             0.999322 |
|  5 | Regional / Local           |      85 | 0.000327627 |             259350 |             0.999649 |
|  6 | Stone                      |      58 | 0.000223558 |             259408 |             0.999873 |
|  7 | Car dealer                 |      33 | 0.000127197 |             259441 |             1        |
+----+----------------------------+---------+-------------+--------------------+----------------------+ 

Refused
 ['Credit and cash offices' 'AP+ (Cash loan)' 'Stone' 'Country-wide'
 'Contact center' 'Channel of corporate sales' 'Regional / Local'
 'Car dealer'] 

+----+----------------------------+---------+-------------+--------------------+----------------------+
|    | CHANNEL_TYPE               |   Count |     Percent |   Cumulative Count |   Cumulative Percent |
|----+----------------------------+---------+-------------+--------------------+----------------------|
|  0 | Credit and cash offices    |  125535 | 0.511573    |             125535 |             0.511573 |
|  1 | Country-wide               |   56101 | 0.22862     |             181636 |             0.740193 |
|  2 | AP+ (Cash loan)            |   19330 | 0.0787726   |             200966 |             0.818966 |
|  3 | Stone                      |   18996 | 0.0774115   |             219962 |             0.896377 |
|  4 | Contact center             |   12828 | 0.052276    |             232790 |             0.948653 |
|  5 | Regional / Local           |    9533 | 0.0388484   |             242323 |             0.987502 |
|  6 | Channel of corporate sales |    2958 | 0.0120543   |             245281 |             0.999556 |
|  7 | Car dealer                 |     109 | 0.000444191 |             245390 |             1        |
+----+----------------------------+---------+-------------+--------------------+----------------------+ 

Unused offer
 ['Stone' 'Country-wide' 'Regional / Local' 'Credit and cash offices'
 'AP+ (Cash loan)'] 

+----+-------------------------+---------+-------------+--------------------+----------------------+
|    | CHANNEL_TYPE            |   Count |     Percent |   Cumulative Count |   Cumulative Percent |
|----+-------------------------+---------+-------------+--------------------+----------------------|
|  0 | Country-wide            |   20950 | 0.92003     |              20950 |             0.92003  |
|  1 | Stone                   |     800 | 0.0351324   |              21750 |             0.955162 |
|  2 | Regional / Local        |     589 | 0.0258662   |              22339 |             0.981029 |
|  3 | Credit and cash offices |     427 | 0.0187519   |              22766 |             0.99978  |
|  4 | AP+ (Cash loan)         |       5 | 0.000219578 |              22771 |             1        |
+----+-------------------------+---------+-------------+--------------------+----------------------+ 

Out[103]:
(array([0, 1, 2, 3, 4, 5, 6, 7]), <a list of 8 Text xticklabel objects>)
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values
posx and posy should be finite values

Channel Type

  • Credit and cash offices are playing a vital role in pipe-lining the loan applications.
  • Country_wide has highest rate of Loan Approvals.
  • 75% of Loans are getting approved in (Country_wide + Credit and cash offices + Stone).
In [104]:
#DAYS_DECISION
merged_num_univariate_analysis('DAYS_DECISION')
Approved
 count   886,099.00
mean      1,098.43
std         793.39
min           1.00
25%         454.00
50%         856.00
75%       1,643.00
max       2,922.00
Name: DAYS_DECISION, dtype: float64 

Canceled
 count   259,441.00
mean        221.51
std         135.74
min           2.00
25%         128.00
50%         217.00
75%         305.00
max       2,842.00
Name: DAYS_DECISION, dtype: float64 

Refused
 count   245,390.00
mean        789.08
std         755.78
min           2.00
25%         248.00
50%         462.00
75%       1,102.00
max       2,922.00
Name: DAYS_DECISION, dtype: float64 

Unused offer
 count   22,771.00
mean       885.12
std        607.17
min          4.00
25%        362.00
50%        769.00
75%      1,348.00
max      2,516.00
Name: DAYS_DECISION, dtype: float64 

Days From Previous Application Decision Relative To Current Application

  • Since "Approved" category doesnt have outliers, we will use mean to infer.
  • Mean of "Approved" category is 1,098.43. Any applicant with number of days below 1,098.43 is less likely to get Approved.
In [105]:
#NAME_CLIENT_TYPE
merged_cat_univariate_analysis('NAME_CLIENT_TYPE')
Approved
 ['New' 'Repeater' 'Refreshed' nan] 

Canceled
 ['Repeater' nan 'Refreshed' 'New'] 

Refused
 ['Repeater' 'Refreshed' nan 'New'] 

Unused offer
 ['Repeater' 'Refreshed' 'New' nan] 

Approved
 ['New' 'Repeater' 'Refreshed' nan] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_CLIENT_TYPE   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | Repeater           |  560378 | 0.632721  |             560378 |             0.632721 |
|  1 | New                |  242395 | 0.273687  |             802773 |             0.906408 |
|  2 | Refreshed          |   82891 | 0.0935919 |             885664 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

Canceled
 ['Repeater' nan 'Refreshed' 'New'] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_CLIENT_TYPE   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | Repeater           |  239618 | 0.926447  |             239618 |             0.926447 |
|  1 | Refreshed          |   16104 | 0.0622637 |             255722 |             0.98871  |
|  2 | New                |    2920 | 0.0112897 |             258642 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

Refused
 ['Repeater' 'Refreshed' nan 'New'] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_CLIENT_TYPE   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | Repeater           |  220215 | 0.898433  |             220215 |             0.898433 |
|  1 | Refreshed          |   12537 | 0.0511485 |             232752 |             0.949582 |
|  2 | New                |   12358 | 0.0504182 |             245110 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

Unused offer
 ['Repeater' 'Refreshed' 'New' nan] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_CLIENT_TYPE   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | Repeater           |   17471 | 0.768226  |              17471 |             0.768226 |
|  1 | Refreshed          |    3404 | 0.149679  |              20875 |             0.917905 |
|  2 | New                |    1867 | 0.0820948 |              22742 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

Client Type (New/Refreshed/Repeater)

  • From the above plot, most of the applications belong to "Repeater" Category.
  • 54% of Applications from "Repeater" category are approved.
  • 93% of Applications from "New" category are approved.
  • 72% of Applications from "Refreshed" category are approved.

Overall, applications from "New" category are more likely to get approved.

In [106]:
#NAME_CONTRACT_TYPE_prev
merged_cat_univariate_analysis('NAME_CONTRACT_TYPE_prev')
Approved
 ['Consumer loans' 'Cash loans' 'Revolving loans'] 

Canceled
 ['Revolving loans' 'Cash loans' 'Consumer loans' nan] 

Refused
 ['Cash loans' 'Consumer loans' 'Revolving loans' nan] 

Unused offer
 ['Consumer loans' 'Cash loans' 'Revolving loans'] 

Approved
 ['Consumer loans' 'Cash loans' 'Revolving loans'] 

+----+---------------------------+---------+-----------+--------------------+----------------------+
|    | NAME_CONTRACT_TYPE_prev   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+---------------------------+---------+-----------+--------------------+----------------------|
|  0 | Consumer loans            |  537310 | 0.606377  |             537310 |             0.606377 |
|  1 | Cash loans                |  266381 | 0.300622  |             803691 |             0.906999 |
|  2 | Revolving loans           |   82408 | 0.0930009 |             886099 |             1        |
+----+---------------------------+---------+-----------+--------------------+----------------------+ 

Canceled
 ['Revolving loans' 'Cash loans' 'Consumer loans' nan] 

+----+---------------------------+---------+------------+--------------------+----------------------+
|    | NAME_CONTRACT_TYPE_prev   |   Count |    Percent |   Cumulative Count |   Cumulative Percent |
|----+---------------------------+---------+------------+--------------------+----------------------|
|  0 | Cash loans                |  220383 | 0.850384   |             220383 |             0.850384 |
|  1 | Revolving loans           |   37445 | 0.144488   |             257828 |             0.994872 |
|  2 | Consumer loans            |    1329 | 0.00512817 |             259157 |             1        |
+----+---------------------------+---------+------------+--------------------+----------------------+ 

Refused
 ['Cash loans' 'Consumer loans' 'Revolving loans' nan] 

+----+---------------------------+---------+-----------+--------------------+----------------------+
|    | NAME_CONTRACT_TYPE_prev   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+---------------------------+---------+-----------+--------------------+----------------------|
|  0 | Cash loans                |  139568 |  0.568827 |             139568 |             0.568827 |
|  1 | Consumer loans            |   64282 |  0.261989 |             203850 |             0.830817 |
|  2 | Revolving loans           |   41511 |  0.169183 |             245361 |             1        |
+----+---------------------------+---------+-----------+--------------------+----------------------+ 

Unused offer
 ['Consumer loans' 'Cash loans' 'Revolving loans'] 

+----+---------------------------+---------+-------------+--------------------+----------------------+
|    | NAME_CONTRACT_TYPE_prev   |   Count |     Percent |   Cumulative Count |   Cumulative Percent |
|----+---------------------------+---------+-------------+--------------------+----------------------|
|  0 | Consumer loans            |   22335 | 0.980853    |              22335 |             0.980853 |
|  1 | Cash loans                |     432 | 0.0189715   |              22767 |             0.999824 |
|  2 | Revolving loans           |       4 | 0.000175662 |              22771 |             1        |
+----+---------------------------+---------+-------------+--------------------+----------------------+ 

Previous Contract Type

  • Majority of Contract Type if "Cash loans".
  • 86% of Consumer loans are more likely to get Approved.
  • 42% of Cash loans are more likely to get Approved.
  • 51% of Revolving loans are more likely to get Approved.

Approval rate is very high in Consumer loans

In [107]:
#NAME_PORTFOLIO
merged_cat_univariate_analysis('NAME_PORTFOLIO')
Approved
 ['POS' 'Cash' 'Cards' 'Cars' nan] 

Canceled
 [nan 'Cash' 'Cards' 'POS' 'Cars'] 

Refused
 ['Cash' nan 'POS' 'Cards' 'Cars'] 

Unused offer
 [nan 'POS' 'Cards'] 

posx and posy should be finite values
posx and posy should be finite values
Approved
 ['POS' 'Cash' 'Cards' 'Cars' nan] 

+----+------------------+---------+------------+--------------------+----------------------+
|    | NAME_PORTFOLIO   |   Count |    Percent |   Cumulative Count |   Cumulative Percent |
|----+------------------+---------+------------+--------------------+----------------------|
|  0 | POS              |  537069 | 0.606107   |             537069 |             0.606107 |
|  1 | Cash             |  266378 | 0.30062    |             803447 |             0.906727 |
|  2 | Cards            |   82408 | 0.0930012  |             885855 |             0.999728 |
|  3 | Cars             |     241 | 0.00027198 |             886096 |             1        |
+----+------------------+---------+------------+--------------------+----------------------+ 

Canceled
 [nan 'Cash' 'Cards' 'POS' 'Cars'] 

+----+------------------+---------+------------+--------------------+----------------------+
|    | NAME_PORTFOLIO   |   Count |    Percent |   Cumulative Count |   Cumulative Percent |
|----+------------------+---------+------------+--------------------+----------------------|
|  0 | Cash             |    8183 | 0.934132   |               8183 |             0.934132 |
|  1 | Cards            |     417 | 0.0476027  |               8600 |             0.981735 |
|  2 | POS              |     127 | 0.0144977  |               8727 |             0.996233 |
|  3 | Cars             |      33 | 0.00376712 |               8760 |             1        |
+----+------------------+---------+------------+--------------------+----------------------+ 

Refused
 ['Cash' nan 'POS' 'Cards' 'Cars'] 

+----+------------------+---------+-------------+--------------------+----------------------+
|    | NAME_PORTFOLIO   |   Count |     Percent |   Cumulative Count |   Cumulative Percent |
|----+------------------+---------+-------------+--------------------+----------------------|
|  0 | Cash             |  117102 | 0.555383    |             117102 |             0.555383 |
|  1 | POS              |   54482 | 0.258393    |             171584 |             0.813777 |
|  2 | Cards            |   39159 | 0.185721    |             210743 |             0.999497 |
|  3 | Cars             |     106 | 0.000502729 |             210849 |             1        |
+----+------------------+---------+-------------+--------------------+----------------------+ 

Unused offer
 [nan 'POS' 'Cards'] 

+----+------------------+---------+------------+--------------------+----------------------+
|    | NAME_PORTFOLIO   |   Count |    Percent |   Cumulative Count |   Cumulative Percent |
|----+------------------+---------+------------+--------------------+----------------------|
|  0 | POS              |     782 | 0.998723   |                782 |             0.998723 |
|  1 | Cards            |       1 | 0.00127714 |                783 |             1        |
+----+------------------+---------+------------+--------------------+----------------------+ 

posx and posy should be finite values
posx and posy should be finite values

Name Portfolio

  • Majority of the loans come under "POS" category.
  • Percentage of loans approved in each category POS>CASH>CARDS>CARS -- 91%>68%>67%>63%.
In [108]:
#NAME_YIELD_GROUP
merged_cat_univariate_analysis('NAME_YIELD_GROUP')
Approved
 ['low_normal' 'middle' 'high' nan 'low_action'] 

Canceled
 [nan 'low_normal' 'middle' 'high' 'low_action'] 

Refused
 ['low_normal' nan 'high' 'low_action' 'middle'] 

Unused offer
 [nan 'low_normal' 'middle' 'high'] 

posx and posy should be finite values
Approved
 ['low_normal' 'middle' 'high' nan 'low_action'] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_YIELD_GROUP   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | middle             |  273331 | 0.340096  |             273331 |             0.340096 |
|  1 | high               |  258996 | 0.322259  |             532327 |             0.662355 |
|  2 | low_normal         |  210731 | 0.262205  |             743058 |             0.92456  |
|  3 | low_action         |   60630 | 0.0754397 |             803688 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

Canceled
 [nan 'low_normal' 'middle' 'high' 'low_action'] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_YIELD_GROUP   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | low_normal         |    4087 | 0.489872  |               4087 |             0.489872 |
|  1 | middle             |    2629 | 0.315114  |               6716 |             0.804986 |
|  2 | low_action         |     821 | 0.0984058 |               7537 |             0.903392 |
|  3 | high               |     806 | 0.0966079 |               8343 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

Refused
 ['low_normal' nan 'high' 'low_action' 'middle'] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_YIELD_GROUP   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | low_normal         |   59510 | 0.346613  |              59510 |             0.346613 |
|  1 | middle             |   48847 | 0.284507  |             108357 |             0.63112  |
|  2 | high               |   46197 | 0.269072  |             154554 |             0.900192 |
|  3 | low_action         |   17136 | 0.0998078 |             171690 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

Unused offer
 [nan 'low_normal' 'middle' 'high'] 

+----+--------------------+---------+-----------+--------------------+----------------------+
|    | NAME_YIELD_GROUP   |   Count |   Percent |   Cumulative Count |   Cumulative Percent |
|----+--------------------+---------+-----------+--------------------+----------------------|
|  0 | low_normal         |     556 | 0.710997  |                556 |             0.710997 |
|  1 | middle             |     164 | 0.209719  |                720 |             0.920716 |
|  2 | high               |      62 | 0.0792839 |                782 |             1        |
+----+--------------------+---------+-----------+--------------------+----------------------+ 

posx and posy should be finite values

Grouped Interest Rate

  • Major applicants come under "middle" Catrgory.
  • Approval percentage of "low_normal" and "low_action" is similar (77%).
  • Loarge number of loans are approved in "middle" category (85%).

Bivariate Analysis

In [109]:
merged_data.columns
Out[109]:
Index(['AMT_ANNUITY_curr', 'AMT_INCOME_TOTAL', 'AMT_CREDIT_curr',
       'AMT_GOODS_PRICE_curr', 'NAME_CONTRACT_TYPE_curr', 'CODE_GENDER',
       'NAME_INCOME_TYPE', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
       'SK_ID_CURR', 'AGE_YEARS', 'AMT_INCOME_CAT', 'AMT_CREDIT_RANGE',
       'AMT_ANNUITY_prev', 'AMT_APPLICATION', 'AMT_CREDIT_prev',
       'AMT_GOODS_PRICE_prev', 'CHANNEL_TYPE', 'DAYS_DECISION',
       'NAME_CLIENT_TYPE', 'NAME_CONTRACT_STATUS', 'NAME_CONTRACT_TYPE_prev',
       'NAME_PORTFOLIO', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION',
       'SK_ID_PREV', 'AMT_CREDIT_prev_cat', 'AMT_APPLICATION_cat'],
      dtype='object')
In [110]:
# AMT_ANNUITY_prev vs AMT_INCOME_CAT vs NAME_CONTRACT_STATUS
plt.figure(figsize=[20,12])
plt.title('Annuity Amount vs Income Category vs Loan Application Results')
sns.barplot(x='AMT_ANNUITY_prev', y = 'AMT_INCOME_CAT', hue='NAME_CONTRACT_STATUS', data=merged_data)
Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f96d6b8bb50>

Amount of Annuity & Income Category

  • Loans with higher annuity are approved for clients with higher income.
  • Loan refusals follow the opposite trend
  • 450000 - 475000 has higher rate of cancellations compared to the adjacent income ranges.

Further analysis of income range 450000 - 475000 might bring the bank high income low risk loans (with possible high annuities)

In [111]:
# AMT_CREDIT_prev - AMT_APPLICATION vs NAME_YIELD_GROUP vs NAME_CONTRACT_STATUS

merged_data['AMT_DIFF'] = merged_data['AMT_CREDIT_prev'] - merged_data['AMT_APPLICATION']
plt.figure(figsize=[8,8])
plt.title('Difference between Approved Loan and Applied Loan vs Interest Rate Category vs Loan Application Results')
sns.barplot(y='NAME_YIELD_GROUP', x = 'AMT_DIFF', hue='NAME_CONTRACT_STATUS', data=merged_data)
Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f961e049810>

Difference between Approved Loan and Applied Loan & Interest Rate Category

  • It can be seen that a huge proportion of low interest category loans are being refused by the bank
  • And only high interest rate loans are unused by the clients.
  • Among all interest categories, only 'low_normal' has higher refusals compared to client cancellations.
In [112]:
# DAYS_DECISION vs NAME_CONTRACT_TYPE vs NAME_CONTRACT_STATUS
plt.figure(figsize=[8,8])
plt.title('Processing Time vs Client Type vs Loan Application Results')
sns.barplot(y='NAME_CONTRACT_TYPE_prev', x = 'DAYS_DECISION', hue='NAME_CONTRACT_STATUS', data=merged_data)
Out[112]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f95f29d0a50>

Processing Time & Loan Type

  • Consumer Loans have the longest decision period followed by cash loans followed by Revolving Loans , which makes business sense.
  • A huge proportion of consumer loans are being cancelled by the client because of relatively high processing time.
  • Cash loans that take longer to process are more unused than other loans.

The bank should expedite the process of approving cash loans and Consumer Loans

In [113]:
# NAME_PORTFOLIO vs NAME_YIELD_GROUP vs NAME_CONTRACT_STATUS
#merged_data.groupby(['NAME_PORTFOLIO','NAME_YIELD_GROUP'])['NAME_CONTRACT_STATUS'].value_counts(normalize=True).plot.bar()
merged_data.groupby(['NAME_PORTFOLIO','NAME_YIELD_GROUP'])['NAME_CONTRACT_STATUS'].value_counts(normalize=True)\
.unstack()\
   .plot( 
    layout=(2,2),
    figsize=(8,6), kind='barh', stacked=True);

Portfolio & Interest Rate

  • From the above plot, we can see that POS loans are approved irrespective of interest rate. That is POS loans are popular among clients irrespective of interest rate.
  • Car loans at medium interest rate are mostly refused by the bank
  • Highest cancellation rates are found in car loans offered at low interest rates.

This could be further correlated with the difference in approved amount and applied amount to gather useful insights

In [114]:
#AMT_CREDIT_prev & DAYS_EMPLOYED vs NAME_CONTRACT_STATUS

plt.figure(figsize=[10,8])
plt.xticks(rotation=45)
sns.barplot(y='DAYS_EMPLOYED', x = 'NAME_EDUCATION_TYPE', hue='NAME_CONTRACT_STATUS', data=merged_data)

plt.yscale('log')

Education & Days Employed

  • There are very low unused offers among clients with an Academic degree and recent employment
In [115]:
merged_data.columns
Out[115]:
Index(['AMT_ANNUITY_curr', 'AMT_INCOME_TOTAL', 'AMT_CREDIT_curr',
       'AMT_GOODS_PRICE_curr', 'NAME_CONTRACT_TYPE_curr', 'CODE_GENDER',
       'NAME_INCOME_TYPE', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE',
       'SK_ID_CURR', 'AGE_YEARS', 'AMT_INCOME_CAT', 'AMT_CREDIT_RANGE',
       'AMT_ANNUITY_prev', 'AMT_APPLICATION', 'AMT_CREDIT_prev',
       'AMT_GOODS_PRICE_prev', 'CHANNEL_TYPE', 'DAYS_DECISION',
       'NAME_CLIENT_TYPE', 'NAME_CONTRACT_STATUS', 'NAME_CONTRACT_TYPE_prev',
       'NAME_PORTFOLIO', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION',
       'SK_ID_PREV', 'AMT_CREDIT_prev_cat', 'AMT_APPLICATION_cat', 'AMT_DIFF'],
      dtype='object')
In [116]:
#### NAME_EDUCATION_TYPE vs NAME_CLIENT_TYPE vs NAME_CONTRACT_STATUS
merged_data.groupby(['NAME_EDUCATION_TYPE','NAME_CLIENT_TYPE'])['NAME_CONTRACT_STATUS'].value_counts(normalize=True)\
.unstack()\
   .plot( 
    layout=(2,2),
    figsize=(8,6), kind='barh', stacked=True);

Education & Client Type

  • New customers with Lower Secondary education are the most approved borrowers and they almost always take the loan offer.
  • New customers with Academic degree come a close second in approval rate.
  • Among repeat customers, Academic Degree holding customers have the highest approval rate.
  • 'Refreshed' customers have a uniform approval rate across all education types.
  • Refereshed customers with Incomplete higher education have the maximum unused offers
  • Repeat customers have a uniform cancellation rate that's higher than New and Refreshed customers

Bank should look into the reasons why Repeat customers who are cancelling their offers. It could also focus on loans catered to academic degree holders who are first time customers

In [117]:
#CODE_GENDER vs CHANNEL_TYPE vs NAME_CONTRACT_STATUS

merged_data.groupby(['NAME_INCOME_TYPE','CODE_GENDER'])['NAME_CONTRACT_STATUS'].value_counts(normalize=True)\
.unstack()\
   .plot( 
    layout=(2,2),
    figsize=(8,6), kind='barh', stacked=True);

Gender & Channel Type

  • There is a mark difference in loan approval rates of Male and Female students.
  • Also, unemployed Females have a much lower rejection rate than Unemployed males.

Correlation Analysis

In [118]:
def correlation(dataframe) : 
    cor0=dataframe.corr()
    type(cor0)
    cor0.where(np.triu(np.ones(cor0.shape),k=1).astype(np.bool))
    cor0=cor0.unstack().reset_index()
    cor0.columns=['VAR1','VAR2','CORR']
    cor0.dropna(subset=['CORR'], inplace=True)
    cor0.CORR=round(cor0['CORR'],2)
    cor0.CORR=cor0.CORR.abs()
    cor0.sort_values(by=['CORR'],ascending=False)
    cor0=cor0[~(cor0['VAR1']==cor0['VAR2'])]
    return pd.DataFrame(cor0.sort_values(by=['CORR'],ascending=False))

Correlation for 'Approved' Loans

In [119]:
# Correlation for Approved
# Absolute values are reported 
pd.set_option('precision', 2)
cor_0 = correlation(merged_a)
cor_0.style.background_gradient(cmap='GnBu').hide_index()
Out[119]:
VAR1 VAR2 CORR
AMT_APPLICATION AMT_GOODS_PRICE_prev 1.00
AMT_GOODS_PRICE_prev AMT_APPLICATION 1.00
AMT_CREDIT_curr AMT_GOODS_PRICE_curr 0.99
AMT_CREDIT_prev AMT_GOODS_PRICE_prev 0.99
AMT_GOODS_PRICE_curr AMT_CREDIT_curr 0.99
AMT_GOODS_PRICE_prev AMT_CREDIT_prev 0.99
AMT_APPLICATION AMT_CREDIT_prev 0.96
AMT_CREDIT_prev AMT_APPLICATION 0.96
AMT_ANNUITY_prev AMT_GOODS_PRICE_prev 0.83
AMT_ANNUITY_prev AMT_CREDIT_prev 0.83
AMT_GOODS_PRICE_prev AMT_ANNUITY_prev 0.83
AMT_CREDIT_prev AMT_ANNUITY_prev 0.83
AMT_APPLICATION AMT_ANNUITY_prev 0.81
AMT_ANNUITY_prev AMT_APPLICATION 0.81
AMT_GOODS_PRICE_curr AMT_ANNUITY_curr 0.77
AMT_ANNUITY_curr AMT_GOODS_PRICE_curr 0.77
AMT_CREDIT_curr AMT_ANNUITY_curr 0.76
AMT_ANNUITY_curr AMT_CREDIT_curr 0.76
DAYS_EMPLOYED AGE_YEARS 0.64
AGE_YEARS DAYS_EMPLOYED 0.64
DAYS_DECISION AMT_CREDIT_prev 0.27
AMT_CREDIT_prev DAYS_DECISION 0.27
AMT_APPLICATION DAYS_DECISION 0.27
DAYS_DECISION AMT_APPLICATION 0.27
AMT_ANNUITY_prev DAYS_DECISION 0.26
DAYS_DECISION AMT_ANNUITY_prev 0.26
AMT_GOODS_PRICE_prev DAYS_DECISION 0.26
DAYS_DECISION AMT_GOODS_PRICE_prev 0.26
AMT_ANNUITY_curr AMT_INCOME_TOTAL 0.20
AMT_ANNUITY_prev AMT_ANNUITY_curr 0.20
AMT_ANNUITY_curr AMT_ANNUITY_prev 0.20
AMT_INCOME_TOTAL AMT_ANNUITY_curr 0.20
AMT_GOODS_PRICE_curr AMT_INCOME_TOTAL 0.16
AMT_INCOME_TOTAL AMT_CREDIT_curr 0.16
AMT_INCOME_TOTAL AMT_GOODS_PRICE_curr 0.16
AMT_CREDIT_curr AMT_INCOME_TOTAL 0.16
AMT_ANNUITY_prev AMT_CREDIT_curr 0.15
AMT_CREDIT_curr AMT_ANNUITY_prev 0.15
AMT_GOODS_PRICE_curr AMT_ANNUITY_prev 0.15
AMT_ANNUITY_prev AMT_GOODS_PRICE_curr 0.15
AMT_APPLICATION AMT_ANNUITY_curr 0.13
AMT_GOODS_PRICE_curr AMT_GOODS_PRICE_prev 0.13
AMT_APPLICATION AMT_GOODS_PRICE_curr 0.13
AMT_ANNUITY_curr AMT_APPLICATION 0.13
AMT_GOODS_PRICE_curr AMT_APPLICATION 0.13
AMT_GOODS_PRICE_prev AMT_GOODS_PRICE_curr 0.13
AMT_ANNUITY_curr AMT_GOODS_PRICE_prev 0.13
AMT_GOODS_PRICE_prev AMT_CREDIT_curr 0.13
AMT_GOODS_PRICE_prev AMT_ANNUITY_curr 0.13
AMT_CREDIT_curr AMT_GOODS_PRICE_prev 0.13
AMT_GOODS_PRICE_curr AMT_CREDIT_prev 0.12
AMT_CREDIT_prev AMT_ANNUITY_curr 0.12
AGE_YEARS AMT_APPLICATION 0.12
AGE_YEARS AMT_CREDIT_prev 0.12
AGE_YEARS AMT_GOODS_PRICE_prev 0.12
AMT_GOODS_PRICE_prev AGE_YEARS 0.12
AMT_CREDIT_curr AMT_APPLICATION 0.12
AMT_CREDIT_curr AMT_CREDIT_prev 0.12
AMT_APPLICATION AMT_CREDIT_curr 0.12
AMT_CREDIT_prev AGE_YEARS 0.12
AMT_APPLICATION AGE_YEARS 0.12
AMT_CREDIT_prev AMT_GOODS_PRICE_curr 0.12
AMT_CREDIT_prev AMT_CREDIT_curr 0.12
AMT_ANNUITY_curr AMT_CREDIT_prev 0.12
AMT_ANNUITY_curr DAYS_EMPLOYED 0.11
AMT_ANNUITY_prev AMT_INCOME_TOTAL 0.11
DAYS_EMPLOYED AMT_ANNUITY_curr 0.11
AMT_INCOME_TOTAL AMT_ANNUITY_prev 0.11
AMT_ANNUITY_prev AGE_YEARS 0.09
AGE_YEARS AMT_ANNUITY_prev 0.09
AMT_INCOME_TOTAL AMT_APPLICATION 0.08
AMT_APPLICATION AMT_INCOME_TOTAL 0.08
AMT_INCOME_TOTAL AMT_GOODS_PRICE_prev 0.08
AMT_GOODS_PRICE_prev AMT_INCOME_TOTAL 0.08
AMT_CREDIT_prev AMT_INCOME_TOTAL 0.08
AMT_INCOME_TOTAL AMT_CREDIT_prev 0.08
AMT_CREDIT_curr DAYS_EMPLOYED 0.07
AMT_INCOME_TOTAL DAYS_EMPLOYED 0.07
AMT_GOODS_PRICE_curr DAYS_EMPLOYED 0.07
DAYS_EMPLOYED AMT_INCOME_TOTAL 0.07
DAYS_EMPLOYED AMT_CREDIT_curr 0.07
DAYS_EMPLOYED AMT_GOODS_PRICE_curr 0.07
AMT_CREDIT_curr DAYS_DECISION 0.06
DAYS_DECISION AMT_CREDIT_curr 0.06
AMT_GOODS_PRICE_curr DAYS_DECISION 0.06
DAYS_DECISION AMT_GOODS_PRICE_curr 0.06
DAYS_DECISION AGE_YEARS 0.06
AGE_YEARS DAYS_DECISION 0.06
DAYS_EMPLOYED AMT_APPLICATION 0.04
AMT_GOODS_PRICE_prev DAYS_EMPLOYED 0.04
AMT_ANNUITY_curr AGE_YEARS 0.04
AMT_ANNUITY_curr DAYS_DECISION 0.04
AGE_YEARS AMT_ANNUITY_curr 0.04
DAYS_EMPLOYED AMT_GOODS_PRICE_prev 0.04
AMT_APPLICATION DAYS_EMPLOYED 0.04
DAYS_DECISION AMT_ANNUITY_curr 0.04
AMT_CREDIT_prev DAYS_EMPLOYED 0.03
AGE_YEARS AMT_GOODS_PRICE_curr 0.03
AGE_YEARS AMT_CREDIT_curr 0.03
AGE_YEARS AMT_INCOME_TOTAL 0.03
AMT_INCOME_TOTAL AGE_YEARS 0.03
AMT_GOODS_PRICE_curr AGE_YEARS 0.03
DAYS_EMPLOYED AMT_CREDIT_prev 0.03
AMT_CREDIT_curr AGE_YEARS 0.03
SK_ID_PREV AMT_ANNUITY_prev 0.01
SK_ID_PREV AMT_GOODS_PRICE_prev 0.01
SK_ID_PREV AMT_CREDIT_prev 0.01
AMT_CREDIT_prev SK_ID_PREV 0.01
AMT_ANNUITY_prev SK_ID_PREV 0.01
AMT_GOODS_PRICE_prev SK_ID_PREV 0.01
SK_ID_PREV AMT_ANNUITY_curr 0.00
AMT_INCOME_TOTAL SK_ID_CURR 0.00
SK_ID_PREV AGE_YEARS 0.00
DAYS_DECISION AMT_INCOME_TOTAL 0.00
SK_ID_PREV AMT_APPLICATION 0.00
DAYS_DECISION SK_ID_PREV 0.00
SK_ID_PREV SK_ID_CURR 0.00
SK_ID_PREV DAYS_EMPLOYED 0.00
SK_ID_PREV AMT_GOODS_PRICE_curr 0.00
DAYS_DECISION DAYS_EMPLOYED 0.00
DAYS_DECISION SK_ID_CURR 0.00
SK_ID_PREV AMT_CREDIT_curr 0.00
SK_ID_PREV AMT_INCOME_TOTAL 0.00
AMT_ANNUITY_curr SK_ID_PREV 0.00
AMT_ANNUITY_curr SK_ID_CURR 0.00
SK_ID_CURR DAYS_DECISION 0.00
AMT_CREDIT_prev SK_ID_CURR 0.00
AMT_GOODS_PRICE_prev SK_ID_CURR 0.00
AMT_ANNUITY_prev SK_ID_CURR 0.00
SK_ID_CURR AMT_GOODS_PRICE_prev 0.00
AGE_YEARS SK_ID_CURR 0.00
SK_ID_CURR AMT_CREDIT_prev 0.00
SK_ID_CURR AMT_APPLICATION 0.00
SK_ID_CURR AMT_ANNUITY_prev 0.00
SK_ID_CURR AGE_YEARS 0.00
AGE_YEARS SK_ID_PREV 0.00
SK_ID_CURR DAYS_EMPLOYED 0.00
SK_ID_CURR AMT_GOODS_PRICE_curr 0.00
SK_ID_CURR AMT_CREDIT_curr 0.00
SK_ID_CURR AMT_INCOME_TOTAL 0.00
AMT_ANNUITY_prev DAYS_EMPLOYED 0.00
SK_ID_CURR AMT_ANNUITY_curr 0.00
AMT_INCOME_TOTAL DAYS_DECISION 0.00
DAYS_EMPLOYED SK_ID_PREV 0.00
DAYS_EMPLOYED DAYS_DECISION 0.00
DAYS_EMPLOYED AMT_ANNUITY_prev 0.00
DAYS_EMPLOYED SK_ID_CURR 0.00
AMT_APPLICATION SK_ID_CURR 0.00
AMT_GOODS_PRICE_curr SK_ID_PREV 0.00
AMT_APPLICATION SK_ID_PREV 0.00
AMT_GOODS_PRICE_curr SK_ID_CURR 0.00
SK_ID_CURR SK_ID_PREV 0.00
AMT_CREDIT_curr SK_ID_PREV 0.00
AMT_CREDIT_curr SK_ID_CURR 0.00
AMT_INCOME_TOTAL SK_ID_PREV 0.00
SK_ID_PREV DAYS_DECISION 0.00

Correlation for 'Cancelled' Loans

In [120]:
# Correlation for Cancelled
# Absolute values are reported 
pd.set_option('precision', 2)
cor_0 = correlation(merged_c)
cor_0.style.background_gradient(cmap='GnBu').hide_index()
Out[120]:
VAR1 VAR2 CORR
AMT_APPLICATION AMT_GOODS_PRICE_prev 1.00
AMT_APPLICATION AMT_CREDIT_prev 1.00
AMT_CREDIT_prev AMT_APPLICATION 1.00
AMT_GOODS_PRICE_prev AMT_APPLICATION 1.00
AMT_GOODS_PRICE_curr AMT_CREDIT_curr 0.99
AMT_CREDIT_prev AMT_GOODS_PRICE_prev 0.99
AMT_CREDIT_curr AMT_GOODS_PRICE_curr 0.99
AMT_GOODS_PRICE_prev AMT_CREDIT_prev 0.99
AMT_ANNUITY_prev AMT_CREDIT_prev 0.79
AMT_ANNUITY_prev AMT_GOODS_PRICE_prev 0.79
AMT_APPLICATION AMT_ANNUITY_prev 0.79
AMT_GOODS_PRICE_prev AMT_ANNUITY_prev 0.79
AMT_CREDIT_prev AMT_ANNUITY_prev 0.79
AMT_ANNUITY_prev AMT_APPLICATION 0.79
AMT_GOODS_PRICE_curr AMT_ANNUITY_curr 0.76
AMT_ANNUITY_curr AMT_GOODS_PRICE_curr 0.76
AMT_ANNUITY_curr AMT_CREDIT_curr 0.75
AMT_CREDIT_curr AMT_ANNUITY_curr 0.75
AGE_YEARS DAYS_EMPLOYED 0.64
DAYS_EMPLOYED AGE_YEARS 0.64
AMT_ANNUITY_curr AMT_INCOME_TOTAL 0.40
AMT_INCOME_TOTAL AMT_ANNUITY_curr 0.40
AMT_ANNUITY_curr AMT_ANNUITY_prev 0.39
AMT_ANNUITY_prev AMT_ANNUITY_curr 0.39
AMT_INCOME_TOTAL AMT_ANNUITY_prev 0.37
AMT_ANNUITY_prev AMT_INCOME_TOTAL 0.37
AMT_GOODS_PRICE_curr AMT_GOODS_PRICE_prev 0.36
AMT_GOODS_PRICE_prev AMT_GOODS_PRICE_curr 0.36
AMT_GOODS_PRICE_prev AMT_CREDIT_curr 0.35
AMT_CREDIT_curr AMT_GOODS_PRICE_prev 0.35
AMT_GOODS_PRICE_curr AMT_ANNUITY_prev 0.33
AMT_ANNUITY_prev AMT_GOODS_PRICE_curr 0.33
AMT_ANNUITY_prev AMT_CREDIT_curr 0.33
AMT_CREDIT_curr AMT_ANNUITY_prev 0.33
AMT_INCOME_TOTAL AMT_CREDIT_curr 0.32
AMT_INCOME_TOTAL AMT_GOODS_PRICE_curr 0.32
AMT_CREDIT_curr AMT_INCOME_TOTAL 0.32
AMT_GOODS_PRICE_curr AMT_INCOME_TOTAL 0.32
AMT_ANNUITY_curr AMT_GOODS_PRICE_prev 0.29
AMT_GOODS_PRICE_prev AMT_ANNUITY_curr 0.29
AMT_GOODS_PRICE_prev AMT_INCOME_TOTAL 0.28
AMT_INCOME_TOTAL AMT_GOODS_PRICE_prev 0.28
AMT_INCOME_TOTAL DAYS_EMPLOYED 0.13
DAYS_EMPLOYED AMT_INCOME_TOTAL 0.13
AMT_ANNUITY_prev DAYS_EMPLOYED 0.10
AMT_ANNUITY_curr DAYS_EMPLOYED 0.10
DAYS_EMPLOYED AMT_ANNUITY_curr 0.10
DAYS_EMPLOYED AMT_ANNUITY_prev 0.10
AMT_APPLICATION DAYS_DECISION 0.08
DAYS_DECISION AMT_APPLICATION 0.08
AMT_CREDIT_curr AMT_APPLICATION 0.07
DAYS_DECISION AMT_CREDIT_prev 0.07
AMT_APPLICATION AMT_CREDIT_curr 0.07
AMT_APPLICATION AMT_GOODS_PRICE_curr 0.07
AMT_GOODS_PRICE_prev AGE_YEARS 0.07
AGE_YEARS AMT_GOODS_PRICE_prev 0.07
AMT_CREDIT_prev AMT_CREDIT_curr 0.07
AMT_CREDIT_prev AMT_GOODS_PRICE_curr 0.07
AMT_CREDIT_prev DAYS_DECISION 0.07
AMT_GOODS_PRICE_curr AMT_CREDIT_prev 0.07
AMT_GOODS_PRICE_curr AMT_APPLICATION 0.07
AMT_CREDIT_curr AMT_CREDIT_prev 0.07
AGE_YEARS AMT_GOODS_PRICE_curr 0.05
AMT_ANNUITY_curr DAYS_DECISION 0.05
AMT_CREDIT_prev AMT_ANNUITY_curr 0.05
AMT_GOODS_PRICE_prev DAYS_EMPLOYED 0.05
AMT_APPLICATION AMT_ANNUITY_curr 0.05
AMT_ANNUITY_curr AMT_CREDIT_prev 0.05
AMT_ANNUITY_curr AMT_APPLICATION 0.05
AMT_CREDIT_curr DAYS_EMPLOYED 0.05
DAYS_DECISION AMT_ANNUITY_curr 0.05
AMT_GOODS_PRICE_curr AGE_YEARS 0.05
DAYS_EMPLOYED AMT_CREDIT_curr 0.05
AMT_CREDIT_curr DAYS_DECISION 0.05
DAYS_DECISION AMT_CREDIT_curr 0.05
DAYS_EMPLOYED AMT_GOODS_PRICE_prev 0.05
AMT_CREDIT_curr AGE_YEARS 0.04
AMT_INCOME_TOTAL AGE_YEARS 0.04
DAYS_EMPLOYED AMT_GOODS_PRICE_curr 0.04
DAYS_DECISION AMT_GOODS_PRICE_prev 0.04
AGE_YEARS AMT_INCOME_TOTAL 0.04
AMT_GOODS_PRICE_prev DAYS_DECISION 0.04
AMT_GOODS_PRICE_curr DAYS_DECISION 0.04
AMT_GOODS_PRICE_curr DAYS_EMPLOYED 0.04
DAYS_DECISION AMT_GOODS_PRICE_curr 0.04
AGE_YEARS AMT_CREDIT_curr 0.04
AMT_ANNUITY_curr AGE_YEARS 0.03
AMT_INCOME_TOTAL DAYS_DECISION 0.03
DAYS_DECISION AMT_INCOME_TOTAL 0.03
AGE_YEARS AMT_ANNUITY_curr 0.03
AMT_CREDIT_prev DAYS_EMPLOYED 0.02
AMT_CREDIT_prev AMT_INCOME_TOTAL 0.02
DAYS_EMPLOYED AMT_APPLICATION 0.02
DAYS_EMPLOYED AMT_CREDIT_prev 0.02
AMT_APPLICATION DAYS_EMPLOYED 0.02
AMT_APPLICATION AMT_INCOME_TOTAL 0.02
AMT_INCOME_TOTAL AMT_APPLICATION 0.02
AMT_INCOME_TOTAL AMT_CREDIT_prev 0.02
AMT_ANNUITY_prev SK_ID_PREV 0.01
AMT_APPLICATION SK_ID_PREV 0.01
DAYS_DECISION DAYS_EMPLOYED 0.01
AMT_CREDIT_prev SK_ID_PREV 0.01
AMT_GOODS_PRICE_prev SK_ID_PREV 0.01
SK_ID_PREV AMT_GOODS_PRICE_prev 0.01
AGE_YEARS SK_ID_PREV 0.01
SK_ID_PREV AGE_YEARS 0.01
AMT_ANNUITY_prev DAYS_DECISION 0.01
SK_ID_PREV AMT_ANNUITY_prev 0.01
DAYS_DECISION AMT_ANNUITY_prev 0.01
SK_ID_PREV AMT_CREDIT_prev 0.01
DAYS_EMPLOYED DAYS_DECISION 0.01
SK_ID_PREV AMT_APPLICATION 0.01
DAYS_DECISION AGE_YEARS 0.00
SK_ID_PREV SK_ID_CURR 0.00
SK_ID_PREV AMT_CREDIT_curr 0.00
DAYS_DECISION SK_ID_CURR 0.00
SK_ID_PREV AMT_GOODS_PRICE_curr 0.00
SK_ID_PREV DAYS_EMPLOYED 0.00
AMT_GOODS_PRICE_prev SK_ID_CURR 0.00
SK_ID_PREV AMT_INCOME_TOTAL 0.00
SK_ID_PREV AMT_ANNUITY_curr 0.00
DAYS_DECISION SK_ID_PREV 0.00
AGE_YEARS AMT_ANNUITY_prev 0.00
AMT_CREDIT_prev AGE_YEARS 0.00
SK_ID_CURR DAYS_EMPLOYED 0.00
AMT_ANNUITY_curr SK_ID_CURR 0.00
AMT_ANNUITY_curr SK_ID_PREV 0.00
AMT_INCOME_TOTAL SK_ID_CURR 0.00
AMT_INCOME_TOTAL SK_ID_PREV 0.00
AMT_CREDIT_curr SK_ID_CURR 0.00
AMT_CREDIT_curr SK_ID_PREV 0.00
AMT_GOODS_PRICE_curr SK_ID_CURR 0.00
AMT_GOODS_PRICE_curr SK_ID_PREV 0.00
DAYS_EMPLOYED SK_ID_CURR 0.00
DAYS_EMPLOYED SK_ID_PREV 0.00
SK_ID_CURR AMT_ANNUITY_curr 0.00
SK_ID_CURR AMT_INCOME_TOTAL 0.00
SK_ID_CURR AMT_CREDIT_curr 0.00
SK_ID_CURR AMT_GOODS_PRICE_curr 0.00
SK_ID_CURR AGE_YEARS 0.00
AMT_CREDIT_prev SK_ID_CURR 0.00
SK_ID_CURR AMT_ANNUITY_prev 0.00
SK_ID_CURR AMT_APPLICATION 0.00
SK_ID_CURR AMT_CREDIT_prev 0.00
SK_ID_CURR AMT_GOODS_PRICE_prev 0.00
SK_ID_CURR DAYS_DECISION 0.00
SK_ID_CURR SK_ID_PREV 0.00
AGE_YEARS SK_ID_CURR 0.00
AGE_YEARS AMT_APPLICATION 0.00
AGE_YEARS AMT_CREDIT_prev 0.00
AGE_YEARS DAYS_DECISION 0.00
AMT_ANNUITY_prev SK_ID_CURR 0.00
AMT_ANNUITY_prev AGE_YEARS 0.00
AMT_APPLICATION SK_ID_CURR 0.00
AMT_APPLICATION AGE_YEARS 0.00
SK_ID_PREV DAYS_DECISION 0.00

Correlation for 'Refused' Loans

In [121]:
# Correlation for Refused
# Absolute values are reported 
pd.set_option('precision', 2)
cor_0 = correlation(merged_r)
cor_0.style.background_gradient(cmap='GnBu').hide_index()
Out[121]:
VAR1 VAR2 CORR
AMT_GOODS_PRICE_prev AMT_APPLICATION 1.00
AMT_APPLICATION AMT_GOODS_PRICE_prev 1.00
AMT_GOODS_PRICE_prev AMT_CREDIT_prev 0.99
AMT_CREDIT_prev AMT_GOODS_PRICE_prev 0.99
AMT_GOODS_PRICE_curr AMT_CREDIT_curr 0.99
AMT_CREDIT_curr AMT_GOODS_PRICE_curr 0.99
AMT_APPLICATION AMT_CREDIT_prev 0.98
AMT_CREDIT_prev AMT_APPLICATION 0.98
AMT_ANNUITY_prev AMT_GOODS_PRICE_prev 0.83
AMT_GOODS_PRICE_prev AMT_ANNUITY_prev 0.83
AMT_ANNUITY_prev AMT_CREDIT_prev 0.82
AMT_CREDIT_prev AMT_ANNUITY_prev 0.82
AMT_ANNUITY_prev AMT_APPLICATION 0.82
AMT_APPLICATION AMT_ANNUITY_prev 0.82
AMT_ANNUITY_curr AMT_GOODS_PRICE_curr 0.75
AMT_GOODS_PRICE_curr AMT_ANNUITY_curr 0.75
AMT_ANNUITY_curr AMT_CREDIT_curr 0.74
AMT_CREDIT_curr AMT_ANNUITY_curr 0.74
AGE_YEARS DAYS_EMPLOYED 0.61
DAYS_EMPLOYED AGE_YEARS 0.61
AMT_ANNUITY_curr AMT_INCOME_TOTAL 0.42
AMT_INCOME_TOTAL AMT_ANNUITY_curr 0.42
AMT_GOODS_PRICE_curr AMT_INCOME_TOTAL 0.35
AMT_INCOME_TOTAL AMT_GOODS_PRICE_curr 0.35
DAYS_DECISION AMT_GOODS_PRICE_prev 0.34
AMT_INCOME_TOTAL AMT_CREDIT_curr 0.34
AMT_CREDIT_curr AMT_INCOME_TOTAL 0.34
AMT_GOODS_PRICE_prev DAYS_DECISION 0.34
DAYS_DECISION AMT_ANNUITY_prev 0.29
AMT_ANNUITY_prev DAYS_DECISION 0.29
DAYS_DECISION AMT_APPLICATION 0.27
AMT_CREDIT_prev DAYS_DECISION 0.27
DAYS_DECISION AMT_CREDIT_prev 0.27
AMT_APPLICATION DAYS_DECISION 0.27
AMT_INCOME_TOTAL AMT_ANNUITY_prev 0.26
AMT_ANNUITY_prev AMT_INCOME_TOTAL 0.26
AMT_ANNUITY_curr AMT_ANNUITY_prev 0.22
AMT_INCOME_TOTAL AMT_GOODS_PRICE_prev 0.22
AMT_GOODS_PRICE_prev AMT_INCOME_TOTAL 0.22
AMT_ANNUITY_prev AMT_ANNUITY_curr 0.22
AMT_GOODS_PRICE_prev AMT_GOODS_PRICE_curr 0.21
AMT_GOODS_PRICE_curr AMT_GOODS_PRICE_prev 0.21
AMT_APPLICATION AMT_GOODS_PRICE_curr 0.20
AMT_CREDIT_prev AMT_INCOME_TOTAL 0.20
AMT_CREDIT_curr AMT_GOODS_PRICE_prev 0.20
AMT_APPLICATION AMT_INCOME_TOTAL 0.20
AMT_GOODS_PRICE_curr AMT_APPLICATION 0.20
AMT_GOODS_PRICE_curr AMT_ANNUITY_prev 0.20
AMT_GOODS_PRICE_prev AMT_CREDIT_curr 0.20
AMT_INCOME_TOTAL AMT_APPLICATION 0.20
AMT_INCOME_TOTAL AMT_CREDIT_prev 0.20
AMT_ANNUITY_prev AMT_CREDIT_curr 0.20
AMT_CREDIT_curr AMT_ANNUITY_prev 0.20
AMT_ANNUITY_prev AMT_GOODS_PRICE_curr 0.20
AMT_APPLICATION AMT_CREDIT_curr 0.19
AMT_GOODS_PRICE_curr AMT_CREDIT_prev 0.19
AMT_CREDIT_prev AMT_GOODS_PRICE_curr 0.19
AMT_CREDIT_curr AMT_APPLICATION 0.19
AMT_CREDIT_curr AMT_CREDIT_prev 0.19
AMT_CREDIT_prev AMT_CREDIT_curr 0.19
AMT_ANNUITY_curr AMT_GOODS_PRICE_prev 0.17
AMT_GOODS_PRICE_prev AMT_ANNUITY_curr 0.17
AMT_CREDIT_prev AMT_ANNUITY_curr 0.16
AMT_APPLICATION AMT_ANNUITY_curr 0.16
AMT_ANNUITY_curr AMT_CREDIT_prev 0.16
AMT_ANNUITY_curr AMT_APPLICATION 0.16
AGE_YEARS AMT_GOODS_PRICE_prev 0.12
DAYS_EMPLOYED AMT_INCOME_TOTAL 0.12
AMT_GOODS_PRICE_prev AGE_YEARS 0.12
AMT_INCOME_TOTAL DAYS_EMPLOYED 0.12
AMT_APPLICATION AGE_YEARS 0.10
AMT_CREDIT_prev AGE_YEARS 0.10
AMT_GOODS_PRICE_curr DAYS_DECISION 0.10
AMT_CREDIT_curr AGE_YEARS 0.10
AGE_YEARS AMT_CREDIT_curr 0.10
AGE_YEARS AMT_GOODS_PRICE_curr 0.10
DAYS_DECISION AMT_GOODS_PRICE_curr 0.10
AGE_YEARS AMT_APPLICATION 0.10
AGE_YEARS AMT_CREDIT_prev 0.10
AMT_GOODS_PRICE_curr AGE_YEARS 0.10
AMT_ANNUITY_prev AGE_YEARS 0.09
DAYS_DECISION AMT_CREDIT_curr 0.09
AGE_YEARS AMT_ANNUITY_prev 0.09
DAYS_EMPLOYED AMT_ANNUITY_curr 0.09
AMT_CREDIT_curr DAYS_DECISION 0.09
AMT_ANNUITY_curr DAYS_EMPLOYED 0.09
DAYS_DECISION AMT_ANNUITY_curr 0.06
AMT_ANNUITY_curr DAYS_DECISION 0.06
DAYS_EMPLOYED DAYS_DECISION 0.03
DAYS_DECISION DAYS_EMPLOYED 0.03
DAYS_EMPLOYED AMT_ANNUITY_prev 0.02
AGE_YEARS AMT_INCOME_TOTAL 0.02
DAYS_EMPLOYED AMT_GOODS_PRICE_curr 0.02
DAYS_EMPLOYED AMT_CREDIT_curr 0.02
AMT_ANNUITY_prev DAYS_EMPLOYED 0.02
AMT_INCOME_TOTAL AGE_YEARS 0.02
AMT_GOODS_PRICE_curr DAYS_EMPLOYED 0.02
AMT_CREDIT_curr DAYS_EMPLOYED 0.02
AMT_INCOME_TOTAL DAYS_DECISION 0.01
SK_ID_PREV AMT_GOODS_PRICE_prev 0.01
AMT_GOODS_PRICE_prev SK_ID_PREV 0.01
AGE_YEARS SK_ID_CURR 0.01
DAYS_DECISION AMT_INCOME_TOTAL 0.01
SK_ID_CURR AGE_YEARS 0.01
AMT_INCOME_TOTAL SK_ID_PREV 0.00
AMT_INCOME_TOTAL SK_ID_CURR 0.00
AMT_ANNUITY_curr SK_ID_PREV 0.00
AGE_YEARS SK_ID_PREV 0.00
DAYS_DECISION SK_ID_CURR 0.00
SK_ID_PREV AMT_GOODS_PRICE_curr 0.00
SK_ID_PREV AMT_CREDIT_prev 0.00
SK_ID_PREV AMT_APPLICATION 0.00
SK_ID_PREV AMT_ANNUITY_prev 0.00
SK_ID_PREV AGE_YEARS 0.00
SK_ID_PREV SK_ID_CURR 0.00
SK_ID_PREV DAYS_EMPLOYED 0.00
SK_ID_PREV AMT_CREDIT_curr 0.00
DAYS_DECISION AGE_YEARS 0.00
SK_ID_PREV AMT_INCOME_TOTAL 0.00
SK_ID_PREV AMT_ANNUITY_curr 0.00
DAYS_DECISION SK_ID_PREV 0.00
AMT_ANNUITY_curr SK_ID_CURR 0.00
AMT_ANNUITY_curr AGE_YEARS 0.00
AMT_GOODS_PRICE_prev SK_ID_CURR 0.00
AMT_CREDIT_curr SK_ID_CURR 0.00
AMT_CREDIT_prev SK_ID_PREV 0.00
AMT_GOODS_PRICE_prev DAYS_EMPLOYED 0.00
AMT_CREDIT_curr SK_ID_PREV 0.00
AGE_YEARS AMT_ANNUITY_curr 0.00
AMT_ANNUITY_prev SK_ID_CURR 0.00
SK_ID_CURR SK_ID_PREV 0.00
SK_ID_CURR DAYS_DECISION 0.00
SK_ID_CURR AMT_GOODS_PRICE_prev 0.00
SK_ID_CURR AMT_CREDIT_prev 0.00
SK_ID_CURR AMT_APPLICATION 0.00
AMT_ANNUITY_prev SK_ID_PREV 0.00
SK_ID_CURR AMT_ANNUITY_prev 0.00
SK_ID_CURR DAYS_EMPLOYED 0.00
SK_ID_CURR AMT_GOODS_PRICE_curr 0.00
AMT_APPLICATION DAYS_EMPLOYED 0.00
AMT_APPLICATION SK_ID_CURR 0.00
SK_ID_CURR AMT_CREDIT_curr 0.00
SK_ID_CURR AMT_INCOME_TOTAL 0.00
SK_ID_CURR AMT_ANNUITY_curr 0.00
DAYS_EMPLOYED SK_ID_PREV 0.00
AMT_APPLICATION SK_ID_PREV 0.00
DAYS_EMPLOYED AMT_GOODS_PRICE_prev 0.00
DAYS_EMPLOYED AMT_CREDIT_prev 0.00
DAYS_EMPLOYED AMT_APPLICATION 0.00
AMT_CREDIT_prev DAYS_EMPLOYED 0.00
AMT_CREDIT_prev SK_ID_CURR 0.00
DAYS_EMPLOYED SK_ID_CURR 0.00
AMT_GOODS_PRICE_curr SK_ID_PREV 0.00
AGE_YEARS DAYS_DECISION 0.00
AMT_GOODS_PRICE_curr SK_ID_CURR 0.00
SK_ID_PREV DAYS_DECISION 0.00

Correlation for 'Unused' Loans

In [122]:
# Correlation for Unused Loans
# Absolute values are reported 
pd.set_option('precision', 2)
cor_0 = correlation(merged_u)
cor_0.style.background_gradient(cmap='GnBu').hide_index()
Out[122]:
VAR1 VAR2 CORR
AMT_GOODS_PRICE_prev AMT_CREDIT_prev 1.00
AMT_GOODS_PRICE_prev AMT_APPLICATION 1.00
AMT_CREDIT_prev AMT_APPLICATION 1.00
AMT_CREDIT_prev AMT_GOODS_PRICE_prev 1.00
AMT_APPLICATION AMT_CREDIT_prev 1.00
AMT_APPLICATION AMT_GOODS_PRICE_prev 1.00
AMT_CREDIT_curr AMT_GOODS_PRICE_curr 0.99
AMT_GOODS_PRICE_curr AMT_CREDIT_curr 0.99
AMT_ANNUITY_prev AMT_CREDIT_prev 0.94
AMT_CREDIT_prev AMT_ANNUITY_prev 0.94
AMT_ANNUITY_prev AMT_APPLICATION 0.94
AMT_GOODS_PRICE_prev AMT_ANNUITY_prev 0.94
AMT_APPLICATION AMT_ANNUITY_prev 0.94
AMT_ANNUITY_prev AMT_GOODS_PRICE_prev 0.94
AMT_GOODS_PRICE_curr AMT_ANNUITY_curr 0.76
AMT_ANNUITY_curr AMT_GOODS_PRICE_curr 0.76
AMT_ANNUITY_curr AMT_CREDIT_curr 0.76
AMT_CREDIT_curr AMT_ANNUITY_curr 0.76
DAYS_EMPLOYED AGE_YEARS 0.55
AGE_YEARS DAYS_EMPLOYED 0.55
AMT_ANNUITY_prev AMT_INCOME_TOTAL 0.23
AMT_INCOME_TOTAL AMT_ANNUITY_prev 0.23
AMT_ANNUITY_prev AMT_ANNUITY_curr 0.17
AMT_ANNUITY_curr AMT_ANNUITY_prev 0.17
AGE_YEARS DAYS_DECISION 0.15
DAYS_DECISION AGE_YEARS 0.15
AMT_ANNUITY_prev DAYS_DECISION 0.11
AMT_CREDIT_prev AMT_ANNUITY_curr 0.11
AMT_APPLICATION AMT_ANNUITY_curr 0.11
AGE_YEARS AMT_CREDIT_curr 0.11
AMT_GOODS_PRICE_prev AMT_ANNUITY_curr 0.11
AMT_CREDIT_curr AMT_ANNUITY_prev 0.11
AMT_CREDIT_curr AGE_YEARS 0.11
AMT_ANNUITY_prev AMT_CREDIT_curr 0.11
DAYS_DECISION AMT_ANNUITY_prev 0.11
AMT_ANNUITY_curr AMT_GOODS_PRICE_prev 0.11
AMT_ANNUITY_curr AMT_CREDIT_prev 0.11
AMT_ANNUITY_curr AMT_APPLICATION 0.11
AMT_GOODS_PRICE_curr AMT_ANNUITY_prev 0.10
AMT_ANNUITY_prev AGE_YEARS 0.10
AMT_ANNUITY_prev AMT_GOODS_PRICE_curr 0.10
AGE_YEARS AMT_ANNUITY_prev 0.10
AGE_YEARS AMT_GOODS_PRICE_curr 0.10
AMT_GOODS_PRICE_curr AGE_YEARS 0.10
DAYS_DECISION DAYS_EMPLOYED 0.09
DAYS_EMPLOYED DAYS_DECISION 0.09
AMT_ANNUITY_curr DAYS_EMPLOYED 0.08
DAYS_EMPLOYED AMT_ANNUITY_curr 0.08
AMT_GOODS_PRICE_curr AMT_APPLICATION 0.07
AMT_CREDIT_curr AMT_APPLICATION 0.07
AMT_GOODS_PRICE_prev AMT_GOODS_PRICE_curr 0.07
AMT_GOODS_PRICE_prev AMT_CREDIT_curr 0.07
AMT_GOODS_PRICE_curr AMT_CREDIT_prev 0.07
AMT_CREDIT_prev AMT_GOODS_PRICE_curr 0.07
AMT_CREDIT_curr AMT_CREDIT_prev 0.07
AMT_CREDIT_prev AMT_CREDIT_curr 0.07
AMT_APPLICATION AMT_CREDIT_curr 0.07
AMT_APPLICATION AMT_GOODS_PRICE_curr 0.07
AMT_GOODS_PRICE_curr AMT_GOODS_PRICE_prev 0.07
AMT_CREDIT_curr AMT_GOODS_PRICE_prev 0.07
AMT_CREDIT_prev DAYS_DECISION 0.05
AMT_APPLICATION DAYS_DECISION 0.05
AMT_GOODS_PRICE_prev DAYS_DECISION 0.05
DAYS_DECISION AMT_APPLICATION 0.05
DAYS_DECISION AMT_CREDIT_prev 0.05
DAYS_DECISION AMT_GOODS_PRICE_prev 0.05
AMT_ANNUITY_curr AMT_INCOME_TOTAL 0.05
AMT_INCOME_TOTAL AMT_ANNUITY_curr 0.05
AMT_CREDIT_curr DAYS_EMPLOYED 0.04
AMT_GOODS_PRICE_curr DAYS_DECISION 0.04
AMT_CREDIT_curr AMT_INCOME_TOTAL 0.04
DAYS_DECISION AMT_ANNUITY_curr 0.04
DAYS_EMPLOYED AMT_GOODS_PRICE_curr 0.04
DAYS_DECISION AMT_GOODS_PRICE_curr 0.04
AMT_INCOME_TOTAL AMT_CREDIT_curr 0.04
AMT_INCOME_TOTAL AMT_GOODS_PRICE_curr 0.04
AMT_ANNUITY_curr DAYS_DECISION 0.04
DAYS_EMPLOYED AMT_CREDIT_curr 0.04
AMT_GOODS_PRICE_curr DAYS_EMPLOYED 0.04
AMT_GOODS_PRICE_curr AMT_INCOME_TOTAL 0.04
SK_ID_CURR AMT_ANNUITY_prev 0.03
DAYS_DECISION AMT_CREDIT_curr 0.03
AMT_ANNUITY_prev DAYS_EMPLOYED 0.03
AMT_ANNUITY_prev SK_ID_CURR 0.03
AMT_CREDIT_curr DAYS_DECISION 0.03
DAYS_EMPLOYED AMT_ANNUITY_prev 0.03
AMT_INCOME_TOTAL AMT_APPLICATION 0.02
AMT_APPLICATION AMT_INCOME_TOTAL 0.02
DAYS_EMPLOYED SK_ID_CURR 0.02
AMT_INCOME_TOTAL AMT_CREDIT_prev 0.02
AMT_GOODS_PRICE_prev AMT_INCOME_TOTAL 0.02
AMT_CREDIT_prev AMT_INCOME_TOTAL 0.02
AMT_INCOME_TOTAL AMT_GOODS_PRICE_prev 0.02
SK_ID_PREV DAYS_DECISION 0.02
SK_ID_CURR DAYS_EMPLOYED 0.02
AMT_ANNUITY_curr AGE_YEARS 0.02
DAYS_DECISION SK_ID_PREV 0.02
AGE_YEARS AMT_ANNUITY_curr 0.02
SK_ID_PREV AGE_YEARS 0.01
AMT_INCOME_TOTAL SK_ID_PREV 0.01
AMT_INCOME_TOTAL DAYS_DECISION 0.01
AMT_CREDIT_prev SK_ID_PREV 0.01
SK_ID_PREV AMT_GOODS_PRICE_prev 0.01
SK_ID_PREV AMT_CREDIT_prev 0.01
SK_ID_PREV AMT_APPLICATION 0.01
SK_ID_PREV AMT_ANNUITY_prev 0.01
DAYS_DECISION AMT_INCOME_TOTAL 0.01
DAYS_DECISION SK_ID_CURR 0.01
SK_ID_PREV DAYS_EMPLOYED 0.01
SK_ID_PREV AMT_INCOME_TOTAL 0.01
AMT_GOODS_PRICE_prev SK_ID_CURR 0.01
AMT_GOODS_PRICE_prev AGE_YEARS 0.01
AMT_INCOME_TOTAL AGE_YEARS 0.01
AMT_INCOME_TOTAL SK_ID_CURR 0.01
AMT_INCOME_TOTAL DAYS_EMPLOYED 0.01
AMT_GOODS_PRICE_prev SK_ID_PREV 0.01
SK_ID_CURR AMT_APPLICATION 0.01
AMT_CREDIT_prev AGE_YEARS 0.01
AMT_ANNUITY_prev SK_ID_PREV 0.01
SK_ID_CURR AMT_GOODS_PRICE_prev 0.01
SK_ID_CURR DAYS_DECISION 0.01
AGE_YEARS AMT_INCOME_TOTAL 0.01
AGE_YEARS AMT_APPLICATION 0.01
AGE_YEARS AMT_CREDIT_prev 0.01
AGE_YEARS AMT_GOODS_PRICE_prev 0.01
AGE_YEARS SK_ID_PREV 0.01
AMT_CREDIT_prev SK_ID_CURR 0.01
DAYS_EMPLOYED SK_ID_PREV 0.01
DAYS_EMPLOYED AMT_CREDIT_prev 0.01
SK_ID_CURR AMT_INCOME_TOTAL 0.01
DAYS_EMPLOYED AMT_INCOME_TOTAL 0.01
AMT_APPLICATION SK_ID_CURR 0.01
AMT_CREDIT_prev DAYS_EMPLOYED 0.01
SK_ID_CURR AMT_CREDIT_prev 0.01
AMT_APPLICATION AGE_YEARS 0.01
AMT_APPLICATION SK_ID_PREV 0.01
SK_ID_CURR AMT_ANNUITY_curr 0.00
AMT_APPLICATION DAYS_EMPLOYED 0.00
AMT_GOODS_PRICE_curr SK_ID_PREV 0.00
DAYS_EMPLOYED AMT_GOODS_PRICE_prev 0.00
SK_ID_CURR SK_ID_PREV 0.00
AMT_CREDIT_curr SK_ID_PREV 0.00
SK_ID_CURR AGE_YEARS 0.00
SK_ID_PREV SK_ID_CURR 0.00
AGE_YEARS SK_ID_CURR 0.00
SK_ID_PREV AMT_GOODS_PRICE_curr 0.00
SK_ID_PREV AMT_CREDIT_curr 0.00
SK_ID_CURR AMT_GOODS_PRICE_curr 0.00
SK_ID_PREV AMT_ANNUITY_curr 0.00
DAYS_EMPLOYED AMT_APPLICATION 0.00
AMT_ANNUITY_curr SK_ID_CURR 0.00
AMT_GOODS_PRICE_curr SK_ID_CURR 0.00
AMT_GOODS_PRICE_prev DAYS_EMPLOYED 0.00
AMT_ANNUITY_curr SK_ID_PREV 0.00
SK_ID_CURR AMT_CREDIT_curr 0.00
AMT_CREDIT_curr SK_ID_CURR 0.00
In [ ]: