In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
df = pd.read_excel('default_of_credit_card_clients__courseware_version_1_21_19.xls')
In [3]:
bill_features = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 
                 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']

pay_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 
                'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
In [4]:
df[bill_features].describe()
Out[4]:
BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6
count 30000.000000 30000.000000 3.000000e+04 30000.000000 30000.000000 30000.000000
mean 50646.744233 48624.349167 4.649736e+04 42791.362167 39884.398167 38480.350933
std 73376.695080 70893.963498 6.910251e+04 64090.316188 60606.644833 59406.836932
min -165580.000000 -69777.000000 -1.572640e+05 -170000.000000 -81334.000000 -339603.000000
25% 3234.000000 2682.000000 2.403000e+03 2034.000000 1534.000000 1080.000000
50% 21644.500000 20597.000000 1.975250e+04 18759.500000 17835.500000 16643.000000
75% 66148.500000 62999.750000 5.952675e+04 53572.250000 49804.000000 48863.500000
max 964511.000000 983931.000000 1.664089e+06 891586.000000 927171.000000 961664.000000
In [5]:
df[bill_features].hist(bins=20, figsize=(10, 6))
plt.show()
No description has been provided for this image
In [6]:
df[pay_features].describe()
Out[6]:
PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
count 30000.000000 3.000000e+04 30000.000000 30000.000000 30000.000000 30000.000000
mean 5613.321500 5.855410e+03 5174.387967 4776.089733 4754.749200 5164.223267
std 16539.094312 2.299256e+04 17565.538305 15532.893047 15239.070708 17712.664703
min 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000
25% 836.000000 7.217500e+02 371.000000 223.000000 170.750000 9.000000
50% 2084.500000 2.000000e+03 1776.000000 1500.000000 1500.000000 1500.000000
75% 5000.000000 5.000000e+03 4500.000000 4000.000000 4000.000000 4000.000000
max 873552.000000 1.684259e+06 896040.000000 621000.000000 426529.000000 528666.000000
In [7]:
df[pay_features].hist(bins=20, figsize=(10, 6), xrot=45)
plt.show()
No description has been provided for this image
In [8]:
mask_zero = df[pay_features] == 0
mask_zero.sum()
Out[8]:
PAY_AMT1    5504
PAY_AMT2    5663
PAY_AMT3    6223
PAY_AMT4    6660
PAY_AMT5    6955
PAY_AMT6    7416
dtype: int64
In [9]:
df_non_zero = df[pay_features][df[pay_features] != 0]

df_log = df_non_zero.apply(np.log10)

df_log.hist(bins=20, figsize=(10, 6))
plt.show()
No description has been provided for this image