In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
df = pd.read_excel('default_of_credit_card_clients__courseware_version_1_21_19.xls')
In [3]:
bill_features = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
pay_features = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3',
'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
In [4]:
df[bill_features].describe()
Out[4]:
| BILL_AMT1 | BILL_AMT2 | BILL_AMT3 | BILL_AMT4 | BILL_AMT5 | BILL_AMT6 | |
|---|---|---|---|---|---|---|
| count | 30000.000000 | 30000.000000 | 3.000000e+04 | 30000.000000 | 30000.000000 | 30000.000000 |
| mean | 50646.744233 | 48624.349167 | 4.649736e+04 | 42791.362167 | 39884.398167 | 38480.350933 |
| std | 73376.695080 | 70893.963498 | 6.910251e+04 | 64090.316188 | 60606.644833 | 59406.836932 |
| min | -165580.000000 | -69777.000000 | -1.572640e+05 | -170000.000000 | -81334.000000 | -339603.000000 |
| 25% | 3234.000000 | 2682.000000 | 2.403000e+03 | 2034.000000 | 1534.000000 | 1080.000000 |
| 50% | 21644.500000 | 20597.000000 | 1.975250e+04 | 18759.500000 | 17835.500000 | 16643.000000 |
| 75% | 66148.500000 | 62999.750000 | 5.952675e+04 | 53572.250000 | 49804.000000 | 48863.500000 |
| max | 964511.000000 | 983931.000000 | 1.664089e+06 | 891586.000000 | 927171.000000 | 961664.000000 |
In [5]:
df[bill_features].hist(bins=20, figsize=(10, 6))
plt.show()
In [6]:
df[pay_features].describe()
Out[6]:
| PAY_AMT1 | PAY_AMT2 | PAY_AMT3 | PAY_AMT4 | PAY_AMT5 | PAY_AMT6 | |
|---|---|---|---|---|---|---|
| count | 30000.000000 | 3.000000e+04 | 30000.000000 | 30000.000000 | 30000.000000 | 30000.000000 |
| mean | 5613.321500 | 5.855410e+03 | 5174.387967 | 4776.089733 | 4754.749200 | 5164.223267 |
| std | 16539.094312 | 2.299256e+04 | 17565.538305 | 15532.893047 | 15239.070708 | 17712.664703 |
| min | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 836.000000 | 7.217500e+02 | 371.000000 | 223.000000 | 170.750000 | 9.000000 |
| 50% | 2084.500000 | 2.000000e+03 | 1776.000000 | 1500.000000 | 1500.000000 | 1500.000000 |
| 75% | 5000.000000 | 5.000000e+03 | 4500.000000 | 4000.000000 | 4000.000000 | 4000.000000 |
| max | 873552.000000 | 1.684259e+06 | 896040.000000 | 621000.000000 | 426529.000000 | 528666.000000 |
In [7]:
df[pay_features].hist(bins=20, figsize=(10, 6), xrot=45)
plt.show()
In [8]:
mask_zero = df[pay_features] == 0
mask_zero.sum()
Out[8]:
PAY_AMT1 5504 PAY_AMT2 5663 PAY_AMT3 6223 PAY_AMT4 6660 PAY_AMT5 6955 PAY_AMT6 7416 dtype: int64
In [9]:
df_non_zero = df[pay_features][df[pay_features] != 0]
df_log = df_non_zero.apply(np.log10)
df_log.hist(bins=20, figsize=(10, 6))
plt.show()