Importing Packages

In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style='darkgrid')
# import warnings; warnings.filterwarnings('ignore')

# Auto reload any script without the need to restart jupyter notebook server
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Change the format of precisions of decimals
pd.set_option('display.float_format', lambda x: '{:f}'.format(x))

SEED = 42
PATH = '../data/data-science-for-good-kiva-crowdfunding/'
In [2]:
color = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71", "#feb308", "e78ac3"]

%matplotlib inline
%config InlineBackend.figure_format='retina'
# Change defualt size of matplotlibs plots
plt.rcParams['figure.figsize'] = (8.0, 5.0)

Functions

In [3]:
## Data Description
def describe(df):
    print('======================================')
    print('No. of Rows.:{0}\nNo. of Columns:{1}\n'.format(df.shape[0], df.shape[1]))
    print('======================================')
    data_type = DataFrame(df.dtypes, columns=['Data Type'])
    null_count =  DataFrame(df.isnull().sum(), columns=['Null Count'])
    not_null_count = DataFrame(df.notnull().sum(), columns=['Not Null Count'])
    unique_count = DataFrame(df.nunique(), columns=['Unique Count'])
    categorical = loans.describe(include='O').T
    count = loans.isnull().sum().sort_values(ascending=False)
    percent = loans.isnull().sum().sort_values(ascending=False)/loans.shape[0]*100
    missing = pd.DataFrame({'Missing Count':count , 'Missing count Percent':percent})#, columns=['Missing Count', 'Percent count'])
    joined = pd.concat([data_type, null_count, not_null_count, unique_count, categorical, missing], axis=1)
    display(joined)
    display(df.describe().T)
    return None

## Adding more time columns
def add_datepart(df, date_column):
    date_series = df[date_column]
    df[date_column] = pd.to_datetime(date_series, infer_datetime_format=True)
    for n in ('Year', 'Month', 'Week', 'Day', 'Weekday_Name', 'Dayofweek', 'Dayofyear'):
        df['Date'+'_'+n] = getattr(date_series.dt, n.lower())
        
## Proportion Plot
def proportion_plot(column, title='', figsize=(15, 5), top=20):
    value_count = (100*column.value_counts()/column.shape[0])[:top]
    plt.subplots(figsize=figsize)
    sns.set_style("whitegrid")
    g = sns.barplot(x=value_count.index, y=value_count.values)
    plt.title(title, size=20)
    plt.xlabel(column.name, size=15); plt.ylabel('percentage', size=15)
    plt.xticks(size=12, rotation=90); plt.yticks(size=15)
In [4]:
loans = pd.read_csv(f'{PATH}kiva_loans.csv')
In [5]:
loans.sample(n=5).T
Out[5]:
194707 488204 333959 565110 359067
id 842569 1148467 980969 1226306 1005941
funded_amount 950.000000 750.000000 500.000000 1525.000000 325.000000
loan_amount 950.000000 750.000000 750.000000 1525.000000 325.000000
activity Personal Housing Expenses Agriculture Personal Medical Expenses Cattle Food Production/Sales
sector Housing Agriculture Health Agriculture Food
use To renovate her house to purchase seeds, manure, and more. to pay for dental treatment to buy a cow and other items. to purchase ingredients for snack vending like…
country_code TJ PE MX PE PH
country Tajikistan Peru Mexico Peru Philippines
region Panjakent Paucartambo - Cusco NaN anta - tambo real Plaridel-Looc Proper, Plaridel, Misamis Occide…
currency TJS PEN MXN PEN PHP
partner_id 100.000000 119.000000 357.000000 119.000000 126.000000
posted_time 2015-02-18 04:51:49+00:00 2016-09-09 17:11:05+00:00 2015-11-18 18:32:07+00:00 2017-01-24 16:01:50+00:00 2016-01-08 23:24:52+00:00
disbursed_time 2015-02-10 08:00:00+00:00 2016-08-12 07:00:00+00:00 2015-10-19 07:00:00+00:00 2017-01-10 08:00:00+00:00 2015-12-18 08:00:00+00:00
funded_time 2015-03-23 13:57:39+00:00 2016-09-11 02:54:25+00:00 NaN 2017-02-18 19:52:15+00:00 2016-01-09 21:53:07+00:00
term_in_months 14.000000 6.000000 20.000000 8.000000 7.000000
lender_count 36 9 17 35 1
tags #Parent, #Woman Owned Biz, user_favorite, user… NaN #Health and Sanitation #Woman Owned Biz, #Animals, user_favorite NaN
borrower_genders female female male female female
repayment_interval monthly irregular monthly monthly monthly
date 2015-02-18 2016-09-09 2015-11-18 2017-01-24 2016-01-08
In [6]:
df_describe = describe(loans)
======================================
No. of Rows.:671205
No. of Columns:20

======================================
Data Type Null Count Not Null Count Unique Count count unique top freq Missing Count Missing count Percent
activity object 0 671205 163 671205 163 Farming 72955 0 0.000000
borrower_genders object 4221 666984 11298 666984 11298 female 426502 4221 0.628869
country object 0 671205 87 671205 87 Philippines 160441 0 0.000000
country_code object 8 671197 86 671197 86 PH 160441 8 0.001192
currency object 0 671205 67 671205 67 PHP 160440 0 0.000000
date object 0 671205 1298 671205 1298 2017-03-20 1308 0 0.000000
disbursed_time object 2396 668809 5719 668809 5719 2017-02-01 08:00:00+00:00 2800 2396 0.356970
funded_amount float64 0 671205 610 NaN NaN NaN NaN 0 0.000000
funded_time object 48331 622874 498007 622874 498007 2016-09-21 13:03:24+00:00 33 48331 7.200632
id int64 0 671205 671205 NaN NaN NaN NaN 0 0.000000
lender_count int64 0 671205 503 NaN NaN NaN NaN 0 0.000000
loan_amount float64 0 671205 479 NaN NaN NaN NaN 0 0.000000
partner_id float64 13507 657698 366 NaN NaN NaN NaN 13507 2.012351
posted_time object 0 671205 667399 671205 667399 2017-05-15 00:00:00+00:00 25 0 0.000000
region object 56800 614405 12695 614405 12695 Kaduna 10000 56800 8.462392
repayment_interval object 0 671205 4 671205 4 monthly 342717 0 0.000000
sector object 0 671205 15 671205 15 Agriculture 180302 0 0.000000
tags object 171416 499789 86719 499789 86719 user_favorite 27088 171416 25.538546
term_in_months float64 0 671205 148 NaN NaN NaN NaN 0 0.000000
use object 4232 666973 424912 666973 424912 to buy a water filter to provide safe drinking… 5217 4232 0.630508
count mean std min 25% 50% 75% max
id 671205.000000 993248.593734 196611.275423 653047.000000 823072.000000 992780.000000 1163653.000000 1340339.000000
funded_amount 671205.000000 785.995061 1130.398941 0.000000 250.000000 450.000000 900.000000 100000.000000
loan_amount 671205.000000 842.397107 1198.660073 25.000000 275.000000 500.000000 1000.000000 100000.000000
partner_id 657698.000000 178.199616 94.247581 9.000000 126.000000 145.000000 204.000000 536.000000
term_in_months 671205.000000 13.739022 8.598919 1.000000 8.000000 13.000000 14.000000 158.000000
lender_count 671205.000000 20.590922 28.459551 0.000000 7.000000 13.000000 24.000000 2986.000000

a) Loan Amount Exploration

Loan Amount Distribution

Higly skewed because of few outliers, therefore filtering loan_amount below $5000

In [7]:
sns.set_style("darkgrid")
sns.kdeplot(loans.loan_amount[loans.loan_amount], color='r', shade=True);

Funded Amount Distribution

Highly skewed because of few outliers, therefore filtering funded_amount below $5000

In [8]:
sns.kdeplot(loans.funded_amount[loans.funded_amount], color='b', shade=True);

Instance when funded_amount is equal to loan_amount

In [9]:
pd.Series()
Out[9]:
Series([], dtype: float64)
In [10]:
appr_equal_to_apply = 100*( (loans.funded_amount == loans.loan_amount).sum())/loans.shape[0]
appr_less_than_apply = 100*(loans.funded_amount < loans.loan_amount).sum()/loans.shape[0]
appr_more_than_apply = 100*(loans.funded_amount > loans.loan_amount).sum()/loans.shape[0]

temp = pd.Series([appr_equal_to_apply, appr_less_than_apply, appr_more_than_apply], 
                 index=['appr_equal_to_apply', 'appr_less_than_apply', 'appr_more_than_apply'])

sns.barplot(x=temp.values, 
            y=temp.index)

plt.title('', size=20)
plt.ylabel('percentage', size=15)
plt.xticks(size=15, rotation=-15)
plt.yticks(size=15);
for index, value in enumerate(temp.values):
    plt.text(0, index, np.round(value, 4), fontdict={'size':18})

There are 2 instances when Funded Amount is greater than Loan Amount

In [11]:
loans[(loans.loan_amount - loans.funded_amount) < 0]
Out[11]:
id funded_amount loan_amount activity sector use country_code country region currency partner_id posted_time disbursed_time funded_time term_in_months lender_count tags borrower_genders repayment_interval date
277188 924149 425.000000 400.000000 General Store Retail to buy beverages, rice, laundry detergent, sug… MZ Mozambique Boane, Maputo MZN 23.000000 2015-07-28 07:14:04+00:00 2015-07-22 07:00:00+00:00 NaN 17.000000 11 #Parent, user_favorite male monthly 2015-07-28
338159 985199 3400.000000 3000.000000 Farm Supplies Agriculture to pay for wires for the grape orchard, cover … AM Armenia Hoktember village, Armavir region USD 169.000000 2015-11-25 10:53:00+00:00 2015-10-29 07:00:00+00:00 NaN 38.000000 84 #Elderly, #Job Creator, #Trees, #Vegan, user_f… male monthly 2015-11-25

Insights

  • Philippines has highest number of loan application proportion and approved loans proportion
  • US even though has not so many loan application, but has high number of applications which didn’t get APPROVED
  • Kenya has high number of loans application and high number of loan rejection count

Let’s explore US and Kenya to see if there is any amount which gets rejected

United States - Not approved Loans

In [12]:
sns.set_style("darkgrid")

country = 'United States'

plt.figure(figsize=(16, 5))
plt.subplot(1,2, 1)
loans[(loans.funded_amount != 0) & (loans.country == country)]['loan_amount'].plot.hist(bins=20, 
                                                                                        title='USA\nApproved', 
                                                                                        color=color[5])
plt.subplot(1,2,2)
loans[(loans.funded_amount == 0) & (loans.country == country)]['loan_amount'].plot.hist(bins=20, 
                                                                                        title='USA\nNot approved', 
                                                                                        color=color[3]);

Interestingly lot of loan application of amount between USD 500 - 1000 gets rejected.

From around 300 applications 180 got rejected.

Kenya Not Approved loans

In [13]:
country = 'Kenya'

plt.figure(figsize=(16, 5))
plt.subplot(1,2, 1)
loans[(loans.funded_amount != 0) & (loans.country == country)]['loan_amount'].plot.hist(bins=20, 
                                                                                        title='Kenya\nApproved', 
                                                                                        color=color[5])
plt.subplot(1,2,2)
loans[(loans.funded_amount == 0) & (loans.country == country)]['loan_amount'].plot.hist(bins=20, 
                                                                                        title='Kenya\nNot approved', 
                                                                                        color=color[3]);

b) Loan Application Exploration

In [14]:
proportion_plot(loans.country, 
                title='Proportion of Applications by countries', 
                figsize=(15, 5), top=30)


proportion_plot(loans[loans.funded_amount != 0].country, 
                title='Proportion of loans APPROVED, by countries', 
                figsize=(15, 5), top=30)

proportion_plot(loans[loans.funded_amount == 0].country, 
                title='Proportion of loan NOT APPROVED, by countries', 
                figsize=(15, 5), top=30)

c) Lenders Exploration

In [15]:
sns.set_style("darkgrid")
plt.figure(figsize=(18, 5))
lender_count =loans[loans.lender_count <= 50].lender_count.value_counts()

sns.barplot(x=lender_count.index, y=lender_count.values)
plt.xlabel('Lenders Count');

Country-wise mean lenders count

In [16]:
loans.groupby(['country']).agg({'lender_count':'mean'}).sort_values(by='lender_count',ascending=False).plot.bar(figsize=(16, 5))
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x28939708c50>

Terms in Month

In [17]:
proportion_plot(loans.term_in_months, top=40)

25% loans have 14 months of repayment term and 21% have 8 months of repayment term

Funded Amount Vs Terms in Month

In [18]:
plt.figure(figsize=(16, 8))
sns.regplot(x="term_in_months", y="funded_amount", data=loans.sample(frac=.1, random_state=SEED), 
            color=color[0],scatter_kws={'alpha':0.3});
In [19]:
import plotly.plotly as py #For World Map
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
In [20]:
temp=loans['country'].value_counts().reset_index()
In [21]:
temp.head(2)
Out[21]:
index country
0 Philippines 160441
1 Kenya 75825
In [22]:
temp.columns = ['country', 'value']
In [23]:
data = dict(type = 'choropleth', 
           locations = temp['country'],
           locationmode = 'country names',
           z = temp['value'], 
           text = temp['country'],
           colorbar = {'title':'Happiness'})
layout = dict(title = 'Loan Application Count', 
             geo = dict(showframe = False, 
                       projection = {'type': 'Mercator'}))
choromap3 = go.Figure(data = [data], layout=layout)
iplot(choromap3)

Word Cloud of Tags

In [24]:
# from wordcloud import WordCloud, STOPWORDS
# stopwords = set(STOPWORDS)

# wordcloud = WordCloud(
#                           background_color='white',
#                           stopwords=stopwords,
#                           max_words=200,
#                           max_font_size=40, 
#                           random_state=42
#                          ).generate(str(loans['tags']))
In [26]:
proportion_plot(loans.currency, title='Currency Proportion', top=40)
In [27]:
proportion_plot(loans.activity, title='Proportion of loan count by activity', figsize=(15, 5), top=40)
In [28]:
proportion_plot(loans.sector, figsize=(15, 5), title='Proportion of loan count by sector')
In [29]:
# sector_repayment = ['sector', 'repayment_interval']
# cm = sns.light_palette("red", as_cmap=True)
# pd.crosstab(loans[sector_repayment[0]], loans[sector_repayment[1]]).style.background_gradient(cmap = cm)
In [30]:
repayment = loans.repayment_interval.value_counts()
sns.barplot(x = repayment.values, y = repayment.index);
for i, v in enumerate(repayment.values):
    plt.text(0.8, i, v, color='k',fontsize=19)
plt.xlabel('Repayment Interval')
plt.ylabel('Frequency')
plt.title('Repayment Interval Proportion');

Gender Exploration

In [31]:
countries_funded_amount = loans.groupby('country').mean()['funded_amount'].sort_values(ascending = False)
print("Top Countries with funded_amount(Dollar value of loan funded on Kiva.org)(Mean values)\n",countries_funded_amount.head(10))
Top Countries with funded_amount(Dollar value of loan funded on Kiva.org)(Mean values)
 country
Cote D'Ivoire        50000.000000
Mauritania           15000.000000
Bhutan                7812.500000
Chile                 7625.000000
Afghanistan           7000.000000
Congo                 6142.578125
Puerto Rico           4409.191176
Dominican Republic    4200.604839
Namibia               4046.875000
United States         3800.843591
Name: funded_amount, dtype: float64
In [32]:
# a = loans.borrower_genders[15213]
loans['female_borrower_count']= loans.borrower_genders.apply(lambda x: str(x).split(', ').count('female'))
loans['male_borrower_count']= loans.borrower_genders.apply(lambda x: str(x).split(', ').count('male'))
In [33]:
borrowers_percent = dict()
borrowers_percent['female'] = 100*loans['female_borrower_count'].sum()/(loans['female_borrower_count'].sum() + loans['male_borrower_count'].sum())
borrowers_percent['male'] = 100*loans['male_borrower_count'].sum()/(loans['female_borrower_count'].sum() + loans['male_borrower_count'].sum())
print('Female borrowers percentage: {:.2f} %'.format(borrowers_percent['female']))
print('Male borrowers percentage: {:.2f} %'.format(borrowers_percent['male']))
Female borrowers percentage: 79.58 %
Male borrowers percentage: 20.42 %
In [34]:
gender_prop = pd.Series(borrowers_percent)
for i, j in enumerate(gender_prop.values):
    print(i, j)
0 79.5794421681
1 20.4205578319
In [35]:
gender_prop = pd.Series(borrowers_percent)
g = sns.barplot(y=gender_prop.index, x=gender_prop.values)
plt.title('Proportion of female and male borrowers', size=20)
plt.ylabel('percentage', size=15)
plt.xticks(size=15)
plt.yticks(size=15);
for gender, value in enumerate(gender_prop.values):
    plt.text(x=0, y=gender, s=(str(np.round(value, 2))+ "%"),fontdict={'size':20})

To Be Continue…