Importing Packages¶
In [1]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from pysqldf import SQLDF; sqldf = SQLDF(globals()); q = getattr(sqldf, 'execute')
import warnings; warnings.filterwarnings('ignore')
from fast import *
%reload_ext autoreload
%autoreload 2
%matplotlib inline
sns.set(style='darkgrid')
pd.set_option('display.float_format', lambda x: '{:f}'.format(x))
PATH = '../data'
Reading data¶
In [3]:
display(df.head())
Data Summary¶
In [4]:
describe(df)
Data Exploration¶
Distribution of investments count across different cities¶
In [5]:
plt.subplots(figsize=(15,5))
p1 = sns.countplot(x = 'CityLocation',
data = df,
order = df['CityLocation'].value_counts().index)
plt.xticks(rotation=90);
Total amount invested over months¶
In [6]:
df_temp = df.groupby(['Month_Year']).agg({'AmountInUSD': 'sum'}).reset_index()
plt.subplots(figsize=(8,5))
sns.barplot(x = 'Month_Year', y='AmountInUSD', data = df_temp)
plt.xticks(rotation=90);
Count of Funding in each month¶
In [7]:
plt.subplots(figsize=(8,5))
sns.countplot(x = 'Month_Year', data = df)
plt.xticks(rotation=90);
When funding was declared¶
In [8]:
temp = df.groupby(['Date_Weekday_Name']).agg({'Date':'count'}).reset_index()
sns.barplot(x=temp.Date_Weekday_Name, y=temp.Date, color='violet')
Out[8]:
Distribution of funding for sum of below $1M¶
In [9]:
ax = histogram(df.AmountInUSD, upper=10**6, bins=30, color='skyblue')
ax.annotate('~ $500K', xy=(500000, 4.7e-6), xytext=(700000, 4e-6),
arrowprops=dict(facecolor='black', shrink=0.05), fontsize=15);
plt.title('Histogram');
When funding was announced for big investments¶
In [10]:
temp = df[df['rank'] == 1].sort_values('Date')
sns.countplot(x=temp.Date_Weekday_Name);
Investment Type Distribution¶
In [11]:
temp = df.groupby(['InvestmentType']).agg({'AmountInUSD':'sum'}).reset_index()
temp.AmountInUSD = 100*temp.AmountInUSD/np.sum(temp.AmountInUSD)
g = sns.barplot(x=temp.InvestmentType, y=temp.AmountInUSD)
for index, row in temp.iterrows():
g.text(row.name,row.AmountInUSD, round(row.AmountInUSD,2), color='black', ha="center")
plt.xticks(rotation=45);
Industry Vertical Distribution¶
In [12]:
temp = df.groupby(['IndustryVertical']).agg({'AmountInUSD':'sum'}).reset_index()\
.sort_values('AmountInUSD', ascending=False).reset_index(drop=True).iloc[:40, ]
plt.subplots(figsize=(20,8))
sns.barplot(x=temp.IndustryVertical, y=temp.AmountInUSD)
plt.xticks(rotation=90);