Importing packages

pandas and numpy

In [1]:
import pandas as pd
import numpy as np

Reading a CSV file

In [3]:
df = pd.read_csv('data/file.csv')

df
Out[3]:
Id Name Amount
0 101 Jack 100.0
1 102 Emma 200.0
2 103 Scott 150.0
3 104 Evan 300.0
4 105 Neil 500.0
In [4]:
print df.dtypes
Id          int64
Name       object
Amount    float64
dtype: object

Reading a CSV file with a Date column

In [6]:
raw_df = pd.read_csv('data/file.csv')

raw_df
Out[6]:
Id Date Name Amount
0 101 2018-01-10 Jack 100.0
1 102 2018-01-11 Emma 200.0
2 103 2018-01-12 Scott 150.0
3 104 2018-01-13 Evan 300.0
4 105 2018-01-14 Neil 500.0
In [7]:
print raw_df.dtypes
Id          int64
Date       object
Name       object
Amount    float64
dtype: object
In [8]:
date_parser = lambda dates: [pd.datetime.strptime(d, '%Y-%m-%d') for d in dates]

df = pd.read_csv('data/file.csv',
                 dtype={'Date': np.float64},
                 parse_dates=['Date'], 
                 date_parser=date_parser)

print df.dtypes
df
Id                 int64
Date      datetime64[ns]
Name              object
Amount           float64
dtype: object
Out[8]:
Id Date Name Amount
0 101 2018-01-10 Jack 100.0
1 102 2018-01-11 Emma 200.0
2 103 2018-01-12 Scott 150.0
3 104 2018-01-13 Evan 300.0
4 105 2018-01-14 Neil 500.0

Reading a CSV file with Timestamp Column

In [10]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Id          int64
Date        int64
Name       object
Amount    float64
dtype: object
Out[10]:
Id Date Name Amount
0 101 1502841600 Jack 100.0
1 102 1502841600 Emma 200.0
2 103 1502841600 Scott 150.0
3 104 1502841600 Evan 300.0
4 105 1502841600 Neil 500.0
In [11]:
from datetime import datetime

date_parser = lambda timestamp: datetime.fromtimestamp(timestamp)

df = pd.read_csv('data/file.csv',
                 dtype={'Date': np.float64},
                 parse_dates=['Date'], 
                 date_parser=date_parser)

print df.dtypes
df
Id                 int64
Date      datetime64[ns]
Name              object
Amount           float64
dtype: object
Out[11]:
Id Date Name Amount
0 101 2017-08-15 19:00:00 Jack 100.0
1 102 2017-08-15 19:00:00 Emma 200.0
2 103 2017-08-15 19:00:00 Scott 150.0
3 104 2017-08-15 19:00:00 Evan 300.0
4 105 2017-08-15 19:00:00 Neil 500.0

Change data type of columns while reading CSV file

In [13]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Id         int64
Name      object
Amount     int64
Bool        bool
dtype: object
Out[13]:
Id Name Amount Bool
0 101 Jack 100 True
1 102 Emma 200 True
2 103 Scott 150 False
3 104 Evan 300 True
4 105 Neil 500 False
In [14]:
df = pd.read_csv('data/file.csv', dtype={'Amount': np.float64, 'Bool': np.int32})
print df.dtypes
df
Id          int64
Name       object
Amount    float64
Bool        int32
dtype: object
Out[14]:
Id Name Amount Bool
0 101 Jack 100.0 1
1 102 Emma 200.0 1
2 103 Scott 150.0 0
3 104 Evan 300.0 1
4 105 Neil 500.0 0

Reading top ‘n’ rows from a CSV file

In [16]:
df = pd.read_csv('data/file.csv',
                 nrows=3)

df
Out[16]:
Id Name Amount
0 101 Jack 100.0
1 102 Emma 200.0
2 103 Scott 150.0

Reading a random sample from a large CSV file

In [18]:
import random 

total_rows = sum(1 for line in open('data/file.csv')) - 1 # if known you can hard code the value

sample_size = 3

# Seed to get same sample every time we run this code
random.seed(42) 

skip = sorted(random.sample(range(1, total_rows), total_rows - sample_size))
df = pd.read_csv('data/file.csv', header=0, skiprows=skip)

df
Out[18]:
Id Name Amount
0 102 Emma 200.0
1 104 Evan 300.0
2 105 Neil 500.0

Handling NA values while reading a CSV file

In [20]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Id         int64
Name      object
Amount    object
dtype: object
Out[20]:
Id Name Amount
0 101 Jack 100.0
1 102 Null 200.0
2 103 Scott 150
3 104 Evan N.A.
4 105 Neil 500.0
In [21]:
df= pd.read_csv('data/file.csv', 
                dtype={'Amount': np.float64}, 
                na_values={'Name': 'Null', 'Amount':'N.A.',})

print df.dtypes
df
Id          int64
Name       object
Amount    float64
dtype: object
Out[21]:
Id Name Amount
0 101 Jack 100.0
1 102 NaN 200.0
2 103 Scott 150.0
3 104 Evan NaN
4 105 Neil 500.0

Reading CSV file having comma for thousands separator

In [23]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Id         int64
Name      object
Amount    object
dtype: object
Out[23]:
Id Name Amount
0 101 Jack 1,100,000.25
1 102 Emma 250,000.0
2 103 Scott 15,000.10
3 104 Evan N.A.
4 105 Neil 5,000.50
In [24]:
df= pd.read_csv('data/file.csv', 
                dtype={'Amount': np.float64}, 
                na_values='N.A.', 
                thousands=',')
In [25]:
print df.dtypes
df
Id          int64
Name       object
Amount    float64
dtype: object
Out[25]:
Id Name Amount
0 101 Jack 1100000.25
1 102 Emma 250000.00
2 103 Scott 15000.10
3 104 Evan NaN
4 105 Neil 5000.50