Importing packages¶
pandas
and numpy
In [1]:
import pandas as pd
import numpy as np
Reading a CSV file¶
In [3]:
df = pd.read_csv('data/file.csv')
df
Out[3]:
In [4]:
print df.dtypes
Reading a CSV file with a Date column¶
In [6]:
raw_df = pd.read_csv('data/file.csv')
raw_df
Out[6]:
In [7]:
print raw_df.dtypes
In [8]:
date_parser = lambda dates: [pd.datetime.strptime(d, '%Y-%m-%d') for d in dates]
df = pd.read_csv('data/file.csv',
dtype={'Date': np.float64},
parse_dates=['Date'],
date_parser=date_parser)
print df.dtypes
df
Out[8]:
Reading a CSV file with Timestamp Column¶
In [10]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Out[10]:
In [11]:
from datetime import datetime
date_parser = lambda timestamp: datetime.fromtimestamp(timestamp)
df = pd.read_csv('data/file.csv',
dtype={'Date': np.float64},
parse_dates=['Date'],
date_parser=date_parser)
print df.dtypes
df
Out[11]:
Change data type of columns while reading CSV file¶
In [13]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Out[13]:
In [14]:
df = pd.read_csv('data/file.csv', dtype={'Amount': np.float64, 'Bool': np.int32})
print df.dtypes
df
Out[14]:
Reading top ‘n’ rows from a CSV file¶
In [16]:
df = pd.read_csv('data/file.csv',
nrows=3)
df
Out[16]:
Reading a random sample from a large CSV file¶
In [18]:
import random
total_rows = sum(1 for line in open('data/file.csv')) - 1 # if known you can hard code the value
sample_size = 3
# Seed to get same sample every time we run this code
random.seed(42)
skip = sorted(random.sample(range(1, total_rows), total_rows - sample_size))
df = pd.read_csv('data/file.csv', header=0, skiprows=skip)
df
Out[18]:
Handling NA values while reading a CSV file¶
In [20]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Out[20]:
In [21]:
df= pd.read_csv('data/file.csv',
dtype={'Amount': np.float64},
na_values={'Name': 'Null', 'Amount':'N.A.',})
print df.dtypes
df
Out[21]:
Reading CSV file having comma for thousands separator¶
In [23]:
raw_df = pd.read_csv('data/file.csv')
print raw_df.dtypes
raw_df
Out[23]:
In [24]:
df= pd.read_csv('data/file.csv',
dtype={'Amount': np.float64},
na_values='N.A.',
thousands=',')
In [25]:
print df.dtypes
df
Out[25]: