Python Data Engineering Cheat Sheet

Some frequent needed utilities in Python data scripts —— good to have it by hand when facing puzzle.

ETL

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns

Data Loading

import pandas as pd

# From CSV
df = pd.read_csv("path")

# From Excel
df = pd.read_excel('/path')

# From database (sqlite)
import sqlite3
conn = sqlite3.connect("foo.db")
cur = conn.cursor()
#Check how many tables are there in the database
cur.execute("SELECT name FROM sqlite_master WHERE type='table';").fetchall()
#SQL Query to pandas dataframe
df = pd.read_sql_query("select * from bar limit 5;", conn)

Indexing

# Set index
df = df.set_index('colName')
# loc works on labels in the index
s.loc[:3]
# iloc works on the positions in the index (so it only takes integers)
s.iloc[:3]

Sorting

# First c1, then c2
df = df.sort(['c1','c2'], ascending=[False,True])

Dropping

# Drop columns (axis=0: column-wise; axis=1: row-wise)
df = df.drop(['Cabin','Ticket'],axis = 1)
# Drop rows
df = df.drop(['label string']) # by index name
df.drop(df.index[[1,3]]) # by row number (0-based)

Slicing

# Filter the dataset by a certain condition
df = df[df.name != 'Tina']

Dealing with missings

# Drop them
df = df.dropna()
# Fill them
df.fillna(0) # fill by a number
df.fillna(method='ffill') # propagates last valid observation forward to next valid
df.fillna(method='bfill')

Sampling

df_GM_sample = df_GM.sample(n=None, frac=None)

Apply function

# axis=0: column-wise; axis=1: row-wise
df.apply(func,axis = )
# apply to every element
df.applymap(lambda x: )

Dealing with datetime

# string to datetime
df.dt = pd.to_datetime(df.dt, format='%Y%m%d')

# get datetime indexes
t = pd.DatetimeIndex(df.dt)
hr = t.hour
df['HourOfDay'] = hr
month = t.month
df['Month'] = month
year = t.year
df['Year'] = year

# resample time series
df = df.set_index('datetime')
weekly_summary['speed'] = df.speed.resample('W').mean()
weekly_summary['distance'] = df.distance.resample('W').sum()
weekly_summary['cumulative_distance'] = df.cumulative_distance.resample('W').last()

# generate given format string from datetime
df['DOB1'] = df['DOB'].dt.strftime('%m/%d/%Y')

Categorical to dummy

dummiesT = pd.get_dummies(test['Embarked'],prefix = 'Embarked')
test = pd.concat([test,dummiesT],axis = 1)
test = test.drop('Embarked',axis =1)

concat & join

# concat along rows
df_new = pd.concat([df_a, df_b])

# join
df = df1.join(df2, how='left', lsuffix='', rsuffix='', sort=False)

Groupby

df.groupby(by = 'Sex').mean()

Differencing & Cumulation

# Differencing
data['instantaneous'] = data.volume_out.diff()

# Cumulation
consum.loc[:,"group"] = consum["is_start_point"].cumsum()

Sliding Window Apply

df["is_lucky_than_previous"] =\
pd.rolling_apply(df.Survived, 2, lambda x: x[1] - x[0] == 1).fillna(1)

Regular Expression

 def volCalc(row):
    name = row['tbordername']
    try:
        vol = 0
        p = re.compile(r'(\d+)ml')
        sizes = p.findall(name)
        for size in sizes:
            p1 = re.compile(size + r'ml\D+(\d)\D+')
            amount = p1.findall(name)
            if amount:
                vol += int(size)*int(amount[0])
            else:
                vol += int(size)*1
        return vol
    
    except:
        return 'N/A'

Descriptive Stats

Numerical stats

df.describe()

Correlation

corr = df.corr()
plt.matshow(df.corr())

Basic Charts

# line chart
fig = plt.figure(figsize=(12,6))
plt.plot(data.dateTime,data.volume_out)
plt.title('title')

# hist: numerical feature distribution
df.Age.hist()
# categorical feature distribution  
df.Survived.value_counts().plot(kind = 'bar')
# Basic box plot
sns.boxplot(consum.instantaneous,orient='v')
plt.title('instantaneous consumption value distribution')
# Box plot with hue
sns.boxplot(x="Sex", y="Age",hue = 'Survived', data=df, palette="Set3")

# Scatter
plt.scatter(df.Fare,df.Survived)
plt.xlabel('Fare')
plt.ylabel('Survived?')

# Regression chart
sns.jointplot(x="duration", y="usage", kind = 'reg', data=filtered)
plt.title('title')

Feature Engineering

Rescaling

# (0,1) scaling 
# (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
cols_to_norm = ['PassengerId','SibSp']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: scaler.fit_transform(x))

# Standardization: Zero mean and unit variance
from sklearn.preprocessing import scale
cols_to_norm = ['Age','SibSp']
df[cols_to_norm] = df[cols_to_norm].apply(lambda x: scale(x))

# Normalization: scaling individual observation (row) to have unit norm.
# if you plan to use a quadratic form such as the dot-product or any other kernel to quantify the similarity of any pair of samples, like KNN. 
from sklearn.preprocessing import normalize
df_normalized = pd.DataFrame(normalize(df._get_numeric_data(),norm = 'l2'),columns=df._get_numeric_data().columns,index=df._get_numeric_data().index)
df_normalized.apply(lambda x: np.sqrt(x.dot(x)), axis=1) # check results

Feature Binarization

# thresholding numerical features to get boolean values
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=30)
df['Age'] = df['Age'].apply(lambda x: binarizer.fit_transform(x)[0][0])

Generating Polynomial Features

# get features’ high-order and interaction terms
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2)
#  (X_1, X_2) to (1, X_1, X_2, X_1^2, X_1X_2, X_2^2)
X_poly = pd.DataFrame(poly.fit_transform(X))

Feature Selection

Filter methods

# Variance Treshhold
from sklearn.feature_selection import VarianceThreshold 

# Univariate feature selection 
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)

Wrapper Methods

# LASSO
class sklearn.linear_model.Lasso()

# Tree-based
class sklearn.ensemble.RandomForestClassifier()

Algorithm

Sk-Learn Official Cheat Sheet

Frequent Used Pieces

Linear Regression

from sklearn import linear_model
# Create linear regression object
regr = linear_model.LinearRegression(fit_intercept=True)
# Train the model using the training sets
regr.fit(df, y)
# The coefficients
print('Coefficients:', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % np.mean((regr.predict(df) - y) ** 2))
# Explained variance score: 1 is perfect prediction
print "R Squared score:";regr.score(df, y)

# Coef_ check
plt.figure(figsize=(12,8))
plt.barh(range(len(regr.coef_)),regr.coef_,height=0.2,tick_label = df.columns)
plt.title('Regression Coefficients')

# Residuals Check
res = regr.predict(df) - y
plt.axhline(0)
plt.scatter(range(len(res)),res.values,color = 'r')
plt.title('Residual Plot')

Kmeans

from sklearn.cluster import KMeans
estimator = KMeans(n_clusters=3)
estimator.fit(filtered_scaled)
labels = estimator.labels_

Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
rf = RandomForestClassifier(n_estimators=8000,n_jobs=-1,oob_score=True)
rf.fit(train,res)
rf.oob_score_
# feature importance
feature_importances = pd.Series(rf.feature_importances_,index = train.columns)
feature_importances.sort(inplace = True)
feature_importances.plot(kind = 'barh')

Recommender

# item-based CF
import graphlab
train_data = graphlab.SFrame(df_CF)
item_sim_model = graphlab.item_similarity_recommender.create(train_data, user_id='Customer_id', item_id='item')
# Make Recommendations
item_sim_recomm = item_sim_model.recommend(users=['5208494361'],k=10)
item_sim_recomm.print_rows()

Time Series

# DF Test
from statsmodels.tsa.stattools import adfuller
def test_stationarity(timeseries):
    
    # Determing rolling statistics
    rolmean = pd.rolling_mean(timeseries, window=10)
    rolstd = pd.rolling_std(timeseries, window=10)

    # Plot rolling statistics:
    plt.figure(figsize=(14,8))
    orig = plt.plot(timeseries, color='blue',label='Original')
    mean = plt.plot(rolmean, color='red', label='Rolling Mean')
    std = plt.plot(rolstd, color='black', label = 'Rolling Std')
    
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)
    
    # Perform Dickey-Fuller test:
    print 'Results of Dickey-Fuller Test:'
    dftest = adfuller(timeseries.SALES_IN_ML, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print dfoutput

# ARIMA
# Ordering: ACF and PACF plots
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_acf(TS_log, lags=30)
plot_pacf(TS_log, lags=30)

# Ordering: AIC
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import arma_order_select_ic
# Smaller Better
arma_order_select_ic(TS_log, max_ar=4, max_ma=0, ic='aic')

# Modeling
model = ARIMA(TS_log, order=(1, 0, 0))  
results_AR = model.fit(disp= 1)  
plt.plot(TS_log)
plt.plot(results_AR.fittedvalues, color='red')
TS_fitted = pd.DataFrame(results_AR.fittedvalues,columns=['SALES_IN_ML'])

# Residual Series Check
residuals = TS_log - TS_fitted
test_stationarity(residuals)

Tuning & Validation

Training/Test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

Cross Validation

from sklearn.model_selection import cross_val_score
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf, X.values, y[0].values, cv=5)
# 95% confidence interval of the score
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Exhaustive Grid Search

from sklearn.model_selection import GridSearchCV
svr = svm.SVC()
parameters = {'kernel':('linear','rbf'), 'C':[1, 10]}
clf = GridSearchCV(svr, parameters,n_jobs = -1,cv = 5)
clf.fit(X, y[0])
clf.cv_results_
clf.best_estimator_