Cheatsheet
Some heplful functions for Data Analysis and ML with Python
Python Data Science Cheatsheet
General Information from here:
- https://towardsdatascience.com/feature-selection-techniques-in-machine-learning-with-python-f24e7da3f36e
- https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60
Univariate Selection
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
DATA_FILE = 'sample-data/mobile-price-classification/train.csv'
uv_data = pd.read_csv(DATA_FILE)
uv_data.head()
uv_x = uv_data.iloc[:, 0:20] # Read in the first 20 columns
uv_y = uv_data.iloc[:, -1] # Read in the last column
feature_count = 10 # Number of features we want to select
scores = SelectKBest(score_func=chi2, k = 'all').fit(uv_x, uv_y).scores_
df_fit = pd.DataFrame(scores) # Scores as DF
df_cols = pd.DataFrame(uv_x.columns) # Column names as DF
df_scores = pd.concat([df_cols, df_fit], axis=1)
df_scores.columns = ['Feature', 'Score']
df_scores.nlargest(feature_count, 'Score')
Feature Selection
import pandas as pd
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
fs_data = pd.read_csv(DATA_FILE)
fs_x = fs_data.iloc[:,0:20]
fs_y = fs_data.iloc[:,-1]
classifier = ExtraTreesClassifier() # Create classifier instance
classifier.fit(fs_x, fs_y) # Train the Classifier
fs_importance = classifier.feature_importances_
print(fs_importance)
df_importance = pd.Series(fs_importance, index=fs_x.columns)
df_importance.nlargest
df_importance.nlargest(10).plot(kind='barh')
plt.show()
Normal Correlation Heatmap
import pandas as pd
import numpy as np
import seaborn as sns
cm_data = pd.read_csv(DATA_FILE)
cm_data.head()
cm_x = cm_data.iloc[:,0:20] # Extract feature columns
cm_y = cm_data.iloc[:,-1] # Extract target column
correlation_matrix = cm_data.corr()
correlation_matrix
top_correlation_features = correlation_matrix.index
plt.figure(figsize=(20,20))
_ = sns.heatmap(cm_data[top_correlation_features].corr(), annot=True, cmap="RdYlGn")
Or as the folllowing Function:
def plot_df_correlation(df):
plt.figure(figsize=(20,20))
return sns.heatmap(df[df.corr().index].corr(), annot=True, cmap="coolwarm")
Simplified One-Hot Encoding
Only encode values that occure more than a specific threshold
domain_counts = df_domain_invoice['Email Domain'].value_counts()
replace_domains = domain_counts[domain_counts < 100].index
# on-hot encoding of Domains
df_domain_invoice = pd.get_dummies(df_domain_invoice['Email Domain']
.replace(replace_domains, 'other_'),
columns = ['Email Domain'],
drop_first=False)
Heatmapping Categorical Correlation
From the following sources:
- https://stackoverflow.com/questions/20892799/using-pandas-calculate-cram%C3%A9rs-coefficient-matrix
- https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792
- https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9
- https://www.kaggle.com/mlwhiz/seaborn-visualizations-using-football-data
import scipy.stats as ss
from collections import Counter
import math
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
def convert(data, to):
converted = None
if to == 'array':
if isinstance(data, np.ndarray):
converted = data
elif isinstance(data, pd.Series):
converted = data.values
elif isinstance(data, list):
converted = np.array(data)
elif isinstance(data, pd.DataFrame):
converted = data.as_matrix()
elif to == 'list':
if isinstance(data, list):
converted = data
elif isinstance(data, pd.Series):
converted = data.values.tolist()
elif isinstance(data, np.ndarray):
converted = data.tolist()
elif to == 'dataframe':
if isinstance(data, pd.DataFrame):
converted = data
elif isinstance(data, np.ndarray):
converted = pd.DataFrame(data)
else:
raise ValueError("Unknown data conversion: {}".format(to))
if converted is None:
raise TypeError('cannot handle data conversion of type: {} to {}'.format(type(data),to))
else:
return converted
def conditional_entropy(x, y):
"""
Calculates the conditional entropy of x given y: S(x|y)
Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy
:param x: list / NumPy ndarray / Pandas Series
A sequence of measurements
:param y: list / NumPy ndarray / Pandas Series
A sequence of measurements
:return: float
"""
# entropy of x given y
y_counter = Counter(y)
xy_counter = Counter(list(zip(x,y)))
total_occurrences = sum(y_counter.values())
entropy = 0.0
for xy in xy_counter.keys():
p_xy = xy_counter[xy] / total_occurrences
p_y = y_counter[xy[1]] / total_occurrences
entropy += p_xy * math.log(p_y/p_xy)
return entropy
def cramers_v(x, y):
confusion_matrix = pd.crosstab(x,y)
chi2 = ss.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2/n
r,k = confusion_matrix.shape
phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
rcorr = r-((r-1)**2)/(n-1)
kcorr = k-((k-1)**2)/(n-1)
return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))
def theils_u(x, y):
s_xy = conditional_entropy(x,y)
x_counter = Counter(x)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
s_x = ss.entropy(p_x)
if s_x == 0:
return 1
else:
return (s_x - s_xy) / s_x
def correlation_ratio(categories, measurements):
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
eta = numerator/denominator
return eta
def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
return_results = False, **kwargs):
"""
Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
continuous features using:
- Pearson's R for continuous-continuous cases
- Correlation Ratio for categorical-continuous cases
- Cramer's V or Theil's U for categorical-categorical cases
:param dataset: NumPy ndarray / Pandas DataFrame
The data-set for which the features' correlation is computed
:param nominal_columns: string / list / NumPy ndarray
Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
columns are categorical, or None (default) to state none are categorical
:param mark_columns: Boolean (default: False)
if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
continuous), as provided by nominal_columns
:param theil_u: Boolean (default: False)
In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
:param plot: Boolean (default: True)
If True, plot a heat-map of the correlation matrix
:param return_results: Boolean (default: False)
If True, the function will return a Pandas DataFrame of the computed associations
:param kwargs:
Arguments to be passed to used function and methods
:return: Pandas DataFrame
A DataFrame of the correlation/strength-of-association between all features
"""
dataset = convert(dataset, 'dataframe')
columns = dataset.columns
if nominal_columns is None:
nominal_columns = list()
elif nominal_columns == 'all':
nominal_columns = columns
corr = pd.DataFrame(index=columns, columns=columns)
for i in range(0,len(columns)):
for j in range(i,len(columns)):
if i == j:
corr[columns[i]][columns[j]] = 1.0
else:
if columns[i] in nominal_columns:
if columns[j] in nominal_columns:
if theil_u:
corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])
corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])
else:
cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
if columns[j] in nominal_columns:
cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
else:
cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
corr[columns[i]][columns[j]] = cell
corr[columns[j]][columns[i]] = cell
corr.fillna(value=np.nan, inplace=True)
if mark_columns:
marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
corr.columns = marked_columns
corr.index = marked_columns
if plot:
plt.figure(figsize=(20,20))#kwargs.get('figsize',None))
sns.heatmap(corr, annot=kwargs.get('annot',True), fmt=kwargs.get('fmt','.2f'), cmap='coolwarm')
plt.show()
if return_results:
return corr