Mercurial Hosting > traffic-intelligence
changeset 1156:f7fbe624fff7
added helper functions for categorical variables
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Fri, 25 Sep 2020 11:56:59 -0400 |
parents | fd729e8f073c |
children | 173b7926734e |
files | trafficintelligence/utils.py |
diffstat | 1 files changed, 29 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/trafficintelligence/utils.py Wed Sep 09 23:47:57 2020 -0400 +++ b/trafficintelligence/utils.py Fri Sep 25 11:56:59 2020 -0400 @@ -8,12 +8,12 @@ from copy import deepcopy, copy from collections import Counter -from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t +from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t, chi2_contingency from scipy.spatial import distance from scipy.sparse import dok_matrix from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log, polyfit, float as npfloat from numpy.random import random_sample, permutation as nppermutation -from pandas import DataFrame, concat +from pandas import DataFrame, concat, crosstab import matplotlib.pyplot as plt datetimeFormat = "%Y-%m-%d %H:%M:%S" @@ -654,7 +654,33 @@ sample.append(x) s += x return sample - + +def cramers_v(x, y): + """ calculate Cramers V statistic for categorial-categorial association. + uses correction from Bergsma and Wicher, + Journal of the Korean Statistical Society 42 (2013): 323-328 + https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9 + https://stackoverflow.com/questions/46498455/categorical-features-correlation/46498792#46498792 + """ + confusionMatrix = crosstab(x,y) + chi2 = chi2_contingency(confusionMatrix)[0] + n = confusionMatrix.sum().sum() + phi2 = chi2/n + r,k = confusionMatrix.shape + phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1)) + rcorr = r-((r-1)**2)/(n-1) + kcorr = k-((k-1)**2)/(n-1) + return sqrt(phi2corr/min((kcorr-1),(rcorr-1))) + +def categoricalCorrelationMatrix(data, categoricalVariables): + 'Returns correlation matrix for the categorical variables' + corr = np.ones((len(categoricalVariables), len(categoricalVariables))) + for i in range(len(categoricalVariables)): + for j in range(i): + corr[i,j] = utils.cramers_v(petDf[categoricalVariables[i]], petDf[categoricalVariables[j]]) + corr[j,i] = corr[i,j] + return corr + ######################### # regression analysis using statsmodels (and pandas) #########################