Mercurial Hosting > traffic-intelligence
changeset 668:f8dcf483b296
code to prepare regression variables (remove correlated variables) and record dataset size in experimnets
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Tue, 26 May 2015 11:19:03 +0200 |
parents | 179b81faa1f8 |
children | df6be882f325 |
files | python/utils.py |
diffstat | 1 files changed, 104 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/python/utils.py Mon May 25 13:54:29 2015 +0200 +++ b/python/utils.py Tue May 26 11:19:03 2015 +0200 @@ -1,9 +1,10 @@ #! /usr/bin/env python ''' Generic utilities.''' +import matplotlib.pyplot as plt from datetime import time, datetime from math import sqrt - +from scipy.stats import kruskal, shapiro datetimeFormat = "%Y-%m-%d %H:%M:%S" @@ -273,7 +274,6 @@ '''returns the least square estimation of the linear regression of y = ax+b as well as the plot''' from numpy.lib.polynomial import polyfit - from matplotlib.pyplot import plot from numpy.core.multiarray import arange coef = polyfit(x, y, deg) if plotData: @@ -282,17 +282,114 @@ for i in range(len(coef)): result += coef[i]*x**(len(coef)-i-1) return result - plot(x, y, 'x') + plt.plot(x, y, 'x') xx = arange(min(x), max(x),(max(x)-min(x))/1000) - plot(xx, [poly(z) for z in xx]) + plt.plot(xx, [poly(z) for z in xx]) return coef +def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None): + '''Computes (and displays) the correlation matrix for a pandas DataFrame''' + c=data.corr(correlationMethod) + if plotFigure: + fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0])) + fig.add_subplot(1,1,1) + #plt.imshow(np.fabs(c), interpolation='none') + plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm + colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()] + #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename) + plt.xticks(range(len(colnames)), colnames, rotation=90) + plt.yticks(range(len(colnames)), colnames) + plt.tick_params('both', length=0) + plt.subplots_adjust(bottom = 0.29) + plt.colorbar() + plt.title('Correlation ({})'.format(correlationMethod)) + plt.tight_layout() + if figureFilename is not None: + plt.savefig(figureFilename, dpi = 150, transparent = True) + return c + +def addDummies(data, variables, allVariables = True): + '''Add binary dummy variables for each value of a nominal variable + in a pandas DataFrame''' + from numpy import NaN, dtype + newVariables = [] + for var in variables: + if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2: + values = data[var].unique() + if not allVariables: + values = values[:-1] + for val in values: + if val is not NaN: + newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') + data[newVariable] = (data[var] == val) + newVariables.append(newVariable) + return newVariables + +def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'): + '''Studies the influence of (nominal) independent variable over the dependent variable + + Makes tests if the conditional distributions are normal + using the Shapiro-Wilk test (in which case ANOVA could be used) + Implements uses the non-parametric Kruskal Wallis test''' + tmp = data[data[independentVariable].notnull()] + independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) + if len(independentVariableValues) >= 2: + for x in independentVariableValues: + print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) + if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: + print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) + if plotFigure: + plt.figure() + plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) + #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) + #plt.ylim(ymax = q75+1.5*(q75-q25)) + plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) + plt.title('{} vs {}'.format(dependentVariable, independentVariable)) + if figureFilenamePrefix is not None: + plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) + #else: + # TODO formatter le tableau (html?) + print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) + return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) + else: + return None + +def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): + '''Removes variables from candidate independent variables if + - if two independent variables are correlated (> maxCorrelationThreshold), one is removed + - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP) + Returns the remaining non-correlated variables, correlated with the dependent variable + + correlationFunc is spearmanr or pearsonr from scipy.stats + + TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' + from numpy import dtype + from copy import copy + result = copy(independentVariables) + for v1 in independentVariables: + if v1 in correlations.index: + for v2 in independentVariables: + if v2 != v1 and v2 in correlations.index: + if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold: + if v1 in result and v2 in result: + print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1)) + result.remove(v2) + #regressionIndependentVariables = result + for var in copy(result): + if data.dtypes[var] != dtype('O'): + cor, p = correlationFunc(data[dependentVariable], data[var]) + if p > maxCorrelationP: + print('Removing {} (no correlation p={})'.format(var, p)) + result.remove(var) + return result + ######################### # regression analysis using statsmodels (and pandas) ######################### # TODO make class for experiments? +# TODO add tests with public dataset downloaded from Internet (IRIS et al) def modelString(experiment, dependentVariable, independentVariables): return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) @@ -321,6 +418,7 @@ experiments.loc[i,'r2adj'] = results.rsquared_adj experiments.loc[i,'condNum'] = results.condition_number experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] + experiments.loc[i,'nobs'] = int(results.nobs) return experiments def generateExperiments(independentVariables): @@ -339,6 +437,7 @@ experiments['r2adj'] = 0. experiments['condNum'] = np.nan experiments['shapiroP'] = -1 + experiments['nobs'] = -1 return experiments def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): @@ -385,6 +484,7 @@ experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj experiments.loc[rowIdx, 'condNum'] = results.condition_number experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] + experiments.loc[rowIdx, 'nobs'] = int(results.nobs) if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: currentR2Adj = experiments.loc[rowIdx, 'r2adj'] bestModel[currentVarNum] = True