changeset 668:f8dcf483b296

code to prepare regression variables (remove correlated variables) and record dataset size in experimnets
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Tue, 26 May 2015 11:19:03 +0200
parents 179b81faa1f8
children df6be882f325
files python/utils.py
diffstat 1 files changed, 104 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/python/utils.py	Mon May 25 13:54:29 2015 +0200
+++ b/python/utils.py	Tue May 26 11:19:03 2015 +0200
@@ -1,9 +1,10 @@
 #! /usr/bin/env python
 ''' Generic utilities.'''
 
+import matplotlib.pyplot as plt
 from datetime import time, datetime
 from math import sqrt
-
+from scipy.stats import kruskal, shapiro
 
 datetimeFormat = "%Y-%m-%d %H:%M:%S"
 
@@ -273,7 +274,6 @@
     '''returns the least square estimation of the linear regression of y = ax+b
     as well as the plot'''
     from numpy.lib.polynomial import polyfit
-    from matplotlib.pyplot import plot
     from numpy.core.multiarray import arange
     coef = polyfit(x, y, deg)
     if plotData:
@@ -282,17 +282,114 @@
             for i in range(len(coef)):
                 result += coef[i]*x**(len(coef)-i-1)
             return result
-        plot(x, y, 'x')
+        plt.plot(x, y, 'x')
         xx = arange(min(x), max(x),(max(x)-min(x))/1000)
-        plot(xx, [poly(z) for z in xx])
+        plt.plot(xx, [poly(z) for z in xx])
     return coef
 
+def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
+    '''Computes (and displays) the correlation matrix for a pandas DataFrame'''
+    c=data.corr(correlationMethod)
+    if plotFigure:
+        fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0]))
+        fig.add_subplot(1,1,1)
+        #plt.imshow(np.fabs(c), interpolation='none')
+        plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
+        colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()]
+        #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
+        plt.xticks(range(len(colnames)), colnames, rotation=90)
+        plt.yticks(range(len(colnames)), colnames)
+        plt.tick_params('both', length=0)
+        plt.subplots_adjust(bottom = 0.29)
+        plt.colorbar()
+        plt.title('Correlation ({})'.format(correlationMethod))
+        plt.tight_layout()
+        if figureFilename is not None:
+            plt.savefig(figureFilename, dpi = 150, transparent = True)
+    return c
+
+def addDummies(data, variables, allVariables = True):
+    '''Add binary dummy variables for each value of a nominal variable 
+    in a pandas DataFrame'''
+    from numpy import NaN, dtype
+    newVariables = []
+    for var in variables:
+        if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
+            values = data[var].unique()
+            if not allVariables:
+                values = values[:-1]
+            for val in values:
+                if val is not NaN:
+                    newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
+                    data[newVariable] = (data[var] == val)
+                    newVariables.append(newVariable)
+    return newVariables
+
+def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'):
+    '''Studies the influence of (nominal) independent variable over the dependent variable
+
+    Makes tests if the conditional distributions are normal
+    using the Shapiro-Wilk test (in which case ANOVA could be used)
+    Implements uses the non-parametric Kruskal Wallis test'''
+    tmp = data[data[independentVariable].notnull()]
+    independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
+    if len(independentVariableValues) >= 2:
+        for x in independentVariableValues:
+            print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
+            if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
+                print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])
+        if plotFigure:
+            plt.figure()
+            plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
+            #q25, q75 = tmp[dependentVariable].quantile([.25, .75])
+            #plt.ylim(ymax = q75+1.5*(q75-q25))
+            plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
+            plt.title('{} vs {}'.format(dependentVariable, independentVariable))
+            if figureFilenamePrefix is not None:
+                plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
+        #else:
+        # TODO formatter le tableau (html?)
+        print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
+        return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
+    else:
+        return None
+
+def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
+    '''Removes variables from candidate independent variables if
+    - if two independent variables are correlated (> maxCorrelationThreshold), one is removed
+    - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
+    Returns the remaining non-correlated variables, correlated with the dependent variable
+
+    correlationFunc is spearmanr or pearsonr from scipy.stats
+
+    TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
+    from numpy import dtype
+    from copy import copy
+    result = copy(independentVariables)
+    for v1 in independentVariables:
+        if v1 in correlations.index:
+            for v2 in independentVariables:
+                if v2 != v1 and v2 in correlations.index:
+                    if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
+                        if v1 in result and v2 in result:
+                            print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1))
+                            result.remove(v2)
+    #regressionIndependentVariables = result
+    for var in copy(result):
+        if data.dtypes[var] != dtype('O'):
+            cor, p = correlationFunc(data[dependentVariable], data[var])
+            if p > maxCorrelationP:
+                print('Removing {} (no correlation p={})'.format(var, p))
+                result.remove(var)
+    return result
+
 
 #########################
 # regression analysis using statsmodels (and pandas)
 #########################
 
 # TODO make class for experiments?
+# TODO add tests with public dataset downloaded from Internet (IRIS et al)
 def modelString(experiment, dependentVariable, independentVariables):
     return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]])
 
@@ -321,6 +418,7 @@
             experiments.loc[i,'r2adj'] = results.rsquared_adj
             experiments.loc[i,'condNum'] = results.condition_number
             experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1]
+            experiments.loc[i,'nobs'] = int(results.nobs)
     return experiments
 
 def generateExperiments(independentVariables):
@@ -339,6 +437,7 @@
     experiments['r2adj'] = 0.
     experiments['condNum'] = np.nan
     experiments['shapiroP'] = -1
+    experiments['nobs'] = -1
     return experiments
 
 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
@@ -385,6 +484,7 @@
             experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj
             experiments.loc[rowIdx, 'condNum'] = results.condition_number
             experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1]
+            experiments.loc[rowIdx, 'nobs'] = int(results.nobs)
         if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
             currentR2Adj = experiments.loc[rowIdx, 'r2adj']
             bestModel[currentVarNum] = True