diff python/utils.py @ 677:ae07c7b4cf87

update to utils for pavement results
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Sat, 30 May 2015 23:30:21 +0200
parents 58b9ac2f262f
children da1352b89d02
line wrap: on
line diff
--- a/python/utils.py	Wed May 27 04:08:19 2015 +0200
+++ b/python/utils.py	Sat May 30 23:30:21 2015 +0200
@@ -1,11 +1,13 @@
 #! /usr/bin/env python
+# -*- coding: utf-8 -*-
 ''' Generic utilities.'''
 
 import matplotlib.pyplot as plt
 from datetime import time, datetime
 from math import sqrt, ceil, floor
 from scipy.stats import kruskal, shapiro
-from numpy import zeros, array, exp, sum as npsum, arange, cumsum
+from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, median, isnan, ones, convolve,  dtype, isnan, NaN, mean, ma
+
 
 datetimeFormat = "%Y-%m-%d %H:%M:%S"
 
@@ -197,7 +199,6 @@
 
 def medianSmoothing(x, X, Y, halfwidth):
     '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]'''
-    from numpy import median
     return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth])
 
 def argmaxDict(d):
@@ -255,7 +256,6 @@
 def filterMovingWindow(inputSignal, halfWidth):
     '''Returns an array obtained after the smoothing of the input by a moving average
     The first and last points are copied from the original.'''
-    from numpy import ones,convolve,array
     width = float(halfWidth*2+1)
     win = ones(width,'d')
     result = convolve(win/width,array(inputSignal),'same')
@@ -282,13 +282,18 @@
 
 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
     '''Computes (and displays) the correlation matrix for a pandas DataFrame'''
-    c=data.corr(correlationMethod)
+    columns = data.columns.tolist()
+    for var in data.columns:
+        uniqueValues = data[var].unique()
+        if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan
+            columns.remove(var)
+    c=data[columns].corr(correlationMethod)
     if plotFigure:
-        fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0]))
+        fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0]))
         fig.add_subplot(1,1,1)
         #plt.imshow(np.fabs(c), interpolation='none')
         plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
-        colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()]
+        colnames = [displayNames.get(s.strip(), s.strip()) for s in columns]
         #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
         plt.xticks(range(len(colnames)), colnames, rotation=90)
         plt.yticks(range(len(colnames)), colnames)
@@ -297,6 +302,8 @@
         plt.colorbar()
         plt.title('Correlation ({})'.format(correlationMethod))
         plt.tight_layout()
+        if len(colnames) > 50:
+            plt.subplots_adjust(left=.06)
         if figureFilename is not None:
             plt.savefig(figureFilename, dpi = 150, transparent = True)
     return c
@@ -304,7 +311,6 @@
 def addDummies(data, variables, allVariables = True):
     '''Add binary dummy variables for each value of a nominal variable 
     in a pandas DataFrame'''
-    from numpy import NaN, dtype
     newVariables = []
     for var in variables:
         if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
@@ -318,7 +324,7 @@
                     newVariables.append(newVariable)
     return newVariables
 
-def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, translate = lambda s: s, kwCaption = u''):
+def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = u''):
     '''Studies the influence of (nominal) independent variable over the dependent variable
 
     Makes tests if the conditional distributions are normal
@@ -345,48 +351,80 @@
                 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
         table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
         table['count'] = table['count'].astype(int)
-        #table.index.rename(translate(table.index.name), inplace = True)
         testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
         if saveLatex:
-            out.write(translate('\\begin{minipage}{\\linewidth}\n'
-                                +'\\centering\n'
-                                +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n'
-                                +table.to_latex(float_format = lambda x: '{:.2f}'.format(x)).encode('ascii')+'\n'
-                                +'\\end{minipage}\n'
-                                +'\\vspace{0.5cm}\n'))
+            out.write('\\begin{minipage}{\\linewidth}\n'
+                      +'\\centering\n'
+                      +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n'
+                      +table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n'
+                      +'\\end{minipage}\n'
+                      +'\\ \\vspace{0.5cm}\n')
         else:
             print table
         return testResult
     else:
         return None
 
-def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
+def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''):
     '''Removes variables from candidate independent variables if
     - if two independent variables are correlated (> maxCorrelationThreshold), one is removed
     - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
     Returns the remaining non-correlated variables, correlated with the dependent variable
 
     correlationFunc is spearmanr or pearsonr from scipy.stats
+    text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise
 
-    TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
-    from numpy import dtype
+    TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''    
     from copy import copy
+    from pandas import DataFrame
     result = copy(independentVariables)
-    for v1 in independentVariables:
+    table1 = ''
+    table2 = {}
+    # constant variables
+    for var in independentVariables:
+        uniqueValues = data[var].unique()
+        if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1):
+            print(stdoutText[0].format(var, uniqueValues))
+            if saveFiles:
+                table1 += latexTable[0].format(var, *uniqueValues)
+            result.remove(var)
+    # correlated variables
+    for v1 in copy(result):
         if v1 in correlations.index:
-            for v2 in independentVariables:
+            for v2 in copy(result):
                 if v2 != v1 and v2 in correlations.index:
                     if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
                         if v1 in result and v2 in result:
-                            print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1))
+                            if saveFiles:
+                                table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2])
+                            print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2]))
                             result.remove(v2)
-    #regressionIndependentVariables = result
+    # not correlated with dependent variable
+    table2['Correlations'] = []
+    table2['Valeurs p'] = []
     for var in copy(result):
         if data.dtypes[var] != dtype('O'):
             cor, p = correlationFunc(data[dependentVariable], data[var])
             if p > maxCorrelationP:
-                print('Removing {} (no correlation p={})'.format(var, p))
+                if saveFiles:
+                    table1 += latexTable[2].format(var, cor, p)
+                print(stdoutText[2].format(var, cor, p))
                 result.remove(var)
+            else:
+                table2['Correlations'].append(cor)
+                table2['Valeurs p'].append(p)
+
+    if saveFiles:
+        from storage import openCheck
+        out = openCheck(filenamePrefix+'-removed-variables.tex', 'w')
+        out.write(latexHeader)
+        out.write(table1)
+        out.write(latexFooter)
+        out.close()
+        out = openCheck(filenamePrefix+'-correlations.html', 'w')
+        table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')]
+        out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False))
+        out.close()
     return result
 
 
@@ -429,7 +467,6 @@
 
 def generateExperiments(independentVariables):
     '''Generates all possible models for including or not each independent variable'''
-    from numpy import nan
     from pandas import DataFrame
     experiments = {}
     nIndependentVariables = len(independentVariables)
@@ -443,7 +480,7 @@
         experiments[var] = pattern*(2**(nIndependentVariables-i-1))
     experiments = DataFrame(experiments)
     experiments['r2adj'] = 0.
-    experiments['condNum'] = nan
+    experiments['condNum'] = NaN
     experiments['shapiroP'] = -1
     experiments['nobs'] = -1
     return experiments
@@ -502,30 +539,52 @@
             bestModel[currentVarNum] = True
     return experiments
 
-def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf'):
+def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}):
     import statsmodels.api as sm
-    '''Displays some model results'''
+    '''Displays some model results
+
+    3 graphics, true-predicted, residuals-predicted, '''
     print(results.summary())
-    print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid)))
+    shapiroResult = shapiro(results.resid)
+    print(shapiroResult)
     if plotFigures:
+        fig = plt.figure(figsize=(7,6.3*(2+int(model is not None))))
         if model is not None:
-            plt.figure()
+            ax = fig.add_subplot(3,1,1)
             plt.plot(results.predict(), model.endog, 'x')
             x=plt.xlim()
             y=plt.ylim()
             plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
-            plt.title('true vs predicted')
-            if filenamePrefix is not None:
-                plt.savefig(filenamePrefix+'-true-predicted.'+figureFileType)
-        plt.figure()
-        plt.plot(results.predict(), results.resid, 'x')
+            #plt.axis('equal')
+            if text is not None:
+                plt.title(text['title-shapiro'].format(*shapiroResult))
+                #plt.title(text['true-predicted.title'])
+                plt.xlabel(text['true-predicted.xlabel'])
+                plt.ylabel(text['true-predicted.ylabel'])
+            fig.add_subplot(3,1,2, sharex = ax)
+            plt.plot(results.predict(), results.resid, 'x')
+            nextSubplotNum = 3
+        else:
+            fig.add_subplot(2,1,1)
+            plt.plot(results.predict(), results.resid, 'x')
+            nextSubplotNum = 2
+        if text is not None:
+            if model is None:
+                plt.title(text['title-shapiro'].format(*shapiroResult))
+            plt.xlabel(text['residuals-predicted.xlabel'])
+            plt.ylabel(text['residuals-predicted.ylabel'])
+        qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum)
+        sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx)
+        plt.axis('equal')
+        if text is not None and 'qqplot.xlabel' in text:
+            plt.xlabel(text['qqplot.xlabel'])
+            plt.ylabel(text['qqplot.ylabel'])
+        plt.tight_layout()
         if filenamePrefix is not None:
-            plt.savefig(filenamePrefix+'-residuals.'+figureFileType)
-        plt.title('residuals vs predicted')
-        sm.qqplot(results.resid, fit = True, line = '45')
-        if filenamePrefix is not None:
-            plt.savefig(filenamePrefix+'-qq.'+figureFileType)
-
+            from storage import openCheck
+            out = openCheck(filenamePrefix+'-coefficients.html', 'w')
+            out.write(results.summary().as_html())
+            plt.savefig(filenamePrefix+'-model-results.'+figureFileType)
 
 #########################
 # iterable section
@@ -569,7 +628,6 @@
         self.subSequenceIndices = [(0,0)]
 
     def similarities(self, l1, l2, jshift=0):
-        from numpy import zeros, int as npint
         n1 = len(l1)
         n2 = len(l2)
         self.similarityTable = zeros((n1+1,n2+1), dtype = npint)
@@ -640,7 +698,6 @@
         return self._compute(l1, l2, computeSubSequence)
 
     def computeAlignment(self):
-        from numpy import mean
         return mean([j-i for i,j in self.subSequenceIndices])
 
     def _computeNormalized(self, l1, l2, computeSubSequence = False):
@@ -702,7 +759,6 @@
 colors = PlottingPropertyValues('brgmyck') # 'w'
 
 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1):
-    from numpy import array, arange, ones, ma
     from matplotlib.pyplot import pcolor
     coords = array(indicatorMap.keys())
     minX = min(coords[:,0])