repo/traffic-intelligence: python/utils.py comparison

comparison python/utils.py @ 677:ae07c7b4cf87

update to utils for pavement results

author	Nicolas Saunier <nicolas.saunier@polymtl.ca>
date	Sat, 30 May 2015 23:30:21 +0200
parents	58b9ac2f262f
children	da1352b89d02

comparison

equal deleted inserted replaced

-:58b9ac2f262f
+:ae07c7b4cf87
 #! /usr/bin/env python
+# -*- coding: utf-8 -*-
 ''' Generic utilities.'''
 import matplotlib.pyplot as plt
 from datetime import time, datetime
 from math import sqrt, ceil, floor
 from scipy.stats import kruskal, shapiro
-from numpy import zeros, array, exp, sum as npsum, arange, cumsum
+from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, median, isnan, ones, convolve,  dtype, isnan, NaN, mean, ma
 datetimeFormat = "%Y-%m-%d %H:%M:%S"
 #########################
 # Enumerations
 else:
 return 0.
 def medianSmoothing(x, X, Y, halfwidth):
 '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]'''
-from numpy import median
 return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth])
 def argmaxDict(d):
 return max(d, key=d.get)
 return smoothed
 def filterMovingWindow(inputSignal, halfWidth):
 '''Returns an array obtained after the smoothing of the input by a moving average
 The first and last points are copied from the original.'''
-from numpy import ones,convolve,array
 width = float(halfWidth*2+1)
 win = ones(width,'d')
 result = convolve(win/width,array(inputSignal),'same')
 result[:halfWidth] = inputSignal[:halfWidth]
 result[-halfWidth:] = inputSignal[-halfWidth:]
 plt.plot(xx, [poly(z) for z in xx])
 return coef
 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
 '''Computes (and displays) the correlation matrix for a pandas DataFrame'''
-c=data.corr(correlationMethod)
+columns = data.columns.tolist()
+for var in data.columns:
+uniqueValues = data[var].unique()
+if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan
+columns.remove(var)
+c=data[columns].corr(correlationMethod)
 if plotFigure:
-fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0]))
+fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0]))
 fig.add_subplot(1,1,1)
 #plt.imshow(np.fabs(c), interpolation='none')
 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
-colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()]
+colnames = [displayNames.get(s.strip(), s.strip()) for s in columns]
 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
 plt.xticks(range(len(colnames)), colnames, rotation=90)
 plt.yticks(range(len(colnames)), colnames)
 plt.tick_params('both', length=0)
 plt.subplots_adjust(bottom = 0.29)
 plt.colorbar()
 plt.title('Correlation ({})'.format(correlationMethod))
 plt.tight_layout()
+if len(colnames) > 50:
+plt.subplots_adjust(left=.06)
 if figureFilename is not None:
 plt.savefig(figureFilename, dpi = 150, transparent = True)
 return c
 def addDummies(data, variables, allVariables = True):
 '''Add binary dummy variables for each value of a nominal variable
 in a pandas DataFrame'''
-from numpy import NaN, dtype
 newVariables = []
 for var in variables:
 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
 values = data[var].unique()
 if not allVariables:
 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
 data[newVariable] = (data[var] == val)
 newVariables.append(newVariable)
 return newVariables
-def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, translate = lambda s: s, kwCaption = u''):
+def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = u''):
 '''Studies the influence of (nominal) independent variable over the dependent variable
 Makes tests if the conditional distributions are normal
 using the Shapiro-Wilk test (in which case ANOVA could be used)
 Implements uses the non-parametric Kruskal Wallis test'''
 plt.title('{} vs {}'.format(dependentVariable, independentVariable))
 if filenamePrefix is not None:
 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
 table['count'] = table['count'].astype(int)
-#table.index.rename(translate(table.index.name), inplace = True)
 testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
 if saveLatex:
-out.write(translate('\\begin{minipage}{\\linewidth}\n'
+out.write('\\begin{minipage}{\\linewidth}\n'
 +'\\centering\n'
 +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n'
-+table.to_latex(float_format = lambda x: '{:.2f}'.format(x)).encode('ascii')+'\n'
++table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n'
 +'\\end{minipage}\n'
-+'\\vspace{0.5cm}\n'))
++'\\ \\vspace{0.5cm}\n')
 else:
 print table
 return testResult
 else:
 return None
-def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
+def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''):
 '''Removes variables from candidate independent variables if
 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed
 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
 Returns the remaining non-correlated variables, correlated with the dependent variable
 correlationFunc is spearmanr or pearsonr from scipy.stats
+text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise
-TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
-from numpy import dtype
+TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
 from copy import copy
+from pandas import DataFrame
 result = copy(independentVariables)
-for v1 in independentVariables:
+table1 = ''
+table2 = {}
+# constant variables
+for var in independentVariables:
+uniqueValues = data[var].unique()
+if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1):
+print(stdoutText[0].format(var, uniqueValues))
+if saveFiles:
+table1 += latexTable[0].format(var, *uniqueValues)
+result.remove(var)
+# correlated variables
+for v1 in copy(result):
 if v1 in correlations.index:
-for v2 in independentVariables:
+for v2 in copy(result):
 if v2 != v1 and v2 in correlations.index:
 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
 if v1 in result and v2 in result:
-print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1))
+if saveFiles:
+table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2])
+print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2]))
 result.remove(v2)
-#regressionIndependentVariables = result
+# not correlated with dependent variable
+table2['Correlations'] = []
+table2['Valeurs p'] = []
 for var in copy(result):
 if data.dtypes[var] != dtype('O'):
 cor, p = correlationFunc(data[dependentVariable], data[var])
 if p > maxCorrelationP:
-print('Removing {} (no correlation p={})'.format(var, p))
+if saveFiles:
+table1 += latexTable[2].format(var, cor, p)
+print(stdoutText[2].format(var, cor, p))
 result.remove(var)
+else:
+table2['Correlations'].append(cor)
+table2['Valeurs p'].append(p)
+if saveFiles:
+from storage import openCheck
+out = openCheck(filenamePrefix+'-removed-variables.tex', 'w')
+out.write(latexHeader)
+out.write(table1)
+out.write(latexFooter)
+out.close()
+out = openCheck(filenamePrefix+'-correlations.html', 'w')
+table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')]
+out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False))
+out.close()
 return result
 #########################
 # regression analysis using statsmodels (and pandas)
 experiments.loc[i,'nobs'] = int(results.nobs)
 return experiments
 def generateExperiments(independentVariables):
 '''Generates all possible models for including or not each independent variable'''
-from numpy import nan
 from pandas import DataFrame
 experiments = {}
 nIndependentVariables = len(independentVariables)
 if nIndependentVariables != len(set(independentVariables)):
 print("Duplicate variables. Exiting")
 for i,var in enumerate(independentVariables):
 pattern = [False]*(2**i)+[True]*(2**i)
 experiments[var] = pattern*(2**(nIndependentVariables-i-1))
 experiments = DataFrame(experiments)
 experiments['r2adj'] = 0.
-experiments['condNum'] = nan
+experiments['condNum'] = NaN
 experiments['shapiroP'] = -1
 experiments['nobs'] = -1
 return experiments
 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
 currentR2Adj = experiments.loc[rowIdx, 'r2adj']
 bestModel[currentVarNum] = True
 return experiments
-def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf'):
+def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}):
 import statsmodels.api as sm
-'''Displays some model results'''
+'''Displays some model results
+3 graphics, true-predicted, residuals-predicted, '''
 print(results.summary())
-print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid)))
+shapiroResult = shapiro(results.resid)
+print(shapiroResult)
 if plotFigures:
+fig = plt.figure(figsize=(7,6.3*(2+int(model is not None))))
 if model is not None:
-plt.figure()
+ax = fig.add_subplot(3,1,1)
 plt.plot(results.predict(), model.endog, 'x')
 x=plt.xlim()
 y=plt.ylim()
 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
-plt.title('true vs predicted')
+#plt.axis('equal')
-if filenamePrefix is not None:
+if text is not None:
-plt.savefig(filenamePrefix+'-true-predicted.'+figureFileType)
+plt.title(text['title-shapiro'].format(*shapiroResult))
-plt.figure()
+#plt.title(text['true-predicted.title'])
-plt.plot(results.predict(), results.resid, 'x')
+plt.xlabel(text['true-predicted.xlabel'])
+plt.ylabel(text['true-predicted.ylabel'])
+fig.add_subplot(3,1,2, sharex = ax)
+plt.plot(results.predict(), results.resid, 'x')
+nextSubplotNum = 3
+else:
+fig.add_subplot(2,1,1)
+plt.plot(results.predict(), results.resid, 'x')
+nextSubplotNum = 2
+if text is not None:
+if model is None:
+plt.title(text['title-shapiro'].format(*shapiroResult))
+plt.xlabel(text['residuals-predicted.xlabel'])
+plt.ylabel(text['residuals-predicted.ylabel'])
+qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum)
+sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx)
+plt.axis('equal')
+if text is not None and 'qqplot.xlabel' in text:
+plt.xlabel(text['qqplot.xlabel'])
+plt.ylabel(text['qqplot.ylabel'])
+plt.tight_layout()
 if filenamePrefix is not None:
-plt.savefig(filenamePrefix+'-residuals.'+figureFileType)
+from storage import openCheck
-plt.title('residuals vs predicted')
+out = openCheck(filenamePrefix+'-coefficients.html', 'w')
-sm.qqplot(results.resid, fit = True, line = '45')
+out.write(results.summary().as_html())
-if filenamePrefix is not None:
+plt.savefig(filenamePrefix+'-model-results.'+figureFileType)
-plt.savefig(filenamePrefix+'-qq.'+figureFileType)
 #########################
 # iterable section
 #########################
 self.delta = delta
 self.lengthFunc = lengthFunc
 self.subSequenceIndices = [(0,0)]
 def similarities(self, l1, l2, jshift=0):
-from numpy import zeros, int as npint
 n1 = len(l1)
 n2 = len(l2)
 self.similarityTable = zeros((n1+1,n2+1), dtype = npint)
 for i in xrange(1,n1+1):
 for j in xrange(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1):
 def compute(self, l1, l2, computeSubSequence = False):
 '''get methods are to be shadowed in child classes '''
 return self._compute(l1, l2, computeSubSequence)
 def computeAlignment(self):
-from numpy import mean
 return mean([j-i for i,j in self.subSequenceIndices])
 def _computeNormalized(self, l1, l2, computeSubSequence = False):
 ''' compute the normalized LCSS
 ie, the LCSS divided by the min or mean of the indicator lengths (using lengthFunc)
 linestyles = PlottingPropertyValues(['-', '--', '-.', ':'])
 colors = PlottingPropertyValues('brgmyck') # 'w'
 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1):
-from numpy import array, arange, ones, ma
 from matplotlib.pyplot import pcolor
 coords = array(indicatorMap.keys())
 minX = min(coords[:,0])
 minY = min(coords[:,1])
 X = arange(minX, max(coords[:,0])+1.1)*squareSize

Mercurial Hosting > traffic-intelligence

comparison python/utils.py @ 677:ae07c7b4cf87