Mercurial Hosting > traffic-intelligence
changeset 677:ae07c7b4cf87
update to utils for pavement results
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Sat, 30 May 2015 23:30:21 +0200 |
parents | 58b9ac2f262f |
children | 97c305108460 576d9ea4b41a |
files | python/utils.py |
diffstat | 1 files changed, 99 insertions(+), 43 deletions(-) [+] |
line wrap: on
line diff
--- a/python/utils.py Wed May 27 04:08:19 2015 +0200 +++ b/python/utils.py Sat May 30 23:30:21 2015 +0200 @@ -1,11 +1,13 @@ #! /usr/bin/env python +# -*- coding: utf-8 -*- ''' Generic utilities.''' import matplotlib.pyplot as plt from datetime import time, datetime from math import sqrt, ceil, floor from scipy.stats import kruskal, shapiro -from numpy import zeros, array, exp, sum as npsum, arange, cumsum +from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, median, isnan, ones, convolve, dtype, isnan, NaN, mean, ma + datetimeFormat = "%Y-%m-%d %H:%M:%S" @@ -197,7 +199,6 @@ def medianSmoothing(x, X, Y, halfwidth): '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]''' - from numpy import median return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth]) def argmaxDict(d): @@ -255,7 +256,6 @@ def filterMovingWindow(inputSignal, halfWidth): '''Returns an array obtained after the smoothing of the input by a moving average The first and last points are copied from the original.''' - from numpy import ones,convolve,array width = float(halfWidth*2+1) win = ones(width,'d') result = convolve(win/width,array(inputSignal),'same') @@ -282,13 +282,18 @@ def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None): '''Computes (and displays) the correlation matrix for a pandas DataFrame''' - c=data.corr(correlationMethod) + columns = data.columns.tolist() + for var in data.columns: + uniqueValues = data[var].unique() + if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan + columns.remove(var) + c=data[columns].corr(correlationMethod) if plotFigure: - fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0])) + fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0])) fig.add_subplot(1,1,1) #plt.imshow(np.fabs(c), interpolation='none') plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm - colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()] + colnames = [displayNames.get(s.strip(), s.strip()) for s in columns] #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename) plt.xticks(range(len(colnames)), colnames, rotation=90) plt.yticks(range(len(colnames)), colnames) @@ -297,6 +302,8 @@ plt.colorbar() plt.title('Correlation ({})'.format(correlationMethod)) plt.tight_layout() + if len(colnames) > 50: + plt.subplots_adjust(left=.06) if figureFilename is not None: plt.savefig(figureFilename, dpi = 150, transparent = True) return c @@ -304,7 +311,6 @@ def addDummies(data, variables, allVariables = True): '''Add binary dummy variables for each value of a nominal variable in a pandas DataFrame''' - from numpy import NaN, dtype newVariables = [] for var in variables: if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2: @@ -318,7 +324,7 @@ newVariables.append(newVariable) return newVariables -def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, translate = lambda s: s, kwCaption = u''): +def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = u''): '''Studies the influence of (nominal) independent variable over the dependent variable Makes tests if the conditional distributions are normal @@ -345,48 +351,80 @@ plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) table['count'] = table['count'].astype(int) - #table.index.rename(translate(table.index.name), inplace = True) testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) if saveLatex: - out.write(translate('\\begin{minipage}{\\linewidth}\n' - +'\\centering\n' - +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n' - +table.to_latex(float_format = lambda x: '{:.2f}'.format(x)).encode('ascii')+'\n' - +'\\end{minipage}\n' - +'\\vspace{0.5cm}\n')) + out.write('\\begin{minipage}{\\linewidth}\n' + +'\\centering\n' + +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n' + +table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n' + +'\\end{minipage}\n' + +'\\ \\vspace{0.5cm}\n') else: print table return testResult else: return None -def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): +def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''): '''Removes variables from candidate independent variables if - if two independent variables are correlated (> maxCorrelationThreshold), one is removed - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP) Returns the remaining non-correlated variables, correlated with the dependent variable correlationFunc is spearmanr or pearsonr from scipy.stats + text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise - TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' - from numpy import dtype + TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' from copy import copy + from pandas import DataFrame result = copy(independentVariables) - for v1 in independentVariables: + table1 = '' + table2 = {} + # constant variables + for var in independentVariables: + uniqueValues = data[var].unique() + if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1): + print(stdoutText[0].format(var, uniqueValues)) + if saveFiles: + table1 += latexTable[0].format(var, *uniqueValues) + result.remove(var) + # correlated variables + for v1 in copy(result): if v1 in correlations.index: - for v2 in independentVariables: + for v2 in copy(result): if v2 != v1 and v2 in correlations.index: if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold: if v1 in result and v2 in result: - print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1)) + if saveFiles: + table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2]) + print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2])) result.remove(v2) - #regressionIndependentVariables = result + # not correlated with dependent variable + table2['Correlations'] = [] + table2['Valeurs p'] = [] for var in copy(result): if data.dtypes[var] != dtype('O'): cor, p = correlationFunc(data[dependentVariable], data[var]) if p > maxCorrelationP: - print('Removing {} (no correlation p={})'.format(var, p)) + if saveFiles: + table1 += latexTable[2].format(var, cor, p) + print(stdoutText[2].format(var, cor, p)) result.remove(var) + else: + table2['Correlations'].append(cor) + table2['Valeurs p'].append(p) + + if saveFiles: + from storage import openCheck + out = openCheck(filenamePrefix+'-removed-variables.tex', 'w') + out.write(latexHeader) + out.write(table1) + out.write(latexFooter) + out.close() + out = openCheck(filenamePrefix+'-correlations.html', 'w') + table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')] + out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False)) + out.close() return result @@ -429,7 +467,6 @@ def generateExperiments(independentVariables): '''Generates all possible models for including or not each independent variable''' - from numpy import nan from pandas import DataFrame experiments = {} nIndependentVariables = len(independentVariables) @@ -443,7 +480,7 @@ experiments[var] = pattern*(2**(nIndependentVariables-i-1)) experiments = DataFrame(experiments) experiments['r2adj'] = 0. - experiments['condNum'] = nan + experiments['condNum'] = NaN experiments['shapiroP'] = -1 experiments['nobs'] = -1 return experiments @@ -502,30 +539,52 @@ bestModel[currentVarNum] = True return experiments -def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf'): +def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}): import statsmodels.api as sm - '''Displays some model results''' + '''Displays some model results + + 3 graphics, true-predicted, residuals-predicted, ''' print(results.summary()) - print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) + shapiroResult = shapiro(results.resid) + print(shapiroResult) if plotFigures: + fig = plt.figure(figsize=(7,6.3*(2+int(model is not None)))) if model is not None: - plt.figure() + ax = fig.add_subplot(3,1,1) plt.plot(results.predict(), model.endog, 'x') x=plt.xlim() y=plt.ylim() plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') - plt.title('true vs predicted') - if filenamePrefix is not None: - plt.savefig(filenamePrefix+'-true-predicted.'+figureFileType) - plt.figure() - plt.plot(results.predict(), results.resid, 'x') + #plt.axis('equal') + if text is not None: + plt.title(text['title-shapiro'].format(*shapiroResult)) + #plt.title(text['true-predicted.title']) + plt.xlabel(text['true-predicted.xlabel']) + plt.ylabel(text['true-predicted.ylabel']) + fig.add_subplot(3,1,2, sharex = ax) + plt.plot(results.predict(), results.resid, 'x') + nextSubplotNum = 3 + else: + fig.add_subplot(2,1,1) + plt.plot(results.predict(), results.resid, 'x') + nextSubplotNum = 2 + if text is not None: + if model is None: + plt.title(text['title-shapiro'].format(*shapiroResult)) + plt.xlabel(text['residuals-predicted.xlabel']) + plt.ylabel(text['residuals-predicted.ylabel']) + qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum) + sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx) + plt.axis('equal') + if text is not None and 'qqplot.xlabel' in text: + plt.xlabel(text['qqplot.xlabel']) + plt.ylabel(text['qqplot.ylabel']) + plt.tight_layout() if filenamePrefix is not None: - plt.savefig(filenamePrefix+'-residuals.'+figureFileType) - plt.title('residuals vs predicted') - sm.qqplot(results.resid, fit = True, line = '45') - if filenamePrefix is not None: - plt.savefig(filenamePrefix+'-qq.'+figureFileType) - + from storage import openCheck + out = openCheck(filenamePrefix+'-coefficients.html', 'w') + out.write(results.summary().as_html()) + plt.savefig(filenamePrefix+'-model-results.'+figureFileType) ######################### # iterable section @@ -569,7 +628,6 @@ self.subSequenceIndices = [(0,0)] def similarities(self, l1, l2, jshift=0): - from numpy import zeros, int as npint n1 = len(l1) n2 = len(l2) self.similarityTable = zeros((n1+1,n2+1), dtype = npint) @@ -640,7 +698,6 @@ return self._compute(l1, l2, computeSubSequence) def computeAlignment(self): - from numpy import mean return mean([j-i for i,j in self.subSequenceIndices]) def _computeNormalized(self, l1, l2, computeSubSequence = False): @@ -702,7 +759,6 @@ colors = PlottingPropertyValues('brgmyck') # 'w' def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): - from numpy import array, arange, ones, ma from matplotlib.pyplot import pcolor coords = array(indicatorMap.keys()) minX = min(coords[:,0])