comparison python/utils.py @ 677:ae07c7b4cf87

update to utils for pavement results
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Sat, 30 May 2015 23:30:21 +0200
parents 58b9ac2f262f
children da1352b89d02
comparison
equal deleted inserted replaced
676:58b9ac2f262f 677:ae07c7b4cf87
1 #! /usr/bin/env python 1 #! /usr/bin/env python
2 # -*- coding: utf-8 -*-
2 ''' Generic utilities.''' 3 ''' Generic utilities.'''
3 4
4 import matplotlib.pyplot as plt 5 import matplotlib.pyplot as plt
5 from datetime import time, datetime 6 from datetime import time, datetime
6 from math import sqrt, ceil, floor 7 from math import sqrt, ceil, floor
7 from scipy.stats import kruskal, shapiro 8 from scipy.stats import kruskal, shapiro
8 from numpy import zeros, array, exp, sum as npsum, arange, cumsum 9 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, median, isnan, ones, convolve, dtype, isnan, NaN, mean, ma
10
9 11
10 datetimeFormat = "%Y-%m-%d %H:%M:%S" 12 datetimeFormat = "%Y-%m-%d %H:%M:%S"
11 13
12 ######################### 14 #########################
13 # Enumerations 15 # Enumerations
195 else: 197 else:
196 return 0. 198 return 0.
197 199
198 def medianSmoothing(x, X, Y, halfwidth): 200 def medianSmoothing(x, X, Y, halfwidth):
199 '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]''' 201 '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]'''
200 from numpy import median
201 return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth]) 202 return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth])
202 203
203 def argmaxDict(d): 204 def argmaxDict(d):
204 return max(d, key=d.get) 205 return max(d, key=d.get)
205 206
253 return smoothed 254 return smoothed
254 255
255 def filterMovingWindow(inputSignal, halfWidth): 256 def filterMovingWindow(inputSignal, halfWidth):
256 '''Returns an array obtained after the smoothing of the input by a moving average 257 '''Returns an array obtained after the smoothing of the input by a moving average
257 The first and last points are copied from the original.''' 258 The first and last points are copied from the original.'''
258 from numpy import ones,convolve,array
259 width = float(halfWidth*2+1) 259 width = float(halfWidth*2+1)
260 win = ones(width,'d') 260 win = ones(width,'d')
261 result = convolve(win/width,array(inputSignal),'same') 261 result = convolve(win/width,array(inputSignal),'same')
262 result[:halfWidth] = inputSignal[:halfWidth] 262 result[:halfWidth] = inputSignal[:halfWidth]
263 result[-halfWidth:] = inputSignal[-halfWidth:] 263 result[-halfWidth:] = inputSignal[-halfWidth:]
280 plt.plot(xx, [poly(z) for z in xx]) 280 plt.plot(xx, [poly(z) for z in xx])
281 return coef 281 return coef
282 282
283 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None): 283 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
284 '''Computes (and displays) the correlation matrix for a pandas DataFrame''' 284 '''Computes (and displays) the correlation matrix for a pandas DataFrame'''
285 c=data.corr(correlationMethod) 285 columns = data.columns.tolist()
286 for var in data.columns:
287 uniqueValues = data[var].unique()
288 if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan
289 columns.remove(var)
290 c=data[columns].corr(correlationMethod)
286 if plotFigure: 291 if plotFigure:
287 fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0])) 292 fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0]))
288 fig.add_subplot(1,1,1) 293 fig.add_subplot(1,1,1)
289 #plt.imshow(np.fabs(c), interpolation='none') 294 #plt.imshow(np.fabs(c), interpolation='none')
290 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm 295 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
291 colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()] 296 colnames = [displayNames.get(s.strip(), s.strip()) for s in columns]
292 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename) 297 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
293 plt.xticks(range(len(colnames)), colnames, rotation=90) 298 plt.xticks(range(len(colnames)), colnames, rotation=90)
294 plt.yticks(range(len(colnames)), colnames) 299 plt.yticks(range(len(colnames)), colnames)
295 plt.tick_params('both', length=0) 300 plt.tick_params('both', length=0)
296 plt.subplots_adjust(bottom = 0.29) 301 plt.subplots_adjust(bottom = 0.29)
297 plt.colorbar() 302 plt.colorbar()
298 plt.title('Correlation ({})'.format(correlationMethod)) 303 plt.title('Correlation ({})'.format(correlationMethod))
299 plt.tight_layout() 304 plt.tight_layout()
305 if len(colnames) > 50:
306 plt.subplots_adjust(left=.06)
300 if figureFilename is not None: 307 if figureFilename is not None:
301 plt.savefig(figureFilename, dpi = 150, transparent = True) 308 plt.savefig(figureFilename, dpi = 150, transparent = True)
302 return c 309 return c
303 310
304 def addDummies(data, variables, allVariables = True): 311 def addDummies(data, variables, allVariables = True):
305 '''Add binary dummy variables for each value of a nominal variable 312 '''Add binary dummy variables for each value of a nominal variable
306 in a pandas DataFrame''' 313 in a pandas DataFrame'''
307 from numpy import NaN, dtype
308 newVariables = [] 314 newVariables = []
309 for var in variables: 315 for var in variables:
310 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2: 316 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
311 values = data[var].unique() 317 values = data[var].unique()
312 if not allVariables: 318 if not allVariables:
316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') 322 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
317 data[newVariable] = (data[var] == val) 323 data[newVariable] = (data[var] == val)
318 newVariables.append(newVariable) 324 newVariables.append(newVariable)
319 return newVariables 325 return newVariables
320 326
321 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, translate = lambda s: s, kwCaption = u''): 327 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = u''):
322 '''Studies the influence of (nominal) independent variable over the dependent variable 328 '''Studies the influence of (nominal) independent variable over the dependent variable
323 329
324 Makes tests if the conditional distributions are normal 330 Makes tests if the conditional distributions are normal
325 using the Shapiro-Wilk test (in which case ANOVA could be used) 331 using the Shapiro-Wilk test (in which case ANOVA could be used)
326 Implements uses the non-parametric Kruskal Wallis test''' 332 Implements uses the non-parametric Kruskal Wallis test'''
343 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) 349 plt.title('{} vs {}'.format(dependentVariable, independentVariable))
344 if filenamePrefix is not None: 350 if filenamePrefix is not None:
345 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) 351 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
346 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) 352 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
347 table['count'] = table['count'].astype(int) 353 table['count'] = table['count'].astype(int)
348 #table.index.rename(translate(table.index.name), inplace = True)
349 testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) 354 testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
350 if saveLatex: 355 if saveLatex:
351 out.write(translate('\\begin{minipage}{\\linewidth}\n' 356 out.write('\\begin{minipage}{\\linewidth}\n'
352 +'\\centering\n' 357 +'\\centering\n'
353 +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n' 358 +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n'
354 +table.to_latex(float_format = lambda x: '{:.2f}'.format(x)).encode('ascii')+'\n' 359 +table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n'
355 +'\\end{minipage}\n' 360 +'\\end{minipage}\n'
356 +'\\vspace{0.5cm}\n')) 361 +'\\ \\vspace{0.5cm}\n')
357 else: 362 else:
358 print table 363 print table
359 return testResult 364 return testResult
360 else: 365 else:
361 return None 366 return None
362 367
363 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): 368 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''):
364 '''Removes variables from candidate independent variables if 369 '''Removes variables from candidate independent variables if
365 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed 370 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed
366 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP) 371 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
367 Returns the remaining non-correlated variables, correlated with the dependent variable 372 Returns the remaining non-correlated variables, correlated with the dependent variable
368 373
369 correlationFunc is spearmanr or pearsonr from scipy.stats 374 correlationFunc is spearmanr or pearsonr from scipy.stats
370 375 text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise
371 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' 376
372 from numpy import dtype 377 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
373 from copy import copy 378 from copy import copy
379 from pandas import DataFrame
374 result = copy(independentVariables) 380 result = copy(independentVariables)
375 for v1 in independentVariables: 381 table1 = ''
382 table2 = {}
383 # constant variables
384 for var in independentVariables:
385 uniqueValues = data[var].unique()
386 if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1):
387 print(stdoutText[0].format(var, uniqueValues))
388 if saveFiles:
389 table1 += latexTable[0].format(var, *uniqueValues)
390 result.remove(var)
391 # correlated variables
392 for v1 in copy(result):
376 if v1 in correlations.index: 393 if v1 in correlations.index:
377 for v2 in independentVariables: 394 for v2 in copy(result):
378 if v2 != v1 and v2 in correlations.index: 395 if v2 != v1 and v2 in correlations.index:
379 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold: 396 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
380 if v1 in result and v2 in result: 397 if v1 in result and v2 in result:
381 print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1)) 398 if saveFiles:
399 table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2])
400 print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2]))
382 result.remove(v2) 401 result.remove(v2)
383 #regressionIndependentVariables = result 402 # not correlated with dependent variable
403 table2['Correlations'] = []
404 table2['Valeurs p'] = []
384 for var in copy(result): 405 for var in copy(result):
385 if data.dtypes[var] != dtype('O'): 406 if data.dtypes[var] != dtype('O'):
386 cor, p = correlationFunc(data[dependentVariable], data[var]) 407 cor, p = correlationFunc(data[dependentVariable], data[var])
387 if p > maxCorrelationP: 408 if p > maxCorrelationP:
388 print('Removing {} (no correlation p={})'.format(var, p)) 409 if saveFiles:
410 table1 += latexTable[2].format(var, cor, p)
411 print(stdoutText[2].format(var, cor, p))
389 result.remove(var) 412 result.remove(var)
413 else:
414 table2['Correlations'].append(cor)
415 table2['Valeurs p'].append(p)
416
417 if saveFiles:
418 from storage import openCheck
419 out = openCheck(filenamePrefix+'-removed-variables.tex', 'w')
420 out.write(latexHeader)
421 out.write(table1)
422 out.write(latexFooter)
423 out.close()
424 out = openCheck(filenamePrefix+'-correlations.html', 'w')
425 table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')]
426 out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False))
427 out.close()
390 return result 428 return result
391 429
392 430
393 ######################### 431 #########################
394 # regression analysis using statsmodels (and pandas) 432 # regression analysis using statsmodels (and pandas)
427 experiments.loc[i,'nobs'] = int(results.nobs) 465 experiments.loc[i,'nobs'] = int(results.nobs)
428 return experiments 466 return experiments
429 467
430 def generateExperiments(independentVariables): 468 def generateExperiments(independentVariables):
431 '''Generates all possible models for including or not each independent variable''' 469 '''Generates all possible models for including or not each independent variable'''
432 from numpy import nan
433 from pandas import DataFrame 470 from pandas import DataFrame
434 experiments = {} 471 experiments = {}
435 nIndependentVariables = len(independentVariables) 472 nIndependentVariables = len(independentVariables)
436 if nIndependentVariables != len(set(independentVariables)): 473 if nIndependentVariables != len(set(independentVariables)):
437 print("Duplicate variables. Exiting") 474 print("Duplicate variables. Exiting")
441 for i,var in enumerate(independentVariables): 478 for i,var in enumerate(independentVariables):
442 pattern = [False]*(2**i)+[True]*(2**i) 479 pattern = [False]*(2**i)+[True]*(2**i)
443 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) 480 experiments[var] = pattern*(2**(nIndependentVariables-i-1))
444 experiments = DataFrame(experiments) 481 experiments = DataFrame(experiments)
445 experiments['r2adj'] = 0. 482 experiments['r2adj'] = 0.
446 experiments['condNum'] = nan 483 experiments['condNum'] = NaN
447 experiments['shapiroP'] = -1 484 experiments['shapiroP'] = -1
448 experiments['nobs'] = -1 485 experiments['nobs'] = -1
449 return experiments 486 return experiments
450 487
451 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): 488 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
500 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: 537 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
501 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] 538 currentR2Adj = experiments.loc[rowIdx, 'r2adj']
502 bestModel[currentVarNum] = True 539 bestModel[currentVarNum] = True
503 return experiments 540 return experiments
504 541
505 def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf'): 542 def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}):
506 import statsmodels.api as sm 543 import statsmodels.api as sm
507 '''Displays some model results''' 544 '''Displays some model results
545
546 3 graphics, true-predicted, residuals-predicted, '''
508 print(results.summary()) 547 print(results.summary())
509 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) 548 shapiroResult = shapiro(results.resid)
549 print(shapiroResult)
510 if plotFigures: 550 if plotFigures:
551 fig = plt.figure(figsize=(7,6.3*(2+int(model is not None))))
511 if model is not None: 552 if model is not None:
512 plt.figure() 553 ax = fig.add_subplot(3,1,1)
513 plt.plot(results.predict(), model.endog, 'x') 554 plt.plot(results.predict(), model.endog, 'x')
514 x=plt.xlim() 555 x=plt.xlim()
515 y=plt.ylim() 556 y=plt.ylim()
516 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') 557 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
517 plt.title('true vs predicted') 558 #plt.axis('equal')
518 if filenamePrefix is not None: 559 if text is not None:
519 plt.savefig(filenamePrefix+'-true-predicted.'+figureFileType) 560 plt.title(text['title-shapiro'].format(*shapiroResult))
520 plt.figure() 561 #plt.title(text['true-predicted.title'])
521 plt.plot(results.predict(), results.resid, 'x') 562 plt.xlabel(text['true-predicted.xlabel'])
563 plt.ylabel(text['true-predicted.ylabel'])
564 fig.add_subplot(3,1,2, sharex = ax)
565 plt.plot(results.predict(), results.resid, 'x')
566 nextSubplotNum = 3
567 else:
568 fig.add_subplot(2,1,1)
569 plt.plot(results.predict(), results.resid, 'x')
570 nextSubplotNum = 2
571 if text is not None:
572 if model is None:
573 plt.title(text['title-shapiro'].format(*shapiroResult))
574 plt.xlabel(text['residuals-predicted.xlabel'])
575 plt.ylabel(text['residuals-predicted.ylabel'])
576 qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum)
577 sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx)
578 plt.axis('equal')
579 if text is not None and 'qqplot.xlabel' in text:
580 plt.xlabel(text['qqplot.xlabel'])
581 plt.ylabel(text['qqplot.ylabel'])
582 plt.tight_layout()
522 if filenamePrefix is not None: 583 if filenamePrefix is not None:
523 plt.savefig(filenamePrefix+'-residuals.'+figureFileType) 584 from storage import openCheck
524 plt.title('residuals vs predicted') 585 out = openCheck(filenamePrefix+'-coefficients.html', 'w')
525 sm.qqplot(results.resid, fit = True, line = '45') 586 out.write(results.summary().as_html())
526 if filenamePrefix is not None: 587 plt.savefig(filenamePrefix+'-model-results.'+figureFileType)
527 plt.savefig(filenamePrefix+'-qq.'+figureFileType)
528
529 588
530 ######################### 589 #########################
531 # iterable section 590 # iterable section
532 ######################### 591 #########################
533 592
567 self.delta = delta 626 self.delta = delta
568 self.lengthFunc = lengthFunc 627 self.lengthFunc = lengthFunc
569 self.subSequenceIndices = [(0,0)] 628 self.subSequenceIndices = [(0,0)]
570 629
571 def similarities(self, l1, l2, jshift=0): 630 def similarities(self, l1, l2, jshift=0):
572 from numpy import zeros, int as npint
573 n1 = len(l1) 631 n1 = len(l1)
574 n2 = len(l2) 632 n2 = len(l2)
575 self.similarityTable = zeros((n1+1,n2+1), dtype = npint) 633 self.similarityTable = zeros((n1+1,n2+1), dtype = npint)
576 for i in xrange(1,n1+1): 634 for i in xrange(1,n1+1):
577 for j in xrange(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): 635 for j in xrange(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1):
638 def compute(self, l1, l2, computeSubSequence = False): 696 def compute(self, l1, l2, computeSubSequence = False):
639 '''get methods are to be shadowed in child classes ''' 697 '''get methods are to be shadowed in child classes '''
640 return self._compute(l1, l2, computeSubSequence) 698 return self._compute(l1, l2, computeSubSequence)
641 699
642 def computeAlignment(self): 700 def computeAlignment(self):
643 from numpy import mean
644 return mean([j-i for i,j in self.subSequenceIndices]) 701 return mean([j-i for i,j in self.subSequenceIndices])
645 702
646 def _computeNormalized(self, l1, l2, computeSubSequence = False): 703 def _computeNormalized(self, l1, l2, computeSubSequence = False):
647 ''' compute the normalized LCSS 704 ''' compute the normalized LCSS
648 ie, the LCSS divided by the min or mean of the indicator lengths (using lengthFunc) 705 ie, the LCSS divided by the min or mean of the indicator lengths (using lengthFunc)
700 linestyles = PlottingPropertyValues(['-', '--', '-.', ':']) 757 linestyles = PlottingPropertyValues(['-', '--', '-.', ':'])
701 758
702 colors = PlottingPropertyValues('brgmyck') # 'w' 759 colors = PlottingPropertyValues('brgmyck') # 'w'
703 760
704 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): 761 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1):
705 from numpy import array, arange, ones, ma
706 from matplotlib.pyplot import pcolor 762 from matplotlib.pyplot import pcolor
707 coords = array(indicatorMap.keys()) 763 coords = array(indicatorMap.keys())
708 minX = min(coords[:,0]) 764 minX = min(coords[:,0])
709 minY = min(coords[:,1]) 765 minY = min(coords[:,1])
710 X = arange(minX, max(coords[:,0])+1.1)*squareSize 766 X = arange(minX, max(coords[:,0])+1.1)*squareSize