comparison trafficintelligence/utils.py @ 1029:c6cf75a2ed08

reorganization of imports
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Mon, 18 Jun 2018 22:50:59 -0400
parents cc5cb04b04b0
children aafbc0bab925
comparison
equal deleted inserted replaced
1028:cc5cb04b04b0 1029:c6cf75a2ed08
1 #! /usr/bin/env python 1 #! /usr/bin/env python
2 # -*- coding: utf-8 -*-
3 ''' Generic utilities.''' 2 ''' Generic utilities.'''
4 3
5 import matplotlib.pyplot as plt
6 from datetime import time, datetime 4 from datetime import time, datetime
7 from argparse import ArgumentTypeError 5 from argparse import ArgumentTypeError
8 from pathlib import Path 6 from pathlib import Path
9 from math import sqrt, ceil, floor 7 from math import sqrt, ceil, floor
10 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm 8 from copy import deepcopy, copy
9
10 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t
11 from scipy.spatial import distance 11 from scipy.spatial import distance
12 from scipy.sparse import dok_matrix 12 from scipy.sparse import dok_matrix
13 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log 13 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log, polyfit
14 14 from numpy.random import permutation as nppermutation
15 from pandas import DataFrame, concat
16 import matplotlib.pyplot as plt
17
18 from trafficintelligence.storage import openCheck
15 19
16 datetimeFormat = "%Y-%m-%d %H:%M:%S" 20 datetimeFormat = "%Y-%m-%d %H:%M:%S"
17 21
18 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 22 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720
19 23
65 'returns the fitted location and scale of the lognormal (general definition)' 69 'returns the fitted location and scale of the lognormal (general definition)'
66 shape, loc, scale = lognorm.fit(x, floc=0.) 70 shape, loc, scale = lognorm.fit(x, floc=0.)
67 return log(scale), shape 71 return log(scale), shape
68 72
69 def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False): 73 def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False):
70 from scipy.stats.distributions import norm
71 if nRoundingDigits is None: 74 if nRoundingDigits is None:
72 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200. 75 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200.
73 else: 76 else:
74 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits) 77 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits)
75 stdev = round(stdev, nRoundingDigits) 78 stdev = round(stdev, nRoundingDigits)
82 '''if trueStd, use normal distribution, otherwise, Student 85 '''if trueStd, use normal distribution, otherwise, Student
83 86
84 Use otherwise t.interval or norm.interval for the boundaries 87 Use otherwise t.interval or norm.interval for the boundaries
85 ex: norm.interval(0.95) 88 ex: norm.interval(0.95)
86 t.interval(0.95, nSamples-1)''' 89 t.interval(0.95, nSamples-1)'''
87 from scipy.stats.distributions import norm, t
88 if trueStd: 90 if trueStd:
89 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) 91 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2)
90 else: # use Student 92 else: # use Student
91 k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2) 93 k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2)
92 e = k*stdev/sqrt(nSamples) 94 e = k*stdev/sqrt(nSamples)
342 return l1[0]*l2[1]-l1[1]*l2[0] 344 return l1[0]*l2[1]-l1[1]*l2[0]
343 345
344 def cat_mvgavg(cat_list, halfWidth): 346 def cat_mvgavg(cat_list, halfWidth):
345 ''' Return a list of categories/values smoothed according to a window. 347 ''' Return a list of categories/values smoothed according to a window.
346 halfWidth is the search radius on either side''' 348 halfWidth is the search radius on either side'''
347 from copy import deepcopy
348 smoothed = deepcopy(cat_list) 349 smoothed = deepcopy(cat_list)
349 for point in range(len(cat_list)): 350 for point in range(len(cat_list)):
350 lower_bound_check = max(0,point-halfWidth) 351 lower_bound_check = max(0,point-halfWidth)
351 upper_bound_check = min(len(cat_list)-1,point+halfWidth+1) 352 upper_bound_check = min(len(cat_list)-1,point+halfWidth+1)
352 window_values = cat_list[lower_bound_check:upper_bound_check] 353 window_values = cat_list[lower_bound_check:upper_bound_check]
364 return result 365 return result
365 366
366 def linearRegression(x, y, deg = 1, plotData = False): 367 def linearRegression(x, y, deg = 1, plotData = False):
367 '''returns the least square estimation of the linear regression of y = ax+b 368 '''returns the least square estimation of the linear regression of y = ax+b
368 as well as the plot''' 369 as well as the plot'''
369 from numpy.lib.polynomial import polyfit
370 from numpy.core.multiarray import arange
371 coef = polyfit(x, y, deg) 370 coef = polyfit(x, y, deg)
372 if plotData: 371 if plotData:
373 def poly(x): 372 def poly(x):
374 result = 0 373 result = 0
375 for i in range(len(coef)): 374 for i in range(len(coef)):
435 Implements uses the non-parametric Kruskal Wallis test''' 434 Implements uses the non-parametric Kruskal Wallis test'''
436 tmp = data[data[independentVariable].notnull()] 435 tmp = data[data[independentVariable].notnull()]
437 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) 436 independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
438 if len(independentVariableValues) >= 2: 437 if len(independentVariableValues) >= 2:
439 if saveLatex: 438 if saveLatex:
440 from storage import openCheck
441 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w') 439 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w')
442 for x in independentVariableValues: 440 for x in independentVariableValues:
443 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) 441 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
444 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: 442 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
445 print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])) 443 print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]))
474 472
475 correlationFunc is spearmanr or pearsonr from scipy.stats 473 correlationFunc is spearmanr or pearsonr from scipy.stats
476 text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise 474 text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise
477 475
478 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' 476 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
479 from copy import copy
480 from pandas import DataFrame
481 result = copy(independentVariables) 477 result = copy(independentVariables)
482 table1 = '' 478 table1 = ''
483 table2 = {} 479 table2 = {}
484 # constant variables 480 # constant variables
485 for var in independentVariables: 481 for var in independentVariables:
514 else: 510 else:
515 table2['Correlations'].append(cor) 511 table2['Correlations'].append(cor)
516 table2['Valeurs p'].append(p) 512 table2['Valeurs p'].append(p)
517 513
518 if saveFiles: 514 if saveFiles:
519 from storage import openCheck
520 out = openCheck(filenamePrefix+'-removed-variables.tex', 'w') 515 out = openCheck(filenamePrefix+'-removed-variables.tex', 'w')
521 out.write(latexHeader) 516 out.write(latexHeader)
522 out.write(table1) 517 out.write(table1)
523 out.write(latexFooter) 518 out.write(latexFooter)
524 out.close() 519 out.close()
596 experiments.loc[i,'nobs'] = int(results.nobs) 591 experiments.loc[i,'nobs'] = int(results.nobs)
597 return experiments 592 return experiments
598 593
599 def generateExperiments(independentVariables): 594 def generateExperiments(independentVariables):
600 '''Generates all possible models for including or not each independent variable''' 595 '''Generates all possible models for including or not each independent variable'''
601 from pandas import DataFrame
602 experiments = {} 596 experiments = {}
603 nIndependentVariables = len(independentVariables) 597 nIndependentVariables = len(independentVariables)
604 if nIndependentVariables != len(set(independentVariables)): 598 if nIndependentVariables != len(set(independentVariables)):
605 print("Duplicate variables. Exiting") 599 print("Duplicate variables. Exiting")
606 import sys 600 import sys
618 612
619 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): 613 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
620 '''Generates all possible model with the independentVariables 614 '''Generates all possible model with the independentVariables
621 and runs them, saving the results in experiments 615 and runs them, saving the results in experiments
622 with multiprocess option''' 616 with multiprocess option'''
623 from pandas import concat
624 from multiprocessing import Pool
625 experiments = generateExperiments(independentVariables) 617 experiments = generateExperiments(independentVariables)
626 nModels = len(experiments) 618 nModels = len(experiments)
627 print("Running {} models with {} processes".format(nModels, nProcesses)) 619 print("Running {} models with {} processes".format(nModels, nProcesses))
628 print("IndependentVariables: {}".format(independentVariables)) 620 print("IndependentVariables: {}".format(independentVariables))
629 if nProcesses == 1: 621 if nProcesses == 1:
640 if they improve the model 632 if they improve the model
641 633
642 The results are added to experiments if provided as argument 634 The results are added to experiments if provided as argument
643 Storing in experiment relies on the index being the number equal 635 Storing in experiment relies on the index being the number equal
644 to the binary code derived from the independent variables''' 636 to the binary code derived from the independent variables'''
645 from numpy.random import permutation as nppermutation
646 if experiments is None: 637 if experiments is None:
647 experiments = generateExperiments(independentVariables) 638 experiments = generateExperiments(independentVariables)
648 nIndependentVariables = len(independentVariables) 639 nIndependentVariables = len(independentVariables)
649 permutation = nppermutation(list(range(nIndependentVariables))) 640 permutation = nppermutation(list(range(nIndependentVariables)))
650 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} 641 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)}
710 if text is not None and 'qqplot.xlabel' in text: 701 if text is not None and 'qqplot.xlabel' in text:
711 plt.xlabel(text['qqplot.xlabel']) 702 plt.xlabel(text['qqplot.xlabel'])
712 plt.ylabel(text['qqplot.ylabel']) 703 plt.ylabel(text['qqplot.ylabel'])
713 plt.tight_layout() 704 plt.tight_layout()
714 if filenamePrefix is not None: 705 if filenamePrefix is not None:
715 from storage import openCheck
716 out = openCheck(filenamePrefix+'-coefficients.html', 'w') 706 out = openCheck(filenamePrefix+'-coefficients.html', 'w')
717 out.write(results.summary().as_html()) 707 out.write(results.summary().as_html())
718 plt.savefig(filenamePrefix+'-model-results.'+figureFileType) 708 plt.savefig(filenamePrefix+'-model-results.'+figureFileType)
719 709
720 ######################### 710 #########################
883 # plotting section 873 # plotting section
884 ######################### 874 #########################
885 875
886 def plotPolygon(poly, options = '', **kwargs): 876 def plotPolygon(poly, options = '', **kwargs):
887 'Plots shapely polygon poly' 877 'Plots shapely polygon poly'
888 from matplotlib.pyplot import plot
889 x,y = poly.exterior.xy 878 x,y = poly.exterior.xy
890 plot(x, y, options, **kwargs) 879 plt.plot(x, y, options, **kwargs)
891 880
892 def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1): 881 def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1):
893 '''for each value in X, increment by increment the initial count 882 '''for each value in X, increment by increment the initial count
894 returns the lists that can be plotted 883 returns the lists that can be plotted
895 to obtain a step plot increasing by one for each value in x, from first to last value 884 to obtain a step plot increasing by one for each value in x, from first to last value
925 else: 914 else:
926 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.'])) 915 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.']))
927 plt.rc('axes', prop_cycle=monochrome) 916 plt.rc('axes', prop_cycle=monochrome)
928 917
929 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): 918 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1):
930 from matplotlib.pyplot import pcolor
931 coords = array(list(indicatorMap.keys())) 919 coords = array(list(indicatorMap.keys()))
932 minX = min(coords[:,0]) 920 minX = min(coords[:,0])
933 minY = min(coords[:,1]) 921 minY = min(coords[:,1])
934 X = arange(minX, max(coords[:,0])+1.1)*squareSize 922 X = arange(minX, max(coords[:,0])+1.1)*squareSize
935 Y = arange(minY, max(coords[:,1])+1.1)*squareSize 923 Y = arange(minY, max(coords[:,1])+1.1)*squareSize
936 C = defaultValue*ones((len(Y), len(X))) 924 C = defaultValue*ones((len(Y), len(X)))
937 for k,v in indicatorMap.items(): 925 for k,v in indicatorMap.items():
938 C[k[1]-minY,k[0]-minX] = v 926 C[k[1]-minY,k[0]-minX] = v
939 if masked: 927 if masked:
940 pcolor(X, Y, ma.masked_where(C==defaultValue,C)) 928 plt.pcolor(X, Y, ma.masked_where(C==defaultValue,C))
941 else: 929 else:
942 pcolor(X, Y, C) 930 plt.pcolor(X, Y, C)
943 931
944 ######################### 932 #########################
945 # Data download 933 # Data download
946 ######################### 934 #########################
947 935