Mercurial Hosting > traffic-intelligence
comparison trafficintelligence/utils.py @ 1029:c6cf75a2ed08
reorganization of imports
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Mon, 18 Jun 2018 22:50:59 -0400 |
parents | cc5cb04b04b0 |
children | aafbc0bab925 |
comparison
equal
deleted
inserted
replaced
1028:cc5cb04b04b0 | 1029:c6cf75a2ed08 |
---|---|
1 #! /usr/bin/env python | 1 #! /usr/bin/env python |
2 # -*- coding: utf-8 -*- | |
3 ''' Generic utilities.''' | 2 ''' Generic utilities.''' |
4 | 3 |
5 import matplotlib.pyplot as plt | |
6 from datetime import time, datetime | 4 from datetime import time, datetime |
7 from argparse import ArgumentTypeError | 5 from argparse import ArgumentTypeError |
8 from pathlib import Path | 6 from pathlib import Path |
9 from math import sqrt, ceil, floor | 7 from math import sqrt, ceil, floor |
10 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm | 8 from copy import deepcopy, copy |
9 | |
10 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm, norm, t | |
11 from scipy.spatial import distance | 11 from scipy.spatial import distance |
12 from scipy.sparse import dok_matrix | 12 from scipy.sparse import dok_matrix |
13 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log | 13 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log, polyfit |
14 | 14 from numpy.random import permutation as nppermutation |
15 from pandas import DataFrame, concat | |
16 import matplotlib.pyplot as plt | |
17 | |
18 from trafficintelligence.storage import openCheck | |
15 | 19 |
16 datetimeFormat = "%Y-%m-%d %H:%M:%S" | 20 datetimeFormat = "%Y-%m-%d %H:%M:%S" |
17 | 21 |
18 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 | 22 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 |
19 | 23 |
65 'returns the fitted location and scale of the lognormal (general definition)' | 69 'returns the fitted location and scale of the lognormal (general definition)' |
66 shape, loc, scale = lognorm.fit(x, floc=0.) | 70 shape, loc, scale = lognorm.fit(x, floc=0.) |
67 return log(scale), shape | 71 return log(scale), shape |
68 | 72 |
69 def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False): | 73 def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False): |
70 from scipy.stats.distributions import norm | |
71 if nRoundingDigits is None: | 74 if nRoundingDigits is None: |
72 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200. | 75 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200. |
73 else: | 76 else: |
74 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits) | 77 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits) |
75 stdev = round(stdev, nRoundingDigits) | 78 stdev = round(stdev, nRoundingDigits) |
82 '''if trueStd, use normal distribution, otherwise, Student | 85 '''if trueStd, use normal distribution, otherwise, Student |
83 | 86 |
84 Use otherwise t.interval or norm.interval for the boundaries | 87 Use otherwise t.interval or norm.interval for the boundaries |
85 ex: norm.interval(0.95) | 88 ex: norm.interval(0.95) |
86 t.interval(0.95, nSamples-1)''' | 89 t.interval(0.95, nSamples-1)''' |
87 from scipy.stats.distributions import norm, t | |
88 if trueStd: | 90 if trueStd: |
89 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) | 91 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) |
90 else: # use Student | 92 else: # use Student |
91 k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2) | 93 k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2) |
92 e = k*stdev/sqrt(nSamples) | 94 e = k*stdev/sqrt(nSamples) |
342 return l1[0]*l2[1]-l1[1]*l2[0] | 344 return l1[0]*l2[1]-l1[1]*l2[0] |
343 | 345 |
344 def cat_mvgavg(cat_list, halfWidth): | 346 def cat_mvgavg(cat_list, halfWidth): |
345 ''' Return a list of categories/values smoothed according to a window. | 347 ''' Return a list of categories/values smoothed according to a window. |
346 halfWidth is the search radius on either side''' | 348 halfWidth is the search radius on either side''' |
347 from copy import deepcopy | |
348 smoothed = deepcopy(cat_list) | 349 smoothed = deepcopy(cat_list) |
349 for point in range(len(cat_list)): | 350 for point in range(len(cat_list)): |
350 lower_bound_check = max(0,point-halfWidth) | 351 lower_bound_check = max(0,point-halfWidth) |
351 upper_bound_check = min(len(cat_list)-1,point+halfWidth+1) | 352 upper_bound_check = min(len(cat_list)-1,point+halfWidth+1) |
352 window_values = cat_list[lower_bound_check:upper_bound_check] | 353 window_values = cat_list[lower_bound_check:upper_bound_check] |
364 return result | 365 return result |
365 | 366 |
366 def linearRegression(x, y, deg = 1, plotData = False): | 367 def linearRegression(x, y, deg = 1, plotData = False): |
367 '''returns the least square estimation of the linear regression of y = ax+b | 368 '''returns the least square estimation of the linear regression of y = ax+b |
368 as well as the plot''' | 369 as well as the plot''' |
369 from numpy.lib.polynomial import polyfit | |
370 from numpy.core.multiarray import arange | |
371 coef = polyfit(x, y, deg) | 370 coef = polyfit(x, y, deg) |
372 if plotData: | 371 if plotData: |
373 def poly(x): | 372 def poly(x): |
374 result = 0 | 373 result = 0 |
375 for i in range(len(coef)): | 374 for i in range(len(coef)): |
435 Implements uses the non-parametric Kruskal Wallis test''' | 434 Implements uses the non-parametric Kruskal Wallis test''' |
436 tmp = data[data[independentVariable].notnull()] | 435 tmp = data[data[independentVariable].notnull()] |
437 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) | 436 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) |
438 if len(independentVariableValues) >= 2: | 437 if len(independentVariableValues) >= 2: |
439 if saveLatex: | 438 if saveLatex: |
440 from storage import openCheck | |
441 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w') | 439 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w') |
442 for x in independentVariableValues: | 440 for x in independentVariableValues: |
443 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) | 441 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) |
444 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: | 442 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: |
445 print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])) | 443 print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])) |
474 | 472 |
475 correlationFunc is spearmanr or pearsonr from scipy.stats | 473 correlationFunc is spearmanr or pearsonr from scipy.stats |
476 text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise | 474 text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise |
477 | 475 |
478 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' | 476 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' |
479 from copy import copy | |
480 from pandas import DataFrame | |
481 result = copy(independentVariables) | 477 result = copy(independentVariables) |
482 table1 = '' | 478 table1 = '' |
483 table2 = {} | 479 table2 = {} |
484 # constant variables | 480 # constant variables |
485 for var in independentVariables: | 481 for var in independentVariables: |
514 else: | 510 else: |
515 table2['Correlations'].append(cor) | 511 table2['Correlations'].append(cor) |
516 table2['Valeurs p'].append(p) | 512 table2['Valeurs p'].append(p) |
517 | 513 |
518 if saveFiles: | 514 if saveFiles: |
519 from storage import openCheck | |
520 out = openCheck(filenamePrefix+'-removed-variables.tex', 'w') | 515 out = openCheck(filenamePrefix+'-removed-variables.tex', 'w') |
521 out.write(latexHeader) | 516 out.write(latexHeader) |
522 out.write(table1) | 517 out.write(table1) |
523 out.write(latexFooter) | 518 out.write(latexFooter) |
524 out.close() | 519 out.close() |
596 experiments.loc[i,'nobs'] = int(results.nobs) | 591 experiments.loc[i,'nobs'] = int(results.nobs) |
597 return experiments | 592 return experiments |
598 | 593 |
599 def generateExperiments(independentVariables): | 594 def generateExperiments(independentVariables): |
600 '''Generates all possible models for including or not each independent variable''' | 595 '''Generates all possible models for including or not each independent variable''' |
601 from pandas import DataFrame | |
602 experiments = {} | 596 experiments = {} |
603 nIndependentVariables = len(independentVariables) | 597 nIndependentVariables = len(independentVariables) |
604 if nIndependentVariables != len(set(independentVariables)): | 598 if nIndependentVariables != len(set(independentVariables)): |
605 print("Duplicate variables. Exiting") | 599 print("Duplicate variables. Exiting") |
606 import sys | 600 import sys |
618 | 612 |
619 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): | 613 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): |
620 '''Generates all possible model with the independentVariables | 614 '''Generates all possible model with the independentVariables |
621 and runs them, saving the results in experiments | 615 and runs them, saving the results in experiments |
622 with multiprocess option''' | 616 with multiprocess option''' |
623 from pandas import concat | |
624 from multiprocessing import Pool | |
625 experiments = generateExperiments(independentVariables) | 617 experiments = generateExperiments(independentVariables) |
626 nModels = len(experiments) | 618 nModels = len(experiments) |
627 print("Running {} models with {} processes".format(nModels, nProcesses)) | 619 print("Running {} models with {} processes".format(nModels, nProcesses)) |
628 print("IndependentVariables: {}".format(independentVariables)) | 620 print("IndependentVariables: {}".format(independentVariables)) |
629 if nProcesses == 1: | 621 if nProcesses == 1: |
640 if they improve the model | 632 if they improve the model |
641 | 633 |
642 The results are added to experiments if provided as argument | 634 The results are added to experiments if provided as argument |
643 Storing in experiment relies on the index being the number equal | 635 Storing in experiment relies on the index being the number equal |
644 to the binary code derived from the independent variables''' | 636 to the binary code derived from the independent variables''' |
645 from numpy.random import permutation as nppermutation | |
646 if experiments is None: | 637 if experiments is None: |
647 experiments = generateExperiments(independentVariables) | 638 experiments = generateExperiments(independentVariables) |
648 nIndependentVariables = len(independentVariables) | 639 nIndependentVariables = len(independentVariables) |
649 permutation = nppermutation(list(range(nIndependentVariables))) | 640 permutation = nppermutation(list(range(nIndependentVariables))) |
650 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} | 641 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} |
710 if text is not None and 'qqplot.xlabel' in text: | 701 if text is not None and 'qqplot.xlabel' in text: |
711 plt.xlabel(text['qqplot.xlabel']) | 702 plt.xlabel(text['qqplot.xlabel']) |
712 plt.ylabel(text['qqplot.ylabel']) | 703 plt.ylabel(text['qqplot.ylabel']) |
713 plt.tight_layout() | 704 plt.tight_layout() |
714 if filenamePrefix is not None: | 705 if filenamePrefix is not None: |
715 from storage import openCheck | |
716 out = openCheck(filenamePrefix+'-coefficients.html', 'w') | 706 out = openCheck(filenamePrefix+'-coefficients.html', 'w') |
717 out.write(results.summary().as_html()) | 707 out.write(results.summary().as_html()) |
718 plt.savefig(filenamePrefix+'-model-results.'+figureFileType) | 708 plt.savefig(filenamePrefix+'-model-results.'+figureFileType) |
719 | 709 |
720 ######################### | 710 ######################### |
883 # plotting section | 873 # plotting section |
884 ######################### | 874 ######################### |
885 | 875 |
886 def plotPolygon(poly, options = '', **kwargs): | 876 def plotPolygon(poly, options = '', **kwargs): |
887 'Plots shapely polygon poly' | 877 'Plots shapely polygon poly' |
888 from matplotlib.pyplot import plot | |
889 x,y = poly.exterior.xy | 878 x,y = poly.exterior.xy |
890 plot(x, y, options, **kwargs) | 879 plt.plot(x, y, options, **kwargs) |
891 | 880 |
892 def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1): | 881 def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1): |
893 '''for each value in X, increment by increment the initial count | 882 '''for each value in X, increment by increment the initial count |
894 returns the lists that can be plotted | 883 returns the lists that can be plotted |
895 to obtain a step plot increasing by one for each value in x, from first to last value | 884 to obtain a step plot increasing by one for each value in x, from first to last value |
925 else: | 914 else: |
926 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.'])) | 915 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.'])) |
927 plt.rc('axes', prop_cycle=monochrome) | 916 plt.rc('axes', prop_cycle=monochrome) |
928 | 917 |
929 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): | 918 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): |
930 from matplotlib.pyplot import pcolor | |
931 coords = array(list(indicatorMap.keys())) | 919 coords = array(list(indicatorMap.keys())) |
932 minX = min(coords[:,0]) | 920 minX = min(coords[:,0]) |
933 minY = min(coords[:,1]) | 921 minY = min(coords[:,1]) |
934 X = arange(minX, max(coords[:,0])+1.1)*squareSize | 922 X = arange(minX, max(coords[:,0])+1.1)*squareSize |
935 Y = arange(minY, max(coords[:,1])+1.1)*squareSize | 923 Y = arange(minY, max(coords[:,1])+1.1)*squareSize |
936 C = defaultValue*ones((len(Y), len(X))) | 924 C = defaultValue*ones((len(Y), len(X))) |
937 for k,v in indicatorMap.items(): | 925 for k,v in indicatorMap.items(): |
938 C[k[1]-minY,k[0]-minX] = v | 926 C[k[1]-minY,k[0]-minX] = v |
939 if masked: | 927 if masked: |
940 pcolor(X, Y, ma.masked_where(C==defaultValue,C)) | 928 plt.pcolor(X, Y, ma.masked_where(C==defaultValue,C)) |
941 else: | 929 else: |
942 pcolor(X, Y, C) | 930 plt.pcolor(X, Y, C) |
943 | 931 |
944 ######################### | 932 ######################### |
945 # Data download | 933 # Data download |
946 ######################### | 934 ######################### |
947 | 935 |