Mercurial Hosting > traffic-intelligence
comparison python/utils.py @ 997:4f3387a242a1
updated utils to python 3
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Fri, 25 May 2018 18:15:18 -0400 |
parents | 94bee7b604eb |
children | 16932cefabc1 |
comparison
equal
deleted
inserted
replaced
996:add667153087 | 997:4f3387a242a1 |
---|---|
299 return 1 | 299 return 1 |
300 | 300 |
301 def sortByLength(instances, reverse = False): | 301 def sortByLength(instances, reverse = False): |
302 '''Returns a new list with the instances sorted by length (method __len__) | 302 '''Returns a new list with the instances sorted by length (method __len__) |
303 reverse is passed to sorted''' | 303 reverse is passed to sorted''' |
304 return sorted(instances, cmp = compareLengthForSort, reverse = reverse) | 304 return sorted(instances, key = len, reverse = reverse) |
305 | 305 |
306 def ceilDecimals(v, nDecimals): | 306 def ceilDecimals(v, nDecimals): |
307 '''Rounds the number at the nth decimal | 307 '''Rounds the number at the nth decimal |
308 eg 1.23 at 0 decimal is 2, at 1 decimal is 1.3''' | 308 eg 1.23 at 0 decimal is 2, at 1 decimal is 1.3''' |
309 tens = 10**nDecimals | 309 tens = 10**nDecimals |
404 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') | 404 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') |
405 data[newVariable] = (data[var] == val) | 405 data[newVariable] = (data[var] == val) |
406 newVariables.append(newVariable) | 406 newVariables.append(newVariable) |
407 return newVariables | 407 return newVariables |
408 | 408 |
409 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = u''): | 409 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = ''): |
410 '''Studies the influence of (nominal) independent variable over the dependent variable | 410 '''Studies the influence of (nominal) independent variable over the dependent variable |
411 | 411 |
412 Makes tests if the conditional distributions are normal | 412 Makes tests if the conditional distributions are normal |
413 using the Shapiro-Wilk test (in which case ANOVA could be used) | 413 using the Shapiro-Wilk test (in which case ANOVA could be used) |
414 Implements uses the non-parametric Kruskal Wallis test''' | 414 Implements uses the non-parametric Kruskal Wallis test''' |
510 return result | 510 return result |
511 | 511 |
512 def saveDokMatrix(filename, m, lowerTriangle = False): | 512 def saveDokMatrix(filename, m, lowerTriangle = False): |
513 'Saves a dok_matrix using savez' | 513 'Saves a dok_matrix using savez' |
514 if lowerTriangle: | 514 if lowerTriangle: |
515 keys = [k for k in m.keys() if k[0] > k[1]] | 515 keys = [k for k in m if k[0] > k[1]] |
516 savez(filename, shape = m.shape, keys = keys, values = [m[k[0],k[1]] for k in keys]) | 516 savez(filename, shape = m.shape, keys = keys, values = [m[k[0],k[1]] for k in keys]) |
517 else: | 517 else: |
518 savez(filename, shape = m.shape, keys = m.keys(), values = m.values()) | 518 savez(filename, shape = m.shape, keys = list(m.keys()), values = list(m.values())) |
519 | 519 |
520 def loadDokMatrix(filename): | 520 def loadDokMatrix(filename): |
521 'Loads a dok_matrix saved using the above saveDokMatrix' | 521 'Loads a dok_matrix saved using the above saveDokMatrix' |
522 data = npload(filename) | 522 data = npload(filename) |
523 m = dok_matrix(tuple(data['shape'])) | 523 m = dok_matrix(tuple(data['shape'])) |
610 to the binary code derived from the independent variables''' | 610 to the binary code derived from the independent variables''' |
611 from numpy.random import permutation as nppermutation | 611 from numpy.random import permutation as nppermutation |
612 if experiments is None: | 612 if experiments is None: |
613 experiments = generateExperiments(independentVariables) | 613 experiments = generateExperiments(independentVariables) |
614 nIndependentVariables = len(independentVariables) | 614 nIndependentVariables = len(independentVariables) |
615 permutation = nppermutation(range(nIndependentVariables)).tolist() | 615 permutation = nppermutation(list(range(nIndependentVariables))) |
616 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} | 616 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} |
617 print('Tested variables '+', '.join([variableMapping[i] for i in xrange(nIndependentVariables)])) | 617 print('Tested variables '+', '.join([variableMapping[i] for i in range(nIndependentVariables)])) |
618 bestModel = [False]*nIndependentVariables | 618 bestModel = [False]*nIndependentVariables |
619 currentVarNum = 0 | 619 currentVarNum = 0 |
620 currentR2Adj = 0. | 620 currentR2Adj = 0. |
621 for currentVarNum in xrange(nIndependentVariables): | 621 for currentVarNum in range(nIndependentVariables): |
622 currentModel = [i for i in bestModel] | 622 currentModel = [i for i in bestModel] |
623 currentModel[currentVarNum] = True | 623 currentModel[currentVarNum] = True |
624 rowIdx = sum([0]+[2**i for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) | 624 rowIdx = sum([0]+[2**i for i in range(nIndependentVariables) if currentModel[permutation[i]]]) |
625 #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) | 625 #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in range(nIndependentVariables) if currentModel[permutation[i]]]) |
626 if experiments.loc[rowIdx, 'shapiroP'] < 0: | 626 if experiments.loc[rowIdx, 'shapiroP'] < 0: |
627 modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables) | 627 modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables) |
628 model = modelFunc(modelStr, data = data) | 628 model = modelFunc(modelStr, data = data) |
629 results = model.fit() | 629 results = model.fit() |
630 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj | 630 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj |
750 def similarities(self, l1, l2, jshift=0): | 750 def similarities(self, l1, l2, jshift=0): |
751 n1 = len(l1) | 751 n1 = len(l1) |
752 n2 = len(l2) | 752 n2 = len(l2) |
753 self.similarityTable = zeros((n1+1,n2+1), dtype = npint) | 753 self.similarityTable = zeros((n1+1,n2+1), dtype = npint) |
754 if self.similarityFunc is not None: | 754 if self.similarityFunc is not None: |
755 for i in xrange(1,n1+1): | 755 for i in range(1,n1+1): |
756 for j in xrange(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): | 756 for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): |
757 if self.similarityFunc(l1[i-1], l2[j-1]): | 757 if self.similarityFunc(l1[i-1], l2[j-1]): |
758 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 | 758 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 |
759 else: | 759 else: |
760 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) | 760 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) |
761 elif self.metric is not None: | 761 elif self.metric is not None: |
762 similarElements = distance.cdist(l1, l2, self.metric) <= self.epsilon | 762 similarElements = distance.cdist(l1, l2, self.metric) <= self.epsilon |
763 for i in xrange(1,n1+1): | 763 for i in range(1,n1+1): |
764 for j in xrange(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): | 764 for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): |
765 if similarElements[i-1, j-1]: | 765 if similarElements[i-1, j-1]: |
766 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 | 766 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 |
767 else: | 767 else: |
768 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) | 768 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) |
769 | 769 |
800 n2 = len(l2) | 800 n2 = len(l2) |
801 | 801 |
802 if self.aligned: | 802 if self.aligned: |
803 lcssValues = {} | 803 lcssValues = {} |
804 similarityTables = {} | 804 similarityTables = {} |
805 for i in xrange(-n2-self.delta+1, n1+self.delta): # interval such that [i-shift-delta, i-shift+delta] is never empty, which happens when i-shift+delta < 1 or when i-shift-delta > n2 | 805 for i in range(-n2-self.delta+1, n1+self.delta): # interval such that [i-shift-delta, i-shift+delta] is never empty, which happens when i-shift+delta < 1 or when i-shift-delta > n2 |
806 self.similarities(l1, l2, i) | 806 self.similarities(l1, l2, i) |
807 lcssValues[i] = self.similarityTable.max() | 807 lcssValues[i] = self.similarityTable.max() |
808 similarityTables[i] = self.similarityTable | 808 similarityTables[i] = self.similarityTable |
809 #print self.similarityTable | 809 #print self.similarityTable |
810 alignmentShift = argmaxDict(lcssValues) # ideally get the medium alignment shift, the one that minimizes distance | 810 alignmentShift = argmaxDict(lcssValues) # ideally get the medium alignment shift, the one that minimizes distance |
892 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.'])) | 892 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.'])) |
893 plt.rc('axes', prop_cycle=monochrome) | 893 plt.rc('axes', prop_cycle=monochrome) |
894 | 894 |
895 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): | 895 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): |
896 from matplotlib.pyplot import pcolor | 896 from matplotlib.pyplot import pcolor |
897 coords = array(indicatorMap.keys()) | 897 coords = array(list(indicatorMap.keys())) |
898 minX = min(coords[:,0]) | 898 minX = min(coords[:,0]) |
899 minY = min(coords[:,1]) | 899 minY = min(coords[:,1]) |
900 X = arange(minX, max(coords[:,0])+1.1)*squareSize | 900 X = arange(minX, max(coords[:,0])+1.1)*squareSize |
901 Y = arange(minY, max(coords[:,1])+1.1)*squareSize | 901 Y = arange(minY, max(coords[:,1])+1.1)*squareSize |
902 C = defaultValue*ones((len(Y), len(X))) | 902 C = defaultValue*ones((len(Y), len(X))) |
903 for k,v in indicatorMap.iteritems(): | 903 for k,v in indicatorMap.items(): |
904 C[k[1]-minY,k[0]-minX] = v | 904 C[k[1]-minY,k[0]-minX] = v |
905 if masked: | 905 if masked: |
906 pcolor(X, Y, ma.masked_where(C==defaultValue,C)) | 906 pcolor(X, Y, ma.masked_where(C==defaultValue,C)) |
907 else: | 907 else: |
908 pcolor(X, Y, C) | 908 pcolor(X, Y, C) |
924 To get hourly data for 2009 and 2012, January, March and October, downloadECWeather(10761, [2009,2012], [1,3,10], '/tmp') | 924 To get hourly data for 2009 and 2012, January, March and October, downloadECWeather(10761, [2009,2012], [1,3,10], '/tmp') |
925 | 925 |
926 for annee in `seq 2016 2017`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&timeframe=2&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done | 926 for annee in `seq 2016 2017`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&timeframe=2&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done |
927 for annee in `seq 2016 2017`;do for mois in `seq 1 12`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&Month=${mois}&timeframe=1&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done;done | 927 for annee in `seq 2016 2017`;do for mois in `seq 1 12`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&Month=${mois}&timeframe=1&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done;done |
928 ''' | 928 ''' |
929 import urllib2 | 929 import urllib.request |
930 if english: | 930 if english: |
931 language = 'e' | 931 language = 'e' |
932 else: | 932 else: |
933 language = 'f' | 933 language = 'f' |
934 if len(months) == 0: | 934 if len(months) == 0: |
937 else: | 937 else: |
938 timeFrame = 1 | 938 timeFrame = 1 |
939 | 939 |
940 for year in years: | 940 for year in years: |
941 for month in months: | 941 for month in months: |
942 url = urllib2.urlopen('http://climate.weather.gc.ca/climate_data/bulk_data_{}.html?format=csv&stationID={}&Year={}&Month={}&Day=1&timeframe={}&submit= Download+Data'.format(language, stationID, year, month, timeFrame)) | |
943 #http://climat.meteo.gc.ca/climateData/bulkdata_{}.html?format=csv&stationID={}&Year={}&Month={}&Day=1&timeframe={}&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es | |
944 data = url.read() | |
945 outFilename = '{}/{}-{}'.format(outputDirectoryname, stationID, year) | 942 outFilename = '{}/{}-{}'.format(outputDirectoryname, stationID, year) |
946 if timeFrame == 1: | 943 if timeFrame == 1: |
947 outFilename += '-{}-hourly'.format(month) | 944 outFilename += '-{}-hourly'.format(month) |
948 else: | 945 else: |
949 outFilename += '-daily' | 946 outFilename += '-daily' |
950 outFilename += '.csv' | 947 outFilename += '.csv' |
951 out = open(outFilename, 'w') | 948 url = urllib.request.urlretrieve('http://climate.weather.gc.ca/climate_data/bulk_data_{}.html?format=csv&stationID={}&Year={}&Month={}&Day=1&timeframe={}&submit=Download+Data'.format(language, stationID, year, month, timeFrame), outFilename) |
952 out.write(data) | |
953 out.close() | |
954 | 949 |
955 ######################### | 950 ######################### |
956 # File I/O | 951 # File I/O |
957 ######################### | 952 ######################### |
958 | 953 |
1009 return [float(x) for x in l.split(separator)] | 1004 return [float(x) for x in l.split(separator)] |
1010 | 1005 |
1011 def line2Ints(l, separator=' '): | 1006 def line2Ints(l, separator=' '): |
1012 '''Returns the list of ints corresponding to the string''' | 1007 '''Returns the list of ints corresponding to the string''' |
1013 return [int(x) for x in l.split(separator)] | 1008 return [int(x) for x in l.split(separator)] |
1014 | |
1015 ######################### | |
1016 # CLI utils | |
1017 ######################### | |
1018 | |
1019 def parseCLIOptions(helpMessage, options, cliArgs, optionalOptions=[]): | |
1020 ''' Simple function to handle similar argument parsing | |
1021 Returns the dictionary of options and their values | |
1022 | |
1023 * cliArgs are most likely directly sys.argv | |
1024 (only the elements after the first one are considered) | |
1025 | |
1026 * options should be a list of strings for getopt options, | |
1027 eg ['frame=','correspondences=','video='] | |
1028 A value must be provided for each option, or the program quits''' | |
1029 import sys, getopt | |
1030 from numpy.core.fromnumeric import all | |
1031 optionValues, args = getopt.getopt(cliArgs[1:], 'h', ['help']+options+optionalOptions) | |
1032 optionValues = dict(optionValues) | |
1033 | |
1034 if '--help' in optionValues.keys() or '-h' in optionValues.keys(): | |
1035 print(helpMessage+ | |
1036 '\n - Compulsory options: '+' '.join([opt.replace('=','') for opt in options])+ | |
1037 '\n - Non-compulsory options: '+' '.join([opt.replace('=','') for opt in optionalOptions])) | |
1038 sys.exit() | |
1039 | |
1040 missingArgument = [('--'+opt.replace('=','') in optionValues.keys()) for opt in options] | |
1041 if not all(missingArgument): | |
1042 print('Missing argument') | |
1043 print(optionValues) | |
1044 sys.exit() | |
1045 | |
1046 return optionValues | |
1047 | |
1048 | 1009 |
1049 ######################### | 1010 ######################### |
1050 # Profiling | 1011 # Profiling |
1051 ######################### | 1012 ######################### |
1052 | 1013 |