comparison trafficintelligence/utils.py @ 1028:cc5cb04b04b0

major update using the trafficintelligence package name and install through pip
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 15 Jun 2018 11:19:10 -0400
parents python/utils.py@a13f47c8931d
children c6cf75a2ed08
comparison
equal deleted inserted replaced
1027:6129296848d3 1028:cc5cb04b04b0
1 #! /usr/bin/env python
2 # -*- coding: utf-8 -*-
3 ''' Generic utilities.'''
4
5 import matplotlib.pyplot as plt
6 from datetime import time, datetime
7 from argparse import ArgumentTypeError
8 from pathlib import Path
9 from math import sqrt, ceil, floor
10 from scipy.stats import rv_continuous, kruskal, shapiro, lognorm
11 from scipy.spatial import distance
12 from scipy.sparse import dok_matrix
13 from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log
14
15
16 datetimeFormat = "%Y-%m-%d %H:%M:%S"
17
18 sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720
19
20 #########################
21 # Strings
22 #########################
23
24 def upperCaseFirstLetter(s):
25 words = s.split(' ')
26 lowerWords = [w[0].upper()+w[1:].lower() for w in words]
27 return ' '.join(lowerWords)
28
29 class TimeConverter:
30 def __init__(self, datetimeFormat = datetimeFormat):
31 self.datetimeFormat = datetimeFormat
32
33 def convert(self, s):
34 try:
35 return datetime.strptime(s, self.datetimeFormat)
36 except ValueError:
37 msg = "Not a valid date: '{0}'.".format(s)
38 raise ArgumentTypeError(msg)
39
40 #########################
41 # Enumerations
42 #########################
43
44 def inverseEnumeration(l):
45 'Returns the dictionary that provides for each element in the input list its index in the input list'
46 result = {}
47 for i,x in enumerate(l):
48 result[x] = i
49 return result
50
51 #########################
52 # Simple statistics
53 #########################
54
55 def logNormalMeanVar(loc, scale):
56 '''location and scale are respectively the mean and standard deviation of the normal in the log-normal distribution
57 https://en.wikipedia.org/wiki/Log-normal_distribution
58
59 same as lognorm.stats(scale, 0, exp(loc))'''
60 mean = exp(loc+(scale**2)/2)
61 var = (exp(scale**2)-1)*exp(2*loc+scale**2)
62 return mean, var
63
64 def fitLogNormal(x):
65 'returns the fitted location and scale of the lognormal (general definition)'
66 shape, loc, scale = lognorm.fit(x, floc=0.)
67 return log(scale), shape
68
69 def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False):
70 from scipy.stats.distributions import norm
71 if nRoundingDigits is None:
72 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200.
73 else:
74 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits)
75 stdev = round(stdev, nRoundingDigits)
76 tolerance = round(tolerance, nRoundingDigits)
77 if printLatex:
78 print('$z_{{{}}}^2\\frac{{s^2}}{{e^2}}={}^2\\frac{{{}^2}}{{{}^2}}$'.format(0.5+percentConfidence/200.,k, stdev, tolerance))
79 return (k*stdev/tolerance)**2
80
81 def confidenceInterval(mean, stdev, nSamples, percentConfidence, trueStd = True, printLatex = False):
82 '''if trueStd, use normal distribution, otherwise, Student
83
84 Use otherwise t.interval or norm.interval for the boundaries
85 ex: norm.interval(0.95)
86 t.interval(0.95, nSamples-1)'''
87 from scipy.stats.distributions import norm, t
88 if trueStd:
89 k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2)
90 else: # use Student
91 k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2)
92 e = k*stdev/sqrt(nSamples)
93 if printLatex:
94 print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format(mean, k, stdev, nSamples))
95 return mean-e, mean+e
96
97 def computeChi2(expected, observed):
98 '''Returns the Chi2 statistics'''
99 return sum([((e-o)*(e-o))/float(e) for e, o in zip(expected, observed)])
100
101 class generateDistribution(rv_continuous):
102 def __init__(self, values, probabilities):
103 '''The values (and corresponding probabilities) are supposed to be sorted by value
104 for v, p in zip(values, probabilities): P(X<=v)=p'''
105 assert probabilities[0]==0
106 self.values = values
107 self.probabilities = probabilities
108
109 def _cdf(self, x):
110 if x < self.values[0]:
111 return self.probabilities[0]
112 else:
113 i=0
114 while i+1<len(self.values) and self.values[i+1] < x:
115 i += 1
116 if i == len(self.values)-1:
117 return self.probabilities[-1]
118 else:
119 return (self.probabilities[i+1]-self.probabilities[i])/(self.values[i+1]-self.values[i])
120
121 class DistributionSample(object):
122 def nSamples(self):
123 return sum(self.counts)
124
125 def cumulativeDensityFunction(sample, normalized = False):
126 '''Returns the cumulative density function of the sample of a random variable'''
127 xaxis = sorted(sample)
128 counts = arange(1,len(sample)+1) # dtype = float
129 if normalized:
130 counts /= float(len(sample))
131 return xaxis, counts
132
133 class DiscreteDistributionSample(DistributionSample):
134 '''Class to represent a sample of a distribution for a discrete random variable'''
135 def __init__(self, categories, counts):
136 self.categories = categories
137 self.counts = counts
138
139 def mean(self):
140 result = [float(x*y) for x,y in zip(self.categories, self.counts)]
141 return npsum(result)/self.nSamples()
142
143 def var(self, mean = None):
144 if not mean:
145 m = self.mean()
146 else:
147 m = mean
148 result = 0.
149 squares = [float((x-m)*(x-m)*y) for x,y in zip(self.categories, self.counts)]
150 return npsum(squares)/(self.nSamples()-1)
151
152 def referenceCounts(self, probability):
153 '''probability is a function that returns the probability of the random variable for the category values'''
154 refProba = [probability(c) for c in self.categories]
155 refProba[-1] = 1-npsum(refProba[:-1])
156 refCounts = [r*self.nSamples() for r in refProba]
157 return refCounts, refProba
158
159 class ContinuousDistributionSample(DistributionSample):
160 '''Class to represent a sample of a distribution for a continuous random variable
161 with the number of observations for each interval
162 intervals (categories variable) are defined by their left limits, the last one being the right limit
163 categories contain therefore one more element than the counts'''
164 def __init__(self, categories, counts):
165 # todo add samples for initialization and everything to None? (or setSamples?)
166 self.categories = categories
167 self.counts = counts
168
169 @staticmethod
170 def generate(sample, categories):
171 if min(sample) < min(categories):
172 print('Sample has lower min than proposed categories ({}, {})'.format(min(sample), min(categories)))
173 if max(sample) > max(categories):
174 print('Sample has higher max than proposed categories ({}, {})'.format(max(sample), max(categories)))
175 dist = ContinuousDistributionSample(sorted(categories), [0]*(len(categories)-1))
176 for s in sample:
177 i = 0
178 while i<len(dist.categories) and dist.categories[i] <= s:
179 i += 1
180 if i <= len(dist.counts):
181 dist.counts[i-1] += 1
182 #print('{} in {} {}'.format(s, dist.categories[i-1], dist.categories[i]))
183 else:
184 print('Element {} is not in the categories'.format(s))
185 return dist
186
187 def mean(self):
188 result = 0.
189 for i in range(len(self.counts)-1):
190 result += self.counts[i]*(self.categories[i]+self.categories[i+1])/2
191 return result/self.nSamples()
192
193 def var(self, mean = None):
194 if not mean:
195 m = self.mean()
196 else:
197 m = mean
198 result = 0.
199 for i in range(len(self.counts)-1):
200 mid = (self.categories[i]+self.categories[i+1])/2
201 result += self.counts[i]*(mid - m)*(mid - m)
202 return result/(self.nSamples()-1)
203
204 def referenceCounts(self, cdf):
205 '''cdf is a cumulative distribution function
206 returning the probability of the variable being less that x'''
207 # refCumulativeCounts = [0]#[cdf(self.categories[0][0])]
208 # for inter in self.categories:
209 # refCumulativeCounts.append(cdf(inter[1]))
210 refCumulativeCounts = [cdf(x) for x in self.categories[1:-1]]
211
212 refProba = [refCumulativeCounts[0]]
213 for i in xrange(1,len(refCumulativeCounts)):
214 refProba.append(refCumulativeCounts[i]-refCumulativeCounts[i-1])
215 refProba.append(1-refCumulativeCounts[-1])
216 refCounts = [p*self.nSamples() for p in refProba]
217
218 return refCounts, refProba
219
220 def printReferenceCounts(self, refCounts=None):
221 if refCounts:
222 ref = refCounts
223 else:
224 ref = self.referenceCounts
225 for i in xrange(len(ref[0])):
226 print('{0}-{1} & {2:0.3} & {3:0.3} \\\\'.format(self.categories[i],self.categories[i+1],ref[1][i], ref[0][i]))
227
228
229 #########################
230 # maths section
231 #########################
232
233 # def kernelSmoothing(sampleX, X, Y, weightFunc, halfwidth):
234 # '''Returns a smoothed weighted version of Y at the predefined values of sampleX
235 # Sum_x weight(sample_x,x) * y(x)'''
236 # from numpy import zeros, array
237 # smoothed = zeros(len(sampleX))
238 # for i,x in enumerate(sampleX):
239 # weights = array([weightFunc(x,xx, halfwidth) for xx in X])
240 # if sum(weights)>0:
241 # smoothed[i] = sum(weights*Y)/sum(weights)
242 # else:
243 # smoothed[i] = 0
244 # return smoothed
245
246 def kernelSmoothing(x, X, Y, weightFunc, halfwidth):
247 '''Returns the smoothed estimate of (X,Y) at x
248 Sum_x weight(sample_x,x) * y(x)'''
249 weights = array([weightFunc(x,observedx, halfwidth) for observedx in X])
250 if sum(weights)>0:
251 return sum(weights*Y)/sum(weights)
252 else:
253 return 0
254
255 def uniform(center, x, halfwidth):
256 if abs(center-x)<halfwidth:
257 return 1.
258 else:
259 return 0.
260
261 def gaussian(center, x, halfwidth):
262 return exp(-((center-x)/halfwidth)**2/2)
263
264 def epanechnikov(center, x, halfwidth):
265 diff = abs(center-x)
266 if diff<halfwidth:
267 return 1.-(diff/halfwidth)**2
268 else:
269 return 0.
270
271 def triangular(center, x, halfwidth):
272 diff = abs(center-x)
273 if diff<halfwidth:
274 return 1.-abs(diff/halfwidth)
275 else:
276 return 0.
277
278 def medianSmoothing(x, X, Y, halfwidth):
279 '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]'''
280 return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth])
281
282 def argmaxDict(d):
283 return max(d, key=d.get)
284
285 def deltaFrames(t1, t2, frameRate):
286 '''Returns the number of frames between t1 and t2
287 positive if t1<=t2, negative otherwise'''
288 if t1 > t2:
289 return -(t1-t2).seconds*frameRate
290 else:
291 return (t2-t1).seconds*frameRate
292
293 def framesToTime(nFrames, frameRate, initialTime = time()):
294 '''returns a datetime.time for the time in hour, minutes and seconds
295 initialTime is a datetime.time'''
296 seconds = int(floor(float(nFrames)/float(frameRate))+initialTime.hour*3600+initialTime.minute*60+initialTime.second)
297 h = int(floor(seconds/3600.))
298 seconds = seconds - h*3600
299 m = int(floor(seconds/60))
300 seconds = seconds - m*60
301 return time(h, m, seconds)
302
303 def timeToFrames(t, frameRate):
304 return frameRate*(t.hour*3600+t.minute*60+t.second)
305
306 def sortXY(X,Y):
307 'returns the sorted (x, Y(x)) sorted on X'
308 D = {}
309 for x, y in zip(X,Y):
310 D[x]=y
311 xsorted = sorted(D.keys())
312 return xsorted, [D[x] for x in xsorted]
313
314 def compareLengthForSort(i, j):
315 if len(i) < len(j):
316 return -1
317 elif len(i) == len(j):
318 return 0
319 else:
320 return 1
321
322 def sortByLength(instances, reverse = False):
323 '''Returns a new list with the instances sorted by length (method __len__)
324 reverse is passed to sorted'''
325 return sorted(instances, key = len, reverse = reverse)
326
327 def ceilDecimals(v, nDecimals):
328 '''Rounds the number at the nth decimal
329 eg 1.23 at 0 decimal is 2, at 1 decimal is 1.3'''
330 tens = 10**nDecimals
331 return ceil(v*tens)/tens
332
333 def inBetween(bound1, bound2, x):
334 'useful if one does not know the order of bound1/bound2'
335 return bound1 <= x <= bound2 or bound2 <= x <= bound1
336
337 def pointDistanceL2(x1,y1,x2,y2):
338 ''' Compute point-to-point distance (L2 norm, ie Euclidean distance)'''
339 return sqrt((x2-x1)**2+(y2-y1)**2)
340
341 def crossProduct(l1, l2):
342 return l1[0]*l2[1]-l1[1]*l2[0]
343
344 def cat_mvgavg(cat_list, halfWidth):
345 ''' Return a list of categories/values smoothed according to a window.
346 halfWidth is the search radius on either side'''
347 from copy import deepcopy
348 smoothed = deepcopy(cat_list)
349 for point in range(len(cat_list)):
350 lower_bound_check = max(0,point-halfWidth)
351 upper_bound_check = min(len(cat_list)-1,point+halfWidth+1)
352 window_values = cat_list[lower_bound_check:upper_bound_check]
353 smoothed[point] = max(set(window_values), key=window_values.count)
354 return smoothed
355
356 def filterMovingWindow(inputSignal, halfWidth):
357 '''Returns an array obtained after the smoothing of the input by a moving average
358 The first and last points are copied from the original.'''
359 width = float(halfWidth*2+1)
360 win = ones(width,'d')
361 result = convolve(win/width,array(inputSignal),'same')
362 result[:halfWidth] = inputSignal[:halfWidth]
363 result[-halfWidth:] = inputSignal[-halfWidth:]
364 return result
365
366 def linearRegression(x, y, deg = 1, plotData = False):
367 '''returns the least square estimation of the linear regression of y = ax+b
368 as well as the plot'''
369 from numpy.lib.polynomial import polyfit
370 from numpy.core.multiarray import arange
371 coef = polyfit(x, y, deg)
372 if plotData:
373 def poly(x):
374 result = 0
375 for i in range(len(coef)):
376 result += coef[i]*x**(len(coef)-i-1)
377 return result
378 plt.plot(x, y, 'x')
379 xx = arange(min(x), max(x),(max(x)-min(x))/1000)
380 plt.plot(xx, [poly(z) for z in xx])
381 return coef
382
383 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
384 '''Computes (and displays) the correlation matrix for a pandas DataFrame'''
385 columns = data.columns.tolist()
386 for var in data.columns:
387 uniqueValues = data[var].unique()
388 if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan
389 columns.remove(var)
390 c=data[columns].corr(correlationMethod)
391 if plotFigure:
392 fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0]))
393 fig.add_subplot(1,1,1)
394 #plt.imshow(np.fabs(c), interpolation='none')
395 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
396 if displayNames is not None:
397 colnames = [displayNames.get(s.strip(), s.strip()) for s in columns]
398 else:
399 colnames = columns
400 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
401 plt.xticks(range(len(colnames)), colnames, rotation=90)
402 plt.yticks(range(len(colnames)), colnames)
403 plt.tick_params('both', length=0)
404 plt.subplots_adjust(bottom = 0.29)
405 plt.colorbar()
406 plt.title('Correlation ({})'.format(correlationMethod))
407 plt.tight_layout()
408 if len(colnames) > 50:
409 plt.subplots_adjust(left=.06)
410 if figureFilename is not None:
411 plt.savefig(figureFilename, dpi = 150, transparent = True)
412 return c
413
414 def addDummies(data, variables, allVariables = True):
415 '''Add binary dummy variables for each value of a nominal variable
416 in a pandas DataFrame'''
417 newVariables = []
418 for var in variables:
419 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
420 values = data[var].unique()
421 if not allVariables:
422 values = values[:-1]
423 for val in values:
424 if val is not NaN:
425 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
426 data[newVariable] = (data[var] == val)
427 newVariables.append(newVariable)
428 return newVariables
429
430 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = ''):
431 '''Studies the influence of (nominal) independent variable over the dependent variable
432
433 Makes tests if the conditional distributions are normal
434 using the Shapiro-Wilk test (in which case ANOVA could be used)
435 Implements uses the non-parametric Kruskal Wallis test'''
436 tmp = data[data[independentVariable].notnull()]
437 independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
438 if len(independentVariableValues) >= 2:
439 if saveLatex:
440 from storage import openCheck
441 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w')
442 for x in independentVariableValues:
443 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
444 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
445 print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]))
446 if plotFigure:
447 plt.figure()
448 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
449 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
450 plt.title('{} vs {}'.format(dependentVariable, independentVariable))
451 if filenamePrefix is not None:
452 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
453 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
454 table['count'] = table['count'].astype(int)
455 testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
456 if saveLatex:
457 out.write('\\begin{minipage}{\\linewidth}\n'
458 +'\\centering\n'
459 +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n'
460 +table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n'
461 +'\\end{minipage}\n'
462 +'\\ \\vspace{0.5cm}\n')
463 else:
464 print(table)
465 return testResult
466 else:
467 return None
468
469 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''):
470 '''Removes variables from candidate independent variables if
471 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed
472 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
473 Returns the remaining non-correlated variables, correlated with the dependent variable
474
475 correlationFunc is spearmanr or pearsonr from scipy.stats
476 text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise
477
478 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
479 from copy import copy
480 from pandas import DataFrame
481 result = copy(independentVariables)
482 table1 = ''
483 table2 = {}
484 # constant variables
485 for var in independentVariables:
486 uniqueValues = data[var].unique()
487 if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1):
488 print(stdoutText[0].format(var, uniqueValues))
489 if saveFiles:
490 table1 += latexTable[0].format(var, *uniqueValues)
491 result.remove(var)
492 # correlated variables
493 for v1 in copy(result):
494 if v1 in correlations.index:
495 for v2 in copy(result):
496 if v2 != v1 and v2 in correlations.index:
497 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
498 if v1 in result and v2 in result:
499 if saveFiles:
500 table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2])
501 print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2]))
502 result.remove(v2)
503 # not correlated with dependent variable
504 table2['Correlations'] = []
505 table2['Valeurs p'] = []
506 for var in copy(result):
507 if data.dtypes[var] != dtype('O'):
508 cor, p = correlationFunc(data[dependentVariable], data[var])
509 if p > maxCorrelationP:
510 if saveFiles:
511 table1 += latexTable[2].format(var, cor, p)
512 print(stdoutText[2].format(var, cor, p))
513 result.remove(var)
514 else:
515 table2['Correlations'].append(cor)
516 table2['Valeurs p'].append(p)
517
518 if saveFiles:
519 from storage import openCheck
520 out = openCheck(filenamePrefix+'-removed-variables.tex', 'w')
521 out.write(latexHeader)
522 out.write(table1)
523 out.write(latexFooter)
524 out.close()
525 out = openCheck(filenamePrefix+'-correlations.html', 'w')
526 table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')]
527 out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False))
528 out.close()
529 return result
530
531 def saveDokMatrix(filename, m, lowerTriangle = False):
532 'Saves a dok_matrix using savez'
533 if lowerTriangle:
534 keys = [k for k in m if k[0] > k[1]]
535 savez(filename, shape = m.shape, keys = keys, values = [m[k[0],k[1]] for k in keys])
536 else:
537 savez(filename, shape = m.shape, keys = list(m.keys()), values = list(m.values()))
538
539 def loadDokMatrix(filename):
540 'Loads a dok_matrix saved using the above saveDokMatrix'
541 data = npload(filename)
542 m = dok_matrix(tuple(data['shape']))
543 for k, v in zip(data['keys'], data['values']):
544 m[tuple(k)] = v
545 return m
546
547 def aggregationFunction(funcStr, centile = 50):
548 '''return the numpy function corresponding to funcStr
549 centile can be a list of centiles to compute at once, eg [25, 50, 75] for the 3 quartiles'''
550 if funcStr == 'median':
551 return median
552 elif funcStr == 'mean':
553 return mean
554 elif funcStr == 'centile':
555 return lambda x: percentile(x, centile)
556 elif funcStr == '85centile':
557 return lambda x: percentile(x, 85)
558 else:
559 print('Unknown aggregation method: {}'.format(funcStr))
560 return None
561
562 #########################
563 # regression analysis using statsmodels (and pandas)
564 #########################
565
566 # TODO make class for experiments?
567 # TODO add tests with public dataset downloaded from Internet (IRIS et al)
568 def modelString(experiment, dependentVariable, independentVariables):
569 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]])
570
571 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'):
572 import statsmodels.formula.api as smf
573 modelStr = modelString(experiment, dependentVariable, independentVariables)
574 if regressionType == 'ols':
575 model = smf.ols(modelStr, data = data)
576 elif regressionType == 'gls':
577 model = smf.gls(modelStr, data = data)
578 elif regressionType == 'rlm':
579 model = smf.rlm(modelStr, data = data)
580 else:
581 print('Unknown regression type {}. Exiting'.format(regressionType))
582 import sys
583 sys.exit()
584 return model.fit()
585
586 def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'):
587 '''Runs several models and stores 3 statistics
588 adjusted R2, condition number (should be small, eg < 1000)
589 and p-value for Shapiro-Wilk test of residual normality'''
590 for i,experiment in experiments.iterrows():
591 if experiment[independentVariables].any():
592 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols')
593 experiments.loc[i,'r2adj'] = results.rsquared_adj
594 experiments.loc[i,'condNum'] = results.condition_number
595 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1]
596 experiments.loc[i,'nobs'] = int(results.nobs)
597 return experiments
598
599 def generateExperiments(independentVariables):
600 '''Generates all possible models for including or not each independent variable'''
601 from pandas import DataFrame
602 experiments = {}
603 nIndependentVariables = len(independentVariables)
604 if nIndependentVariables != len(set(independentVariables)):
605 print("Duplicate variables. Exiting")
606 import sys
607 sys.exit()
608 nModels = 2**nIndependentVariables
609 for i,var in enumerate(independentVariables):
610 pattern = [False]*(2**i)+[True]*(2**i)
611 experiments[var] = pattern*(2**(nIndependentVariables-i-1))
612 experiments = DataFrame(experiments)
613 experiments['r2adj'] = 0.
614 experiments['condNum'] = NaN
615 experiments['shapiroP'] = -1
616 experiments['nobs'] = -1
617 return experiments
618
619 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
620 '''Generates all possible model with the independentVariables
621 and runs them, saving the results in experiments
622 with multiprocess option'''
623 from pandas import concat
624 from multiprocessing import Pool
625 experiments = generateExperiments(independentVariables)
626 nModels = len(experiments)
627 print("Running {} models with {} processes".format(nModels, nProcesses))
628 print("IndependentVariables: {}".format(independentVariables))
629 if nProcesses == 1:
630 return runModels(experiments, data, dependentVariable, independentVariables, regressionType)
631 else:
632 pool = Pool(processes = nProcesses)
633 chunkSize = int(ceil(nModels/nProcesses))
634 jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)]
635 return concat([job.get() for job in jobs])
636
637 def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None):
638 '''Forward search for best model (based on adjusted R2)
639 Randomly starting with one variable and adding randomly variables
640 if they improve the model
641
642 The results are added to experiments if provided as argument
643 Storing in experiment relies on the index being the number equal
644 to the binary code derived from the independent variables'''
645 from numpy.random import permutation as nppermutation
646 if experiments is None:
647 experiments = generateExperiments(independentVariables)
648 nIndependentVariables = len(independentVariables)
649 permutation = nppermutation(list(range(nIndependentVariables)))
650 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)}
651 print('Tested variables '+', '.join([variableMapping[i] for i in range(nIndependentVariables)]))
652 bestModel = [False]*nIndependentVariables
653 currentVarNum = 0
654 currentR2Adj = 0.
655 for currentVarNum in range(nIndependentVariables):
656 currentModel = [i for i in bestModel]
657 currentModel[currentVarNum] = True
658 rowIdx = sum([0]+[2**i for i in range(nIndependentVariables) if currentModel[permutation[i]]])
659 #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in range(nIndependentVariables) if currentModel[permutation[i]]])
660 if experiments.loc[rowIdx, 'shapiroP'] < 0:
661 modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables)
662 model = modelFunc(modelStr, data = data)
663 results = model.fit()
664 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj
665 experiments.loc[rowIdx, 'condNum'] = results.condition_number
666 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1]
667 experiments.loc[rowIdx, 'nobs'] = int(results.nobs)
668 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
669 currentR2Adj = experiments.loc[rowIdx, 'r2adj']
670 bestModel[currentVarNum] = True
671 return experiments
672
673 def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}):
674 import statsmodels.api as sm
675 '''Displays some model results
676
677 3 graphics, true-predicted, residuals-predicted, '''
678 print(results.summary())
679 shapiroResult = shapiro(results.resid)
680 print(shapiroResult)
681 if plotFigures:
682 fig = plt.figure(figsize=(7,6.3*(2+int(model is not None))))
683 if model is not None:
684 ax = fig.add_subplot(3,1,1)
685 plt.plot(results.predict(), model.endog, 'x')
686 x=plt.xlim()
687 y=plt.ylim()
688 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
689 #plt.axis('equal')
690 if text is not None:
691 plt.title(text['title-shapiro'].format(*shapiroResult))
692 #plt.title(text['true-predicted.title'])
693 plt.xlabel(text['true-predicted.xlabel'])
694 plt.ylabel(text['true-predicted.ylabel'])
695 fig.add_subplot(3,1,2, sharex = ax)
696 plt.plot(results.predict(), results.resid, 'x')
697 nextSubplotNum = 3
698 else:
699 fig.add_subplot(2,1,1)
700 plt.plot(results.predict(), results.resid, 'x')
701 nextSubplotNum = 2
702 if text is not None:
703 if model is None:
704 plt.title(text['title-shapiro'].format(*shapiroResult))
705 plt.xlabel(text['residuals-predicted.xlabel'])
706 plt.ylabel(text['residuals-predicted.ylabel'])
707 qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum)
708 sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx)
709 plt.axis('equal')
710 if text is not None and 'qqplot.xlabel' in text:
711 plt.xlabel(text['qqplot.xlabel'])
712 plt.ylabel(text['qqplot.ylabel'])
713 plt.tight_layout()
714 if filenamePrefix is not None:
715 from storage import openCheck
716 out = openCheck(filenamePrefix+'-coefficients.html', 'w')
717 out.write(results.summary().as_html())
718 plt.savefig(filenamePrefix+'-model-results.'+figureFileType)
719
720 #########################
721 # iterable section
722 #########################
723
724 def mostCommon(L):
725 '''Returns the most frequent element in a iterable
726
727 taken from http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list'''
728 from itertools import groupby
729 from operator import itemgetter
730 # get an iterable of (item, iterable) pairs
731 SL = sorted((x, i) for i, x in enumerate(L))
732 # print 'SL:', SL
733 groups = groupby(SL, key=itemgetter(0))
734 # auxiliary function to get "quality" for an item
735 def _auxfun(g):
736 item, iterable = g
737 count = 0
738 min_index = len(L)
739 for _, where in iterable:
740 count += 1
741 min_index = min(min_index, where)
742 # print 'item %r, count %r, minind %r' % (item, count, min_index)
743 return count, -min_index
744 # pick the highest-count/earliest item
745 return max(groups, key=_auxfun)[0]
746
747 #########################
748 # sequence section
749 #########################
750
751 class LCSS(object):
752 '''Class that keeps the LCSS parameters
753 and puts together the various computations
754
755 the methods with names starting with _ are not to be shadowed
756 in child classes, who will shadow the other methods,
757 ie compute and computeXX methods'''
758 def __init__(self, similarityFunc = None, metric = None, epsilon = None, delta = float('inf'), aligned = False, lengthFunc = min):
759 '''One should provide either a similarity function
760 that indicates (return bool) whether elements in the compares lists are similar
761
762 eg distance(p1, p2) < epsilon
763
764 or a type of metric usable in scipy.spatial.distance.cdist with an epsilon'''
765 if similarityFunc is None and metric is None:
766 print("No way to compute LCSS, similarityFunc and metric are None. Exiting")
767 import sys
768 sys.exit()
769 elif metric is not None and epsilon is None:
770 print("Please provide a value for epsilon if using a cdist metric. Exiting")
771 import sys
772 sys.exit()
773 else:
774 if similarityFunc is None and metric is not None and not isinf(delta):
775 print('Warning: you are using a cdist metric and a finite delta, which will make probably computation slower than using the equivalent similarityFunc (since all pairwise distances will be computed by cdist).')
776 self.similarityFunc = similarityFunc
777 self.metric = metric
778 self.epsilon = epsilon
779 self.aligned = aligned
780 self.delta = delta
781 self.lengthFunc = lengthFunc
782 self.subSequenceIndices = [(0,0)]
783
784 def similarities(self, l1, l2, jshift=0):
785 n1 = len(l1)
786 n2 = len(l2)
787 self.similarityTable = zeros((n1+1,n2+1), dtype = npint)
788 if self.similarityFunc is not None:
789 for i in range(1,n1+1):
790 for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1):
791 if self.similarityFunc(l1[i-1], l2[j-1]):
792 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1
793 else:
794 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1])
795 elif self.metric is not None:
796 similarElements = distance.cdist(l1, l2, self.metric) <= self.epsilon
797 for i in range(1,n1+1):
798 for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1):
799 if similarElements[i-1, j-1]:
800 self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1
801 else:
802 self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1])
803
804
805 def subSequence(self, i, j):
806 '''Returns the subsequence of two sequences
807 http://en.wikipedia.org/wiki/Longest_common_subsequence_problem'''
808 if i == 0 or j == 0:
809 return []
810 elif self.similarityTable[i][j] == self.similarityTable[i][j-1]:
811 return self.subSequence(i, j-1)
812 elif self.similarityTable[i][j] == self.similarityTable[i-1][j]:
813 return self.subSequence(i-1, j)
814 else:
815 return self.subSequence(i-1, j-1) + [(i-1,j-1)]
816
817 def _compute(self, _l1, _l2, computeSubSequence = False):
818 '''returns the longest common subsequence similarity
819 l1 and l2 should be the right format
820 eg list of tuple points for cdist
821 or elements that can be compare using similarityFunc
822
823 if aligned, returns the best matching if using a finite delta by shifting the series alignments
824 '''
825 if len(_l2) < len(_l1): # l1 is the shortest
826 l1 = _l2
827 l2 = _l1
828 revertIndices = True
829 else:
830 l1 = _l1
831 l2 = _l2
832 revertIndices = False
833 n1 = len(l1)
834 n2 = len(l2)
835
836 if self.aligned:
837 lcssValues = {}
838 similarityTables = {}
839 for i in range(-n2-self.delta+1, n1+self.delta): # interval such that [i-shift-delta, i-shift+delta] is never empty, which happens when i-shift+delta < 1 or when i-shift-delta > n2
840 self.similarities(l1, l2, i)
841 lcssValues[i] = self.similarityTable.max()
842 similarityTables[i] = self.similarityTable
843 #print self.similarityTable
844 alignmentShift = argmaxDict(lcssValues) # ideally get the medium alignment shift, the one that minimizes distance
845 self.similarityTable = similarityTables[alignmentShift]
846 else:
847 alignmentShift = 0
848 self.similarities(l1, l2)
849
850 # threshold values for the useful part of the similarity table are n2-n1-delta and n1-n2-delta
851 self.similarityTable = self.similarityTable[:min(n1, n2+alignmentShift+self.delta)+1, :min(n2, n1-alignmentShift+self.delta)+1]
852
853 if computeSubSequence:
854 self.subSequenceIndices = self.subSequence(self.similarityTable.shape[0]-1, self.similarityTable.shape[1]-1)
855 if revertIndices:
856 self.subSequenceIndices = [(j,i) for i,j in self.subSequenceIndices]
857 return self.similarityTable[-1,-1]
858
859 def compute(self, l1, l2, computeSubSequence = False):
860 '''get methods are to be shadowed in child classes '''
861 return self._compute(l1, l2, computeSubSequence)
862
863 def computeAlignment(self):
864 return mean([j-i for i,j in self.subSequenceIndices])
865
866 def _computeNormalized(self, l1, l2, computeSubSequence = False):
867 ''' compute the normalized LCSS
868 ie, the LCSS divided by the min or mean of the indicator lengths (using lengthFunc)
869 lengthFunc = lambda x,y:float(x,y)/2'''
870 return float(self._compute(l1, l2, computeSubSequence))/self.lengthFunc(len(l1), len(l2))
871
872 def computeNormalized(self, l1, l2, computeSubSequence = False):
873 return self._computeNormalized(l1, l2, computeSubSequence)
874
875 def _computeDistance(self, l1, l2, computeSubSequence = False):
876 ''' compute the LCSS distance'''
877 return 1-self._computeNormalized(l1, l2, computeSubSequence)
878
879 def computeDistance(self, l1, l2, computeSubSequence = False):
880 return self._computeDistance(l1, l2, computeSubSequence)
881
882 #########################
883 # plotting section
884 #########################
885
886 def plotPolygon(poly, options = '', **kwargs):
887 'Plots shapely polygon poly'
888 from matplotlib.pyplot import plot
889 x,y = poly.exterior.xy
890 plot(x, y, options, **kwargs)
891
892 def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1):
893 '''for each value in X, increment by increment the initial count
894 returns the lists that can be plotted
895 to obtain a step plot increasing by one for each value in x, from first to last value
896 firstX and lastX should be respectively smaller and larger than all elements in X'''
897
898 sortedX = []
899 counts = [initialCount]
900 for x in sorted(X):
901 sortedX += [x,x]
902 counts.append(counts[-1])
903 counts.append(counts[-1]+increment)
904 counts.append(counts[-1])
905 return [firstX]+sortedX+[lastX], counts
906
907 class PlottingPropertyValues(object):
908 def __init__(self, values):
909 self.values = values
910
911 def __getitem__(self, i):
912 return self.values[i%len(self.values)]
913
914 markers = PlottingPropertyValues(['+', '*', ',', '.', 'x', 'D', 's', 'o'])
915 scatterMarkers = PlottingPropertyValues(['s','o','^','>','v','<','d','p','h','8','+','x'])
916
917 linestyles = PlottingPropertyValues(['-', '--', '-.', ':'])
918
919 colors = PlottingPropertyValues('brgmyck') # 'w'
920
921 def monochromeCycler(withMarker = False):
922 from cycler import cycler
923 if withMarker:
924 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.']) * cycler('marker', ['^',',', '.']))
925 else:
926 monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.']))
927 plt.rc('axes', prop_cycle=monochrome)
928
929 def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1):
930 from matplotlib.pyplot import pcolor
931 coords = array(list(indicatorMap.keys()))
932 minX = min(coords[:,0])
933 minY = min(coords[:,1])
934 X = arange(minX, max(coords[:,0])+1.1)*squareSize
935 Y = arange(minY, max(coords[:,1])+1.1)*squareSize
936 C = defaultValue*ones((len(Y), len(X)))
937 for k,v in indicatorMap.items():
938 C[k[1]-minY,k[0]-minX] = v
939 if masked:
940 pcolor(X, Y, ma.masked_where(C==defaultValue,C))
941 else:
942 pcolor(X, Y, C)
943
944 #########################
945 # Data download
946 #########################
947
948 def downloadECWeather(stationID, years, months = [], outputDirectoryname = '.', english = True):
949 '''Downloads monthly weather data from Environment Canada
950 If month is provided (number 1 to 12), it means hourly data for the whole month
951 Otherwise, means the data for each day, for the whole year
952
953 Example: MONTREAL MCTAVISH 10761
954 MONTREALPIERRE ELLIOTT TRUDEAU INTL A 5415
955 see ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv
956
957 To get daily data for 2010 and 2011, downloadECWeather(10761, [2010,2011], [], '/tmp')
958 To get hourly data for 2009 and 2012, January, March and October, downloadECWeather(10761, [2009,2012], [1,3,10], '/tmp')
959
960 for annee in `seq 2016 2017`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&timeframe=2&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done
961 for annee in `seq 2016 2017`;do for mois in `seq 1 12`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&Month=${mois}&timeframe=1&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done;done
962 '''
963 import urllib.request
964 if english:
965 language = 'e'
966 else:
967 language = 'f'
968 if len(months) == 0:
969 timeFrame = 2
970 months = [1]
971 else:
972 timeFrame = 1
973
974 for year in years:
975 for month in months:
976 outFilename = '{}/{}-{}'.format(outputDirectoryname, stationID, year)
977 if timeFrame == 1:
978 outFilename += '-{}-hourly'.format(month)
979 else:
980 outFilename += '-daily'
981 outFilename += '.csv'
982 url = urllib.request.urlretrieve('http://climate.weather.gc.ca/climate_data/bulk_data_{}.html?format=csv&stationID={}&Year={}&Month={}&Day=1&timeframe={}&submit=Download+Data'.format(language, stationID, year, month, timeFrame), outFilename)
983
984 #########################
985 # File I/O
986 #########################
987
988 def removeExtension(filename, delimiter = '.'):
989 '''Returns the filename minus the extension (all characters after last .)'''
990 i = filename.rfind(delimiter)
991 if i>0:
992 return filename[:i]
993 else:
994 return filename
995
996 def getExtension(filename, delimiter = '.'):
997 '''Returns the filename minus the extension (all characters after last .)'''
998 i = filename.rfind(delimiter)
999 if i>0:
1000 return filename[i+1:]
1001 else:
1002 return ''
1003
1004 def cleanFilename(s):
1005 'cleans filenames obtained when contatenating figure characteristics'
1006 return s.replace(' ','-').replace('.','').replace('/','-').replace(',','')
1007
1008 def getRelativeFilename(parentPath, filename):
1009 'Returns filename if absolute, otherwise parentPath/filename as string'
1010 filePath = Path(filename)
1011 if filePath.is_absolute():
1012 return filename
1013 else:
1014 return str(parentPath/filePath)
1015
1016 def listfiles(dirname, extension, remove = False):
1017 '''Returns the list of files with the extension in the directory dirname
1018 If remove is True, the filenames are stripped from the extension'''
1019 d = Path(dirname)
1020 if d.is_dir():
1021 tmp = [str(f) for f in d.glob('*.extension')]
1022 if remove:
1023 return [removeExtension(f, extension) for f in tmp]
1024 else:
1025 return tmp
1026 else:
1027 print(dirname+' is not a directory')
1028 return []
1029
1030 def mkdir(dirname):
1031 'Creates a directory if it does not exist'
1032 p = Path(dirname)
1033 if not p.exists():
1034 p.mkdir()
1035 else:
1036 print(dirname+' already exists')
1037
1038 def removeFile(filename):
1039 '''Deletes the file while avoiding raising an error
1040 if the file does not exist'''
1041 f = Path(filename)
1042 if (f.exists()):
1043 f.unlink()
1044 else:
1045 print(filename+' does not exist')
1046
1047 def line2Floats(l, separator=' '):
1048 '''Returns the list of floats corresponding to the string'''
1049 return [float(x) for x in l.split(separator)]
1050
1051 def line2Ints(l, separator=' '):
1052 '''Returns the list of ints corresponding to the string'''
1053 return [int(x) for x in l.split(separator)]
1054
1055 #########################
1056 # Profiling
1057 #########################
1058
1059 def analyzeProfile(profileFilename, stripDirs = True):
1060 '''Analyze the file produced by cProfile
1061
1062 obtained by for example:
1063 - call in script (for main() function in script)
1064 import cProfile, os
1065 cProfile.run('main()', os.path.join(os.getcwd(),'main.profile'))
1066
1067 - or on the command line:
1068 python -m cProfile [-o profile.bin] [-s sort] scriptfile [arg]'''
1069 import pstats, os
1070 p = pstats.Stats(os.path.join(os.pardir, profileFilename))
1071 if stripDirs:
1072 p.strip_dirs()
1073 p.sort_stats('time')
1074 p.print_stats(.2)
1075 #p.sort_stats('time')
1076 # p.print_callees(.1, 'int_prediction.py:')
1077 return p
1078
1079 #########################
1080 # running tests
1081 #########################
1082
1083 if __name__ == "__main__":
1084 import doctest
1085 import unittest
1086 suite = doctest.DocFileSuite('tests/utils.txt')
1087 #suite = doctest.DocTestSuite()
1088 unittest.TextTestRunner().run(suite)
1089 #doctest.testmod()
1090 #doctest.testfile("example.txt")