Mercurial Hosting > traffic-intelligence

diff trafficintelligence/utils.py @ 1028:cc5cb04b04b0
major update using the trafficintelligence package name and install through pip
author: Nicolas Saunier <nicolas.saunier@polymtl.ca>
date: Fri, 15 Jun 2018 11:19:10 -0400
parents: python/utils.py@a13f47c8931d
children: c6cf75a2ed08
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trafficintelligence/utils.py	Fri Jun 15 11:19:10 2018 -0400
@@ -0,0 +1,1090 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+''' Generic utilities.'''
+
+import matplotlib.pyplot as plt
+from datetime import time, datetime
+from argparse import ArgumentTypeError
+from pathlib import Path
+from math import sqrt, ceil, floor
+from scipy.stats import rv_continuous, kruskal, shapiro, lognorm
+from scipy.spatial import distance
+from scipy.sparse import dok_matrix
+from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve,  dtype, isnan, NaN, ma, isinf, savez, load as npload, log
+
+
+datetimeFormat = "%Y-%m-%d %H:%M:%S"
+
+sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720
+
+#########################
+# Strings
+#########################
+
+def upperCaseFirstLetter(s):
+    words = s.split(' ')
+    lowerWords = [w[0].upper()+w[1:].lower() for w in words]
+    return ' '.join(lowerWords)
+
+class TimeConverter:
+    def __init__(self, datetimeFormat = datetimeFormat):
+        self.datetimeFormat = datetimeFormat
+    
+    def convert(self, s):
+        try:
+            return datetime.strptime(s, self.datetimeFormat)
+        except ValueError:
+            msg = "Not a valid date: '{0}'.".format(s)
+            raise ArgumentTypeError(msg)
+
+#########################
+# Enumerations
+#########################
+
+def inverseEnumeration(l):
+    'Returns the dictionary that provides for each element in the input list its index in the input list'
+    result = {}
+    for i,x in enumerate(l):
+        result[x] = i
+    return result
+
+#########################
+# Simple statistics
+#########################
+
+def logNormalMeanVar(loc, scale):
+    '''location and scale are respectively the mean and standard deviation of the normal in the log-normal distribution
+    https://en.wikipedia.org/wiki/Log-normal_distribution
+
+    same as lognorm.stats(scale, 0, exp(loc))'''
+    mean = exp(loc+(scale**2)/2)
+    var = (exp(scale**2)-1)*exp(2*loc+scale**2)
+    return mean, var
+
+def fitLogNormal(x):
+    'returns the fitted location and scale of the lognormal (general definition)'
+    shape, loc, scale = lognorm.fit(x, floc=0.)
+    return log(scale), shape
+
+def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False):
+    from scipy.stats.distributions import norm
+    if nRoundingDigits is None:
+        k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200.
+    else:
+        k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits)
+        stdev = round(stdev, nRoundingDigits)
+        tolerance = round(tolerance, nRoundingDigits)
+    if printLatex:
+        print('$z_{{{}}}^2\\frac{{s^2}}{{e^2}}={}^2\\frac{{{}^2}}{{{}^2}}$'.format(0.5+percentConfidence/200.,k, stdev, tolerance))
+    return (k*stdev/tolerance)**2
+
+def confidenceInterval(mean, stdev, nSamples, percentConfidence, trueStd = True, printLatex = False):
+    '''if trueStd, use normal distribution, otherwise, Student
+
+    Use otherwise t.interval or norm.interval for the boundaries
+    ex: norm.interval(0.95)
+    t.interval(0.95, nSamples-1)'''
+    from scipy.stats.distributions import norm, t
+    if trueStd:
+        k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2)
+    else: # use Student
+        k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2)
+    e = k*stdev/sqrt(nSamples)
+    if printLatex:
+        print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format(mean, k, stdev, nSamples))
+    return mean-e, mean+e
+
+def computeChi2(expected, observed):
+    '''Returns the Chi2 statistics'''
+    return sum([((e-o)*(e-o))/float(e) for e, o in zip(expected, observed)])
+
+class generateDistribution(rv_continuous):
+    def __init__(self, values, probabilities):
+        '''The values (and corresponding probabilities) are supposed to be sorted by value
+        for v, p in zip(values, probabilities): P(X<=v)=p'''
+        assert probabilities[0]==0
+        self.values = values
+        self.probabilities = probabilities
+    
+    def _cdf(self, x):
+        if x < self.values[0]:
+            return self.probabilities[0]
+        else:
+            i=0
+            while i+1<len(self.values) and self.values[i+1] < x:
+                i += 1
+            if i == len(self.values)-1:
+                return self.probabilities[-1]
+            else:
+                return (self.probabilities[i+1]-self.probabilities[i])/(self.values[i+1]-self.values[i])
+
+class DistributionSample(object):
+    def nSamples(self):
+        return sum(self.counts)
+
+def cumulativeDensityFunction(sample, normalized = False):
+    '''Returns the cumulative density function of the sample of a random variable'''
+    xaxis = sorted(sample)
+    counts = arange(1,len(sample)+1) # dtype = float
+    if normalized:
+        counts /= float(len(sample))
+    return xaxis, counts
+
+class DiscreteDistributionSample(DistributionSample):
+    '''Class to represent a sample of a distribution for a discrete random variable'''
+    def __init__(self, categories, counts):
+        self.categories = categories
+        self.counts = counts
+
+    def mean(self):
+        result = [float(x*y) for x,y in zip(self.categories, self.counts)]
+        return npsum(result)/self.nSamples()
+
+    def var(self, mean = None):
+        if not mean:
+            m = self.mean()
+        else:
+            m = mean
+        result = 0.
+        squares = [float((x-m)*(x-m)*y) for x,y in zip(self.categories, self.counts)]
+        return npsum(squares)/(self.nSamples()-1)
+
+    def referenceCounts(self, probability):
+        '''probability is a function that returns the probability of the random variable for the category values'''
+        refProba = [probability(c) for c in self.categories]
+        refProba[-1] = 1-npsum(refProba[:-1])
+        refCounts = [r*self.nSamples() for r in refProba]
+        return refCounts, refProba
+
+class ContinuousDistributionSample(DistributionSample):
+    '''Class to represent a sample of a distribution for a continuous random variable
+    with the number of observations for each interval
+    intervals (categories variable) are defined by their left limits, the last one being the right limit
+    categories contain therefore one more element than the counts'''
+    def __init__(self, categories, counts):
+        # todo add samples for initialization and everything to None? (or setSamples?)
+        self.categories = categories
+        self.counts = counts
+
+    @staticmethod
+    def generate(sample, categories):
+        if min(sample) < min(categories):
+            print('Sample has lower min than proposed categories ({}, {})'.format(min(sample), min(categories)))
+        if max(sample) > max(categories):
+            print('Sample has higher max than proposed categories ({}, {})'.format(max(sample), max(categories)))
+        dist = ContinuousDistributionSample(sorted(categories), [0]*(len(categories)-1))
+        for s in sample:
+            i = 0
+            while  i<len(dist.categories) and dist.categories[i] <= s:
+                i += 1
+            if i <= len(dist.counts):
+                dist.counts[i-1] += 1
+                #print('{} in {} {}'.format(s, dist.categories[i-1], dist.categories[i]))
+            else:
+                print('Element {} is not in the categories'.format(s))
+        return dist
+
+    def mean(self):
+        result = 0.
+        for i in range(len(self.counts)-1):
+            result += self.counts[i]*(self.categories[i]+self.categories[i+1])/2
+        return result/self.nSamples()
+
+    def var(self, mean = None):
+        if not mean:
+            m = self.mean()
+        else:
+            m = mean
+        result = 0.
+        for i in range(len(self.counts)-1):
+            mid = (self.categories[i]+self.categories[i+1])/2
+            result += self.counts[i]*(mid - m)*(mid - m)
+        return result/(self.nSamples()-1)
+
+    def referenceCounts(self, cdf):
+        '''cdf is a cumulative distribution function
+        returning the probability of the variable being less that x'''
+        # refCumulativeCounts = [0]#[cdf(self.categories[0][0])]
+#         for inter in self.categories:
+#             refCumulativeCounts.append(cdf(inter[1]))
+        refCumulativeCounts = [cdf(x) for x in self.categories[1:-1]]
+
+        refProba = [refCumulativeCounts[0]]
+        for i in xrange(1,len(refCumulativeCounts)):
+            refProba.append(refCumulativeCounts[i]-refCumulativeCounts[i-1])
+        refProba.append(1-refCumulativeCounts[-1])
+        refCounts = [p*self.nSamples() for p in refProba]
+        
+        return refCounts, refProba
+
+    def printReferenceCounts(self, refCounts=None):
+        if refCounts:
+            ref = refCounts
+        else:
+            ref = self.referenceCounts
+        for i in xrange(len(ref[0])):
+            print('{0}-{1} & {2:0.3} & {3:0.3} \\\\'.format(self.categories[i],self.categories[i+1],ref[1][i], ref[0][i]))
+
+
+#########################
+# maths section
+#########################
+
+# def kernelSmoothing(sampleX, X, Y, weightFunc, halfwidth):
+#     '''Returns a smoothed weighted version of Y at the predefined values of sampleX
+#     Sum_x weight(sample_x,x) * y(x)'''
+#     from numpy import zeros, array
+#     smoothed = zeros(len(sampleX))
+#     for i,x in enumerate(sampleX):
+#         weights = array([weightFunc(x,xx, halfwidth) for xx in X])
+#         if sum(weights)>0:
+#             smoothed[i] = sum(weights*Y)/sum(weights)
+#         else:
+#             smoothed[i] = 0
+#     return smoothed
+
+def kernelSmoothing(x, X, Y, weightFunc, halfwidth):
+    '''Returns the smoothed estimate of (X,Y) at x
+    Sum_x weight(sample_x,x) * y(x)'''
+    weights = array([weightFunc(x,observedx, halfwidth) for observedx in X])
+    if sum(weights)>0:
+        return sum(weights*Y)/sum(weights)
+    else:
+        return 0
+
+def uniform(center, x, halfwidth):
+    if abs(center-x)<halfwidth:
+        return 1.
+    else:
+        return 0.
+
+def gaussian(center, x, halfwidth):
+    return exp(-((center-x)/halfwidth)**2/2)
+
+def epanechnikov(center, x, halfwidth):
+    diff = abs(center-x)
+    if diff<halfwidth:
+        return 1.-(diff/halfwidth)**2
+    else:
+        return 0.
+    
+def triangular(center, x, halfwidth):
+    diff = abs(center-x)
+    if diff<halfwidth:
+        return 1.-abs(diff/halfwidth)
+    else:
+        return 0.
+
+def medianSmoothing(x, X, Y, halfwidth):
+    '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]'''
+    return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth])
+
+def argmaxDict(d):
+    return max(d, key=d.get)
+
+def deltaFrames(t1, t2, frameRate):
+    '''Returns the number of frames between t1 and t2
+    positive if t1<=t2, negative otherwise'''
+    if t1 > t2:
+        return -(t1-t2).seconds*frameRate
+    else:
+        return (t2-t1).seconds*frameRate
+
+def framesToTime(nFrames, frameRate, initialTime = time()):
+    '''returns a datetime.time for the time in hour, minutes and seconds
+    initialTime is a datetime.time'''
+    seconds = int(floor(float(nFrames)/float(frameRate))+initialTime.hour*3600+initialTime.minute*60+initialTime.second)
+    h = int(floor(seconds/3600.))
+    seconds = seconds - h*3600
+    m = int(floor(seconds/60))
+    seconds = seconds - m*60
+    return time(h, m, seconds)
+
+def timeToFrames(t, frameRate):
+    return frameRate*(t.hour*3600+t.minute*60+t.second)
+
+def sortXY(X,Y):
+    'returns the sorted (x, Y(x)) sorted on X'
+    D = {}
+    for x, y in zip(X,Y):
+        D[x]=y
+    xsorted = sorted(D.keys())
+    return xsorted, [D[x] for x in xsorted]
+
+def compareLengthForSort(i, j):
+    if len(i) < len(j):
+        return -1
+    elif len(i) == len(j):
+        return 0
+    else:
+        return 1
+
+def sortByLength(instances, reverse = False):
+    '''Returns a new list with the instances sorted by length (method __len__)
+    reverse is passed to sorted'''
+    return sorted(instances, key = len, reverse = reverse)
+
+def ceilDecimals(v, nDecimals):
+    '''Rounds the number at the nth decimal
+    eg 1.23 at 0 decimal is 2, at 1 decimal is 1.3'''
+    tens = 10**nDecimals
+    return ceil(v*tens)/tens
+
+def inBetween(bound1, bound2, x):
+    'useful if one does not know the order of bound1/bound2'
+    return bound1 <= x <= bound2 or bound2 <= x <= bound1
+
+def pointDistanceL2(x1,y1,x2,y2):
+    ''' Compute point-to-point distance (L2 norm, ie Euclidean distance)'''
+    return sqrt((x2-x1)**2+(y2-y1)**2)
+
+def crossProduct(l1, l2):
+    return l1[0]*l2[1]-l1[1]*l2[0]
+
+def cat_mvgavg(cat_list, halfWidth):
+    ''' Return a list of categories/values smoothed according to a window. 
+        halfWidth is the search radius on either side'''
+    from copy import deepcopy
+    smoothed = deepcopy(cat_list)
+    for point in range(len(cat_list)):
+        lower_bound_check = max(0,point-halfWidth)
+        upper_bound_check = min(len(cat_list)-1,point+halfWidth+1)
+        window_values = cat_list[lower_bound_check:upper_bound_check]
+        smoothed[point] = max(set(window_values), key=window_values.count)
+    return smoothed
+
+def filterMovingWindow(inputSignal, halfWidth):
+    '''Returns an array obtained after the smoothing of the input by a moving average
+    The first and last points are copied from the original.'''
+    width = float(halfWidth*2+1)
+    win = ones(width,'d')
+    result = convolve(win/width,array(inputSignal),'same')
+    result[:halfWidth] = inputSignal[:halfWidth]
+    result[-halfWidth:] = inputSignal[-halfWidth:]
+    return result
+
+def linearRegression(x, y, deg = 1, plotData = False):
+    '''returns the least square estimation of the linear regression of y = ax+b
+    as well as the plot'''
+    from numpy.lib.polynomial import polyfit
+    from numpy.core.multiarray import arange
+    coef = polyfit(x, y, deg)
+    if plotData:
+        def poly(x):
+            result = 0
+            for i in range(len(coef)):
+                result += coef[i]*x**(len(coef)-i-1)
+            return result
+        plt.plot(x, y, 'x')
+        xx = arange(min(x), max(x),(max(x)-min(x))/1000)
+        plt.plot(xx, [poly(z) for z in xx])
+    return coef
+
+def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
+    '''Computes (and displays) the correlation matrix for a pandas DataFrame'''
+    columns = data.columns.tolist()
+    for var in data.columns:
+        uniqueValues = data[var].unique()
+        if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan
+            columns.remove(var)
+    c=data[columns].corr(correlationMethod)
+    if plotFigure:
+        fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0]))
+        fig.add_subplot(1,1,1)
+        #plt.imshow(np.fabs(c), interpolation='none')
+        plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
+        if displayNames is not None:
+            colnames = [displayNames.get(s.strip(), s.strip()) for s in columns]
+        else:
+            colnames = columns
+        #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
+        plt.xticks(range(len(colnames)), colnames, rotation=90)
+        plt.yticks(range(len(colnames)), colnames)
+        plt.tick_params('both', length=0)
+        plt.subplots_adjust(bottom = 0.29)
+        plt.colorbar()
+        plt.title('Correlation ({})'.format(correlationMethod))
+        plt.tight_layout()
+        if len(colnames) > 50:
+            plt.subplots_adjust(left=.06)
+        if figureFilename is not None:
+            plt.savefig(figureFilename, dpi = 150, transparent = True)
+    return c
+
+def addDummies(data, variables, allVariables = True):
+    '''Add binary dummy variables for each value of a nominal variable 
+    in a pandas DataFrame'''
+    newVariables = []
+    for var in variables:
+        if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
+            values = data[var].unique()
+            if not allVariables:
+                values = values[:-1]
+            for val in values:
+                if val is not NaN:
+                    newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
+                    data[newVariable] = (data[var] == val)
+                    newVariables.append(newVariable)
+    return newVariables
+
+def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = ''):
+    '''Studies the influence of (nominal) independent variable over the dependent variable
+
+    Makes tests if the conditional distributions are normal
+    using the Shapiro-Wilk test (in which case ANOVA could be used)
+    Implements uses the non-parametric Kruskal Wallis test'''
+    tmp = data[data[independentVariable].notnull()]
+    independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
+    if len(independentVariableValues) >= 2:
+        if saveLatex:
+            from storage import openCheck
+            out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w')
+        for x in independentVariableValues:
+            print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
+            if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
+                print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]))
+        if plotFigure:
+            plt.figure()
+            plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
+            plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
+            plt.title('{} vs {}'.format(dependentVariable, independentVariable))
+            if filenamePrefix is not None:
+                plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
+        table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
+        table['count'] = table['count'].astype(int)
+        testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
+        if saveLatex:
+            out.write('\\begin{minipage}{\\linewidth}\n'
+                      +'\\centering\n'
+                      +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n'
+                      +table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n'
+                      +'\\end{minipage}\n'
+                      +'\\ \\vspace{0.5cm}\n')
+        else:
+            print(table)
+        return testResult
+    else:
+        return None
+
+def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''):
+    '''Removes variables from candidate independent variables if
+    - if two independent variables are correlated (> maxCorrelationThreshold), one is removed
+    - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
+    Returns the remaining non-correlated variables, correlated with the dependent variable
+
+    correlationFunc is spearmanr or pearsonr from scipy.stats
+    text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise
+
+    TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''    
+    from copy import copy
+    from pandas import DataFrame
+    result = copy(independentVariables)
+    table1 = ''
+    table2 = {}
+    # constant variables
+    for var in independentVariables:
+        uniqueValues = data[var].unique()
+        if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1):
+            print(stdoutText[0].format(var, uniqueValues))
+            if saveFiles:
+                table1 += latexTable[0].format(var, *uniqueValues)
+            result.remove(var)
+    # correlated variables
+    for v1 in copy(result):
+        if v1 in correlations.index:
+            for v2 in copy(result):
+                if v2 != v1 and v2 in correlations.index:
+                    if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
+                        if v1 in result and v2 in result:
+                            if saveFiles:
+                                table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2])
+                            print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2]))
+                            result.remove(v2)
+    # not correlated with dependent variable
+    table2['Correlations'] = []
+    table2['Valeurs p'] = []
+    for var in copy(result):
+        if data.dtypes[var] != dtype('O'):
+            cor, p = correlationFunc(data[dependentVariable], data[var])
+            if p > maxCorrelationP:
+                if saveFiles:
+                    table1 += latexTable[2].format(var, cor, p)
+                print(stdoutText[2].format(var, cor, p))
+                result.remove(var)
+            else:
+                table2['Correlations'].append(cor)
+                table2['Valeurs p'].append(p)
+
+    if saveFiles:
+        from storage import openCheck
+        out = openCheck(filenamePrefix+'-removed-variables.tex', 'w')
+        out.write(latexHeader)
+        out.write(table1)
+        out.write(latexFooter)
+        out.close()
+        out = openCheck(filenamePrefix+'-correlations.html', 'w')
+        table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')]
+        out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False))
+        out.close()
+    return result
+
+def saveDokMatrix(filename, m, lowerTriangle = False):
+    'Saves a dok_matrix using savez'
+    if lowerTriangle:
+        keys = [k for k in m if k[0] > k[1]]
+        savez(filename, shape = m.shape, keys = keys, values = [m[k[0],k[1]] for k in keys])
+    else:
+        savez(filename, shape = m.shape, keys = list(m.keys()), values = list(m.values()))
+
+def loadDokMatrix(filename):
+    'Loads a dok_matrix saved using the above saveDokMatrix'
+    data = npload(filename)
+    m = dok_matrix(tuple(data['shape']))
+    for k, v in zip(data['keys'], data['values']):
+        m[tuple(k)] = v
+    return m
+
+def aggregationFunction(funcStr, centile = 50):
+    '''return the numpy function corresponding to funcStr
+    centile can be a list of centiles to compute at once, eg [25, 50, 75] for the 3 quartiles'''
+    if funcStr == 'median':
+        return median
+    elif funcStr == 'mean':
+        return mean
+    elif funcStr == 'centile':
+        return lambda x: percentile(x, centile)
+    elif funcStr == '85centile':
+        return lambda x: percentile(x, 85)
+    else:
+        print('Unknown aggregation method: {}'.format(funcStr))
+        return None
+
+#########################
+# regression analysis using statsmodels (and pandas)
+#########################
+
+# TODO make class for experiments?
+# TODO add tests with public dataset downloaded from Internet (IRIS et al)
+def modelString(experiment, dependentVariable, independentVariables):
+    return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]])
+
+def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'):
+    import statsmodels.formula.api as smf
+    modelStr = modelString(experiment, dependentVariable, independentVariables)
+    if regressionType == 'ols':
+        model = smf.ols(modelStr, data = data)
+    elif regressionType == 'gls':
+        model = smf.gls(modelStr, data = data)
+    elif regressionType == 'rlm':
+        model = smf.rlm(modelStr, data = data)
+    else:
+        print('Unknown regression type {}. Exiting'.format(regressionType))
+        import sys
+        sys.exit()
+    return model.fit()
+
+def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'):
+    '''Runs several models and stores 3 statistics
+    adjusted R2, condition number (should be small, eg < 1000)
+    and p-value for Shapiro-Wilk test of residual normality'''
+    for i,experiment in experiments.iterrows():
+        if experiment[independentVariables].any():
+            results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols')
+            experiments.loc[i,'r2adj'] = results.rsquared_adj
+            experiments.loc[i,'condNum'] = results.condition_number
+            experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1]
+            experiments.loc[i,'nobs'] = int(results.nobs)
+    return experiments
+
+def generateExperiments(independentVariables):
+    '''Generates all possible models for including or not each independent variable'''
+    from pandas import DataFrame
+    experiments = {}
+    nIndependentVariables = len(independentVariables)
+    if nIndependentVariables != len(set(independentVariables)):
+        print("Duplicate variables. Exiting")
+        import sys
+        sys.exit()
+    nModels = 2**nIndependentVariables
+    for i,var in enumerate(independentVariables):
+        pattern = [False]*(2**i)+[True]*(2**i)
+        experiments[var] = pattern*(2**(nIndependentVariables-i-1))
+    experiments = DataFrame(experiments)
+    experiments['r2adj'] = 0.
+    experiments['condNum'] = NaN
+    experiments['shapiroP'] = -1
+    experiments['nobs'] = -1
+    return experiments
+
+def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
+    '''Generates all possible model with the independentVariables
+    and runs them, saving the results in experiments
+    with multiprocess option'''
+    from pandas import concat
+    from multiprocessing import Pool
+    experiments = generateExperiments(independentVariables)
+    nModels = len(experiments)
+    print("Running {} models with {} processes".format(nModels, nProcesses))
+    print("IndependentVariables: {}".format(independentVariables))
+    if nProcesses == 1:
+        return runModels(experiments, data, dependentVariable, independentVariables, regressionType)
+    else:
+        pool = Pool(processes = nProcesses)
+        chunkSize = int(ceil(nModels/nProcesses))
+        jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)]
+        return concat([job.get() for job in jobs])
+
+def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None):
+    '''Forward search for best model (based on adjusted R2)
+    Randomly starting with one variable and adding randomly variables 
+    if they improve the model
+    
+    The results are added to experiments if provided as argument
+    Storing in experiment relies on the index being the number equal 
+    to the binary code derived from the independent variables'''
+    from numpy.random import permutation as nppermutation
+    if experiments is None:
+        experiments = generateExperiments(independentVariables)
+    nIndependentVariables = len(independentVariables)
+    permutation = nppermutation(list(range(nIndependentVariables)))
+    variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)}
+    print('Tested variables '+', '.join([variableMapping[i] for i in range(nIndependentVariables)]))
+    bestModel = [False]*nIndependentVariables
+    currentVarNum = 0
+    currentR2Adj = 0.
+    for currentVarNum in range(nIndependentVariables):
+        currentModel = [i for i in bestModel]
+        currentModel[currentVarNum] = True
+        rowIdx = sum([0]+[2**i for i in range(nIndependentVariables) if currentModel[permutation[i]]])
+        #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in range(nIndependentVariables) if currentModel[permutation[i]]])
+        if experiments.loc[rowIdx, 'shapiroP'] < 0:
+            modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables)
+            model = modelFunc(modelStr, data = data)
+            results = model.fit()
+            experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj
+            experiments.loc[rowIdx, 'condNum'] = results.condition_number
+            experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1]
+            experiments.loc[rowIdx, 'nobs'] = int(results.nobs)
+        if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
+            currentR2Adj = experiments.loc[rowIdx, 'r2adj']
+            bestModel[currentVarNum] = True
+    return experiments
+
+def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}):
+    import statsmodels.api as sm
+    '''Displays some model results
+
+    3 graphics, true-predicted, residuals-predicted, '''
+    print(results.summary())
+    shapiroResult = shapiro(results.resid)
+    print(shapiroResult)
+    if plotFigures:
+        fig = plt.figure(figsize=(7,6.3*(2+int(model is not None))))
+        if model is not None:
+            ax = fig.add_subplot(3,1,1)
+            plt.plot(results.predict(), model.endog, 'x')
+            x=plt.xlim()
+            y=plt.ylim()
+            plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
+            #plt.axis('equal')
+            if text is not None:
+                plt.title(text['title-shapiro'].format(*shapiroResult))
+                #plt.title(text['true-predicted.title'])
+                plt.xlabel(text['true-predicted.xlabel'])
+                plt.ylabel(text['true-predicted.ylabel'])
+            fig.add_subplot(3,1,2, sharex = ax)
+            plt.plot(results.predict(), results.resid, 'x')
+            nextSubplotNum = 3
+        else:
+            fig.add_subplot(2,1,1)
+            plt.plot(results.predict(), results.resid, 'x')
+            nextSubplotNum = 2
+        if text is not None:
+            if model is None:
+                plt.title(text['title-shapiro'].format(*shapiroResult))
+            plt.xlabel(text['residuals-predicted.xlabel'])
+            plt.ylabel(text['residuals-predicted.ylabel'])
+        qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum)
+        sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx)
+        plt.axis('equal')
+        if text is not None and 'qqplot.xlabel' in text:
+            plt.xlabel(text['qqplot.xlabel'])
+            plt.ylabel(text['qqplot.ylabel'])
+        plt.tight_layout()
+        if filenamePrefix is not None:
+            from storage import openCheck
+            out = openCheck(filenamePrefix+'-coefficients.html', 'w')
+            out.write(results.summary().as_html())
+            plt.savefig(filenamePrefix+'-model-results.'+figureFileType)
+
+#########################
+# iterable section
+#########################
+
+def mostCommon(L):
+    '''Returns the most frequent element in a iterable
+
+    taken from http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list'''
+    from itertools import groupby
+    from operator import itemgetter
+    # get an iterable of (item, iterable) pairs
+    SL = sorted((x, i) for i, x in enumerate(L))
+    # print 'SL:', SL
+    groups = groupby(SL, key=itemgetter(0))
+    # auxiliary function to get "quality" for an item
+    def _auxfun(g):
+        item, iterable = g
+        count = 0
+        min_index = len(L)
+        for _, where in iterable:
+            count += 1
+            min_index = min(min_index, where)
+            # print 'item %r, count %r, minind %r' % (item, count, min_index)
+        return count, -min_index
+    # pick the highest-count/earliest item
+    return max(groups, key=_auxfun)[0]
+
+#########################
+# sequence section
+#########################
+
+class LCSS(object):
+    '''Class that keeps the LCSS parameters
+    and puts together the various computations
+
+    the methods with names starting with _ are not to be shadowed
+    in child classes, who will shadow the other methods, 
+    ie compute and computeXX methods'''
+    def __init__(self, similarityFunc = None, metric = None, epsilon = None, delta = float('inf'), aligned = False, lengthFunc = min):
+        '''One should provide either a similarity function
+        that indicates (return bool) whether elements in the compares lists are similar
+
+        eg distance(p1, p2) < epsilon
+        
+        or a type of metric usable in scipy.spatial.distance.cdist with an epsilon'''
+        if similarityFunc is None and metric is None:
+            print("No way to compute LCSS, similarityFunc and metric are None. Exiting")
+            import sys
+            sys.exit()
+        elif metric is not None and epsilon is None:
+            print("Please provide a value for epsilon if using a cdist metric. Exiting")
+            import sys
+            sys.exit()
+        else:
+            if similarityFunc is None and metric is not None and not isinf(delta):
+                print('Warning: you are using a cdist metric and a finite delta, which will make probably computation slower than using the equivalent similarityFunc (since all pairwise distances will be computed by cdist).')
+            self.similarityFunc = similarityFunc
+            self.metric = metric
+            self.epsilon = epsilon
+            self.aligned = aligned
+            self.delta = delta
+            self.lengthFunc = lengthFunc
+            self.subSequenceIndices = [(0,0)]
+
+    def similarities(self, l1, l2, jshift=0):
+        n1 = len(l1)
+        n2 = len(l2)
+        self.similarityTable = zeros((n1+1,n2+1), dtype = npint)
+        if self.similarityFunc is not None:
+            for i in range(1,n1+1):
+                for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1):
+                    if self.similarityFunc(l1[i-1], l2[j-1]):
+                        self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1
+                    else:
+                        self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1])
+        elif self.metric is not None:
+            similarElements = distance.cdist(l1, l2, self.metric) <= self.epsilon
+            for i in range(1,n1+1):
+                for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1):
+                    if similarElements[i-1, j-1]:
+                        self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1
+                    else:
+                        self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1])
+            
+
+    def subSequence(self, i, j):
+        '''Returns the subsequence of two sequences
+        http://en.wikipedia.org/wiki/Longest_common_subsequence_problem'''
+        if i == 0 or j == 0:
+            return []
+        elif self.similarityTable[i][j] == self.similarityTable[i][j-1]:
+            return self.subSequence(i, j-1)
+        elif self.similarityTable[i][j] == self.similarityTable[i-1][j]:
+            return self.subSequence(i-1, j)
+        else:
+            return self.subSequence(i-1, j-1) + [(i-1,j-1)]
+
+    def _compute(self, _l1, _l2, computeSubSequence = False):
+        '''returns the longest common subsequence similarity
+        l1 and l2 should be the right format
+        eg list of tuple points for cdist 
+        or elements that can be compare using similarityFunc
+
+        if aligned, returns the best matching if using a finite delta by shifting the series alignments
+        '''
+        if len(_l2) < len(_l1): # l1 is the shortest
+            l1 = _l2
+            l2 = _l1
+            revertIndices = True
+        else:
+            l1 = _l1
+            l2 = _l2
+            revertIndices = False
+        n1 = len(l1)
+        n2 = len(l2)
+
+        if self.aligned:
+            lcssValues = {}
+            similarityTables = {}
+            for i in range(-n2-self.delta+1, n1+self.delta): # interval such that [i-shift-delta, i-shift+delta] is never empty, which happens when i-shift+delta < 1 or when i-shift-delta > n2
+                self.similarities(l1, l2, i)
+                lcssValues[i] = self.similarityTable.max()
+                similarityTables[i] = self.similarityTable
+                #print self.similarityTable
+            alignmentShift = argmaxDict(lcssValues) # ideally get the medium alignment shift, the one that minimizes distance
+            self.similarityTable = similarityTables[alignmentShift]
+        else:
+            alignmentShift = 0
+            self.similarities(l1, l2)
+
+        # threshold values for the useful part of the similarity table are n2-n1-delta and n1-n2-delta
+        self.similarityTable = self.similarityTable[:min(n1, n2+alignmentShift+self.delta)+1, :min(n2, n1-alignmentShift+self.delta)+1]
+
+        if computeSubSequence:
+            self.subSequenceIndices = self.subSequence(self.similarityTable.shape[0]-1, self.similarityTable.shape[1]-1)
+            if revertIndices:
+                self.subSequenceIndices = [(j,i) for i,j in self.subSequenceIndices]
+        return self.similarityTable[-1,-1]
+
+    def compute(self, l1, l2, computeSubSequence = False):
+        '''get methods are to be shadowed in child classes '''
+        return self._compute(l1, l2, computeSubSequence)
+
+    def computeAlignment(self):
+        return mean([j-i for i,j in self.subSequenceIndices])
+
+    def _computeNormalized(self, l1, l2, computeSubSequence = False):
+        ''' compute the normalized LCSS
+        ie, the LCSS divided by the min or mean of the indicator lengths (using lengthFunc)
+        lengthFunc = lambda x,y:float(x,y)/2'''
+        return float(self._compute(l1, l2, computeSubSequence))/self.lengthFunc(len(l1), len(l2))
+
+    def computeNormalized(self, l1, l2, computeSubSequence = False):
+        return self._computeNormalized(l1, l2, computeSubSequence)
+
+    def _computeDistance(self, l1, l2, computeSubSequence = False):
+        ''' compute the LCSS distance'''
+        return 1-self._computeNormalized(l1, l2, computeSubSequence)
+
+    def computeDistance(self, l1, l2, computeSubSequence = False):
+        return self._computeDistance(l1, l2, computeSubSequence)
+    
+#########################
+# plotting section
+#########################
+
+def plotPolygon(poly, options = '', **kwargs):
+    'Plots shapely polygon poly'
+    from matplotlib.pyplot import plot
+    x,y = poly.exterior.xy
+    plot(x, y, options, **kwargs)
+
+def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1):
+    '''for each value in X, increment by increment the initial count
+    returns the lists that can be plotted 
+    to obtain a step plot increasing by one for each value in x, from first to last value
+    firstX and lastX should be respectively smaller and larger than all elements in X'''
+    
+    sortedX = []
+    counts = [initialCount]
+    for x in sorted(X):
+        sortedX += [x,x]
+        counts.append(counts[-1])
+        counts.append(counts[-1]+increment)
+    counts.append(counts[-1])
+    return [firstX]+sortedX+[lastX], counts
+
+class PlottingPropertyValues(object):
+    def __init__(self, values):
+        self.values = values
+
+    def __getitem__(self, i):
+        return self.values[i%len(self.values)]
+
+markers = PlottingPropertyValues(['+', '*', ',', '.', 'x', 'D', 's', 'o'])
+scatterMarkers = PlottingPropertyValues(['s','o','^','>','v','<','d','p','h','8','+','x'])
+
+linestyles = PlottingPropertyValues(['-', '--', '-.', ':'])
+
+colors = PlottingPropertyValues('brgmyck') # 'w'
+
+def monochromeCycler(withMarker = False):
+    from cycler import cycler
+    if withMarker:
+        monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.']) * cycler('marker', ['^',',', '.']))
+    else:
+        monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.']))
+    plt.rc('axes', prop_cycle=monochrome)
+
+def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1):
+    from matplotlib.pyplot import pcolor
+    coords = array(list(indicatorMap.keys()))
+    minX = min(coords[:,0])
+    minY = min(coords[:,1])
+    X = arange(minX, max(coords[:,0])+1.1)*squareSize
+    Y = arange(minY, max(coords[:,1])+1.1)*squareSize
+    C = defaultValue*ones((len(Y), len(X)))
+    for k,v in indicatorMap.items():
+        C[k[1]-minY,k[0]-minX] = v
+    if masked:
+        pcolor(X, Y, ma.masked_where(C==defaultValue,C))
+    else:
+        pcolor(X, Y, C)
+
+#########################
+# Data download
+#########################
+
+def downloadECWeather(stationID, years, months = [], outputDirectoryname = '.', english = True):
+    '''Downloads monthly weather data from Environment Canada
+    If month is provided (number 1 to 12), it means hourly data for the whole month
+    Otherwise, means the data for each day, for the whole year
+
+    Example: MONTREAL MCTAVISH	10761
+             MONTREALPIERRE ELLIOTT TRUDEAU INTL A	5415
+    see ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv
+
+    To get daily data for 2010 and 2011, downloadECWeather(10761, [2010,2011], [], '/tmp')
+    To get hourly data for 2009 and 2012, January, March and October, downloadECWeather(10761, [2009,2012], [1,3,10], '/tmp')
+
+    for annee in `seq 2016 2017`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&timeframe=2&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done
+    for annee in `seq 2016 2017`;do for mois in `seq 1 12`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&Month=${mois}&timeframe=1&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done;done
+    '''
+    import urllib.request
+    if english:
+        language = 'e'
+    else:
+        language = 'f'
+    if len(months) == 0:
+        timeFrame = 2
+        months = [1]
+    else:
+        timeFrame = 1
+
+    for year in years:
+        for month in months:
+            outFilename = '{}/{}-{}'.format(outputDirectoryname, stationID, year)
+            if timeFrame == 1:
+                outFilename += '-{}-hourly'.format(month)
+            else:
+                outFilename += '-daily'
+            outFilename += '.csv'
+            url = urllib.request.urlretrieve('http://climate.weather.gc.ca/climate_data/bulk_data_{}.html?format=csv&stationID={}&Year={}&Month={}&Day=1&timeframe={}&submit=Download+Data'.format(language, stationID, year, month, timeFrame), outFilename)
+
+#########################
+# File I/O
+#########################
+
+def removeExtension(filename, delimiter = '.'):
+    '''Returns the filename minus the extension (all characters after last .)'''
+    i = filename.rfind(delimiter)
+    if i>0:
+        return filename[:i]
+    else:
+        return filename
+
+def getExtension(filename, delimiter = '.'):
+    '''Returns the filename minus the extension (all characters after last .)'''
+    i = filename.rfind(delimiter)
+    if i>0:
+        return filename[i+1:]
+    else:
+        return ''
+
+def cleanFilename(s):
+    'cleans filenames obtained when contatenating figure characteristics'
+    return s.replace(' ','-').replace('.','').replace('/','-').replace(',','')
+
+def getRelativeFilename(parentPath, filename):
+    'Returns filename if absolute, otherwise parentPath/filename as string'
+    filePath = Path(filename)
+    if filePath.is_absolute():
+        return filename
+    else:
+        return str(parentPath/filePath)
+
+def listfiles(dirname, extension, remove = False):
+    '''Returns the list of files with the extension in the directory dirname
+    If remove is True, the filenames are stripped from the extension'''
+    d = Path(dirname)
+    if d.is_dir():
+        tmp = [str(f) for f in d.glob('*.extension')]
+        if remove:
+            return [removeExtension(f, extension) for f in tmp]
+        else:
+            return tmp
+    else:
+        print(dirname+' is not a directory')
+        return []
+
+def mkdir(dirname):
+    'Creates a directory if it does not exist'
+    p = Path(dirname)
+    if not p.exists():
+        p.mkdir()
+    else:
+        print(dirname+' already exists')
+
+def removeFile(filename):
+    '''Deletes the file while avoiding raising an error 
+    if the file does not exist'''
+    f = Path(filename)
+    if (f.exists()):
+        f.unlink()
+    else:
+        print(filename+' does not exist')
+
+def line2Floats(l, separator=' '):
+    '''Returns the list of floats corresponding to the string'''
+    return [float(x) for x in l.split(separator)]
+
+def line2Ints(l, separator=' '):
+    '''Returns the list of ints corresponding to the string'''
+    return [int(x) for x in l.split(separator)]
+
+#########################
+# Profiling
+#########################
+
+def analyzeProfile(profileFilename, stripDirs = True):
+    '''Analyze the file produced by cProfile 
+
+    obtained by for example: 
+    - call in script (for main() function in script)
+    import cProfile, os
+    cProfile.run('main()', os.path.join(os.getcwd(),'main.profile'))
+
+    - or on the command line:
+    python -m cProfile [-o profile.bin] [-s sort] scriptfile [arg]'''
+    import pstats, os
+    p = pstats.Stats(os.path.join(os.pardir, profileFilename))
+    if stripDirs:
+        p.strip_dirs()
+    p.sort_stats('time')
+    p.print_stats(.2)
+    #p.sort_stats('time')
+    # p.print_callees(.1, 'int_prediction.py:')
+    return p
+
+#########################
+# running tests
+#########################
+
+if __name__ == "__main__":
+    import doctest
+    import unittest
+    suite = doctest.DocFileSuite('tests/utils.txt')
+    #suite = doctest.DocTestSuite()
+    unittest.TextTestRunner().run(suite)
+    #doctest.testmod()
+    #doctest.testfile("example.txt")
author	Nicolas Saunier <nicolas.saunier@polymtl.ca>
date	Fri, 15 Jun 2018 11:19:10 -0400
parents	python/utils.py@a13f47c8931d
children	c6cf75a2ed08