Mercurial Hosting > traffic-intelligence
diff trafficintelligence/utils.py @ 1028:cc5cb04b04b0
major update using the trafficintelligence package name and install through pip
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Fri, 15 Jun 2018 11:19:10 -0400 |
parents | python/utils.py@a13f47c8931d |
children | c6cf75a2ed08 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trafficintelligence/utils.py Fri Jun 15 11:19:10 2018 -0400 @@ -0,0 +1,1090 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- +''' Generic utilities.''' + +import matplotlib.pyplot as plt +from datetime import time, datetime +from argparse import ArgumentTypeError +from pathlib import Path +from math import sqrt, ceil, floor +from scipy.stats import rv_continuous, kruskal, shapiro, lognorm +from scipy.spatial import distance +from scipy.sparse import dok_matrix +from numpy import zeros, array, exp, sum as npsum, int as npint, arange, cumsum, mean, median, percentile, isnan, ones, convolve, dtype, isnan, NaN, ma, isinf, savez, load as npload, log + + +datetimeFormat = "%Y-%m-%d %H:%M:%S" + +sjcamDatetimeFormat = "%Y_%m%d_%H%M%S"#2017_0626_143720 + +######################### +# Strings +######################### + +def upperCaseFirstLetter(s): + words = s.split(' ') + lowerWords = [w[0].upper()+w[1:].lower() for w in words] + return ' '.join(lowerWords) + +class TimeConverter: + def __init__(self, datetimeFormat = datetimeFormat): + self.datetimeFormat = datetimeFormat + + def convert(self, s): + try: + return datetime.strptime(s, self.datetimeFormat) + except ValueError: + msg = "Not a valid date: '{0}'.".format(s) + raise ArgumentTypeError(msg) + +######################### +# Enumerations +######################### + +def inverseEnumeration(l): + 'Returns the dictionary that provides for each element in the input list its index in the input list' + result = {} + for i,x in enumerate(l): + result[x] = i + return result + +######################### +# Simple statistics +######################### + +def logNormalMeanVar(loc, scale): + '''location and scale are respectively the mean and standard deviation of the normal in the log-normal distribution + https://en.wikipedia.org/wiki/Log-normal_distribution + + same as lognorm.stats(scale, 0, exp(loc))''' + mean = exp(loc+(scale**2)/2) + var = (exp(scale**2)-1)*exp(2*loc+scale**2) + return mean, var + +def fitLogNormal(x): + 'returns the fitted location and scale of the lognormal (general definition)' + shape, loc, scale = lognorm.fit(x, floc=0.) + return log(scale), shape + +def sampleSize(stdev, tolerance, percentConfidence, nRoundingDigits = None, printLatex = False): + from scipy.stats.distributions import norm + if nRoundingDigits is None: + k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) # 1.-(100-percentConfidence)/200. + else: + k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), nRoundingDigits) + stdev = round(stdev, nRoundingDigits) + tolerance = round(tolerance, nRoundingDigits) + if printLatex: + print('$z_{{{}}}^2\\frac{{s^2}}{{e^2}}={}^2\\frac{{{}^2}}{{{}^2}}$'.format(0.5+percentConfidence/200.,k, stdev, tolerance)) + return (k*stdev/tolerance)**2 + +def confidenceInterval(mean, stdev, nSamples, percentConfidence, trueStd = True, printLatex = False): + '''if trueStd, use normal distribution, otherwise, Student + + Use otherwise t.interval or norm.interval for the boundaries + ex: norm.interval(0.95) + t.interval(0.95, nSamples-1)''' + from scipy.stats.distributions import norm, t + if trueStd: + k = round(norm.ppf(0.5+percentConfidence/200., 0, 1), 2) + else: # use Student + k = round(t.ppf(0.5+percentConfidence/200., nSamples-1), 2) + e = k*stdev/sqrt(nSamples) + if printLatex: + print('${0} \pm {1}\\frac{{{2}}}{{\sqrt{{{3}}}}}$'.format(mean, k, stdev, nSamples)) + return mean-e, mean+e + +def computeChi2(expected, observed): + '''Returns the Chi2 statistics''' + return sum([((e-o)*(e-o))/float(e) for e, o in zip(expected, observed)]) + +class generateDistribution(rv_continuous): + def __init__(self, values, probabilities): + '''The values (and corresponding probabilities) are supposed to be sorted by value + for v, p in zip(values, probabilities): P(X<=v)=p''' + assert probabilities[0]==0 + self.values = values + self.probabilities = probabilities + + def _cdf(self, x): + if x < self.values[0]: + return self.probabilities[0] + else: + i=0 + while i+1<len(self.values) and self.values[i+1] < x: + i += 1 + if i == len(self.values)-1: + return self.probabilities[-1] + else: + return (self.probabilities[i+1]-self.probabilities[i])/(self.values[i+1]-self.values[i]) + +class DistributionSample(object): + def nSamples(self): + return sum(self.counts) + +def cumulativeDensityFunction(sample, normalized = False): + '''Returns the cumulative density function of the sample of a random variable''' + xaxis = sorted(sample) + counts = arange(1,len(sample)+1) # dtype = float + if normalized: + counts /= float(len(sample)) + return xaxis, counts + +class DiscreteDistributionSample(DistributionSample): + '''Class to represent a sample of a distribution for a discrete random variable''' + def __init__(self, categories, counts): + self.categories = categories + self.counts = counts + + def mean(self): + result = [float(x*y) for x,y in zip(self.categories, self.counts)] + return npsum(result)/self.nSamples() + + def var(self, mean = None): + if not mean: + m = self.mean() + else: + m = mean + result = 0. + squares = [float((x-m)*(x-m)*y) for x,y in zip(self.categories, self.counts)] + return npsum(squares)/(self.nSamples()-1) + + def referenceCounts(self, probability): + '''probability is a function that returns the probability of the random variable for the category values''' + refProba = [probability(c) for c in self.categories] + refProba[-1] = 1-npsum(refProba[:-1]) + refCounts = [r*self.nSamples() for r in refProba] + return refCounts, refProba + +class ContinuousDistributionSample(DistributionSample): + '''Class to represent a sample of a distribution for a continuous random variable + with the number of observations for each interval + intervals (categories variable) are defined by their left limits, the last one being the right limit + categories contain therefore one more element than the counts''' + def __init__(self, categories, counts): + # todo add samples for initialization and everything to None? (or setSamples?) + self.categories = categories + self.counts = counts + + @staticmethod + def generate(sample, categories): + if min(sample) < min(categories): + print('Sample has lower min than proposed categories ({}, {})'.format(min(sample), min(categories))) + if max(sample) > max(categories): + print('Sample has higher max than proposed categories ({}, {})'.format(max(sample), max(categories))) + dist = ContinuousDistributionSample(sorted(categories), [0]*(len(categories)-1)) + for s in sample: + i = 0 + while i<len(dist.categories) and dist.categories[i] <= s: + i += 1 + if i <= len(dist.counts): + dist.counts[i-1] += 1 + #print('{} in {} {}'.format(s, dist.categories[i-1], dist.categories[i])) + else: + print('Element {} is not in the categories'.format(s)) + return dist + + def mean(self): + result = 0. + for i in range(len(self.counts)-1): + result += self.counts[i]*(self.categories[i]+self.categories[i+1])/2 + return result/self.nSamples() + + def var(self, mean = None): + if not mean: + m = self.mean() + else: + m = mean + result = 0. + for i in range(len(self.counts)-1): + mid = (self.categories[i]+self.categories[i+1])/2 + result += self.counts[i]*(mid - m)*(mid - m) + return result/(self.nSamples()-1) + + def referenceCounts(self, cdf): + '''cdf is a cumulative distribution function + returning the probability of the variable being less that x''' + # refCumulativeCounts = [0]#[cdf(self.categories[0][0])] +# for inter in self.categories: +# refCumulativeCounts.append(cdf(inter[1])) + refCumulativeCounts = [cdf(x) for x in self.categories[1:-1]] + + refProba = [refCumulativeCounts[0]] + for i in xrange(1,len(refCumulativeCounts)): + refProba.append(refCumulativeCounts[i]-refCumulativeCounts[i-1]) + refProba.append(1-refCumulativeCounts[-1]) + refCounts = [p*self.nSamples() for p in refProba] + + return refCounts, refProba + + def printReferenceCounts(self, refCounts=None): + if refCounts: + ref = refCounts + else: + ref = self.referenceCounts + for i in xrange(len(ref[0])): + print('{0}-{1} & {2:0.3} & {3:0.3} \\\\'.format(self.categories[i],self.categories[i+1],ref[1][i], ref[0][i])) + + +######################### +# maths section +######################### + +# def kernelSmoothing(sampleX, X, Y, weightFunc, halfwidth): +# '''Returns a smoothed weighted version of Y at the predefined values of sampleX +# Sum_x weight(sample_x,x) * y(x)''' +# from numpy import zeros, array +# smoothed = zeros(len(sampleX)) +# for i,x in enumerate(sampleX): +# weights = array([weightFunc(x,xx, halfwidth) for xx in X]) +# if sum(weights)>0: +# smoothed[i] = sum(weights*Y)/sum(weights) +# else: +# smoothed[i] = 0 +# return smoothed + +def kernelSmoothing(x, X, Y, weightFunc, halfwidth): + '''Returns the smoothed estimate of (X,Y) at x + Sum_x weight(sample_x,x) * y(x)''' + weights = array([weightFunc(x,observedx, halfwidth) for observedx in X]) + if sum(weights)>0: + return sum(weights*Y)/sum(weights) + else: + return 0 + +def uniform(center, x, halfwidth): + if abs(center-x)<halfwidth: + return 1. + else: + return 0. + +def gaussian(center, x, halfwidth): + return exp(-((center-x)/halfwidth)**2/2) + +def epanechnikov(center, x, halfwidth): + diff = abs(center-x) + if diff<halfwidth: + return 1.-(diff/halfwidth)**2 + else: + return 0. + +def triangular(center, x, halfwidth): + diff = abs(center-x) + if diff<halfwidth: + return 1.-abs(diff/halfwidth) + else: + return 0. + +def medianSmoothing(x, X, Y, halfwidth): + '''Returns the media of Y's corresponding to X's in the interval [x-halfwidth, x+halfwidth]''' + return median([y for observedx, y in zip(X,Y) if abs(x-observedx)<halfwidth]) + +def argmaxDict(d): + return max(d, key=d.get) + +def deltaFrames(t1, t2, frameRate): + '''Returns the number of frames between t1 and t2 + positive if t1<=t2, negative otherwise''' + if t1 > t2: + return -(t1-t2).seconds*frameRate + else: + return (t2-t1).seconds*frameRate + +def framesToTime(nFrames, frameRate, initialTime = time()): + '''returns a datetime.time for the time in hour, minutes and seconds + initialTime is a datetime.time''' + seconds = int(floor(float(nFrames)/float(frameRate))+initialTime.hour*3600+initialTime.minute*60+initialTime.second) + h = int(floor(seconds/3600.)) + seconds = seconds - h*3600 + m = int(floor(seconds/60)) + seconds = seconds - m*60 + return time(h, m, seconds) + +def timeToFrames(t, frameRate): + return frameRate*(t.hour*3600+t.minute*60+t.second) + +def sortXY(X,Y): + 'returns the sorted (x, Y(x)) sorted on X' + D = {} + for x, y in zip(X,Y): + D[x]=y + xsorted = sorted(D.keys()) + return xsorted, [D[x] for x in xsorted] + +def compareLengthForSort(i, j): + if len(i) < len(j): + return -1 + elif len(i) == len(j): + return 0 + else: + return 1 + +def sortByLength(instances, reverse = False): + '''Returns a new list with the instances sorted by length (method __len__) + reverse is passed to sorted''' + return sorted(instances, key = len, reverse = reverse) + +def ceilDecimals(v, nDecimals): + '''Rounds the number at the nth decimal + eg 1.23 at 0 decimal is 2, at 1 decimal is 1.3''' + tens = 10**nDecimals + return ceil(v*tens)/tens + +def inBetween(bound1, bound2, x): + 'useful if one does not know the order of bound1/bound2' + return bound1 <= x <= bound2 or bound2 <= x <= bound1 + +def pointDistanceL2(x1,y1,x2,y2): + ''' Compute point-to-point distance (L2 norm, ie Euclidean distance)''' + return sqrt((x2-x1)**2+(y2-y1)**2) + +def crossProduct(l1, l2): + return l1[0]*l2[1]-l1[1]*l2[0] + +def cat_mvgavg(cat_list, halfWidth): + ''' Return a list of categories/values smoothed according to a window. + halfWidth is the search radius on either side''' + from copy import deepcopy + smoothed = deepcopy(cat_list) + for point in range(len(cat_list)): + lower_bound_check = max(0,point-halfWidth) + upper_bound_check = min(len(cat_list)-1,point+halfWidth+1) + window_values = cat_list[lower_bound_check:upper_bound_check] + smoothed[point] = max(set(window_values), key=window_values.count) + return smoothed + +def filterMovingWindow(inputSignal, halfWidth): + '''Returns an array obtained after the smoothing of the input by a moving average + The first and last points are copied from the original.''' + width = float(halfWidth*2+1) + win = ones(width,'d') + result = convolve(win/width,array(inputSignal),'same') + result[:halfWidth] = inputSignal[:halfWidth] + result[-halfWidth:] = inputSignal[-halfWidth:] + return result + +def linearRegression(x, y, deg = 1, plotData = False): + '''returns the least square estimation of the linear regression of y = ax+b + as well as the plot''' + from numpy.lib.polynomial import polyfit + from numpy.core.multiarray import arange + coef = polyfit(x, y, deg) + if plotData: + def poly(x): + result = 0 + for i in range(len(coef)): + result += coef[i]*x**(len(coef)-i-1) + return result + plt.plot(x, y, 'x') + xx = arange(min(x), max(x),(max(x)-min(x))/1000) + plt.plot(xx, [poly(z) for z in xx]) + return coef + +def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None): + '''Computes (and displays) the correlation matrix for a pandas DataFrame''' + columns = data.columns.tolist() + for var in data.columns: + uniqueValues = data[var].unique() + if len(uniqueValues) == 1 or data.dtypes[var] == dtype('O') or (len(uniqueValues) == 2 and len(data.loc[~isnan(data[var]), var].unique()) == 1): # last condition: only one other value than nan + columns.remove(var) + c=data[columns].corr(correlationMethod) + if plotFigure: + fig = plt.figure(figsize=(4+0.4*c.shape[0], 0.4*c.shape[0])) + fig.add_subplot(1,1,1) + #plt.imshow(np.fabs(c), interpolation='none') + plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm + if displayNames is not None: + colnames = [displayNames.get(s.strip(), s.strip()) for s in columns] + else: + colnames = columns + #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename) + plt.xticks(range(len(colnames)), colnames, rotation=90) + plt.yticks(range(len(colnames)), colnames) + plt.tick_params('both', length=0) + plt.subplots_adjust(bottom = 0.29) + plt.colorbar() + plt.title('Correlation ({})'.format(correlationMethod)) + plt.tight_layout() + if len(colnames) > 50: + plt.subplots_adjust(left=.06) + if figureFilename is not None: + plt.savefig(figureFilename, dpi = 150, transparent = True) + return c + +def addDummies(data, variables, allVariables = True): + '''Add binary dummy variables for each value of a nominal variable + in a pandas DataFrame''' + newVariables = [] + for var in variables: + if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2: + values = data[var].unique() + if not allVariables: + values = values[:-1] + for val in values: + if val is not NaN: + newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') + data[newVariable] = (data[var] == val) + newVariables.append(newVariable) + return newVariables + +def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, renameVariables = lambda s: s, kwCaption = ''): + '''Studies the influence of (nominal) independent variable over the dependent variable + + Makes tests if the conditional distributions are normal + using the Shapiro-Wilk test (in which case ANOVA could be used) + Implements uses the non-parametric Kruskal Wallis test''' + tmp = data[data[independentVariable].notnull()] + independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) + if len(independentVariableValues) >= 2: + if saveLatex: + from storage import openCheck + out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w') + for x in independentVariableValues: + print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) + if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: + print(shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])) + if plotFigure: + plt.figure() + plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) + plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) + plt.title('{} vs {}'.format(dependentVariable, independentVariable)) + if filenamePrefix is not None: + plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) + table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) + table['count'] = table['count'].astype(int) + testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) + if saveLatex: + out.write('\\begin{minipage}{\\linewidth}\n' + +'\\centering\n' + +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n' + +table.to_latex(float_format = lambda x: '{:.3f}'.format(x)).encode('ascii')+'\n' + +'\\end{minipage}\n' + +'\\ \\vspace{0.5cm}\n') + else: + print(table) + return testResult + else: + return None + +def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc, stdoutText = ['Removing {} (constant: {})', 'Removing {} (correlation {} with {})', 'Removing {} (no correlation: {}, p={})'], saveFiles = False, filenamePrefix = None, latexHeader = '', latexTable = None, latexFooter=''): + '''Removes variables from candidate independent variables if + - if two independent variables are correlated (> maxCorrelationThreshold), one is removed + - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP) + Returns the remaining non-correlated variables, correlated with the dependent variable + + correlationFunc is spearmanr or pearsonr from scipy.stats + text is the template to display for the two types of printout (see default): 3 elements if no saving to latex file, 8 otherwise + + TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' + from copy import copy + from pandas import DataFrame + result = copy(independentVariables) + table1 = '' + table2 = {} + # constant variables + for var in independentVariables: + uniqueValues = data[var].unique() + if (len(uniqueValues) == 1) or (len(uniqueValues) == 2 and uniqueValues.dtype != dtype('O') and len(data.loc[~isnan(data[var]), var].unique()) == 1): + print(stdoutText[0].format(var, uniqueValues)) + if saveFiles: + table1 += latexTable[0].format(var, *uniqueValues) + result.remove(var) + # correlated variables + for v1 in copy(result): + if v1 in correlations.index: + for v2 in copy(result): + if v2 != v1 and v2 in correlations.index: + if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold: + if v1 in result and v2 in result: + if saveFiles: + table1 += latexTable[1].format(v2, v1, correlations.loc[v1, v2]) + print(stdoutText[1].format(v2, v1, correlations.loc[v1, v2])) + result.remove(v2) + # not correlated with dependent variable + table2['Correlations'] = [] + table2['Valeurs p'] = [] + for var in copy(result): + if data.dtypes[var] != dtype('O'): + cor, p = correlationFunc(data[dependentVariable], data[var]) + if p > maxCorrelationP: + if saveFiles: + table1 += latexTable[2].format(var, cor, p) + print(stdoutText[2].format(var, cor, p)) + result.remove(var) + else: + table2['Correlations'].append(cor) + table2['Valeurs p'].append(p) + + if saveFiles: + from storage import openCheck + out = openCheck(filenamePrefix+'-removed-variables.tex', 'w') + out.write(latexHeader) + out.write(table1) + out.write(latexFooter) + out.close() + out = openCheck(filenamePrefix+'-correlations.html', 'w') + table2['Variables'] = [var for var in result if data.dtypes[var] != dtype('O')] + out.write(DataFrame(table2)[['Variables', 'Correlations', 'Valeurs p']].to_html(formatters = {'Correlations': lambda x: '{:.2f}'.format(x), 'Valeurs p': lambda x: '{:.3f}'.format(x)}, index = False)) + out.close() + return result + +def saveDokMatrix(filename, m, lowerTriangle = False): + 'Saves a dok_matrix using savez' + if lowerTriangle: + keys = [k for k in m if k[0] > k[1]] + savez(filename, shape = m.shape, keys = keys, values = [m[k[0],k[1]] for k in keys]) + else: + savez(filename, shape = m.shape, keys = list(m.keys()), values = list(m.values())) + +def loadDokMatrix(filename): + 'Loads a dok_matrix saved using the above saveDokMatrix' + data = npload(filename) + m = dok_matrix(tuple(data['shape'])) + for k, v in zip(data['keys'], data['values']): + m[tuple(k)] = v + return m + +def aggregationFunction(funcStr, centile = 50): + '''return the numpy function corresponding to funcStr + centile can be a list of centiles to compute at once, eg [25, 50, 75] for the 3 quartiles''' + if funcStr == 'median': + return median + elif funcStr == 'mean': + return mean + elif funcStr == 'centile': + return lambda x: percentile(x, centile) + elif funcStr == '85centile': + return lambda x: percentile(x, 85) + else: + print('Unknown aggregation method: {}'.format(funcStr)) + return None + +######################### +# regression analysis using statsmodels (and pandas) +######################### + +# TODO make class for experiments? +# TODO add tests with public dataset downloaded from Internet (IRIS et al) +def modelString(experiment, dependentVariable, independentVariables): + return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) + +def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): + import statsmodels.formula.api as smf + modelStr = modelString(experiment, dependentVariable, independentVariables) + if regressionType == 'ols': + model = smf.ols(modelStr, data = data) + elif regressionType == 'gls': + model = smf.gls(modelStr, data = data) + elif regressionType == 'rlm': + model = smf.rlm(modelStr, data = data) + else: + print('Unknown regression type {}. Exiting'.format(regressionType)) + import sys + sys.exit() + return model.fit() + +def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'): + '''Runs several models and stores 3 statistics + adjusted R2, condition number (should be small, eg < 1000) + and p-value for Shapiro-Wilk test of residual normality''' + for i,experiment in experiments.iterrows(): + if experiment[independentVariables].any(): + results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') + experiments.loc[i,'r2adj'] = results.rsquared_adj + experiments.loc[i,'condNum'] = results.condition_number + experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] + experiments.loc[i,'nobs'] = int(results.nobs) + return experiments + +def generateExperiments(independentVariables): + '''Generates all possible models for including or not each independent variable''' + from pandas import DataFrame + experiments = {} + nIndependentVariables = len(independentVariables) + if nIndependentVariables != len(set(independentVariables)): + print("Duplicate variables. Exiting") + import sys + sys.exit() + nModels = 2**nIndependentVariables + for i,var in enumerate(independentVariables): + pattern = [False]*(2**i)+[True]*(2**i) + experiments[var] = pattern*(2**(nIndependentVariables-i-1)) + experiments = DataFrame(experiments) + experiments['r2adj'] = 0. + experiments['condNum'] = NaN + experiments['shapiroP'] = -1 + experiments['nobs'] = -1 + return experiments + +def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): + '''Generates all possible model with the independentVariables + and runs them, saving the results in experiments + with multiprocess option''' + from pandas import concat + from multiprocessing import Pool + experiments = generateExperiments(independentVariables) + nModels = len(experiments) + print("Running {} models with {} processes".format(nModels, nProcesses)) + print("IndependentVariables: {}".format(independentVariables)) + if nProcesses == 1: + return runModels(experiments, data, dependentVariable, independentVariables, regressionType) + else: + pool = Pool(processes = nProcesses) + chunkSize = int(ceil(nModels/nProcesses)) + jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)] + return concat([job.get() for job in jobs]) + +def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None): + '''Forward search for best model (based on adjusted R2) + Randomly starting with one variable and adding randomly variables + if they improve the model + + The results are added to experiments if provided as argument + Storing in experiment relies on the index being the number equal + to the binary code derived from the independent variables''' + from numpy.random import permutation as nppermutation + if experiments is None: + experiments = generateExperiments(independentVariables) + nIndependentVariables = len(independentVariables) + permutation = nppermutation(list(range(nIndependentVariables))) + variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} + print('Tested variables '+', '.join([variableMapping[i] for i in range(nIndependentVariables)])) + bestModel = [False]*nIndependentVariables + currentVarNum = 0 + currentR2Adj = 0. + for currentVarNum in range(nIndependentVariables): + currentModel = [i for i in bestModel] + currentModel[currentVarNum] = True + rowIdx = sum([0]+[2**i for i in range(nIndependentVariables) if currentModel[permutation[i]]]) + #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in range(nIndependentVariables) if currentModel[permutation[i]]]) + if experiments.loc[rowIdx, 'shapiroP'] < 0: + modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables) + model = modelFunc(modelStr, data = data) + results = model.fit() + experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj + experiments.loc[rowIdx, 'condNum'] = results.condition_number + experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] + experiments.loc[rowIdx, 'nobs'] = int(results.nobs) + if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: + currentR2Adj = experiments.loc[rowIdx, 'r2adj'] + bestModel[currentVarNum] = True + return experiments + +def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf', text = {'title-shapiro': 'Shapiro-Wilk normality test for residuals: {:.2f} (p={:.3f})', 'true-predicted.xlabel': 'Predicted values', 'true-predicted.ylabel': 'True values', 'residuals-predicted.xlabel': 'Predicted values', 'residuals-predicted.ylabel': 'Residuals'}): + import statsmodels.api as sm + '''Displays some model results + + 3 graphics, true-predicted, residuals-predicted, ''' + print(results.summary()) + shapiroResult = shapiro(results.resid) + print(shapiroResult) + if plotFigures: + fig = plt.figure(figsize=(7,6.3*(2+int(model is not None)))) + if model is not None: + ax = fig.add_subplot(3,1,1) + plt.plot(results.predict(), model.endog, 'x') + x=plt.xlim() + y=plt.ylim() + plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') + #plt.axis('equal') + if text is not None: + plt.title(text['title-shapiro'].format(*shapiroResult)) + #plt.title(text['true-predicted.title']) + plt.xlabel(text['true-predicted.xlabel']) + plt.ylabel(text['true-predicted.ylabel']) + fig.add_subplot(3,1,2, sharex = ax) + plt.plot(results.predict(), results.resid, 'x') + nextSubplotNum = 3 + else: + fig.add_subplot(2,1,1) + plt.plot(results.predict(), results.resid, 'x') + nextSubplotNum = 2 + if text is not None: + if model is None: + plt.title(text['title-shapiro'].format(*shapiroResult)) + plt.xlabel(text['residuals-predicted.xlabel']) + plt.ylabel(text['residuals-predicted.ylabel']) + qqAx = fig.add_subplot(nextSubplotNum,1,nextSubplotNum) + sm.qqplot(results.resid, fit = True, line = '45', ax = qqAx) + plt.axis('equal') + if text is not None and 'qqplot.xlabel' in text: + plt.xlabel(text['qqplot.xlabel']) + plt.ylabel(text['qqplot.ylabel']) + plt.tight_layout() + if filenamePrefix is not None: + from storage import openCheck + out = openCheck(filenamePrefix+'-coefficients.html', 'w') + out.write(results.summary().as_html()) + plt.savefig(filenamePrefix+'-model-results.'+figureFileType) + +######################### +# iterable section +######################### + +def mostCommon(L): + '''Returns the most frequent element in a iterable + + taken from http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list''' + from itertools import groupby + from operator import itemgetter + # get an iterable of (item, iterable) pairs + SL = sorted((x, i) for i, x in enumerate(L)) + # print 'SL:', SL + groups = groupby(SL, key=itemgetter(0)) + # auxiliary function to get "quality" for an item + def _auxfun(g): + item, iterable = g + count = 0 + min_index = len(L) + for _, where in iterable: + count += 1 + min_index = min(min_index, where) + # print 'item %r, count %r, minind %r' % (item, count, min_index) + return count, -min_index + # pick the highest-count/earliest item + return max(groups, key=_auxfun)[0] + +######################### +# sequence section +######################### + +class LCSS(object): + '''Class that keeps the LCSS parameters + and puts together the various computations + + the methods with names starting with _ are not to be shadowed + in child classes, who will shadow the other methods, + ie compute and computeXX methods''' + def __init__(self, similarityFunc = None, metric = None, epsilon = None, delta = float('inf'), aligned = False, lengthFunc = min): + '''One should provide either a similarity function + that indicates (return bool) whether elements in the compares lists are similar + + eg distance(p1, p2) < epsilon + + or a type of metric usable in scipy.spatial.distance.cdist with an epsilon''' + if similarityFunc is None and metric is None: + print("No way to compute LCSS, similarityFunc and metric are None. Exiting") + import sys + sys.exit() + elif metric is not None and epsilon is None: + print("Please provide a value for epsilon if using a cdist metric. Exiting") + import sys + sys.exit() + else: + if similarityFunc is None and metric is not None and not isinf(delta): + print('Warning: you are using a cdist metric and a finite delta, which will make probably computation slower than using the equivalent similarityFunc (since all pairwise distances will be computed by cdist).') + self.similarityFunc = similarityFunc + self.metric = metric + self.epsilon = epsilon + self.aligned = aligned + self.delta = delta + self.lengthFunc = lengthFunc + self.subSequenceIndices = [(0,0)] + + def similarities(self, l1, l2, jshift=0): + n1 = len(l1) + n2 = len(l2) + self.similarityTable = zeros((n1+1,n2+1), dtype = npint) + if self.similarityFunc is not None: + for i in range(1,n1+1): + for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): + if self.similarityFunc(l1[i-1], l2[j-1]): + self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 + else: + self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) + elif self.metric is not None: + similarElements = distance.cdist(l1, l2, self.metric) <= self.epsilon + for i in range(1,n1+1): + for j in range(max(1,i-jshift-self.delta),min(n2,i-jshift+self.delta)+1): + if similarElements[i-1, j-1]: + self.similarityTable[i,j] = self.similarityTable[i-1,j-1]+1 + else: + self.similarityTable[i,j] = max(self.similarityTable[i-1,j], self.similarityTable[i,j-1]) + + + def subSequence(self, i, j): + '''Returns the subsequence of two sequences + http://en.wikipedia.org/wiki/Longest_common_subsequence_problem''' + if i == 0 or j == 0: + return [] + elif self.similarityTable[i][j] == self.similarityTable[i][j-1]: + return self.subSequence(i, j-1) + elif self.similarityTable[i][j] == self.similarityTable[i-1][j]: + return self.subSequence(i-1, j) + else: + return self.subSequence(i-1, j-1) + [(i-1,j-1)] + + def _compute(self, _l1, _l2, computeSubSequence = False): + '''returns the longest common subsequence similarity + l1 and l2 should be the right format + eg list of tuple points for cdist + or elements that can be compare using similarityFunc + + if aligned, returns the best matching if using a finite delta by shifting the series alignments + ''' + if len(_l2) < len(_l1): # l1 is the shortest + l1 = _l2 + l2 = _l1 + revertIndices = True + else: + l1 = _l1 + l2 = _l2 + revertIndices = False + n1 = len(l1) + n2 = len(l2) + + if self.aligned: + lcssValues = {} + similarityTables = {} + for i in range(-n2-self.delta+1, n1+self.delta): # interval such that [i-shift-delta, i-shift+delta] is never empty, which happens when i-shift+delta < 1 or when i-shift-delta > n2 + self.similarities(l1, l2, i) + lcssValues[i] = self.similarityTable.max() + similarityTables[i] = self.similarityTable + #print self.similarityTable + alignmentShift = argmaxDict(lcssValues) # ideally get the medium alignment shift, the one that minimizes distance + self.similarityTable = similarityTables[alignmentShift] + else: + alignmentShift = 0 + self.similarities(l1, l2) + + # threshold values for the useful part of the similarity table are n2-n1-delta and n1-n2-delta + self.similarityTable = self.similarityTable[:min(n1, n2+alignmentShift+self.delta)+1, :min(n2, n1-alignmentShift+self.delta)+1] + + if computeSubSequence: + self.subSequenceIndices = self.subSequence(self.similarityTable.shape[0]-1, self.similarityTable.shape[1]-1) + if revertIndices: + self.subSequenceIndices = [(j,i) for i,j in self.subSequenceIndices] + return self.similarityTable[-1,-1] + + def compute(self, l1, l2, computeSubSequence = False): + '''get methods are to be shadowed in child classes ''' + return self._compute(l1, l2, computeSubSequence) + + def computeAlignment(self): + return mean([j-i for i,j in self.subSequenceIndices]) + + def _computeNormalized(self, l1, l2, computeSubSequence = False): + ''' compute the normalized LCSS + ie, the LCSS divided by the min or mean of the indicator lengths (using lengthFunc) + lengthFunc = lambda x,y:float(x,y)/2''' + return float(self._compute(l1, l2, computeSubSequence))/self.lengthFunc(len(l1), len(l2)) + + def computeNormalized(self, l1, l2, computeSubSequence = False): + return self._computeNormalized(l1, l2, computeSubSequence) + + def _computeDistance(self, l1, l2, computeSubSequence = False): + ''' compute the LCSS distance''' + return 1-self._computeNormalized(l1, l2, computeSubSequence) + + def computeDistance(self, l1, l2, computeSubSequence = False): + return self._computeDistance(l1, l2, computeSubSequence) + +######################### +# plotting section +######################### + +def plotPolygon(poly, options = '', **kwargs): + 'Plots shapely polygon poly' + from matplotlib.pyplot import plot + x,y = poly.exterior.xy + plot(x, y, options, **kwargs) + +def stepPlot(X, firstX, lastX, initialCount = 0, increment = 1): + '''for each value in X, increment by increment the initial count + returns the lists that can be plotted + to obtain a step plot increasing by one for each value in x, from first to last value + firstX and lastX should be respectively smaller and larger than all elements in X''' + + sortedX = [] + counts = [initialCount] + for x in sorted(X): + sortedX += [x,x] + counts.append(counts[-1]) + counts.append(counts[-1]+increment) + counts.append(counts[-1]) + return [firstX]+sortedX+[lastX], counts + +class PlottingPropertyValues(object): + def __init__(self, values): + self.values = values + + def __getitem__(self, i): + return self.values[i%len(self.values)] + +markers = PlottingPropertyValues(['+', '*', ',', '.', 'x', 'D', 's', 'o']) +scatterMarkers = PlottingPropertyValues(['s','o','^','>','v','<','d','p','h','8','+','x']) + +linestyles = PlottingPropertyValues(['-', '--', '-.', ':']) + +colors = PlottingPropertyValues('brgmyck') # 'w' + +def monochromeCycler(withMarker = False): + from cycler import cycler + if withMarker: + monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.']) * cycler('marker', ['^',',', '.'])) + else: + monochrome = (cycler('color', ['k']) * cycler('linestyle', ['-', '--', ':', '-.'])) + plt.rc('axes', prop_cycle=monochrome) + +def plotIndicatorMap(indicatorMap, squareSize, masked = True, defaultValue=-1): + from matplotlib.pyplot import pcolor + coords = array(list(indicatorMap.keys())) + minX = min(coords[:,0]) + minY = min(coords[:,1]) + X = arange(minX, max(coords[:,0])+1.1)*squareSize + Y = arange(minY, max(coords[:,1])+1.1)*squareSize + C = defaultValue*ones((len(Y), len(X))) + for k,v in indicatorMap.items(): + C[k[1]-minY,k[0]-minX] = v + if masked: + pcolor(X, Y, ma.masked_where(C==defaultValue,C)) + else: + pcolor(X, Y, C) + +######################### +# Data download +######################### + +def downloadECWeather(stationID, years, months = [], outputDirectoryname = '.', english = True): + '''Downloads monthly weather data from Environment Canada + If month is provided (number 1 to 12), it means hourly data for the whole month + Otherwise, means the data for each day, for the whole year + + Example: MONTREAL MCTAVISH 10761 + MONTREALPIERRE ELLIOTT TRUDEAU INTL A 5415 + see ftp://client_climate@ftp.tor.ec.gc.ca/Pub/Get_More_Data_Plus_de_donnees/Station%20Inventory%20EN.csv + + To get daily data for 2010 and 2011, downloadECWeather(10761, [2010,2011], [], '/tmp') + To get hourly data for 2009 and 2012, January, March and October, downloadECWeather(10761, [2009,2012], [1,3,10], '/tmp') + + for annee in `seq 2016 2017`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&timeframe=2&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done + for annee in `seq 2016 2017`;do for mois in `seq 1 12`;do wget --content-disposition "http://climat.meteo.gc.ca/climate_data/bulk_data_f.html?format=csv&stationID=10761&Year=${annee}&Month=${mois}&timeframe=1&submit=++T%C3%A9l%C3%A9charger+%0D%0Ades+donn%C3%A9es" ;done;done + ''' + import urllib.request + if english: + language = 'e' + else: + language = 'f' + if len(months) == 0: + timeFrame = 2 + months = [1] + else: + timeFrame = 1 + + for year in years: + for month in months: + outFilename = '{}/{}-{}'.format(outputDirectoryname, stationID, year) + if timeFrame == 1: + outFilename += '-{}-hourly'.format(month) + else: + outFilename += '-daily' + outFilename += '.csv' + url = urllib.request.urlretrieve('http://climate.weather.gc.ca/climate_data/bulk_data_{}.html?format=csv&stationID={}&Year={}&Month={}&Day=1&timeframe={}&submit=Download+Data'.format(language, stationID, year, month, timeFrame), outFilename) + +######################### +# File I/O +######################### + +def removeExtension(filename, delimiter = '.'): + '''Returns the filename minus the extension (all characters after last .)''' + i = filename.rfind(delimiter) + if i>0: + return filename[:i] + else: + return filename + +def getExtension(filename, delimiter = '.'): + '''Returns the filename minus the extension (all characters after last .)''' + i = filename.rfind(delimiter) + if i>0: + return filename[i+1:] + else: + return '' + +def cleanFilename(s): + 'cleans filenames obtained when contatenating figure characteristics' + return s.replace(' ','-').replace('.','').replace('/','-').replace(',','') + +def getRelativeFilename(parentPath, filename): + 'Returns filename if absolute, otherwise parentPath/filename as string' + filePath = Path(filename) + if filePath.is_absolute(): + return filename + else: + return str(parentPath/filePath) + +def listfiles(dirname, extension, remove = False): + '''Returns the list of files with the extension in the directory dirname + If remove is True, the filenames are stripped from the extension''' + d = Path(dirname) + if d.is_dir(): + tmp = [str(f) for f in d.glob('*.extension')] + if remove: + return [removeExtension(f, extension) for f in tmp] + else: + return tmp + else: + print(dirname+' is not a directory') + return [] + +def mkdir(dirname): + 'Creates a directory if it does not exist' + p = Path(dirname) + if not p.exists(): + p.mkdir() + else: + print(dirname+' already exists') + +def removeFile(filename): + '''Deletes the file while avoiding raising an error + if the file does not exist''' + f = Path(filename) + if (f.exists()): + f.unlink() + else: + print(filename+' does not exist') + +def line2Floats(l, separator=' '): + '''Returns the list of floats corresponding to the string''' + return [float(x) for x in l.split(separator)] + +def line2Ints(l, separator=' '): + '''Returns the list of ints corresponding to the string''' + return [int(x) for x in l.split(separator)] + +######################### +# Profiling +######################### + +def analyzeProfile(profileFilename, stripDirs = True): + '''Analyze the file produced by cProfile + + obtained by for example: + - call in script (for main() function in script) + import cProfile, os + cProfile.run('main()', os.path.join(os.getcwd(),'main.profile')) + + - or on the command line: + python -m cProfile [-o profile.bin] [-s sort] scriptfile [arg]''' + import pstats, os + p = pstats.Stats(os.path.join(os.pardir, profileFilename)) + if stripDirs: + p.strip_dirs() + p.sort_stats('time') + p.print_stats(.2) + #p.sort_stats('time') + # p.print_callees(.1, 'int_prediction.py:') + return p + +######################### +# running tests +######################### + +if __name__ == "__main__": + import doctest + import unittest + suite = doctest.DocFileSuite('tests/utils.txt') + #suite = doctest.DocTestSuite() + unittest.TextTestRunner().run(suite) + #doctest.testmod() + #doctest.testfile("example.txt")