Mercurial Hosting > traffic-intelligence
comparison python/utils.py @ 668:f8dcf483b296
code to prepare regression variables (remove correlated variables) and record dataset size in experimnets
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Tue, 26 May 2015 11:19:03 +0200 |
parents | 179b81faa1f8 |
children | df6be882f325 |
comparison
equal
deleted
inserted
replaced
667:179b81faa1f8 | 668:f8dcf483b296 |
---|---|
1 #! /usr/bin/env python | 1 #! /usr/bin/env python |
2 ''' Generic utilities.''' | 2 ''' Generic utilities.''' |
3 | 3 |
4 import matplotlib.pyplot as plt | |
4 from datetime import time, datetime | 5 from datetime import time, datetime |
5 from math import sqrt | 6 from math import sqrt |
6 | 7 from scipy.stats import kruskal, shapiro |
7 | 8 |
8 datetimeFormat = "%Y-%m-%d %H:%M:%S" | 9 datetimeFormat = "%Y-%m-%d %H:%M:%S" |
9 | 10 |
10 ######################### | 11 ######################### |
11 # Enumerations | 12 # Enumerations |
271 | 272 |
272 def linearRegression(x, y, deg = 1, plotData = False): | 273 def linearRegression(x, y, deg = 1, plotData = False): |
273 '''returns the least square estimation of the linear regression of y = ax+b | 274 '''returns the least square estimation of the linear regression of y = ax+b |
274 as well as the plot''' | 275 as well as the plot''' |
275 from numpy.lib.polynomial import polyfit | 276 from numpy.lib.polynomial import polyfit |
276 from matplotlib.pyplot import plot | |
277 from numpy.core.multiarray import arange | 277 from numpy.core.multiarray import arange |
278 coef = polyfit(x, y, deg) | 278 coef = polyfit(x, y, deg) |
279 if plotData: | 279 if plotData: |
280 def poly(x): | 280 def poly(x): |
281 result = 0 | 281 result = 0 |
282 for i in range(len(coef)): | 282 for i in range(len(coef)): |
283 result += coef[i]*x**(len(coef)-i-1) | 283 result += coef[i]*x**(len(coef)-i-1) |
284 return result | 284 return result |
285 plot(x, y, 'x') | 285 plt.plot(x, y, 'x') |
286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) | 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) |
287 plot(xx, [poly(z) for z in xx]) | 287 plt.plot(xx, [poly(z) for z in xx]) |
288 return coef | 288 return coef |
289 | 289 |
290 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None): | |
291 '''Computes (and displays) the correlation matrix for a pandas DataFrame''' | |
292 c=data.corr(correlationMethod) | |
293 if plotFigure: | |
294 fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0])) | |
295 fig.add_subplot(1,1,1) | |
296 #plt.imshow(np.fabs(c), interpolation='none') | |
297 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm | |
298 colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()] | |
299 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename) | |
300 plt.xticks(range(len(colnames)), colnames, rotation=90) | |
301 plt.yticks(range(len(colnames)), colnames) | |
302 plt.tick_params('both', length=0) | |
303 plt.subplots_adjust(bottom = 0.29) | |
304 plt.colorbar() | |
305 plt.title('Correlation ({})'.format(correlationMethod)) | |
306 plt.tight_layout() | |
307 if figureFilename is not None: | |
308 plt.savefig(figureFilename, dpi = 150, transparent = True) | |
309 return c | |
310 | |
311 def addDummies(data, variables, allVariables = True): | |
312 '''Add binary dummy variables for each value of a nominal variable | |
313 in a pandas DataFrame''' | |
314 from numpy import NaN, dtype | |
315 newVariables = [] | |
316 for var in variables: | |
317 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2: | |
318 values = data[var].unique() | |
319 if not allVariables: | |
320 values = values[:-1] | |
321 for val in values: | |
322 if val is not NaN: | |
323 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') | |
324 data[newVariable] = (data[var] == val) | |
325 newVariables.append(newVariable) | |
326 return newVariables | |
327 | |
328 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'): | |
329 '''Studies the influence of (nominal) independent variable over the dependent variable | |
330 | |
331 Makes tests if the conditional distributions are normal | |
332 using the Shapiro-Wilk test (in which case ANOVA could be used) | |
333 Implements uses the non-parametric Kruskal Wallis test''' | |
334 tmp = data[data[independentVariable].notnull()] | |
335 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) | |
336 if len(independentVariableValues) >= 2: | |
337 for x in independentVariableValues: | |
338 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) | |
339 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: | |
340 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) | |
341 if plotFigure: | |
342 plt.figure() | |
343 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | |
344 #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) | |
345 #plt.ylim(ymax = q75+1.5*(q75-q25)) | |
346 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) | |
347 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) | |
348 if figureFilenamePrefix is not None: | |
349 plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) | |
350 #else: | |
351 # TODO formatter le tableau (html?) | |
352 print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) | |
353 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | |
354 else: | |
355 return None | |
356 | |
357 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): | |
358 '''Removes variables from candidate independent variables if | |
359 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed | |
360 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP) | |
361 Returns the remaining non-correlated variables, correlated with the dependent variable | |
362 | |
363 correlationFunc is spearmanr or pearsonr from scipy.stats | |
364 | |
365 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable''' | |
366 from numpy import dtype | |
367 from copy import copy | |
368 result = copy(independentVariables) | |
369 for v1 in independentVariables: | |
370 if v1 in correlations.index: | |
371 for v2 in independentVariables: | |
372 if v2 != v1 and v2 in correlations.index: | |
373 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold: | |
374 if v1 in result and v2 in result: | |
375 print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1)) | |
376 result.remove(v2) | |
377 #regressionIndependentVariables = result | |
378 for var in copy(result): | |
379 if data.dtypes[var] != dtype('O'): | |
380 cor, p = correlationFunc(data[dependentVariable], data[var]) | |
381 if p > maxCorrelationP: | |
382 print('Removing {} (no correlation p={})'.format(var, p)) | |
383 result.remove(var) | |
384 return result | |
385 | |
290 | 386 |
291 ######################### | 387 ######################### |
292 # regression analysis using statsmodels (and pandas) | 388 # regression analysis using statsmodels (and pandas) |
293 ######################### | 389 ######################### |
294 | 390 |
295 # TODO make class for experiments? | 391 # TODO make class for experiments? |
392 # TODO add tests with public dataset downloaded from Internet (IRIS et al) | |
296 def modelString(experiment, dependentVariable, independentVariables): | 393 def modelString(experiment, dependentVariable, independentVariables): |
297 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) | 394 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) |
298 | 395 |
299 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): | 396 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): |
300 import statsmodels.formula.api as smf | 397 import statsmodels.formula.api as smf |
319 if experiment[independentVariables].any(): | 416 if experiment[independentVariables].any(): |
320 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') | 417 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') |
321 experiments.loc[i,'r2adj'] = results.rsquared_adj | 418 experiments.loc[i,'r2adj'] = results.rsquared_adj |
322 experiments.loc[i,'condNum'] = results.condition_number | 419 experiments.loc[i,'condNum'] = results.condition_number |
323 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] | 420 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] |
421 experiments.loc[i,'nobs'] = int(results.nobs) | |
324 return experiments | 422 return experiments |
325 | 423 |
326 def generateExperiments(independentVariables): | 424 def generateExperiments(independentVariables): |
327 '''Generates all possible models for including or not each independent variable''' | 425 '''Generates all possible models for including or not each independent variable''' |
328 experiments = {} | 426 experiments = {} |
337 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) | 435 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) |
338 experiments = pd.DataFrame(experiments) | 436 experiments = pd.DataFrame(experiments) |
339 experiments['r2adj'] = 0. | 437 experiments['r2adj'] = 0. |
340 experiments['condNum'] = np.nan | 438 experiments['condNum'] = np.nan |
341 experiments['shapiroP'] = -1 | 439 experiments['shapiroP'] = -1 |
440 experiments['nobs'] = -1 | |
342 return experiments | 441 return experiments |
343 | 442 |
344 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): | 443 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): |
345 '''Generates all possible model with the independentVariables | 444 '''Generates all possible model with the independentVariables |
346 and runs them, saving the results in experiments | 445 and runs them, saving the results in experiments |
383 model = modelFunc(modelStr, data = data) | 482 model = modelFunc(modelStr, data = data) |
384 results = model.fit() | 483 results = model.fit() |
385 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj | 484 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj |
386 experiments.loc[rowIdx, 'condNum'] = results.condition_number | 485 experiments.loc[rowIdx, 'condNum'] = results.condition_number |
387 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] | 486 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] |
487 experiments.loc[rowIdx, 'nobs'] = int(results.nobs) | |
388 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: | 488 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: |
389 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] | 489 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] |
390 bestModel[currentVarNum] = True | 490 bestModel[currentVarNum] = True |
391 return experiments | 491 return experiments |
392 | 492 |