comparison python/utils.py @ 668:f8dcf483b296

code to prepare regression variables (remove correlated variables) and record dataset size in experimnets
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Tue, 26 May 2015 11:19:03 +0200
parents 179b81faa1f8
children df6be882f325
comparison
equal deleted inserted replaced
667:179b81faa1f8 668:f8dcf483b296
1 #! /usr/bin/env python 1 #! /usr/bin/env python
2 ''' Generic utilities.''' 2 ''' Generic utilities.'''
3 3
4 import matplotlib.pyplot as plt
4 from datetime import time, datetime 5 from datetime import time, datetime
5 from math import sqrt 6 from math import sqrt
6 7 from scipy.stats import kruskal, shapiro
7 8
8 datetimeFormat = "%Y-%m-%d %H:%M:%S" 9 datetimeFormat = "%Y-%m-%d %H:%M:%S"
9 10
10 ######################### 11 #########################
11 # Enumerations 12 # Enumerations
271 272
272 def linearRegression(x, y, deg = 1, plotData = False): 273 def linearRegression(x, y, deg = 1, plotData = False):
273 '''returns the least square estimation of the linear regression of y = ax+b 274 '''returns the least square estimation of the linear regression of y = ax+b
274 as well as the plot''' 275 as well as the plot'''
275 from numpy.lib.polynomial import polyfit 276 from numpy.lib.polynomial import polyfit
276 from matplotlib.pyplot import plot
277 from numpy.core.multiarray import arange 277 from numpy.core.multiarray import arange
278 coef = polyfit(x, y, deg) 278 coef = polyfit(x, y, deg)
279 if plotData: 279 if plotData:
280 def poly(x): 280 def poly(x):
281 result = 0 281 result = 0
282 for i in range(len(coef)): 282 for i in range(len(coef)):
283 result += coef[i]*x**(len(coef)-i-1) 283 result += coef[i]*x**(len(coef)-i-1)
284 return result 284 return result
285 plot(x, y, 'x') 285 plt.plot(x, y, 'x')
286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000)
287 plot(xx, [poly(z) for z in xx]) 287 plt.plot(xx, [poly(z) for z in xx])
288 return coef 288 return coef
289 289
290 def correlation(data, correlationMethod = 'pearson', plotFigure = False, displayNames = None, figureFilename = None):
291 '''Computes (and displays) the correlation matrix for a pandas DataFrame'''
292 c=data.corr(correlationMethod)
293 if plotFigure:
294 fig = plt.figure(figsize=(2+0.4*c.shape[0], 0.4*c.shape[0]))
295 fig.add_subplot(1,1,1)
296 #plt.imshow(np.fabs(c), interpolation='none')
297 plt.imshow(c, vmin=-1., vmax = 1., interpolation='none', cmap = 'RdYlBu_r') # coolwarm
298 colnames = [displayNames.get(s.strip(), s.strip()) for s in c.columns.tolist()]
299 #correlation.plot_corr(c, xnames = colnames, normcolor=True, title = filename)
300 plt.xticks(range(len(colnames)), colnames, rotation=90)
301 plt.yticks(range(len(colnames)), colnames)
302 plt.tick_params('both', length=0)
303 plt.subplots_adjust(bottom = 0.29)
304 plt.colorbar()
305 plt.title('Correlation ({})'.format(correlationMethod))
306 plt.tight_layout()
307 if figureFilename is not None:
308 plt.savefig(figureFilename, dpi = 150, transparent = True)
309 return c
310
311 def addDummies(data, variables, allVariables = True):
312 '''Add binary dummy variables for each value of a nominal variable
313 in a pandas DataFrame'''
314 from numpy import NaN, dtype
315 newVariables = []
316 for var in variables:
317 if var in data.columns and data.dtypes[var] == dtype('O') and len(data[var].unique()) > 2:
318 values = data[var].unique()
319 if not allVariables:
320 values = values[:-1]
321 for val in values:
322 if val is not NaN:
323 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
324 data[newVariable] = (data[var] == val)
325 newVariables.append(newVariable)
326 return newVariables
327
328 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'):
329 '''Studies the influence of (nominal) independent variable over the dependent variable
330
331 Makes tests if the conditional distributions are normal
332 using the Shapiro-Wilk test (in which case ANOVA could be used)
333 Implements uses the non-parametric Kruskal Wallis test'''
334 tmp = data[data[independentVariable].notnull()]
335 independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
336 if len(independentVariableValues) >= 2:
337 for x in independentVariableValues:
338 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
339 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
340 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])
341 if plotFigure:
342 plt.figure()
343 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
344 #q25, q75 = tmp[dependentVariable].quantile([.25, .75])
345 #plt.ylim(ymax = q75+1.5*(q75-q25))
346 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
347 plt.title('{} vs {}'.format(dependentVariable, independentVariable))
348 if figureFilenamePrefix is not None:
349 plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
350 #else:
351 # TODO formatter le tableau (html?)
352 print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
353 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
354 else:
355 return None
356
357 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
358 '''Removes variables from candidate independent variables if
359 - if two independent variables are correlated (> maxCorrelationThreshold), one is removed
360 - if an independent variable is not correlated with the dependent variable (p>maxCorrelationP)
361 Returns the remaining non-correlated variables, correlated with the dependent variable
362
363 correlationFunc is spearmanr or pearsonr from scipy.stats
364
365 TODO: pass the dummies for nominal variables and remove if all dummies are correlated, or none is correlated with the dependentvariable'''
366 from numpy import dtype
367 from copy import copy
368 result = copy(independentVariables)
369 for v1 in independentVariables:
370 if v1 in correlations.index:
371 for v2 in independentVariables:
372 if v2 != v1 and v2 in correlations.index:
373 if abs(correlations.loc[v1, v2]) > maxCorrelationThreshold:
374 if v1 in result and v2 in result:
375 print('Removing {} (correlation {} with {})'.format(v2, correlations.loc[v1, v2], v1))
376 result.remove(v2)
377 #regressionIndependentVariables = result
378 for var in copy(result):
379 if data.dtypes[var] != dtype('O'):
380 cor, p = correlationFunc(data[dependentVariable], data[var])
381 if p > maxCorrelationP:
382 print('Removing {} (no correlation p={})'.format(var, p))
383 result.remove(var)
384 return result
385
290 386
291 ######################### 387 #########################
292 # regression analysis using statsmodels (and pandas) 388 # regression analysis using statsmodels (and pandas)
293 ######################### 389 #########################
294 390
295 # TODO make class for experiments? 391 # TODO make class for experiments?
392 # TODO add tests with public dataset downloaded from Internet (IRIS et al)
296 def modelString(experiment, dependentVariable, independentVariables): 393 def modelString(experiment, dependentVariable, independentVariables):
297 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) 394 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]])
298 395
299 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): 396 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'):
300 import statsmodels.formula.api as smf 397 import statsmodels.formula.api as smf
319 if experiment[independentVariables].any(): 416 if experiment[independentVariables].any():
320 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') 417 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols')
321 experiments.loc[i,'r2adj'] = results.rsquared_adj 418 experiments.loc[i,'r2adj'] = results.rsquared_adj
322 experiments.loc[i,'condNum'] = results.condition_number 419 experiments.loc[i,'condNum'] = results.condition_number
323 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] 420 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1]
421 experiments.loc[i,'nobs'] = int(results.nobs)
324 return experiments 422 return experiments
325 423
326 def generateExperiments(independentVariables): 424 def generateExperiments(independentVariables):
327 '''Generates all possible models for including or not each independent variable''' 425 '''Generates all possible models for including or not each independent variable'''
328 experiments = {} 426 experiments = {}
337 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) 435 experiments[var] = pattern*(2**(nIndependentVariables-i-1))
338 experiments = pd.DataFrame(experiments) 436 experiments = pd.DataFrame(experiments)
339 experiments['r2adj'] = 0. 437 experiments['r2adj'] = 0.
340 experiments['condNum'] = np.nan 438 experiments['condNum'] = np.nan
341 experiments['shapiroP'] = -1 439 experiments['shapiroP'] = -1
440 experiments['nobs'] = -1
342 return experiments 441 return experiments
343 442
344 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): 443 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
345 '''Generates all possible model with the independentVariables 444 '''Generates all possible model with the independentVariables
346 and runs them, saving the results in experiments 445 and runs them, saving the results in experiments
383 model = modelFunc(modelStr, data = data) 482 model = modelFunc(modelStr, data = data)
384 results = model.fit() 483 results = model.fit()
385 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj 484 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj
386 experiments.loc[rowIdx, 'condNum'] = results.condition_number 485 experiments.loc[rowIdx, 'condNum'] = results.condition_number
387 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] 486 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1]
487 experiments.loc[rowIdx, 'nobs'] = int(results.nobs)
388 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: 488 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
389 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] 489 currentR2Adj = experiments.loc[rowIdx, 'r2adj']
390 bestModel[currentVarNum] = True 490 bestModel[currentVarNum] = True
391 return experiments 491 return experiments
392 492