comparison python/utils.py @ 667:179b81faa1f8

added regression analysis functions
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Mon, 25 May 2015 13:54:29 +0200
parents 15e244d2a1b5
children f8dcf483b296
comparison
equal deleted inserted replaced
666:93633ce122c3 667:179b81faa1f8
284 return result 284 return result
285 plot(x, y, 'x') 285 plot(x, y, 'x')
286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000)
287 plot(xx, [poly(z) for z in xx]) 287 plot(xx, [poly(z) for z in xx])
288 return coef 288 return coef
289
290
291 #########################
292 # regression analysis using statsmodels (and pandas)
293 #########################
294
295 # TODO make class for experiments?
296 def modelString(experiment, dependentVariable, independentVariables):
297 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]])
298
299 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'):
300 import statsmodels.formula.api as smf
301 modelStr = modelString(experiment, dependentVariable, independentVariables)
302 if regressionType == 'ols':
303 model = smf.ols(modelStr, data = data)
304 elif regressionType == 'gls':
305 model = smf.gls(modelStr, data = data)
306 elif regressionType == 'rlm':
307 model = smf.rlm(modelStr, data = data)
308 else:
309 print('Unknown regression type {}. Exiting'.format(regressionType))
310 import sys
311 sys.exit()
312 return model.fit()
313
314 def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'):
315 '''Runs several models and stores 3 statistics
316 adjusted R2, condition number (should be small, eg < 1000)
317 and p-value for Shapiro-Wilk test of residual normality'''
318 for i,experiment in experiments.iterrows():
319 if experiment[independentVariables].any():
320 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols')
321 experiments.loc[i,'r2adj'] = results.rsquared_adj
322 experiments.loc[i,'condNum'] = results.condition_number
323 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1]
324 return experiments
325
326 def generateExperiments(independentVariables):
327 '''Generates all possible models for including or not each independent variable'''
328 experiments = {}
329 nIndependentVariables = len(independentVariables)
330 if nIndependentVariables != len(np.unique(independentVariables)):
331 print("Duplicate variables. Exiting")
332 import sys
333 sys.exit()
334 nModels = 2**nIndependentVariables
335 for i,var in enumerate(independentVariables):
336 pattern = [False]*(2**i)+[True]*(2**i)
337 experiments[var] = pattern*(2**(nIndependentVariables-i-1))
338 experiments = pd.DataFrame(experiments)
339 experiments['r2adj'] = 0.
340 experiments['condNum'] = np.nan
341 experiments['shapiroP'] = -1
342 return experiments
343
344 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
345 '''Generates all possible model with the independentVariables
346 and runs them, saving the results in experiments
347 with multiprocess option'''
348 experiments = generateExperiments(independentVariables)
349 nModels = len(experiments)
350 print("Running {} models with {} processes".format(nModels, nProcesses))
351 if nProcesses == 1:
352 return runModels(experiments, data, dependentVariable, independentVariables, regressionType)
353 else:
354 pool = Pool(processes = nProcesses)
355 chunkSize = int(np.ceil(nModels/nProcesses))
356 jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)]
357 return pd.concat([job.get() for job in jobs])
358
359 def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None):
360 '''Forward search for best model (based on adjusted R2)
361 Randomly starting with one variable and adding randomly variables
362 if they improve the model
363
364 The results are added to experiments if provided as argument
365 Storing in experiment relies on the index being the number equal
366 to the binary code derived from the independent variables'''
367 if experiments is None:
368 experiments = generateExperiments(independentVariables)
369 nIndependentVariables = len(independentVariables)
370 permutation = np.random.permutation(range(nIndependentVariables)).tolist()
371 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)}
372 print('Tested variables '+', '.join([variableMapping[i] for i in xrange(nIndependentVariables)]))
373 bestModel = [False]*nIndependentVariables
374 currentVarNum = 0
375 currentR2Adj = 0.
376 for currentVarNum in xrange(nIndependentVariables):
377 currentModel = [i for i in bestModel]
378 currentModel[currentVarNum] = True
379 rowIdx = sum([0]+[2**i for i in xrange(nIndependentVariables) if currentModel[permutation[i]]])
380 #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in xrange(nIndependentVariables) if currentModel[permutation[i]]])
381 if experiments.loc[rowIdx, 'shapiroP'] < 0:
382 modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables)
383 model = modelFunc(modelStr, data = data)
384 results = model.fit()
385 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj
386 experiments.loc[rowIdx, 'condNum'] = results.condition_number
387 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1]
388 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
389 currentR2Adj = experiments.loc[rowIdx, 'r2adj']
390 bestModel[currentVarNum] = True
391 return experiments
392
393 def displayModelResults(results, model = None):
394 import statsmodels.api as sm
395 '''Displays some model results'''
396 print results.summary()
397 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid)))
398 if model is not None:
399 plt.figure()
400 plt.plot(results.predict(), model.endog, 'x')
401 x=plt.xlim()
402 y=plt.ylim()
403 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
404 plt.title('true vs predicted')
405 plt.figure()
406 plt.plot(results.predict(), results.resid, 'x')
407 plt.title('residuals vs predicted')
408 sm.qqplot(results.resid, fit = True, line = '45')
409
289 410
290 ######################### 411 #########################
291 # iterable section 412 # iterable section
292 ######################### 413 #########################
293 414