Mercurial Hosting > traffic-intelligence
comparison python/utils.py @ 667:179b81faa1f8
added regression analysis functions
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Mon, 25 May 2015 13:54:29 +0200 |
parents | 15e244d2a1b5 |
children | f8dcf483b296 |
comparison
equal
deleted
inserted
replaced
666:93633ce122c3 | 667:179b81faa1f8 |
---|---|
284 return result | 284 return result |
285 plot(x, y, 'x') | 285 plot(x, y, 'x') |
286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) | 286 xx = arange(min(x), max(x),(max(x)-min(x))/1000) |
287 plot(xx, [poly(z) for z in xx]) | 287 plot(xx, [poly(z) for z in xx]) |
288 return coef | 288 return coef |
289 | |
290 | |
291 ######################### | |
292 # regression analysis using statsmodels (and pandas) | |
293 ######################### | |
294 | |
295 # TODO make class for experiments? | |
296 def modelString(experiment, dependentVariable, independentVariables): | |
297 return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) | |
298 | |
299 def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): | |
300 import statsmodels.formula.api as smf | |
301 modelStr = modelString(experiment, dependentVariable, independentVariables) | |
302 if regressionType == 'ols': | |
303 model = smf.ols(modelStr, data = data) | |
304 elif regressionType == 'gls': | |
305 model = smf.gls(modelStr, data = data) | |
306 elif regressionType == 'rlm': | |
307 model = smf.rlm(modelStr, data = data) | |
308 else: | |
309 print('Unknown regression type {}. Exiting'.format(regressionType)) | |
310 import sys | |
311 sys.exit() | |
312 return model.fit() | |
313 | |
314 def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'): | |
315 '''Runs several models and stores 3 statistics | |
316 adjusted R2, condition number (should be small, eg < 1000) | |
317 and p-value for Shapiro-Wilk test of residual normality''' | |
318 for i,experiment in experiments.iterrows(): | |
319 if experiment[independentVariables].any(): | |
320 results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') | |
321 experiments.loc[i,'r2adj'] = results.rsquared_adj | |
322 experiments.loc[i,'condNum'] = results.condition_number | |
323 experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] | |
324 return experiments | |
325 | |
326 def generateExperiments(independentVariables): | |
327 '''Generates all possible models for including or not each independent variable''' | |
328 experiments = {} | |
329 nIndependentVariables = len(independentVariables) | |
330 if nIndependentVariables != len(np.unique(independentVariables)): | |
331 print("Duplicate variables. Exiting") | |
332 import sys | |
333 sys.exit() | |
334 nModels = 2**nIndependentVariables | |
335 for i,var in enumerate(independentVariables): | |
336 pattern = [False]*(2**i)+[True]*(2**i) | |
337 experiments[var] = pattern*(2**(nIndependentVariables-i-1)) | |
338 experiments = pd.DataFrame(experiments) | |
339 experiments['r2adj'] = 0. | |
340 experiments['condNum'] = np.nan | |
341 experiments['shapiroP'] = -1 | |
342 return experiments | |
343 | |
344 def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): | |
345 '''Generates all possible model with the independentVariables | |
346 and runs them, saving the results in experiments | |
347 with multiprocess option''' | |
348 experiments = generateExperiments(independentVariables) | |
349 nModels = len(experiments) | |
350 print("Running {} models with {} processes".format(nModels, nProcesses)) | |
351 if nProcesses == 1: | |
352 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) | |
353 else: | |
354 pool = Pool(processes = nProcesses) | |
355 chunkSize = int(np.ceil(nModels/nProcesses)) | |
356 jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)] | |
357 return pd.concat([job.get() for job in jobs]) | |
358 | |
359 def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None): | |
360 '''Forward search for best model (based on adjusted R2) | |
361 Randomly starting with one variable and adding randomly variables | |
362 if they improve the model | |
363 | |
364 The results are added to experiments if provided as argument | |
365 Storing in experiment relies on the index being the number equal | |
366 to the binary code derived from the independent variables''' | |
367 if experiments is None: | |
368 experiments = generateExperiments(independentVariables) | |
369 nIndependentVariables = len(independentVariables) | |
370 permutation = np.random.permutation(range(nIndependentVariables)).tolist() | |
371 variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} | |
372 print('Tested variables '+', '.join([variableMapping[i] for i in xrange(nIndependentVariables)])) | |
373 bestModel = [False]*nIndependentVariables | |
374 currentVarNum = 0 | |
375 currentR2Adj = 0. | |
376 for currentVarNum in xrange(nIndependentVariables): | |
377 currentModel = [i for i in bestModel] | |
378 currentModel[currentVarNum] = True | |
379 rowIdx = sum([0]+[2**i for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) | |
380 #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) | |
381 if experiments.loc[rowIdx, 'shapiroP'] < 0: | |
382 modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables) | |
383 model = modelFunc(modelStr, data = data) | |
384 results = model.fit() | |
385 experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj | |
386 experiments.loc[rowIdx, 'condNum'] = results.condition_number | |
387 experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] | |
388 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: | |
389 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] | |
390 bestModel[currentVarNum] = True | |
391 return experiments | |
392 | |
393 def displayModelResults(results, model = None): | |
394 import statsmodels.api as sm | |
395 '''Displays some model results''' | |
396 print results.summary() | |
397 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) | |
398 if model is not None: | |
399 plt.figure() | |
400 plt.plot(results.predict(), model.endog, 'x') | |
401 x=plt.xlim() | |
402 y=plt.ylim() | |
403 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') | |
404 plt.title('true vs predicted') | |
405 plt.figure() | |
406 plt.plot(results.predict(), results.resid, 'x') | |
407 plt.title('residuals vs predicted') | |
408 sm.qqplot(results.resid, fit = True, line = '45') | |
409 | |
289 | 410 |
290 ######################### | 411 ######################### |
291 # iterable section | 412 # iterable section |
292 ######################### | 413 ######################### |
293 | 414 |