Mercurial Hosting > traffic-intelligence
changeset 667:179b81faa1f8
added regression analysis functions
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Mon, 25 May 2015 13:54:29 +0200 |
parents | 93633ce122c3 |
children | f8dcf483b296 |
files | python/utils.py |
diffstat | 1 files changed, 121 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/python/utils.py Wed May 20 14:21:54 2015 +0200 +++ b/python/utils.py Mon May 25 13:54:29 2015 +0200 @@ -287,6 +287,127 @@ plot(xx, [poly(z) for z in xx]) return coef + +######################### +# regression analysis using statsmodels (and pandas) +######################### + +# TODO make class for experiments? +def modelString(experiment, dependentVariable, independentVariables): + return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]]) + +def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'): + import statsmodels.formula.api as smf + modelStr = modelString(experiment, dependentVariable, independentVariables) + if regressionType == 'ols': + model = smf.ols(modelStr, data = data) + elif regressionType == 'gls': + model = smf.gls(modelStr, data = data) + elif regressionType == 'rlm': + model = smf.rlm(modelStr, data = data) + else: + print('Unknown regression type {}. Exiting'.format(regressionType)) + import sys + sys.exit() + return model.fit() + +def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'): + '''Runs several models and stores 3 statistics + adjusted R2, condition number (should be small, eg < 1000) + and p-value for Shapiro-Wilk test of residual normality''' + for i,experiment in experiments.iterrows(): + if experiment[independentVariables].any(): + results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols') + experiments.loc[i,'r2adj'] = results.rsquared_adj + experiments.loc[i,'condNum'] = results.condition_number + experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1] + return experiments + +def generateExperiments(independentVariables): + '''Generates all possible models for including or not each independent variable''' + experiments = {} + nIndependentVariables = len(independentVariables) + if nIndependentVariables != len(np.unique(independentVariables)): + print("Duplicate variables. Exiting") + import sys + sys.exit() + nModels = 2**nIndependentVariables + for i,var in enumerate(independentVariables): + pattern = [False]*(2**i)+[True]*(2**i) + experiments[var] = pattern*(2**(nIndependentVariables-i-1)) + experiments = pd.DataFrame(experiments) + experiments['r2adj'] = 0. + experiments['condNum'] = np.nan + experiments['shapiroP'] = -1 + return experiments + +def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1): + '''Generates all possible model with the independentVariables + and runs them, saving the results in experiments + with multiprocess option''' + experiments = generateExperiments(independentVariables) + nModels = len(experiments) + print("Running {} models with {} processes".format(nModels, nProcesses)) + if nProcesses == 1: + return runModels(experiments, data, dependentVariable, independentVariables, regressionType) + else: + pool = Pool(processes = nProcesses) + chunkSize = int(np.ceil(nModels/nProcesses)) + jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)] + return pd.concat([job.get() for job in jobs]) + +def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None): + '''Forward search for best model (based on adjusted R2) + Randomly starting with one variable and adding randomly variables + if they improve the model + + The results are added to experiments if provided as argument + Storing in experiment relies on the index being the number equal + to the binary code derived from the independent variables''' + if experiments is None: + experiments = generateExperiments(independentVariables) + nIndependentVariables = len(independentVariables) + permutation = np.random.permutation(range(nIndependentVariables)).tolist() + variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)} + print('Tested variables '+', '.join([variableMapping[i] for i in xrange(nIndependentVariables)])) + bestModel = [False]*nIndependentVariables + currentVarNum = 0 + currentR2Adj = 0. + for currentVarNum in xrange(nIndependentVariables): + currentModel = [i for i in bestModel] + currentModel[currentVarNum] = True + rowIdx = sum([0]+[2**i for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) + #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in xrange(nIndependentVariables) if currentModel[permutation[i]]]) + if experiments.loc[rowIdx, 'shapiroP'] < 0: + modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables) + model = modelFunc(modelStr, data = data) + results = model.fit() + experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj + experiments.loc[rowIdx, 'condNum'] = results.condition_number + experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1] + if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: + currentR2Adj = experiments.loc[rowIdx, 'r2adj'] + bestModel[currentVarNum] = True + return experiments + +def displayModelResults(results, model = None): + import statsmodels.api as sm + '''Displays some model results''' + print results.summary() + print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) + if model is not None: + plt.figure() + plt.plot(results.predict(), model.endog, 'x') + x=plt.xlim() + y=plt.ylim() + plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') + plt.title('true vs predicted') + plt.figure() + plt.plot(results.predict(), results.resid, 'x') + plt.title('residuals vs predicted') + sm.qqplot(results.resid, fit = True, line = '45') + + ######################### # iterable section #########################