diff python/utils.py @ 667:179b81faa1f8

added regression analysis functions
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Mon, 25 May 2015 13:54:29 +0200
parents 15e244d2a1b5
children f8dcf483b296
line wrap: on
line diff
--- a/python/utils.py	Wed May 20 14:21:54 2015 +0200
+++ b/python/utils.py	Mon May 25 13:54:29 2015 +0200
@@ -287,6 +287,127 @@
         plot(xx, [poly(z) for z in xx])
     return coef
 
+
+#########################
+# regression analysis using statsmodels (and pandas)
+#########################
+
+# TODO make class for experiments?
+def modelString(experiment, dependentVariable, independentVariables):
+    return dependentVariable+' ~ '+' + '.join([independentVariable for independentVariable in independentVariables if experiment[independentVariable]])
+
+def runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols'):
+    import statsmodels.formula.api as smf
+    modelStr = modelString(experiment, dependentVariable, independentVariables)
+    if regressionType == 'ols':
+        model = smf.ols(modelStr, data = data)
+    elif regressionType == 'gls':
+        model = smf.gls(modelStr, data = data)
+    elif regressionType == 'rlm':
+        model = smf.rlm(modelStr, data = data)
+    else:
+        print('Unknown regression type {}. Exiting'.format(regressionType))
+        import sys
+        sys.exit()
+    return model.fit()
+
+def runModels(experiments, data, dependentVariable, independentVariables, regressionType = 'ols'):
+    '''Runs several models and stores 3 statistics
+    adjusted R2, condition number (should be small, eg < 1000)
+    and p-value for Shapiro-Wilk test of residual normality'''
+    for i,experiment in experiments.iterrows():
+        if experiment[independentVariables].any():
+            results = runModel(experiment, data, dependentVariable, independentVariables, regressionType = 'ols')
+            experiments.loc[i,'r2adj'] = results.rsquared_adj
+            experiments.loc[i,'condNum'] = results.condition_number
+            experiments.loc[i, 'shapiroP'] = shapiro(results.resid)[1]
+    return experiments
+
+def generateExperiments(independentVariables):
+    '''Generates all possible models for including or not each independent variable'''
+    experiments = {}
+    nIndependentVariables = len(independentVariables)
+    if nIndependentVariables != len(np.unique(independentVariables)):
+        print("Duplicate variables. Exiting")
+        import sys
+        sys.exit()
+    nModels = 2**nIndependentVariables
+    for i,var in enumerate(independentVariables):
+        pattern = [False]*(2**i)+[True]*(2**i)
+        experiments[var] = pattern*(2**(nIndependentVariables-i-1))
+    experiments = pd.DataFrame(experiments)
+    experiments['r2adj'] = 0.
+    experiments['condNum'] = np.nan
+    experiments['shapiroP'] = -1
+    return experiments
+
+def findBestModel(data, dependentVariable, independentVariables, regressionType = 'ols', nProcesses = 1):
+    '''Generates all possible model with the independentVariables
+    and runs them, saving the results in experiments
+    with multiprocess option'''
+    experiments = generateExperiments(independentVariables)
+    nModels = len(experiments)
+    print("Running {} models with {} processes".format(nModels, nProcesses))
+    if nProcesses == 1:
+        return runModels(experiments, data, dependentVariable, independentVariables, regressionType)
+    else:
+        pool = Pool(processes = nProcesses)
+        chunkSize = int(np.ceil(nModels/nProcesses))
+        jobs = [pool.apply_async(runModels, args = (experiments[i*chunkSize:(i+1)*chunkSize], data, dependentVariable, independentVariables, regressionType)) for i in range(nProcesses)]
+        return pd.concat([job.get() for job in jobs])
+
+def findBestModelFwd(data, dependentVariable, independentVariables, modelFunc, experiments = None):
+    '''Forward search for best model (based on adjusted R2)
+    Randomly starting with one variable and adding randomly variables 
+    if they improve the model
+    
+    The results are added to experiments if provided as argument
+    Storing in experiment relies on the index being the number equal 
+    to the binary code derived from the independent variables'''
+    if experiments is None:
+        experiments = generateExperiments(independentVariables)
+    nIndependentVariables = len(independentVariables)
+    permutation = np.random.permutation(range(nIndependentVariables)).tolist()
+    variableMapping = {j: independentVariables[i] for i,j in enumerate(permutation)}
+    print('Tested variables '+', '.join([variableMapping[i] for i in xrange(nIndependentVariables)]))
+    bestModel = [False]*nIndependentVariables
+    currentVarNum = 0
+    currentR2Adj = 0.
+    for currentVarNum in xrange(nIndependentVariables):
+        currentModel = [i for i in bestModel]
+        currentModel[currentVarNum] = True
+        rowIdx = sum([0]+[2**i for i in xrange(nIndependentVariables) if currentModel[permutation[i]]])
+        #print currentVarNum, sum(currentModel), ', '.join([independentVariables[i] for i in xrange(nIndependentVariables) if currentModel[permutation[i]]])
+        if experiments.loc[rowIdx, 'shapiroP'] < 0:
+            modelStr = modelString(experiments.loc[rowIdx], dependentVariable, independentVariables)
+            model = modelFunc(modelStr, data = data)
+            results = model.fit()
+            experiments.loc[rowIdx, 'r2adj'] = results.rsquared_adj
+            experiments.loc[rowIdx, 'condNum'] = results.condition_number
+            experiments.loc[rowIdx, 'shapiroP'] = shapiro(results.resid)[1]
+        if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
+            currentR2Adj = experiments.loc[rowIdx, 'r2adj']
+            bestModel[currentVarNum] = True
+    return experiments
+
+def displayModelResults(results, model = None):
+    import statsmodels.api as sm
+    '''Displays some model results'''
+    print results.summary()
+    print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid)))
+    if model is not None:
+        plt.figure()
+        plt.plot(results.predict(), model.endog, 'x')
+        x=plt.xlim()
+        y=plt.ylim()
+        plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
+        plt.title('true vs predicted')
+    plt.figure()
+    plt.plot(results.predict(), results.resid, 'x')
+    plt.title('residuals vs predicted')
+    sm.qqplot(results.resid, fit = True, line = '45')
+
+
 #########################
 # iterable section
 #########################