repo/traffic-intelligence: python/utils.py comparison

comparison python/utils.py @ 674:01b89182891a

corrected bug for intersection of lines (thanks to Paul for finding)

author	Nicolas Saunier <nicolas.saunier@polymtl.ca>
date	Tue, 26 May 2015 18:16:51 +0200
parents	5473b7460375
children	ab3fdff42624

comparison

equal deleted inserted replaced

-:5505f9dbb28e
+:01b89182891a
 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
 data[newVariable] = (data[var] == val)
 newVariables.append(newVariable)
 return newVariables
-def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'):
+def frenchify(s, displayNames):
+return s
+def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False):
 '''Studies the influence of (nominal) independent variable over the dependent variable
 Makes tests if the conditional distributions are normal
 using the Shapiro-Wilk test (in which case ANOVA could be used)
 Implements uses the non-parametric Kruskal Wallis test'''
 tmp = data[data[independentVariable].notnull()]
 independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
 if len(independentVariableValues) >= 2:
+if saveLatex:
+from storage import openCheck
+out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w')
 for x in independentVariableValues:
 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])
 if plotFigure:
 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
 #q25, q75 = tmp[dependentVariable].quantile([.25, .75])
 #plt.ylim(ymax = q75+1.5*(q75-q25))
 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
 plt.title('{} vs {}'.format(dependentVariable, independentVariable))
-if figureFilenamePrefix is not None:
+if filenamePrefix is not None:
-plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
+plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
-#else:
+table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
-# TODO formatter le tableau (html?)
+if saveLatex:
-print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
+out.write('\begin{table}[htp!]')
+out.write(frenchify(table.to_latex(), displayNames))
+out.write('\end{table}[htp!]')
+else:
+print table
 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
 else:
 return None
 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
 from pandas import concat
 from multiprocessing import Pool
 experiments = generateExperiments(independentVariables)
 nModels = len(experiments)
 print("Running {} models with {} processes".format(nModels, nProcesses))
+print("IndependentVariables: {}".format(independentVariables))
 if nProcesses == 1:
 return runModels(experiments, data, dependentVariable, independentVariables, regressionType)
 else:
 pool = Pool(processes = nProcesses)
 chunkSize = int(ceil(nModels/nProcesses))

Mercurial Hosting > traffic-intelligence

comparison python/utils.py @ 674:01b89182891a