Mercurial Hosting > traffic-intelligence
comparison python/utils.py @ 674:01b89182891a
corrected bug for intersection of lines (thanks to Paul for finding)
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Tue, 26 May 2015 18:16:51 +0200 |
parents | 5473b7460375 |
children | ab3fdff42624 |
comparison
equal
deleted
inserted
replaced
673:5505f9dbb28e | 674:01b89182891a |
---|---|
316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') | 316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') |
317 data[newVariable] = (data[var] == val) | 317 data[newVariable] = (data[var] == val) |
318 newVariables.append(newVariable) | 318 newVariables.append(newVariable) |
319 return newVariables | 319 return newVariables |
320 | 320 |
321 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'): | 321 def frenchify(s, displayNames): |
322 return s | |
323 | |
324 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False): | |
322 '''Studies the influence of (nominal) independent variable over the dependent variable | 325 '''Studies the influence of (nominal) independent variable over the dependent variable |
323 | 326 |
324 Makes tests if the conditional distributions are normal | 327 Makes tests if the conditional distributions are normal |
325 using the Shapiro-Wilk test (in which case ANOVA could be used) | 328 using the Shapiro-Wilk test (in which case ANOVA could be used) |
326 Implements uses the non-parametric Kruskal Wallis test''' | 329 Implements uses the non-parametric Kruskal Wallis test''' |
327 tmp = data[data[independentVariable].notnull()] | 330 tmp = data[data[independentVariable].notnull()] |
328 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) | 331 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) |
329 if len(independentVariableValues) >= 2: | 332 if len(independentVariableValues) >= 2: |
333 if saveLatex: | |
334 from storage import openCheck | |
335 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w') | |
330 for x in independentVariableValues: | 336 for x in independentVariableValues: |
331 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) | 337 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) |
332 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: | 338 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: |
333 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) | 339 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) |
334 if plotFigure: | 340 if plotFigure: |
336 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | 342 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) |
337 #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) | 343 #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) |
338 #plt.ylim(ymax = q75+1.5*(q75-q25)) | 344 #plt.ylim(ymax = q75+1.5*(q75-q25)) |
339 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) | 345 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) |
340 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) | 346 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) |
341 if figureFilenamePrefix is not None: | 347 if filenamePrefix is not None: |
342 plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) | 348 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) |
343 #else: | 349 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) |
344 # TODO formatter le tableau (html?) | 350 if saveLatex: |
345 print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) | 351 out.write('\begin{table}[htp!]') |
352 out.write(frenchify(table.to_latex(), displayNames)) | |
353 out.write('\end{table}[htp!]') | |
354 else: | |
355 print table | |
346 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | 356 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) |
347 else: | 357 else: |
348 return None | 358 return None |
349 | 359 |
350 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): | 360 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): |
442 from pandas import concat | 452 from pandas import concat |
443 from multiprocessing import Pool | 453 from multiprocessing import Pool |
444 experiments = generateExperiments(independentVariables) | 454 experiments = generateExperiments(independentVariables) |
445 nModels = len(experiments) | 455 nModels = len(experiments) |
446 print("Running {} models with {} processes".format(nModels, nProcesses)) | 456 print("Running {} models with {} processes".format(nModels, nProcesses)) |
457 print("IndependentVariables: {}".format(independentVariables)) | |
447 if nProcesses == 1: | 458 if nProcesses == 1: |
448 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) | 459 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) |
449 else: | 460 else: |
450 pool = Pool(processes = nProcesses) | 461 pool = Pool(processes = nProcesses) |
451 chunkSize = int(ceil(nModels/nProcesses)) | 462 chunkSize = int(ceil(nModels/nProcesses)) |