comparison python/utils.py @ 674:01b89182891a

corrected bug for intersection of lines (thanks to Paul for finding)
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Tue, 26 May 2015 18:16:51 +0200
parents 5473b7460375
children ab3fdff42624
comparison
equal deleted inserted replaced
673:5505f9dbb28e 674:01b89182891a
316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') 316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
317 data[newVariable] = (data[var] == val) 317 data[newVariable] = (data[var] == val)
318 newVariables.append(newVariable) 318 newVariables.append(newVariable)
319 return newVariables 319 return newVariables
320 320
321 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, figureFilenamePrefix = None, figureFileType = 'pdf'): 321 def frenchify(s, displayNames):
322 return s
323
324 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False):
322 '''Studies the influence of (nominal) independent variable over the dependent variable 325 '''Studies the influence of (nominal) independent variable over the dependent variable
323 326
324 Makes tests if the conditional distributions are normal 327 Makes tests if the conditional distributions are normal
325 using the Shapiro-Wilk test (in which case ANOVA could be used) 328 using the Shapiro-Wilk test (in which case ANOVA could be used)
326 Implements uses the non-parametric Kruskal Wallis test''' 329 Implements uses the non-parametric Kruskal Wallis test'''
327 tmp = data[data[independentVariable].notnull()] 330 tmp = data[data[independentVariable].notnull()]
328 independentVariableValues = sorted(tmp[independentVariable].unique().tolist()) 331 independentVariableValues = sorted(tmp[independentVariable].unique().tolist())
329 if len(independentVariableValues) >= 2: 332 if len(independentVariableValues) >= 2:
333 if saveLatex:
334 from storage import openCheck
335 out = openCheck(filenamePrefix+'-{}-{}.tex'.format(dependentVariable, independentVariable), 'w')
330 for x in independentVariableValues: 336 for x in independentVariableValues:
331 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable]))) 337 print('Shapiro-Wilk normality test for {} when {}={}: {} obs'.format(dependentVariable,independentVariable, x, len(tmp.loc[tmp[independentVariable] == x, dependentVariable])))
332 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3: 338 if len(tmp.loc[tmp[independentVariable] == x, dependentVariable]) >= 3:
333 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable]) 339 print shapiro(tmp.loc[tmp[independentVariable] == x, dependentVariable])
334 if plotFigure: 340 if plotFigure:
336 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) 342 plt.boxplot([tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
337 #q25, q75 = tmp[dependentVariable].quantile([.25, .75]) 343 #q25, q75 = tmp[dependentVariable].quantile([.25, .75])
338 #plt.ylim(ymax = q75+1.5*(q75-q25)) 344 #plt.ylim(ymax = q75+1.5*(q75-q25))
339 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) 345 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
340 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) 346 plt.title('{} vs {}'.format(dependentVariable, independentVariable))
341 if figureFilenamePrefix is not None: 347 if filenamePrefix is not None:
342 plt.savefig(figureFilenamePrefix+'{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) 348 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
343 #else: 349 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
344 # TODO formatter le tableau (html?) 350 if saveLatex:
345 print tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) 351 out.write('\begin{table}[htp!]')
352 out.write(frenchify(table.to_latex(), displayNames))
353 out.write('\end{table}[htp!]')
354 else:
355 print table
346 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) 356 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
347 else: 357 else:
348 return None 358 return None
349 359
350 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): 360 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
442 from pandas import concat 452 from pandas import concat
443 from multiprocessing import Pool 453 from multiprocessing import Pool
444 experiments = generateExperiments(independentVariables) 454 experiments = generateExperiments(independentVariables)
445 nModels = len(experiments) 455 nModels = len(experiments)
446 print("Running {} models with {} processes".format(nModels, nProcesses)) 456 print("Running {} models with {} processes".format(nModels, nProcesses))
457 print("IndependentVariables: {}".format(independentVariables))
447 if nProcesses == 1: 458 if nProcesses == 1:
448 return runModels(experiments, data, dependentVariable, independentVariables, regressionType) 459 return runModels(experiments, data, dependentVariable, independentVariables, regressionType)
449 else: 460 else:
450 pool = Pool(processes = nProcesses) 461 pool = Pool(processes = nProcesses)
451 chunkSize = int(ceil(nModels/nProcesses)) 462 chunkSize = int(ceil(nModels/nProcesses))