comparison python/utils.py @ 676:58b9ac2f262f

fine tuning
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Wed, 27 May 2015 04:08:19 +0200
parents ab3fdff42624
children ae07c7b4cf87
comparison
equal deleted inserted replaced
675:ab3fdff42624 676:58b9ac2f262f
316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') 316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','')
317 data[newVariable] = (data[var] == val) 317 data[newVariable] = (data[var] == val)
318 newVariables.append(newVariable) 318 newVariables.append(newVariable)
319 return newVariables 319 return newVariables
320 320
321 def frenchify(s, displayNames): 321 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, translate = lambda s: s, kwCaption = u''):
322 return s
323
324 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, displayNames = {}):
325 '''Studies the influence of (nominal) independent variable over the dependent variable 322 '''Studies the influence of (nominal) independent variable over the dependent variable
326 323
327 Makes tests if the conditional distributions are normal 324 Makes tests if the conditional distributions are normal
328 using the Shapiro-Wilk test (in which case ANOVA could be used) 325 using the Shapiro-Wilk test (in which case ANOVA could be used)
329 Implements uses the non-parametric Kruskal Wallis test''' 326 Implements uses the non-parametric Kruskal Wallis test'''
345 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) 342 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues)
346 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) 343 plt.title('{} vs {}'.format(dependentVariable, independentVariable))
347 if filenamePrefix is not None: 344 if filenamePrefix is not None:
348 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) 345 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType))
349 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) 346 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False)
347 table['count'] = table['count'].astype(int)
348 #table.index.rename(translate(table.index.name), inplace = True)
349 testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues])
350 if saveLatex: 350 if saveLatex:
351 out.write('\\begin{table}[htp!]\n') 351 out.write(translate('\\begin{minipage}{\\linewidth}\n'
352 out.write(frenchify(table.to_latex(), displayNames)) 352 +'\\centering\n'
353 out.write('\caption{Test}\n' 353 +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n'
354 +'\end{table}[htp!]') 354 +table.to_latex(float_format = lambda x: '{:.2f}'.format(x)).encode('ascii')+'\n'
355 +'\\end{minipage}\n'
356 +'\\vspace{0.5cm}\n'))
355 else: 357 else:
356 print table 358 print table
357 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) 359 return testResult
358 else: 360 else:
359 return None 361 return None
360 362
361 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): 363 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc):
362 '''Removes variables from candidate independent variables if 364 '''Removes variables from candidate independent variables if
498 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: 500 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']:
499 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] 501 currentR2Adj = experiments.loc[rowIdx, 'r2adj']
500 bestModel[currentVarNum] = True 502 bestModel[currentVarNum] = True
501 return experiments 503 return experiments
502 504
503 def displayModelResults(results, model = None): 505 def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf'):
504 import statsmodels.api as sm 506 import statsmodels.api as sm
505 '''Displays some model results''' 507 '''Displays some model results'''
506 print results.summary() 508 print(results.summary())
507 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) 509 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid)))
508 if model is not None: 510 if plotFigures:
511 if model is not None:
512 plt.figure()
513 plt.plot(results.predict(), model.endog, 'x')
514 x=plt.xlim()
515 y=plt.ylim()
516 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r')
517 plt.title('true vs predicted')
518 if filenamePrefix is not None:
519 plt.savefig(filenamePrefix+'-true-predicted.'+figureFileType)
509 plt.figure() 520 plt.figure()
510 plt.plot(results.predict(), model.endog, 'x') 521 plt.plot(results.predict(), results.resid, 'x')
511 x=plt.xlim() 522 if filenamePrefix is not None:
512 y=plt.ylim() 523 plt.savefig(filenamePrefix+'-residuals.'+figureFileType)
513 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') 524 plt.title('residuals vs predicted')
514 plt.title('true vs predicted') 525 sm.qqplot(results.resid, fit = True, line = '45')
515 plt.figure() 526 if filenamePrefix is not None:
516 plt.plot(results.predict(), results.resid, 'x') 527 plt.savefig(filenamePrefix+'-qq.'+figureFileType)
517 plt.title('residuals vs predicted')
518 sm.qqplot(results.resid, fit = True, line = '45')
519 528
520 529
521 ######################### 530 #########################
522 # iterable section 531 # iterable section
523 ######################### 532 #########################