Mercurial Hosting > traffic-intelligence
comparison python/utils.py @ 676:58b9ac2f262f
fine tuning
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Wed, 27 May 2015 04:08:19 +0200 |
parents | ab3fdff42624 |
children | ae07c7b4cf87 |
comparison
equal
deleted
inserted
replaced
675:ab3fdff42624 | 676:58b9ac2f262f |
---|---|
316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') | 316 newVariable = (var+'_{}'.format(val)).replace('.','').replace(' ','').replace('-','') |
317 data[newVariable] = (data[var] == val) | 317 data[newVariable] = (data[var] == val) |
318 newVariables.append(newVariable) | 318 newVariables.append(newVariable) |
319 return newVariables | 319 return newVariables |
320 | 320 |
321 def frenchify(s, displayNames): | 321 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, translate = lambda s: s, kwCaption = u''): |
322 return s | |
323 | |
324 def kruskalWallis(data, dependentVariable, independentVariable, plotFigure = False, filenamePrefix = None, figureFileType = 'pdf', saveLatex = False, displayNames = {}): | |
325 '''Studies the influence of (nominal) independent variable over the dependent variable | 322 '''Studies the influence of (nominal) independent variable over the dependent variable |
326 | 323 |
327 Makes tests if the conditional distributions are normal | 324 Makes tests if the conditional distributions are normal |
328 using the Shapiro-Wilk test (in which case ANOVA could be used) | 325 using the Shapiro-Wilk test (in which case ANOVA could be used) |
329 Implements uses the non-parametric Kruskal Wallis test''' | 326 Implements uses the non-parametric Kruskal Wallis test''' |
345 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) | 342 plt.xticks(range(1,len(independentVariableValues)+1), independentVariableValues) |
346 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) | 343 plt.title('{} vs {}'.format(dependentVariable, independentVariable)) |
347 if filenamePrefix is not None: | 344 if filenamePrefix is not None: |
348 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) | 345 plt.savefig(filenamePrefix+'-{}-{}.{}'.format(dependentVariable, independentVariable, figureFileType)) |
349 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) | 346 table = tmp.groupby([independentVariable])[dependentVariable].describe().unstack().sort(['50%'], ascending = False) |
347 table['count'] = table['count'].astype(int) | |
348 #table.index.rename(translate(table.index.name), inplace = True) | |
349 testResult = kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | |
350 if saveLatex: | 350 if saveLatex: |
351 out.write('\\begin{table}[htp!]\n') | 351 out.write(translate('\\begin{minipage}{\\linewidth}\n' |
352 out.write(frenchify(table.to_latex(), displayNames)) | 352 +'\\centering\n' |
353 out.write('\caption{Test}\n' | 353 +'\\captionof{table}{'+(kwCaption.format(dependentVariable, independentVariable, *testResult))+'}\n' |
354 +'\end{table}[htp!]') | 354 +table.to_latex(float_format = lambda x: '{:.2f}'.format(x)).encode('ascii')+'\n' |
355 +'\\end{minipage}\n' | |
356 +'\\vspace{0.5cm}\n')) | |
355 else: | 357 else: |
356 print table | 358 print table |
357 return kruskal(*[tmp.loc[tmp[independentVariable] == x, dependentVariable] for x in independentVariableValues]) | 359 return testResult |
358 else: | 360 else: |
359 return None | 361 return None |
360 | 362 |
361 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): | 363 def prepareRegression(data, dependentVariable, independentVariables, maxCorrelationThreshold, correlations, maxCorrelationP, correlationFunc): |
362 '''Removes variables from candidate independent variables if | 364 '''Removes variables from candidate independent variables if |
498 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: | 500 if currentR2Adj < experiments.loc[rowIdx, 'r2adj']: |
499 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] | 501 currentR2Adj = experiments.loc[rowIdx, 'r2adj'] |
500 bestModel[currentVarNum] = True | 502 bestModel[currentVarNum] = True |
501 return experiments | 503 return experiments |
502 | 504 |
503 def displayModelResults(results, model = None): | 505 def displayModelResults(results, model = None, plotFigures = True, filenamePrefix = None, figureFileType = 'pdf'): |
504 import statsmodels.api as sm | 506 import statsmodels.api as sm |
505 '''Displays some model results''' | 507 '''Displays some model results''' |
506 print results.summary() | 508 print(results.summary()) |
507 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) | 509 print('Shapiro-Wilk normality test for residuals: {}'.format(shapiro(results.resid))) |
508 if model is not None: | 510 if plotFigures: |
511 if model is not None: | |
512 plt.figure() | |
513 plt.plot(results.predict(), model.endog, 'x') | |
514 x=plt.xlim() | |
515 y=plt.ylim() | |
516 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') | |
517 plt.title('true vs predicted') | |
518 if filenamePrefix is not None: | |
519 plt.savefig(filenamePrefix+'-true-predicted.'+figureFileType) | |
509 plt.figure() | 520 plt.figure() |
510 plt.plot(results.predict(), model.endog, 'x') | 521 plt.plot(results.predict(), results.resid, 'x') |
511 x=plt.xlim() | 522 if filenamePrefix is not None: |
512 y=plt.ylim() | 523 plt.savefig(filenamePrefix+'-residuals.'+figureFileType) |
513 plt.plot([max(x[0], y[0]), min(x[1], y[1])], [max(x[0], y[0]), min(x[1], y[1])], 'r') | 524 plt.title('residuals vs predicted') |
514 plt.title('true vs predicted') | 525 sm.qqplot(results.resid, fit = True, line = '45') |
515 plt.figure() | 526 if filenamePrefix is not None: |
516 plt.plot(results.predict(), results.resid, 'x') | 527 plt.savefig(filenamePrefix+'-qq.'+figureFileType) |
517 plt.title('residuals vs predicted') | |
518 sm.qqplot(results.resid, fit = True, line = '45') | |
519 | 528 |
520 | 529 |
521 ######################### | 530 ######################### |
522 # iterable section | 531 # iterable section |
523 ######################### | 532 ######################### |