comparison python/utils.py @ 76:64fde2b1f96d

simplified intervales in empiricalDistribution
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Thu, 10 Feb 2011 22:15:54 -0500
parents 46ec876ce90e
children 5e6cd36a991c
comparison
equal deleted inserted replaced
75:46ec876ce90e 76:64fde2b1f96d
23 result += ((e-o)*(e-o))/e 23 result += ((e-o)*(e-o))/e
24 return result 24 return result
25 25
26 class empiricalDistribution: 26 class empiricalDistribution:
27 '''Class to represent a sample of a distribution for a continuous random variable 27 '''Class to represent a sample of a distribution for a continuous random variable
28 with the number of observations for each interval''' 28 with the number of observations for each interval
29 intervals (categories variable) are defined by their left limits, the last one being the right limit
30 categories contain therefore one more element than the counts'''
29 def __init__(self, categories, counts): 31 def __init__(self, categories, counts):
30 self.categories = categories 32 self.categories = categories
31 self.counts = counts 33 self.counts = counts
32 34
33 def mean(self): 35 def mean(self):
34 result = 0. 36 result = 0.
35 for i,c in zip(self.categories, self.counts): 37 for i in range(len(self.counts)-1):
36 result += c*(i[1]+i[0])/2 38 result += self.counts[i]*(self.categories[i]+self.categories[i+1])/2
37 return result/sum(self.counts) 39 return result/self.nSamples()
38 40
39 def var(self, mean = None): 41 def var(self, mean = None):
40 if not mean: 42 if not mean:
41 m = self.mean() 43 m = self.mean()
42 else: 44 else:
43 m = mean 45 m = mean
44 result = 0. 46 result = 0.
45 for i,c in zip(self.categories, self.counts): 47 for i in range(len(self.counts)-1):
46 mid = (i[1]+i[0])/2 48 mid = (self.categories[i]+self.categories[i+1])/2
47 result += c*(mid - m)*(mid - m) 49 result += self.counts[i]*(mid - m)*(mid - m)
48 return result/(self.nSamples()-1) 50 return result/(self.nSamples()-1)
49 51
50 def nSamples(self): 52 def nSamples(self):
51 return sum(self.counts) 53 return sum(self.counts)
52 54
54 '''cdf is a cumulative distribution function 56 '''cdf is a cumulative distribution function
55 returning the probability of the variable being less that x''' 57 returning the probability of the variable being less that x'''
56 # refCumulativeCounts = [0]#[cdf(self.categories[0][0])] 58 # refCumulativeCounts = [0]#[cdf(self.categories[0][0])]
57 # for inter in self.categories: 59 # for inter in self.categories:
58 # refCumulativeCounts.append(cdf(inter[1])) 60 # refCumulativeCounts.append(cdf(inter[1]))
59 refCumulativeCounts = [cdf(inter[1]) for inter in self.categories[:-1]] 61 refCumulativeCounts = [cdf(x) for x in self.categories[1:-1]]
60 62
61 refProba = [refCumulativeCounts[0]] 63 refProba = [refCumulativeCounts[0]]
62 for i in xrange(1,len(refCumulativeCounts)): 64 for i in xrange(1,len(refCumulativeCounts)):
63 refProba.append(refCumulativeCounts[i]-refCumulativeCounts[i-1]) 65 refProba.append(refCumulativeCounts[i]-refCumulativeCounts[i-1])
64 refProba.append(1-refCumulativeCounts[-1]) 66 refProba.append(1-refCumulativeCounts[-1])