Mercurial Hosting > traffic-intelligence
comparison python/ml.py @ 184:d70e9b36889c
initial work on flow vectors and clustering algorithms
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Fri, 25 Nov 2011 18:38:54 -0500 |
parents | ed944ff45e8c |
children | 5957aa1d69e1 8bafd054cda4 |
comparison
equal
deleted
inserted
replaced
183:ed944ff45e8c | 184:d70e9b36889c |
---|---|
1 #! /usr/bin/env python | 1 #! /usr/bin/env python |
2 '''Libraries for machine learning algorithms''' | 2 '''Libraries for machine learning algorithms''' |
3 | 3 |
4 __metaclass__ = type | 4 __metaclass__ = type |
5 | 5 |
6 def kMeansFixedDistance(data, sameCluster, centroid): | 6 class Centroid: |
7 'Wrapper around instances to add a counter' | |
8 | |
9 def __init__(self, instance, nInstances = 1): | |
10 self.instance = instance | |
11 self.nInstances = nInstances | |
12 | |
13 # def similar(instance2): | |
14 # return self.instance.similar(instance2) | |
15 | |
16 def add(self, instance2): | |
17 self.instance = self.instance.multiply(self.nInstances)+instance2 | |
18 self.nInstances += 1 | |
19 self.instance = self.instance.multiply(1/float(self.nInstances)) | |
20 | |
21 def average(c): | |
22 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances) | |
23 inst.multiply(1/(self.nInstances+instance.nInstances)) | |
24 return Centroid(inst, self.nInstances+instance.nInstances) | |
25 | |
26 def draw(self, options = ''): | |
27 from matplotlib.pylab import text | |
28 self.instance.draw(options) | |
29 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances)) | |
30 | |
31 | |
32 def clustering(data, similar, initialCentroids = []): | |
7 '''k-means algorithm with similarity function | 33 '''k-means algorithm with similarity function |
8 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the centroid of a set of instances can be computed, using the function. | 34 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function. |
9 The number of clusters will be determined accordingly | 35 The number of clusters will be determined accordingly |
10 | 36 |
11 data: list of instances | 37 data: list of instances |
12 centroid: ''' | 38 averageCentroid: ''' |
13 | 39 |
14 # todo randomize input | 40 from random import shuffle |
15 centroids = [data[0]] | 41 from copy import copy, deepcopy |
16 for instance in data: | 42 localdata = copy(data) # shallow copy to avoid modifying data |
43 shuffle(localdata) | |
44 if initialCentroids: | |
45 centroids = deepcopy(initialCentroids) | |
46 else: | |
47 centroids = [Centroid(localdata[0])] | |
48 for instance in localdata[1:]: | |
17 i = 0 | 49 i = 0 |
18 while i<len(centroids) and not sameCluster(instance, centroids[i]): | 50 while i<len(centroids) and not similar(centroids[i].instance, instance): |
19 i += 1 | 51 i += 1 |
20 if i == len(centroids): | 52 if i == len(centroids): |
21 centroids.append(instance) | 53 centroids.append(Centroid(instance)) |
22 else: | 54 else: |
23 centroids[i] = centroid(centroids[i], instance) | 55 centroids[i].add(instance) |
24 | 56 |
25 return centroids | 57 return centroids |