repo/traffic-intelligence: python/ml.py comparison

comparison python/ml.py @ 184:d70e9b36889c

initial work on flow vectors and clustering algorithms

author	Nicolas Saunier <nicolas.saunier@polymtl.ca>
date	Fri, 25 Nov 2011 18:38:54 -0500
parents	ed944ff45e8c
children	5957aa1d69e1 8bafd054cda4

comparison

equal deleted inserted replaced

-:ed944ff45e8c
+:d70e9b36889c
 #! /usr/bin/env python
 '''Libraries for machine learning algorithms'''
 __metaclass__ = type
-def kMeansFixedDistance(data, sameCluster, centroid):
+class Centroid:
+'Wrapper around instances to add a counter'
+def __init__(self, instance, nInstances = 1):
+self.instance = instance
+self.nInstances = nInstances
+# def similar(instance2):
+#     return self.instance.similar(instance2)
+def add(self, instance2):
+self.instance = self.instance.multiply(self.nInstances)+instance2
+self.nInstances += 1
+self.instance = self.instance.multiply(1/float(self.nInstances))
+def average(c):
+inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances)
+inst.multiply(1/(self.nInstances+instance.nInstances))
+return Centroid(inst, self.nInstances+instance.nInstances)
+def draw(self, options = ''):
+from matplotlib.pylab import text
+self.instance.draw(options)
+text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances))
+def clustering(data, similar, initialCentroids = []):
 '''k-means algorithm with similarity function
-Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the centroid of a set of instances can be computed, using the function.
+Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function.
 The number of clusters will be determined accordingly
 data: list of instances
-centroid: '''
+averageCentroid: '''
-# todo randomize input
+from random import shuffle
-centroids = [data[0]]
+from copy import copy, deepcopy
-for instance in data:
+localdata = copy(data) # shallow copy to avoid modifying data
+shuffle(localdata)
+if initialCentroids:
+centroids = deepcopy(initialCentroids)
+else:
+centroids = [Centroid(localdata[0])]
+for instance in localdata[1:]:
 i = 0
-while i<len(centroids) and not sameCluster(instance, centroids[i]):
+while i<len(centroids) and not similar(centroids[i].instance, instance):
 i += 1
 if i == len(centroids):
-centroids.append(instance)
+centroids.append(Centroid(instance))
 else:
-centroids[i] = centroid(centroids[i], instance)
+centroids[i].add(instance)
 return centroids

Mercurial Hosting > traffic-intelligence

comparison python/ml.py @ 184:d70e9b36889c