comparison python/ml.py @ 184:d70e9b36889c

initial work on flow vectors and clustering algorithms
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 25 Nov 2011 18:38:54 -0500
parents ed944ff45e8c
children 5957aa1d69e1 8bafd054cda4
comparison
equal deleted inserted replaced
183:ed944ff45e8c 184:d70e9b36889c
1 #! /usr/bin/env python 1 #! /usr/bin/env python
2 '''Libraries for machine learning algorithms''' 2 '''Libraries for machine learning algorithms'''
3 3
4 __metaclass__ = type 4 __metaclass__ = type
5 5
6 def kMeansFixedDistance(data, sameCluster, centroid): 6 class Centroid:
7 'Wrapper around instances to add a counter'
8
9 def __init__(self, instance, nInstances = 1):
10 self.instance = instance
11 self.nInstances = nInstances
12
13 # def similar(instance2):
14 # return self.instance.similar(instance2)
15
16 def add(self, instance2):
17 self.instance = self.instance.multiply(self.nInstances)+instance2
18 self.nInstances += 1
19 self.instance = self.instance.multiply(1/float(self.nInstances))
20
21 def average(c):
22 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances)
23 inst.multiply(1/(self.nInstances+instance.nInstances))
24 return Centroid(inst, self.nInstances+instance.nInstances)
25
26 def draw(self, options = ''):
27 from matplotlib.pylab import text
28 self.instance.draw(options)
29 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances))
30
31
32 def clustering(data, similar, initialCentroids = []):
7 '''k-means algorithm with similarity function 33 '''k-means algorithm with similarity function
8 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the centroid of a set of instances can be computed, using the function. 34 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function.
9 The number of clusters will be determined accordingly 35 The number of clusters will be determined accordingly
10 36
11 data: list of instances 37 data: list of instances
12 centroid: ''' 38 averageCentroid: '''
13 39
14 # todo randomize input 40 from random import shuffle
15 centroids = [data[0]] 41 from copy import copy, deepcopy
16 for instance in data: 42 localdata = copy(data) # shallow copy to avoid modifying data
43 shuffle(localdata)
44 if initialCentroids:
45 centroids = deepcopy(initialCentroids)
46 else:
47 centroids = [Centroid(localdata[0])]
48 for instance in localdata[1:]:
17 i = 0 49 i = 0
18 while i<len(centroids) and not sameCluster(instance, centroids[i]): 50 while i<len(centroids) and not similar(centroids[i].instance, instance):
19 i += 1 51 i += 1
20 if i == len(centroids): 52 if i == len(centroids):
21 centroids.append(instance) 53 centroids.append(Centroid(instance))
22 else: 54 else:
23 centroids[i] = centroid(centroids[i], instance) 55 centroids[i].add(instance)
24 56
25 return centroids 57 return centroids