annotate python/ml.py @ 293:ee3302528cdc

rearranged new code by Paul (works now)
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 08 Feb 2013 18:13:29 -0500
parents 5957aa1d69e1
children 6c068047edbf
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
1 #! /usr/bin/env python
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
2 '''Libraries for machine learning algorithms'''
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
3
285
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
4 import numpy as np
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
5
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
6 __metaclass__ = type
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
7
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
8 class Centroid:
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
9 'Wrapper around instances to add a counter'
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
10
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
11 def __init__(self, instance, nInstances = 1):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
12 self.instance = instance
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
13 self.nInstances = nInstances
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
14
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
15 # def similar(instance2):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
16 # return self.instance.similar(instance2)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
17
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
18 def add(self, instance2):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
19 self.instance = self.instance.multiply(self.nInstances)+instance2
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
20 self.nInstances += 1
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
21 self.instance = self.instance.multiply(1/float(self.nInstances))
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
22
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
23 def average(c):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
24 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
25 inst.multiply(1/(self.nInstances+instance.nInstances))
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
26 return Centroid(inst, self.nInstances+instance.nInstances)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
27
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
28 def draw(self, options = ''):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
29 from matplotlib.pylab import text
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
30 self.instance.draw(options)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
31 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances))
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
32
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
33
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
34 def clustering(data, similar, initialCentroids = []):
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
35 '''k-means algorithm with similarity function
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
36 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function.
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
37 The number of clusters will be determined accordingly
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
38
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
39 data: list of instances
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
40 averageCentroid: '''
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
41
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
42 from random import shuffle
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
43 from copy import copy, deepcopy
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
44 localdata = copy(data) # shallow copy to avoid modifying data
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
45 shuffle(localdata)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
46 if initialCentroids:
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
47 centroids = deepcopy(initialCentroids)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
48 else:
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
49 centroids = [Centroid(localdata[0])]
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
50 for instance in localdata[1:]:
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
51 i = 0
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
52 while i<len(centroids) and not similar(centroids[i].instance, instance):
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
53 i += 1
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
54 if i == len(centroids):
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
55 centroids.append(Centroid(instance))
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
56 else:
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
57 centroids[i].add(instance)
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
58
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
59 return centroids
285
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
60
293
ee3302528cdc rearranged new code by Paul (works now)
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 285
diff changeset
61 def spectralClustering(similarityMatrix, k, iter=20):
285
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
62 '''Spectral Clustering algorithm'''
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
63 n = len(similarityMatrix)
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
64 # create Laplacian matrix
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
65 rowsum = np.sum(similarityMatrix,axis=0)
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
66 D = np.diag(1 / np.sqrt(rowsum))
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
67 I = np.identity(n)
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
68 L = I - np.dot(D,np.dot(similarityMatrix,D))
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
69 # compute eigenvectors of L
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
70 U,sigma,V = np.linalg.svd(L)
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
71 # create feature vector from k first eigenvectors
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
72 # by stacking eigenvectors as columns
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
73 features = np.array(V[:k]).T
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
74 # k-means
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
75 from scipy.cluster.vq import kmeans, whiten, vq
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
76 features = whiten(features)
293
ee3302528cdc rearranged new code by Paul (works now)
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 285
diff changeset
77 centroids,distortion = kmeans(features,k, iter)
285
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
78 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
79 return code,sigma