annotate python/ml.py @ 786:1f2b2d1f4fbf dev

added script and code to learn POIs
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 11 Mar 2016 17:38:48 -0500
parents 2472b4d59aea
children 0a428b449b80
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
1 #! /usr/bin/env python
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
2 '''Libraries for machine learning algorithms'''
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
3
786
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
4 from os import path
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
5 from random import shuffle
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
6 from copy import copy, deepcopy
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
7
308
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
8 import numpy as np
786
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
9 from matplotlib.pylab import text
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
10 import matplotlib as mpl
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
11 import matplotlib.pyplot as plt
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
12 from scipy.cluster.vq import kmeans, whiten, vq
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
13 from sklearn import mixture
308
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
14
786
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
15 import utils
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
16
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
17 #####################
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
18 # OpenCV ML models
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
19 #####################
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
20
380
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
21 class Model(object):
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
22 '''Abstract class for loading/saving model'''
680
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
23 def load(self, filename):
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
24 if path.exists(filename):
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
25 self.model.load(filename)
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
26 else:
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
27 print('Provided filename {} does not exist: model not loaded!'.format(filename))
380
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
28
680
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
29 def save(self, filename):
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
30 self.model.save(filename)
380
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
31
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
32 class SVM(Model):
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
33 '''wrapper for OpenCV SimpleVectorMachine algorithm'''
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
34
680
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
35 def __init__(self):
380
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
36 import cv2
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
37 self.model = cv2.SVM()
680
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
38
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
39 def train(self, samples, responses, svm_type, kernel_type, degree = 0, gamma = 1, coef0 = 0, Cvalue = 1, nu = 0, p = 0):
380
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
40 self.params = dict(svm_type = svm_type, kernel_type = kernel_type, degree = degree, gamma = gamma, coef0 = coef0, Cvalue = Cvalue, nu = nu, p = p)
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
41 self.model.train(samples, responses, params = self.params)
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
42
680
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
43 def predict(self, hog):
da1352b89d02 classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 665
diff changeset
44 return self.model.predict(hog)
380
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
45
adfd4f70ee1d added SVM
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 312
diff changeset
46
786
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
47 #####################
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
48 # Clustering
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
49 #####################
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
50
665
15e244d2a1b5 corrected bug with circular import for VideoFilenameAddable, moved to base module
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 636
diff changeset
51 class Centroid(object):
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
52 'Wrapper around instances to add a counter'
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
53
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
54 def __init__(self, instance, nInstances = 1):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
55 self.instance = instance
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
56 self.nInstances = nInstances
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
57
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
58 # def similar(instance2):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
59 # return self.instance.similar(instance2)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
60
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
61 def add(self, instance2):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
62 self.instance = self.instance.multiply(self.nInstances)+instance2
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
63 self.nInstances += 1
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
64 self.instance = self.instance.multiply(1/float(self.nInstances))
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
65
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
66 def average(c):
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
67 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
68 inst.multiply(1/(self.nInstances+instance.nInstances))
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
69 return Centroid(inst, self.nInstances+instance.nInstances)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
70
515
727e3c529519 renamed all draw functions to plot for consistency
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 501
diff changeset
71 def plot(self, options = ''):
727e3c529519 renamed all draw functions to plot for consistency
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 501
diff changeset
72 self.instance.plot(options)
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
73 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances))
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
74
386
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 382
diff changeset
75 def kMedoids(similarityMatrix, initialCentroids = None, k = None):
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 382
diff changeset
76 '''Algorithm that clusters any dataset based on a similarity matrix
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 382
diff changeset
77 Either the initialCentroids or k are passed'''
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 382
diff changeset
78 pass
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
79
526
21bdeb29f855 corrected bug in initialization of lists and loading trajectories from vissim files
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 515
diff changeset
80 def assignCluster(data, similarFunc, initialCentroids = None, shuffleData = True):
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
81 '''k-means algorithm with similarity function
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
82 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function.
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
83 The number of clusters will be determined accordingly
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
84
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
85 data: list of instances
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
86 averageCentroid: '''
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
87 localdata = copy(data) # shallow copy to avoid modifying data
382
ba813f148ade development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 380
diff changeset
88 if shuffleData:
ba813f148ade development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 380
diff changeset
89 shuffle(localdata)
636
3058e00887bc removed all issues because of tests with None, using is instead of == or !=
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 563
diff changeset
90 if initialCentroids is None:
526
21bdeb29f855 corrected bug in initialization of lists and loading trajectories from vissim files
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 515
diff changeset
91 centroids = [Centroid(localdata[0])]
21bdeb29f855 corrected bug in initialization of lists and loading trajectories from vissim files
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 515
diff changeset
92 else:
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
93 centroids = deepcopy(initialCentroids)
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
94 for instance in localdata[1:]:
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
95 i = 0
382
ba813f148ade development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 380
diff changeset
96 while i<len(centroids) and not similarFunc(centroids[i].instance, instance):
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
97 i += 1
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
98 if i == len(centroids):
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
99 centroids.append(Centroid(instance))
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
100 else:
184
d70e9b36889c initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 183
diff changeset
101 centroids[i].add(instance)
183
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
102
ed944ff45e8c first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff changeset
103 return centroids
308
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
104
382
ba813f148ade development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 380
diff changeset
105 # TODO recompute centroids for each cluster: instance that minimizes some measure to all other elements
ba813f148ade development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 380
diff changeset
106
293
ee3302528cdc rearranged new code by Paul (works now)
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 285
diff changeset
107 def spectralClustering(similarityMatrix, k, iter=20):
285
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
108 '''Spectral Clustering algorithm'''
5957aa1d69e1 Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 184
diff changeset
109 n = len(similarityMatrix)
308
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
110 # create Laplacian matrix
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
111 rowsum = np.sum(similarityMatrix,axis=0)
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
112 D = np.diag(1 / np.sqrt(rowsum))
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
113 I = np.identity(n)
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
114 L = I - np.dot(D,np.dot(similarityMatrix,D))
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
115 # compute eigenvectors of L
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
116 U,sigma,V = np.linalg.svd(L)
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
117 # create feature vector from k first eigenvectors
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
118 # by stacking eigenvectors as columns
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
119 features = np.array(V[:k]).T
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
120 # k-means
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
121 features = whiten(features)
293
ee3302528cdc rearranged new code by Paul (works now)
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 285
diff changeset
122 centroids,distortion = kmeans(features,k, iter)
308
8bafd054cda4 Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents: 184
diff changeset
123 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
309
80cbafd69109 Added spectral clustering function
Mohamed Gomaa
parents: 308
diff changeset
124 return code,sigma
563
39de5c532559 place holder functions
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 526
diff changeset
125
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
126 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = None, randomInitialization = False):
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
127 '''Finds exemplar (prototype) instance that represent each cluster
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
128 Returns the prototype indices (in the instances list) and the cluster label of each instance
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
129
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
130 the elements in the instances list must have a length (method __len__), or one can use the random initialization
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
131 the positions in the instances list corresponds to the similarities
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
132 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed)
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
133 similarities must still be allocated with the right size
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
134
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
135 if an instance is different enough (<minSimilarity),
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
136 it will become a new prototype.
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
137 Non-prototype instances will be assigned to an existing prototype
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
138 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
139 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize'''
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
140
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
141 # sort instances based on length
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
142 indices = range(len(instances))
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
143 if randomInitialization:
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
144 indices = np.random.permutation(indices)
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
145 else:
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
146 def compare(i, j):
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
147 if len(instances[i]) > len(instances[j]):
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
148 return -1
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
149 elif len(instances[i]) == len(instances[j]):
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
150 return 0
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
151 else:
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
152 return 1
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
153 indices.sort(compare)
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
154 # go through all instances
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
155 prototypeIndices = [indices[0]]
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
156 for i in indices[1:]:
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
157 if similarityFunc is not None:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
158 for j in prototypeIndices:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
159 if similarities[i][j] < 0:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
160 similarities[i][j] = similarityFunc(instances[i], instances[j])
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
161 similarities[j][i] = similarities[i][j]
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
162 if similarities[i][prototypeIndices].max() < minSimilarity:
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
163 prototypeIndices.append(i)
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
164
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
165 # assignment
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
166 indices = [i for i in range(similarities.shape[0]) if i not in prototypeIndices]
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
167 assign = True
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
168 while assign:
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
169 labels = [-1]*similarities.shape[0]
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
170 for i in prototypeIndices:
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
171 labels[i] = i
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
172 for i in indices:
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
173 if similarityFunc is not None:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
174 for j in prototypeIndices:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
175 if similarities[i][j] < 0:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
176 similarities[i][j] = similarityFunc(instances[i], instances[j])
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
177 similarities[j][i] = similarities[i][j]
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
178 prototypeIdx = similarities[i][prototypeIndices].argmax()
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
179 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
180 labels[i] = prototypeIndices[prototypeIdx]
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
181 else:
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
182 labels[i] = -1 # outlier
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
183 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
735
0e875a7f5759 modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 734
diff changeset
184 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get)
731
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
185 assign = (clusterSizes[smallestClusterIndex] < minClusterSize)
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
186 if assign:
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
187 prototypeIndices.remove(smallestClusterIndex)
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
188 indices.append(smallestClusterIndex)
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
189
b02431a8234c made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 680
diff changeset
190 return prototypeIndices, labels
738
2472b4d59aea small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 735
diff changeset
191
2472b4d59aea small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 735
diff changeset
192 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1):
2472b4d59aea small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 735
diff changeset
193 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
786
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
194 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex)
738
2472b4d59aea small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 735
diff changeset
195 return clusterSizes
786
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
196
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
197 # Gaussian Mixture Models
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
198 def plotGMMClusters(model, dataset = None, colors = utils.colors):
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
199 '''plot the ellipse corresponding to the Gaussians
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
200 and the predicted classes of the instances in the dataset'''
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
201 fig = plt.figure()
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
202 labels = model.predict(dataset)
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
203 for i in xrange(model.n_components):
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
204 mean = model.means_[i]
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
205 if dataset is not None:
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
206 plt.scatter(dataset[labels == i, 0], dataset[labels == i, 1], .8, color=colors[i])
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
207 plt.annotate(str(i), xy=(mean[0]+1, mean[1]+1))
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
208
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
209 # Plot an ellipse to show the Gaussian component
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
210 v, w = np.linalg.eigh(model.covars_[i])
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
211 angle = np.arctan2(w[0][1], w[0][0])
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
212 angle = 180*angle/np.pi # convert to degrees
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
213 v *= 4
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
214 ell = mpl.patches.Ellipse(mean, v[0], v[1], 180+angle, color=colors[i])
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
215 ell.set_clip_box(fig.bbox)
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
216 ell.set_alpha(.5)
1f2b2d1f4fbf added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents: 738
diff changeset
217 fig.axes[0].add_artist(ell)