Mercurial Hosting > traffic-intelligence
annotate python/ml.py @ 786:1f2b2d1f4fbf dev
added script and code to learn POIs
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Fri, 11 Mar 2016 17:38:48 -0500 |
parents | 2472b4d59aea |
children | 0a428b449b80 |
rev | line source |
---|---|
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
1 #! /usr/bin/env python |
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
2 '''Libraries for machine learning algorithms''' |
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
3 |
786
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
4 from os import path |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
5 from random import shuffle |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
6 from copy import copy, deepcopy |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
7 |
308
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
8 import numpy as np |
786
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
9 from matplotlib.pylab import text |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
10 import matplotlib as mpl |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
11 import matplotlib.pyplot as plt |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
12 from scipy.cluster.vq import kmeans, whiten, vq |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
13 from sklearn import mixture |
308
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
14 |
786
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
15 import utils |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
16 |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
17 ##################### |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
18 # OpenCV ML models |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
19 ##################### |
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
20 |
380 | 21 class Model(object): |
22 '''Abstract class for loading/saving model''' | |
680
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
23 def load(self, filename): |
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
24 if path.exists(filename): |
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
25 self.model.load(filename) |
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
26 else: |
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
27 print('Provided filename {} does not exist: model not loaded!'.format(filename)) |
380 | 28 |
680
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
29 def save(self, filename): |
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
30 self.model.save(filename) |
380 | 31 |
32 class SVM(Model): | |
33 '''wrapper for OpenCV SimpleVectorMachine algorithm''' | |
34 | |
680
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
35 def __init__(self): |
380 | 36 import cv2 |
37 self.model = cv2.SVM() | |
680
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
38 |
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
39 def train(self, samples, responses, svm_type, kernel_type, degree = 0, gamma = 1, coef0 = 0, Cvalue = 1, nu = 0, p = 0): |
380 | 40 self.params = dict(svm_type = svm_type, kernel_type = kernel_type, degree = degree, gamma = gamma, coef0 = coef0, Cvalue = Cvalue, nu = nu, p = p) |
41 self.model.train(samples, responses, params = self.params) | |
42 | |
680
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
43 def predict(self, hog): |
da1352b89d02
classification is working
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
665
diff
changeset
|
44 return self.model.predict(hog) |
380 | 45 |
46 | |
786
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
47 ##################### |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
48 # Clustering |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
49 ##################### |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
50 |
665
15e244d2a1b5
corrected bug with circular import for VideoFilenameAddable, moved to base module
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
636
diff
changeset
|
51 class Centroid(object): |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
52 'Wrapper around instances to add a counter' |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
53 |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
54 def __init__(self, instance, nInstances = 1): |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
55 self.instance = instance |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
56 self.nInstances = nInstances |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
57 |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
58 # def similar(instance2): |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
59 # return self.instance.similar(instance2) |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
60 |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
61 def add(self, instance2): |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
62 self.instance = self.instance.multiply(self.nInstances)+instance2 |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
63 self.nInstances += 1 |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
64 self.instance = self.instance.multiply(1/float(self.nInstances)) |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
65 |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
66 def average(c): |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
67 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances) |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
68 inst.multiply(1/(self.nInstances+instance.nInstances)) |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
69 return Centroid(inst, self.nInstances+instance.nInstances) |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
70 |
515
727e3c529519
renamed all draw functions to plot for consistency
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
501
diff
changeset
|
71 def plot(self, options = ''): |
727e3c529519
renamed all draw functions to plot for consistency
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
501
diff
changeset
|
72 self.instance.plot(options) |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
73 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances)) |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
74 |
386 | 75 def kMedoids(similarityMatrix, initialCentroids = None, k = None): |
76 '''Algorithm that clusters any dataset based on a similarity matrix | |
77 Either the initialCentroids or k are passed''' | |
78 pass | |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
79 |
526
21bdeb29f855
corrected bug in initialization of lists and loading trajectories from vissim files
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
515
diff
changeset
|
80 def assignCluster(data, similarFunc, initialCentroids = None, shuffleData = True): |
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
81 '''k-means algorithm with similarity function |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
82 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function. |
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
83 The number of clusters will be determined accordingly |
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
84 |
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
85 data: list of instances |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
86 averageCentroid: ''' |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
87 localdata = copy(data) # shallow copy to avoid modifying data |
382
ba813f148ade
development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
380
diff
changeset
|
88 if shuffleData: |
ba813f148ade
development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
380
diff
changeset
|
89 shuffle(localdata) |
636
3058e00887bc
removed all issues because of tests with None, using is instead of == or !=
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
563
diff
changeset
|
90 if initialCentroids is None: |
526
21bdeb29f855
corrected bug in initialization of lists and loading trajectories from vissim files
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
515
diff
changeset
|
91 centroids = [Centroid(localdata[0])] |
21bdeb29f855
corrected bug in initialization of lists and loading trajectories from vissim files
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
515
diff
changeset
|
92 else: |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
93 centroids = deepcopy(initialCentroids) |
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
94 for instance in localdata[1:]: |
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
95 i = 0 |
382
ba813f148ade
development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
380
diff
changeset
|
96 while i<len(centroids) and not similarFunc(centroids[i].instance, instance): |
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
97 i += 1 |
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
98 if i == len(centroids): |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
99 centroids.append(Centroid(instance)) |
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
100 else: |
184
d70e9b36889c
initial work on flow vectors and clustering algorithms
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
183
diff
changeset
|
101 centroids[i].add(instance) |
183
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
102 |
ed944ff45e8c
first simple clustering algorithm implementation
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
diff
changeset
|
103 return centroids |
308
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
104 |
382
ba813f148ade
development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
380
diff
changeset
|
105 # TODO recompute centroids for each cluster: instance that minimizes some measure to all other elements |
ba813f148ade
development for clustering
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
380
diff
changeset
|
106 |
293
ee3302528cdc
rearranged new code by Paul (works now)
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
285
diff
changeset
|
107 def spectralClustering(similarityMatrix, k, iter=20): |
285
5957aa1d69e1
Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
184
diff
changeset
|
108 '''Spectral Clustering algorithm''' |
5957aa1d69e1
Integrating Mohamed's changes
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
184
diff
changeset
|
109 n = len(similarityMatrix) |
308
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
110 # create Laplacian matrix |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
111 rowsum = np.sum(similarityMatrix,axis=0) |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
112 D = np.diag(1 / np.sqrt(rowsum)) |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
113 I = np.identity(n) |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
114 L = I - np.dot(D,np.dot(similarityMatrix,D)) |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
115 # compute eigenvectors of L |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
116 U,sigma,V = np.linalg.svd(L) |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
117 # create feature vector from k first eigenvectors |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
118 # by stacking eigenvectors as columns |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
119 features = np.array(V[:k]).T |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
120 # k-means |
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
121 features = whiten(features) |
293
ee3302528cdc
rearranged new code by Paul (works now)
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
285
diff
changeset
|
122 centroids,distortion = kmeans(features,k, iter) |
308
8bafd054cda4
Added a function to compute LCSS distance between two indcators
Mohamed Gomaa
parents:
184
diff
changeset
|
123 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) |
309 | 124 return code,sigma |
563
39de5c532559
place holder functions
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
526
diff
changeset
|
125 |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
126 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = None, randomInitialization = False): |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
127 '''Finds exemplar (prototype) instance that represent each cluster |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
128 Returns the prototype indices (in the instances list) and the cluster label of each instance |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
129 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
130 the elements in the instances list must have a length (method __len__), or one can use the random initialization |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
131 the positions in the instances list corresponds to the similarities |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
132 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
133 similarities must still be allocated with the right size |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
134 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
135 if an instance is different enough (<minSimilarity), |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
136 it will become a new prototype. |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
137 Non-prototype instances will be assigned to an existing prototype |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
138 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
139 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
140 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
141 # sort instances based on length |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
142 indices = range(len(instances)) |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
143 if randomInitialization: |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
144 indices = np.random.permutation(indices) |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
145 else: |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
146 def compare(i, j): |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
147 if len(instances[i]) > len(instances[j]): |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
148 return -1 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
149 elif len(instances[i]) == len(instances[j]): |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
150 return 0 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
151 else: |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
152 return 1 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
153 indices.sort(compare) |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
154 # go through all instances |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
155 prototypeIndices = [indices[0]] |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
156 for i in indices[1:]: |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
157 if similarityFunc is not None: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
158 for j in prototypeIndices: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
159 if similarities[i][j] < 0: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
160 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
161 similarities[j][i] = similarities[i][j] |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
162 if similarities[i][prototypeIndices].max() < minSimilarity: |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
163 prototypeIndices.append(i) |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
164 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
165 # assignment |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
166 indices = [i for i in range(similarities.shape[0]) if i not in prototypeIndices] |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
167 assign = True |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
168 while assign: |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
169 labels = [-1]*similarities.shape[0] |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
170 for i in prototypeIndices: |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
171 labels[i] = i |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
172 for i in indices: |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
173 if similarityFunc is not None: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
174 for j in prototypeIndices: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
175 if similarities[i][j] < 0: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
176 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
177 similarities[j][i] = similarities[i][j] |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
178 prototypeIdx = similarities[i][prototypeIndices].argmax() |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
179 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
180 labels[i] = prototypeIndices[prototypeIdx] |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
181 else: |
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
182 labels[i] = -1 # outlier |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
183 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
735
0e875a7f5759
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
734
diff
changeset
|
184 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) |
731
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
185 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
186 if assign: |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
187 prototypeIndices.remove(smallestClusterIndex) |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
188 indices.append(smallestClusterIndex) |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
189 |
b02431a8234c
made prototypecluster generic, in ml module, and added randominitialization
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
680
diff
changeset
|
190 return prototypeIndices, labels |
738
2472b4d59aea
small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
735
diff
changeset
|
191 |
2472b4d59aea
small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
735
diff
changeset
|
192 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): |
2472b4d59aea
small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
735
diff
changeset
|
193 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
786
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
194 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) |
738
2472b4d59aea
small function
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
735
diff
changeset
|
195 return clusterSizes |
786
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
196 |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
197 # Gaussian Mixture Models |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
198 def plotGMMClusters(model, dataset = None, colors = utils.colors): |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
199 '''plot the ellipse corresponding to the Gaussians |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
200 and the predicted classes of the instances in the dataset''' |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
201 fig = plt.figure() |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
202 labels = model.predict(dataset) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
203 for i in xrange(model.n_components): |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
204 mean = model.means_[i] |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
205 if dataset is not None: |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
206 plt.scatter(dataset[labels == i, 0], dataset[labels == i, 1], .8, color=colors[i]) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
207 plt.annotate(str(i), xy=(mean[0]+1, mean[1]+1)) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
208 |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
209 # Plot an ellipse to show the Gaussian component |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
210 v, w = np.linalg.eigh(model.covars_[i]) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
211 angle = np.arctan2(w[0][1], w[0][0]) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
212 angle = 180*angle/np.pi # convert to degrees |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
213 v *= 4 |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
214 ell = mpl.patches.Ellipse(mean, v[0], v[1], 180+angle, color=colors[i]) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
215 ell.set_clip_box(fig.bbox) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
216 ell.set_alpha(.5) |
1f2b2d1f4fbf
added script and code to learn POIs
Nicolas Saunier <nicolas.saunier@polymtl.ca>
parents:
738
diff
changeset
|
217 fig.axes[0].add_artist(ell) |