Mercurial Hosting > traffic-intelligence
comparison trafficintelligence/ml.py @ 1028:cc5cb04b04b0
major update using the trafficintelligence package name and install through pip
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Fri, 15 Jun 2018 11:19:10 -0400 |
parents | python/ml.py@6ba30b259525 |
children | 8ffb3ae9f3d2 |
comparison
equal
deleted
inserted
replaced
1027:6129296848d3 | 1028:cc5cb04b04b0 |
---|---|
1 #! /usr/bin/env python | |
2 '''Libraries for machine learning algorithms''' | |
3 | |
4 from os import path | |
5 from random import shuffle | |
6 from copy import copy, deepcopy | |
7 | |
8 import numpy as np | |
9 from matplotlib.pylab import text | |
10 import matplotlib as mpl | |
11 import matplotlib.pyplot as plt | |
12 from scipy.cluster.vq import kmeans, whiten, vq | |
13 from sklearn import mixture | |
14 try: | |
15 import cv2 | |
16 opencvAvailable = True | |
17 except ImportError: | |
18 print('OpenCV library could not be loaded (video replay functions will not be available)') # TODO change to logging module | |
19 opencvAvailable = False | |
20 | |
21 from trafficintelligence import utils | |
22 | |
23 ##################### | |
24 # OpenCV ML models | |
25 ##################### | |
26 | |
27 def computeConfusionMatrix(model, samples, responses): | |
28 '''computes the confusion matrix of the classifier (model) | |
29 | |
30 samples should be n samples by m variables''' | |
31 classifications = {} | |
32 predictions = model.predict(samples) | |
33 for predicted, y in zip(predictions, responses): | |
34 classifications[(y, predicted)] = classifications.get((y, predicted), 0)+1 | |
35 return classifications | |
36 | |
37 if opencvAvailable: | |
38 class SVM(object): | |
39 '''wrapper for OpenCV SimpleVectorMachine algorithm''' | |
40 def __init__(self, svmType = cv2.ml.SVM_C_SVC, kernelType = cv2.ml.SVM_RBF, degree = 0, gamma = 1, coef0 = 0, Cvalue = 1, nu = 0, p = 0): | |
41 self.model = cv2.ml.SVM_create() | |
42 self.model.setType(svmType) | |
43 self.model.setKernel(kernelType) | |
44 self.model.setDegree(degree) | |
45 self.model.setGamma(gamma) | |
46 self.model.setCoef0(coef0) | |
47 self.model.setC(Cvalue) | |
48 self.model.setNu(nu) | |
49 self.model.setP(p) | |
50 | |
51 def save(self, filename): | |
52 self.model.save(filename) | |
53 | |
54 def train(self, samples, layout, responses, computePerformance = False): | |
55 self.model.train(samples, layout, responses) | |
56 if computePerformance: | |
57 return computeConfusionMatrix(self, samples, responses) | |
58 | |
59 def predict(self, hog): | |
60 retval, predictions = self.model.predict(hog) | |
61 if hog.shape[0] == 1: | |
62 return predictions[0][0] | |
63 else: | |
64 return np.asarray(predictions, dtype = np.int).ravel().tolist() | |
65 | |
66 def SVM_load(filename): | |
67 if path.exists(filename): | |
68 svm = SVM() | |
69 svm.model = cv2.ml.SVM_load(filename) | |
70 return svm | |
71 else: | |
72 print('Provided filename {} does not exist: model not loaded!'.format(filename)) | |
73 | |
74 ##################### | |
75 # Clustering | |
76 ##################### | |
77 | |
78 class Centroid(object): | |
79 'Wrapper around instances to add a counter' | |
80 | |
81 def __init__(self, instance, nInstances = 1): | |
82 self.instance = instance | |
83 self.nInstances = nInstances | |
84 | |
85 # def similar(instance2): | |
86 # return self.instance.similar(instance2) | |
87 | |
88 def add(self, instance2): | |
89 self.instance = self.instance.multiply(self.nInstances)+instance2 | |
90 self.nInstances += 1 | |
91 self.instance = self.instance.multiply(1/float(self.nInstances)) | |
92 | |
93 def average(c): | |
94 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances) | |
95 inst.multiply(1/(self.nInstances+instance.nInstances)) | |
96 return Centroid(inst, self.nInstances+instance.nInstances) | |
97 | |
98 def plot(self, options = ''): | |
99 self.instance.plot(options) | |
100 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances)) | |
101 | |
102 def kMedoids(similarityMatrix, initialCentroids = None, k = None): | |
103 '''Algorithm that clusters any dataset based on a similarity matrix | |
104 Either the initialCentroids or k are passed''' | |
105 pass | |
106 | |
107 def assignCluster(data, similarFunc, initialCentroids = None, shuffleData = True): | |
108 '''k-means algorithm with similarity function | |
109 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function. | |
110 The number of clusters will be determined accordingly | |
111 | |
112 data: list of instances | |
113 averageCentroid: ''' | |
114 localdata = copy(data) # shallow copy to avoid modifying data | |
115 if shuffleData: | |
116 shuffle(localdata) | |
117 if initialCentroids is None: | |
118 centroids = [Centroid(localdata[0])] | |
119 else: | |
120 centroids = deepcopy(initialCentroids) | |
121 for instance in localdata[1:]: | |
122 i = 0 | |
123 while i<len(centroids) and not similarFunc(centroids[i].instance, instance): | |
124 i += 1 | |
125 if i == len(centroids): | |
126 centroids.append(Centroid(instance)) | |
127 else: | |
128 centroids[i].add(instance) | |
129 | |
130 return centroids | |
131 | |
132 # TODO recompute centroids for each cluster: instance that minimizes some measure to all other elements | |
133 | |
134 def spectralClustering(similarityMatrix, k, iter=20): | |
135 '''Spectral Clustering algorithm''' | |
136 n = len(similarityMatrix) | |
137 # create Laplacian matrix | |
138 rowsum = np.sum(similarityMatrix,axis=0) | |
139 D = np.diag(1 / np.sqrt(rowsum)) | |
140 I = np.identity(n) | |
141 L = I - np.dot(D,np.dot(similarityMatrix,D)) | |
142 # compute eigenvectors of L | |
143 U,sigma,V = np.linalg.svd(L) | |
144 # create feature vector from k first eigenvectors | |
145 # by stacking eigenvectors as columns | |
146 features = np.array(V[:k]).T | |
147 # k-means | |
148 features = whiten(features) | |
149 centroids,distortion = kmeans(features,k, iter) | |
150 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) | |
151 return code,sigma | |
152 | |
153 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): | |
154 '''Assigns instances to prototypes | |
155 if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters | |
156 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize | |
157 | |
158 labels are indices in the prototypeIndices''' | |
159 if similarityFunc is None: | |
160 print('similarityFunc is None') | |
161 return None | |
162 | |
163 indices = [i for i in range(len(instances)) if i not in prototypeIndices] | |
164 labels = [-1]*len(instances) | |
165 assign = True | |
166 while assign: | |
167 for i in prototypeIndices: | |
168 labels[i] = i | |
169 for i in indices: | |
170 for j in prototypeIndices: | |
171 if similarities[i][j] < 0: | |
172 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
173 similarities[j][i] = similarities[i][j] | |
174 label = similarities[i][prototypeIndices].argmax() | |
175 if similarities[i][prototypeIndices[label]] >= minSimilarity: | |
176 labels[i] = prototypeIndices[label] | |
177 else: | |
178 labels[i] = -1 # outlier | |
179 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | |
180 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) | |
181 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) | |
182 if assign: | |
183 prototypeIndices.remove(smallestClusterIndex) | |
184 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] | |
185 return prototypeIndices, labels | |
186 | |
187 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None): | |
188 '''Finds exemplar (prototype) instance that represent each cluster | |
189 Returns the prototype indices (in the instances list) | |
190 | |
191 the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid | |
192 the positions in the instances list corresponds to the similarities | |
193 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) | |
194 similarities must still be allocated with the right size | |
195 | |
196 if an instance is different enough (<minSimilarity), | |
197 it will become a new prototype. | |
198 Non-prototype instances will be assigned to an existing prototype | |
199 | |
200 if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster | |
201 | |
202 initialPrototypeIndices are indices in instances | |
203 | |
204 TODO: check how similarity evolves in clusters''' | |
205 if len(instances) == 0: | |
206 print('no instances to cluster (empty list)') | |
207 return None | |
208 if similarityFunc is None: | |
209 print('similarityFunc is None') | |
210 return None | |
211 | |
212 # sort instances based on length | |
213 indices = range(len(instances)) | |
214 if randomInitialization or optimizeCentroid: | |
215 indices = np.random.permutation(indices).tolist() | |
216 else: | |
217 def compare(i, j): | |
218 if len(instances[i]) > len(instances[j]): | |
219 return -1 | |
220 elif len(instances[i]) == len(instances[j]): | |
221 return 0 | |
222 else: | |
223 return 1 | |
224 indices.sort(compare) | |
225 # initialize clusters | |
226 clusters = [] | |
227 if initialPrototypeIndices is None: | |
228 prototypeIndices = [indices[0]] | |
229 else: | |
230 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances | |
231 for i in prototypeIndices: | |
232 clusters.append([i]) | |
233 indices.remove(i) | |
234 # go through all instances | |
235 for i in indices: | |
236 for j in prototypeIndices: | |
237 if similarities[i][j] < 0: | |
238 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
239 similarities[j][i] = similarities[i][j] | |
240 label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices | |
241 if similarities[i][prototypeIndices[label]] < minSimilarity: | |
242 prototypeIndices.append(i) | |
243 clusters.append([]) | |
244 else: | |
245 clusters[label].append(i) | |
246 if optimizeCentroid: | |
247 if len(clusters[label]) >= 2: # no point if only one element in cluster | |
248 for j in clusters[label][:-1]: | |
249 if similarities[i][j] < 0: | |
250 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
251 similarities[j][i] = similarities[i][j] | |
252 clusterIndices = clusters[label] | |
253 clusterSimilarities = similarities[clusterIndices][:,clusterIndices] | |
254 newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()] | |
255 if prototypeIndices[label] != newCentroidIdx: | |
256 prototypeIndices[label] = newCentroidIdx | |
257 elif len(instances[prototypeIndices[label]]) < len(instances[i]): # replace prototype by current instance i if longer # otherwise, possible to test if randomInitialization or initialPrototypes is not None | |
258 prototypeIndices[label] = i | |
259 return prototypeIndices | |
260 | |
261 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): | |
262 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | |
263 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) | |
264 return clusterSizes | |
265 | |
266 def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1): | |
267 if clusters is None: | |
268 clusters = {protoId:[] for protoId in prototypeIndices+[-1]} | |
269 for i,l in enumerate(labels): | |
270 clusters[l].append(i) | |
271 clusters = [clusters[protoId] for protoId in prototypeIndices] | |
272 for i, cluster in enumerate(clusters): | |
273 n = len(cluster) | |
274 print('cluster {}: {} elements'.format(prototypeIndices[i], n)) | |
275 if n >=2: | |
276 for j,k in enumerate(cluster): | |
277 for l in cluster[:j]: | |
278 if similarities[k][l] < 0: | |
279 similarities[k][l] = similarityFunc(instances[k], instances[l]) | |
280 similarities[l][k] = similarities[k][l] | |
281 print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1))) | |
282 print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1)))) | |
283 | |
284 # Gaussian Mixture Models | |
285 def plotGMM(mean, covariance, gmmId, fig, color, alpha = 0.3): | |
286 v, w = np.linalg.eigh(covariance) | |
287 angle = 180*np.arctan2(w[0][1], w[0][0])/np.pi | |
288 v *= 4 | |
289 ell = mpl.patches.Ellipse(mean, v[0], v[1], 180+angle, color=color) | |
290 ell.set_clip_box(fig.bbox) | |
291 ell.set_alpha(alpha) | |
292 fig.axes[0].add_artist(ell) | |
293 plt.plot([mean[0]], [mean[1]], 'x'+color) | |
294 plt.annotate(str(gmmId), xy=(mean[0]+1, mean[1]+1)) | |
295 | |
296 def plotGMMClusters(model, labels = None, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): | |
297 '''plot the ellipse corresponding to the Gaussians | |
298 and the predicted classes of the instances in the dataset''' | |
299 if fig is None: | |
300 fig = plt.figure() | |
301 if len(fig.get_axes()) == 0: | |
302 fig.add_subplot(111) | |
303 for i in range(model.n_components): | |
304 mean = model.means_[i]/nUnitsPerPixel | |
305 covariance = model.covariances_[i]/nUnitsPerPixel | |
306 # plot points | |
307 if dataset is not None: | |
308 tmpDataset = dataset/nUnitsPerPixel | |
309 plt.scatter(tmpDataset[labels == i, 0], tmpDataset[labels == i, 1], .8, color=colors[i]) | |
310 # plot an ellipse to show the Gaussian component | |
311 plotGMM(mean, covariance, i, fig, colors[i], alpha) | |
312 if dataset is None: # to address issues without points, the axes limits are not redrawn | |
313 minima = model.means_.min(0) | |
314 maxima = model.means_.max(0) | |
315 xwidth = 0.5*(maxima[0]-minima[0]) | |
316 ywidth = 0.5*(maxima[1]-minima[1]) | |
317 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) | |
318 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) | |
319 | |
320 if __name__ == "__main__": | |
321 import doctest | |
322 import unittest | |
323 suite = doctest.DocFileSuite('tests/ml.txt') | |
324 unittest.TextTestRunner().run(suite) | |
325 # #doctest.testmod() | |
326 # #doctest.testfile("example.txt") |