comparison trafficintelligence/ml.py @ 1028:cc5cb04b04b0

major update using the trafficintelligence package name and install through pip
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 15 Jun 2018 11:19:10 -0400
parents python/ml.py@6ba30b259525
children 8ffb3ae9f3d2
comparison
equal deleted inserted replaced
1027:6129296848d3 1028:cc5cb04b04b0
1 #! /usr/bin/env python
2 '''Libraries for machine learning algorithms'''
3
4 from os import path
5 from random import shuffle
6 from copy import copy, deepcopy
7
8 import numpy as np
9 from matplotlib.pylab import text
10 import matplotlib as mpl
11 import matplotlib.pyplot as plt
12 from scipy.cluster.vq import kmeans, whiten, vq
13 from sklearn import mixture
14 try:
15 import cv2
16 opencvAvailable = True
17 except ImportError:
18 print('OpenCV library could not be loaded (video replay functions will not be available)') # TODO change to logging module
19 opencvAvailable = False
20
21 from trafficintelligence import utils
22
23 #####################
24 # OpenCV ML models
25 #####################
26
27 def computeConfusionMatrix(model, samples, responses):
28 '''computes the confusion matrix of the classifier (model)
29
30 samples should be n samples by m variables'''
31 classifications = {}
32 predictions = model.predict(samples)
33 for predicted, y in zip(predictions, responses):
34 classifications[(y, predicted)] = classifications.get((y, predicted), 0)+1
35 return classifications
36
37 if opencvAvailable:
38 class SVM(object):
39 '''wrapper for OpenCV SimpleVectorMachine algorithm'''
40 def __init__(self, svmType = cv2.ml.SVM_C_SVC, kernelType = cv2.ml.SVM_RBF, degree = 0, gamma = 1, coef0 = 0, Cvalue = 1, nu = 0, p = 0):
41 self.model = cv2.ml.SVM_create()
42 self.model.setType(svmType)
43 self.model.setKernel(kernelType)
44 self.model.setDegree(degree)
45 self.model.setGamma(gamma)
46 self.model.setCoef0(coef0)
47 self.model.setC(Cvalue)
48 self.model.setNu(nu)
49 self.model.setP(p)
50
51 def save(self, filename):
52 self.model.save(filename)
53
54 def train(self, samples, layout, responses, computePerformance = False):
55 self.model.train(samples, layout, responses)
56 if computePerformance:
57 return computeConfusionMatrix(self, samples, responses)
58
59 def predict(self, hog):
60 retval, predictions = self.model.predict(hog)
61 if hog.shape[0] == 1:
62 return predictions[0][0]
63 else:
64 return np.asarray(predictions, dtype = np.int).ravel().tolist()
65
66 def SVM_load(filename):
67 if path.exists(filename):
68 svm = SVM()
69 svm.model = cv2.ml.SVM_load(filename)
70 return svm
71 else:
72 print('Provided filename {} does not exist: model not loaded!'.format(filename))
73
74 #####################
75 # Clustering
76 #####################
77
78 class Centroid(object):
79 'Wrapper around instances to add a counter'
80
81 def __init__(self, instance, nInstances = 1):
82 self.instance = instance
83 self.nInstances = nInstances
84
85 # def similar(instance2):
86 # return self.instance.similar(instance2)
87
88 def add(self, instance2):
89 self.instance = self.instance.multiply(self.nInstances)+instance2
90 self.nInstances += 1
91 self.instance = self.instance.multiply(1/float(self.nInstances))
92
93 def average(c):
94 inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances)
95 inst.multiply(1/(self.nInstances+instance.nInstances))
96 return Centroid(inst, self.nInstances+instance.nInstances)
97
98 def plot(self, options = ''):
99 self.instance.plot(options)
100 text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances))
101
102 def kMedoids(similarityMatrix, initialCentroids = None, k = None):
103 '''Algorithm that clusters any dataset based on a similarity matrix
104 Either the initialCentroids or k are passed'''
105 pass
106
107 def assignCluster(data, similarFunc, initialCentroids = None, shuffleData = True):
108 '''k-means algorithm with similarity function
109 Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function.
110 The number of clusters will be determined accordingly
111
112 data: list of instances
113 averageCentroid: '''
114 localdata = copy(data) # shallow copy to avoid modifying data
115 if shuffleData:
116 shuffle(localdata)
117 if initialCentroids is None:
118 centroids = [Centroid(localdata[0])]
119 else:
120 centroids = deepcopy(initialCentroids)
121 for instance in localdata[1:]:
122 i = 0
123 while i<len(centroids) and not similarFunc(centroids[i].instance, instance):
124 i += 1
125 if i == len(centroids):
126 centroids.append(Centroid(instance))
127 else:
128 centroids[i].add(instance)
129
130 return centroids
131
132 # TODO recompute centroids for each cluster: instance that minimizes some measure to all other elements
133
134 def spectralClustering(similarityMatrix, k, iter=20):
135 '''Spectral Clustering algorithm'''
136 n = len(similarityMatrix)
137 # create Laplacian matrix
138 rowsum = np.sum(similarityMatrix,axis=0)
139 D = np.diag(1 / np.sqrt(rowsum))
140 I = np.identity(n)
141 L = I - np.dot(D,np.dot(similarityMatrix,D))
142 # compute eigenvectors of L
143 U,sigma,V = np.linalg.svd(L)
144 # create feature vector from k first eigenvectors
145 # by stacking eigenvectors as columns
146 features = np.array(V[:k]).T
147 # k-means
148 features = whiten(features)
149 centroids,distortion = kmeans(features,k, iter)
150 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
151 return code,sigma
152
153 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0):
154 '''Assigns instances to prototypes
155 if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters
156 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize
157
158 labels are indices in the prototypeIndices'''
159 if similarityFunc is None:
160 print('similarityFunc is None')
161 return None
162
163 indices = [i for i in range(len(instances)) if i not in prototypeIndices]
164 labels = [-1]*len(instances)
165 assign = True
166 while assign:
167 for i in prototypeIndices:
168 labels[i] = i
169 for i in indices:
170 for j in prototypeIndices:
171 if similarities[i][j] < 0:
172 similarities[i][j] = similarityFunc(instances[i], instances[j])
173 similarities[j][i] = similarities[i][j]
174 label = similarities[i][prototypeIndices].argmax()
175 if similarities[i][prototypeIndices[label]] >= minSimilarity:
176 labels[i] = prototypeIndices[label]
177 else:
178 labels[i] = -1 # outlier
179 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
180 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get)
181 assign = (clusterSizes[smallestClusterIndex] < minClusterSize)
182 if assign:
183 prototypeIndices.remove(smallestClusterIndex)
184 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex]
185 return prototypeIndices, labels
186
187 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None):
188 '''Finds exemplar (prototype) instance that represent each cluster
189 Returns the prototype indices (in the instances list)
190
191 the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid
192 the positions in the instances list corresponds to the similarities
193 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed)
194 similarities must still be allocated with the right size
195
196 if an instance is different enough (<minSimilarity),
197 it will become a new prototype.
198 Non-prototype instances will be assigned to an existing prototype
199
200 if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster
201
202 initialPrototypeIndices are indices in instances
203
204 TODO: check how similarity evolves in clusters'''
205 if len(instances) == 0:
206 print('no instances to cluster (empty list)')
207 return None
208 if similarityFunc is None:
209 print('similarityFunc is None')
210 return None
211
212 # sort instances based on length
213 indices = range(len(instances))
214 if randomInitialization or optimizeCentroid:
215 indices = np.random.permutation(indices).tolist()
216 else:
217 def compare(i, j):
218 if len(instances[i]) > len(instances[j]):
219 return -1
220 elif len(instances[i]) == len(instances[j]):
221 return 0
222 else:
223 return 1
224 indices.sort(compare)
225 # initialize clusters
226 clusters = []
227 if initialPrototypeIndices is None:
228 prototypeIndices = [indices[0]]
229 else:
230 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances
231 for i in prototypeIndices:
232 clusters.append([i])
233 indices.remove(i)
234 # go through all instances
235 for i in indices:
236 for j in prototypeIndices:
237 if similarities[i][j] < 0:
238 similarities[i][j] = similarityFunc(instances[i], instances[j])
239 similarities[j][i] = similarities[i][j]
240 label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices
241 if similarities[i][prototypeIndices[label]] < minSimilarity:
242 prototypeIndices.append(i)
243 clusters.append([])
244 else:
245 clusters[label].append(i)
246 if optimizeCentroid:
247 if len(clusters[label]) >= 2: # no point if only one element in cluster
248 for j in clusters[label][:-1]:
249 if similarities[i][j] < 0:
250 similarities[i][j] = similarityFunc(instances[i], instances[j])
251 similarities[j][i] = similarities[i][j]
252 clusterIndices = clusters[label]
253 clusterSimilarities = similarities[clusterIndices][:,clusterIndices]
254 newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()]
255 if prototypeIndices[label] != newCentroidIdx:
256 prototypeIndices[label] = newCentroidIdx
257 elif len(instances[prototypeIndices[label]]) < len(instances[i]): # replace prototype by current instance i if longer # otherwise, possible to test if randomInitialization or initialPrototypes is not None
258 prototypeIndices[label] = i
259 return prototypeIndices
260
261 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1):
262 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
263 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex)
264 return clusterSizes
265
266 def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1):
267 if clusters is None:
268 clusters = {protoId:[] for protoId in prototypeIndices+[-1]}
269 for i,l in enumerate(labels):
270 clusters[l].append(i)
271 clusters = [clusters[protoId] for protoId in prototypeIndices]
272 for i, cluster in enumerate(clusters):
273 n = len(cluster)
274 print('cluster {}: {} elements'.format(prototypeIndices[i], n))
275 if n >=2:
276 for j,k in enumerate(cluster):
277 for l in cluster[:j]:
278 if similarities[k][l] < 0:
279 similarities[k][l] = similarityFunc(instances[k], instances[l])
280 similarities[l][k] = similarities[k][l]
281 print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1)))
282 print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1))))
283
284 # Gaussian Mixture Models
285 def plotGMM(mean, covariance, gmmId, fig, color, alpha = 0.3):
286 v, w = np.linalg.eigh(covariance)
287 angle = 180*np.arctan2(w[0][1], w[0][0])/np.pi
288 v *= 4
289 ell = mpl.patches.Ellipse(mean, v[0], v[1], 180+angle, color=color)
290 ell.set_clip_box(fig.bbox)
291 ell.set_alpha(alpha)
292 fig.axes[0].add_artist(ell)
293 plt.plot([mean[0]], [mean[1]], 'x'+color)
294 plt.annotate(str(gmmId), xy=(mean[0]+1, mean[1]+1))
295
296 def plotGMMClusters(model, labels = None, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3):
297 '''plot the ellipse corresponding to the Gaussians
298 and the predicted classes of the instances in the dataset'''
299 if fig is None:
300 fig = plt.figure()
301 if len(fig.get_axes()) == 0:
302 fig.add_subplot(111)
303 for i in range(model.n_components):
304 mean = model.means_[i]/nUnitsPerPixel
305 covariance = model.covariances_[i]/nUnitsPerPixel
306 # plot points
307 if dataset is not None:
308 tmpDataset = dataset/nUnitsPerPixel
309 plt.scatter(tmpDataset[labels == i, 0], tmpDataset[labels == i, 1], .8, color=colors[i])
310 # plot an ellipse to show the Gaussian component
311 plotGMM(mean, covariance, i, fig, colors[i], alpha)
312 if dataset is None: # to address issues without points, the axes limits are not redrawn
313 minima = model.means_.min(0)
314 maxima = model.means_.max(0)
315 xwidth = 0.5*(maxima[0]-minima[0])
316 ywidth = 0.5*(maxima[1]-minima[1])
317 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth)
318 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth)
319
320 if __name__ == "__main__":
321 import doctest
322 import unittest
323 suite = doctest.DocFileSuite('tests/ml.txt')
324 unittest.TextTestRunner().run(suite)
325 # #doctest.testmod()
326 # #doctest.testfile("example.txt")