Mercurial Hosting > traffic-intelligence
diff trafficintelligence/ml.py @ 1028:cc5cb04b04b0
major update using the trafficintelligence package name and install through pip
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Fri, 15 Jun 2018 11:19:10 -0400 |
parents | python/ml.py@6ba30b259525 |
children | 8ffb3ae9f3d2 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/trafficintelligence/ml.py Fri Jun 15 11:19:10 2018 -0400 @@ -0,0 +1,326 @@ +#! /usr/bin/env python +'''Libraries for machine learning algorithms''' + +from os import path +from random import shuffle +from copy import copy, deepcopy + +import numpy as np +from matplotlib.pylab import text +import matplotlib as mpl +import matplotlib.pyplot as plt +from scipy.cluster.vq import kmeans, whiten, vq +from sklearn import mixture +try: + import cv2 + opencvAvailable = True +except ImportError: + print('OpenCV library could not be loaded (video replay functions will not be available)') # TODO change to logging module + opencvAvailable = False + +from trafficintelligence import utils + +##################### +# OpenCV ML models +##################### + +def computeConfusionMatrix(model, samples, responses): + '''computes the confusion matrix of the classifier (model) + + samples should be n samples by m variables''' + classifications = {} + predictions = model.predict(samples) + for predicted, y in zip(predictions, responses): + classifications[(y, predicted)] = classifications.get((y, predicted), 0)+1 + return classifications + +if opencvAvailable: + class SVM(object): + '''wrapper for OpenCV SimpleVectorMachine algorithm''' + def __init__(self, svmType = cv2.ml.SVM_C_SVC, kernelType = cv2.ml.SVM_RBF, degree = 0, gamma = 1, coef0 = 0, Cvalue = 1, nu = 0, p = 0): + self.model = cv2.ml.SVM_create() + self.model.setType(svmType) + self.model.setKernel(kernelType) + self.model.setDegree(degree) + self.model.setGamma(gamma) + self.model.setCoef0(coef0) + self.model.setC(Cvalue) + self.model.setNu(nu) + self.model.setP(p) + + def save(self, filename): + self.model.save(filename) + + def train(self, samples, layout, responses, computePerformance = False): + self.model.train(samples, layout, responses) + if computePerformance: + return computeConfusionMatrix(self, samples, responses) + + def predict(self, hog): + retval, predictions = self.model.predict(hog) + if hog.shape[0] == 1: + return predictions[0][0] + else: + return np.asarray(predictions, dtype = np.int).ravel().tolist() + + def SVM_load(filename): + if path.exists(filename): + svm = SVM() + svm.model = cv2.ml.SVM_load(filename) + return svm + else: + print('Provided filename {} does not exist: model not loaded!'.format(filename)) + +##################### +# Clustering +##################### + +class Centroid(object): + 'Wrapper around instances to add a counter' + + def __init__(self, instance, nInstances = 1): + self.instance = instance + self.nInstances = nInstances + + # def similar(instance2): + # return self.instance.similar(instance2) + + def add(self, instance2): + self.instance = self.instance.multiply(self.nInstances)+instance2 + self.nInstances += 1 + self.instance = self.instance.multiply(1/float(self.nInstances)) + + def average(c): + inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances) + inst.multiply(1/(self.nInstances+instance.nInstances)) + return Centroid(inst, self.nInstances+instance.nInstances) + + def plot(self, options = ''): + self.instance.plot(options) + text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances)) + +def kMedoids(similarityMatrix, initialCentroids = None, k = None): + '''Algorithm that clusters any dataset based on a similarity matrix + Either the initialCentroids or k are passed''' + pass + +def assignCluster(data, similarFunc, initialCentroids = None, shuffleData = True): + '''k-means algorithm with similarity function + Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function. + The number of clusters will be determined accordingly + + data: list of instances + averageCentroid: ''' + localdata = copy(data) # shallow copy to avoid modifying data + if shuffleData: + shuffle(localdata) + if initialCentroids is None: + centroids = [Centroid(localdata[0])] + else: + centroids = deepcopy(initialCentroids) + for instance in localdata[1:]: + i = 0 + while i<len(centroids) and not similarFunc(centroids[i].instance, instance): + i += 1 + if i == len(centroids): + centroids.append(Centroid(instance)) + else: + centroids[i].add(instance) + + return centroids + +# TODO recompute centroids for each cluster: instance that minimizes some measure to all other elements + +def spectralClustering(similarityMatrix, k, iter=20): + '''Spectral Clustering algorithm''' + n = len(similarityMatrix) + # create Laplacian matrix + rowsum = np.sum(similarityMatrix,axis=0) + D = np.diag(1 / np.sqrt(rowsum)) + I = np.identity(n) + L = I - np.dot(D,np.dot(similarityMatrix,D)) + # compute eigenvectors of L + U,sigma,V = np.linalg.svd(L) + # create feature vector from k first eigenvectors + # by stacking eigenvectors as columns + features = np.array(V[:k]).T + # k-means + features = whiten(features) + centroids,distortion = kmeans(features,k, iter) + code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) + return code,sigma + +def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): + '''Assigns instances to prototypes + if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters + and reassigning all elements in the cluster until no cluster is smaller than minClusterSize + + labels are indices in the prototypeIndices''' + if similarityFunc is None: + print('similarityFunc is None') + return None + + indices = [i for i in range(len(instances)) if i not in prototypeIndices] + labels = [-1]*len(instances) + assign = True + while assign: + for i in prototypeIndices: + labels[i] = i + for i in indices: + for j in prototypeIndices: + if similarities[i][j] < 0: + similarities[i][j] = similarityFunc(instances[i], instances[j]) + similarities[j][i] = similarities[i][j] + label = similarities[i][prototypeIndices].argmax() + if similarities[i][prototypeIndices[label]] >= minSimilarity: + labels[i] = prototypeIndices[label] + else: + labels[i] = -1 # outlier + clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} + smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) + assign = (clusterSizes[smallestClusterIndex] < minClusterSize) + if assign: + prototypeIndices.remove(smallestClusterIndex) + indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] + return prototypeIndices, labels + +def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None): + '''Finds exemplar (prototype) instance that represent each cluster + Returns the prototype indices (in the instances list) + + the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid + the positions in the instances list corresponds to the similarities + if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) + similarities must still be allocated with the right size + + if an instance is different enough (<minSimilarity), + it will become a new prototype. + Non-prototype instances will be assigned to an existing prototype + + if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster + + initialPrototypeIndices are indices in instances + + TODO: check how similarity evolves in clusters''' + if len(instances) == 0: + print('no instances to cluster (empty list)') + return None + if similarityFunc is None: + print('similarityFunc is None') + return None + + # sort instances based on length + indices = range(len(instances)) + if randomInitialization or optimizeCentroid: + indices = np.random.permutation(indices).tolist() + else: + def compare(i, j): + if len(instances[i]) > len(instances[j]): + return -1 + elif len(instances[i]) == len(instances[j]): + return 0 + else: + return 1 + indices.sort(compare) + # initialize clusters + clusters = [] + if initialPrototypeIndices is None: + prototypeIndices = [indices[0]] + else: + prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances + for i in prototypeIndices: + clusters.append([i]) + indices.remove(i) + # go through all instances + for i in indices: + for j in prototypeIndices: + if similarities[i][j] < 0: + similarities[i][j] = similarityFunc(instances[i], instances[j]) + similarities[j][i] = similarities[i][j] + label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices + if similarities[i][prototypeIndices[label]] < minSimilarity: + prototypeIndices.append(i) + clusters.append([]) + else: + clusters[label].append(i) + if optimizeCentroid: + if len(clusters[label]) >= 2: # no point if only one element in cluster + for j in clusters[label][:-1]: + if similarities[i][j] < 0: + similarities[i][j] = similarityFunc(instances[i], instances[j]) + similarities[j][i] = similarities[i][j] + clusterIndices = clusters[label] + clusterSimilarities = similarities[clusterIndices][:,clusterIndices] + newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()] + if prototypeIndices[label] != newCentroidIdx: + prototypeIndices[label] = newCentroidIdx + elif len(instances[prototypeIndices[label]]) < len(instances[i]): # replace prototype by current instance i if longer # otherwise, possible to test if randomInitialization or initialPrototypes is not None + prototypeIndices[label] = i + return prototypeIndices + +def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): + clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} + clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) + return clusterSizes + +def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1): + if clusters is None: + clusters = {protoId:[] for protoId in prototypeIndices+[-1]} + for i,l in enumerate(labels): + clusters[l].append(i) + clusters = [clusters[protoId] for protoId in prototypeIndices] + for i, cluster in enumerate(clusters): + n = len(cluster) + print('cluster {}: {} elements'.format(prototypeIndices[i], n)) + if n >=2: + for j,k in enumerate(cluster): + for l in cluster[:j]: + if similarities[k][l] < 0: + similarities[k][l] = similarityFunc(instances[k], instances[l]) + similarities[l][k] = similarities[k][l] + print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1))) + print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1)))) + +# Gaussian Mixture Models +def plotGMM(mean, covariance, gmmId, fig, color, alpha = 0.3): + v, w = np.linalg.eigh(covariance) + angle = 180*np.arctan2(w[0][1], w[0][0])/np.pi + v *= 4 + ell = mpl.patches.Ellipse(mean, v[0], v[1], 180+angle, color=color) + ell.set_clip_box(fig.bbox) + ell.set_alpha(alpha) + fig.axes[0].add_artist(ell) + plt.plot([mean[0]], [mean[1]], 'x'+color) + plt.annotate(str(gmmId), xy=(mean[0]+1, mean[1]+1)) + +def plotGMMClusters(model, labels = None, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): + '''plot the ellipse corresponding to the Gaussians + and the predicted classes of the instances in the dataset''' + if fig is None: + fig = plt.figure() + if len(fig.get_axes()) == 0: + fig.add_subplot(111) + for i in range(model.n_components): + mean = model.means_[i]/nUnitsPerPixel + covariance = model.covariances_[i]/nUnitsPerPixel + # plot points + if dataset is not None: + tmpDataset = dataset/nUnitsPerPixel + plt.scatter(tmpDataset[labels == i, 0], tmpDataset[labels == i, 1], .8, color=colors[i]) + # plot an ellipse to show the Gaussian component + plotGMM(mean, covariance, i, fig, colors[i], alpha) + if dataset is None: # to address issues without points, the axes limits are not redrawn + minima = model.means_.min(0) + maxima = model.means_.max(0) + xwidth = 0.5*(maxima[0]-minima[0]) + ywidth = 0.5*(maxima[1]-minima[1]) + plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) + plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) + +if __name__ == "__main__": + import doctest + import unittest + suite = doctest.DocFileSuite('tests/ml.txt') + unittest.TextTestRunner().run(suite) +# #doctest.testmod() +# #doctest.testfile("example.txt")