view python/ml.py @ 949:d6c1c05d11f5

modified multithreading at the interaction level for safety computations
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 21 Jul 2017 17:52:56 -0400
parents 89cc05867c4c
children a9b2beef0db4
line wrap: on
line source

#! /usr/bin/env python
'''Libraries for machine learning algorithms'''

from os import path
from random import shuffle
from copy import copy, deepcopy

import numpy as np
from matplotlib.pylab import text
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans, whiten, vq
from sklearn import mixture
import cv2

import utils

#####################
# OpenCV ML models
#####################

class StatModel(object):
    '''Abstract class for loading/saving model'''    
    def load(self, filename):
        if path.exists(filename):
            self.model.load(filename)
        else:
            print('Provided filename {} does not exist: model not loaded!'.format(filename))

    def save(self, filename):
        self.model.save(filename)

class SVM(StatModel):
    '''wrapper for OpenCV SimpleVectorMachine algorithm'''
    def __init__(self, svmType = cv2.SVM_C_SVC, kernelType = cv2.SVM_RBF, degree = 0, gamma = 1, coef0 = 0, Cvalue = 1, nu = 0, p = 0):
        self.model = cv2.SVM()
        self.params = dict(svm_type = svmType, kernel_type = kernelType, degree = degree, gamma = gamma, coef0 = coef0, Cvalue = Cvalue, nu = nu, p = p)
        # OpenCV3
        # self.model = cv2.SVM()
        # self.model.setType(svmType)
        # self.model.setKernel(kernelType)
        # self.model.setDegree(degree)
        # self.model.setGamma(gamma)
        # self.model.setCoef0(coef0)
        # self.model.setC(Cvalue)
        # self.model.setNu(nu)
        # self.model.setP(p)

    def train(self, samples, responses):
        self.model.train(samples, responses, params = self.params)

    def predict(self, hog):
        return self.model.predict(hog)


#####################
# Clustering
#####################

class Centroid(object):
    'Wrapper around instances to add a counter'

    def __init__(self, instance, nInstances = 1):
        self.instance = instance
        self.nInstances = nInstances

    # def similar(instance2):
    #     return self.instance.similar(instance2)

    def add(self, instance2):
        self.instance = self.instance.multiply(self.nInstances)+instance2
        self.nInstances += 1
        self.instance = self.instance.multiply(1/float(self.nInstances))

    def average(c):
        inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances)
        inst.multiply(1/(self.nInstances+instance.nInstances))
        return Centroid(inst, self.nInstances+instance.nInstances)

    def plot(self, options = ''):
        self.instance.plot(options)
        text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances))

def kMedoids(similarityMatrix, initialCentroids = None, k = None):
    '''Algorithm that clusters any dataset based on a similarity matrix
    Either the initialCentroids or k are passed'''
    pass

def assignCluster(data, similarFunc, initialCentroids = None, shuffleData = True):
    '''k-means algorithm with similarity function
    Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function. 
    The number of clusters will be determined accordingly

    data: list of instances
    averageCentroid: '''
    localdata = copy(data) # shallow copy to avoid modifying data
    if shuffleData:
        shuffle(localdata)
    if initialCentroids is None:
        centroids = [Centroid(localdata[0])]
    else:
        centroids = deepcopy(initialCentroids)
    for instance in localdata[1:]:
        i = 0
        while i<len(centroids) and not similarFunc(centroids[i].instance, instance):
            i += 1
        if i == len(centroids):
            centroids.append(Centroid(instance))
        else:
            centroids[i].add(instance)

    return centroids

# TODO recompute centroids for each cluster: instance that minimizes some measure to all other elements

def spectralClustering(similarityMatrix, k, iter=20):
    '''Spectral Clustering algorithm'''
    n = len(similarityMatrix)
    # create Laplacian matrix
    rowsum = np.sum(similarityMatrix,axis=0)
    D = np.diag(1 / np.sqrt(rowsum))
    I = np.identity(n)
    L = I - np.dot(D,np.dot(similarityMatrix,D))
    # compute eigenvectors of L
    U,sigma,V = np.linalg.svd(L)
    # create feature vector from k first eigenvectors
    # by stacking eigenvectors as columns
    features = np.array(V[:k]).T
    # k-means
    features = whiten(features)
    centroids,distortion = kmeans(features,k, iter)
    code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
    return code,sigma

class Cluster:
    'Represents a cluster, with a prototype id and the list of instances in cluster'
    def __init__(prototypeId, memberIndices = []):
        self.prototypeId = prototypeId
        self.memberIndices = memberIndices

def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None):
    '''Assigns instances to prototypes 
    if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters
    and reassigning all elements in the cluster until no cluster is smaller than minClusterSize'''
    indices = [i for i in range(len(instances)) if i not in prototypeIndices]
    labels = [-1]*len(instances)
    assign = True
    while assign:
        for i in prototypeIndices:
            labels[i] = i
        for i in indices:
            if similarityFunc is not None:
                for j in prototypeIndices:
                    if similarities[i][j] < 0:
                        similarities[i][j] = similarityFunc(instances[i], instances[j])
                        similarities[j][i] = similarities[i][j]
            prototypeIdx = similarities[i][prototypeIndices].argmax()
            if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity:
                labels[i] = prototypeIndices[prototypeIdx]
            else:
                labels[i] = -1 # outlier
        clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
        smallestClusterIndex = min(clusterSizes, key = clusterSizes.get)
        assign = (clusterSizes[smallestClusterIndex] < minClusterSize)
        if assign:
            prototypeIndices.remove(smallestClusterIndex)
            indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex]
    return prototypeIndices, labels

def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, optimizeCentroid = True, randomInitialization = False, assign = True, initialPrototypeIndices = None):
    '''Finds exemplar (prototype) instance that represent each cluster
    Returns the prototype indices (in the instances list) and the cluster label of each instance

    the elements in the instances list must have a length (method __len__), or one can use the random initialization
    the positions in the instances list corresponds to the similarities
    if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed)
    similarities must still be allocated with the right size

    if an instance is different enough (<minSimilarity), 
    it will become a new prototype. 
    Non-prototype instances will be assigned to an existing prototype

    if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster

    initialPrototypeIndices are indices in instances

    TODO: check how similarity evolves in clusters'''
    if len(instances) == 0:
        print('no instances to cluster (empty list)')
        return None
    if similarityFunc is None:
        print('similarityFunc is None')
        return None

    # sort instances based on length
    indices = range(len(instances))
    if randomInitialization or optimizeCentroid:
        indices = np.random.permutation(indices)
    else:
        def compare(i, j):
            if len(instances[i]) > len(instances[j]):
                return -1
            elif len(instances[i]) == len(instances[j]):
                return 0
            else:
                return 1
        indices.sort(compare)
    # go through all instances
    clusters = []
    if initialPrototypeIndices is None:
        prototypeIndices = [indices[0]]
    else:
        prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances
    for i in prototypeIndices:
        clusters.append([i])
        indices.remove(i)
    for i in indices:
        for j in prototypeIndices:
            if similarities[i][j] < 0:
                similarities[i][j] = similarityFunc(instances[i], instances[j])
                similarities[j][i] = similarities[i][j]
        label = similarities[i][prototypeIndices].argmax()
        if similarities[i][prototypeIndices[label]] < minSimilarity:
            prototypeIndices.append(i)
            clusters.append([])
        else:
            clusters[label].append(i)
            if optimizeCentroid:
                if len(clusters[label]) >= 2: # no point if only one element in cluster
                    for j in clusters[label][:-1]:
                        if similarities[i][j] < 0:
                            similarities[i][j] = similarityFunc(instances[i], instances[j])
                            similarities[j][i] = similarities[i][j]
                    clusterIndices = clusters[label]
                    clusterSimilarities = similarities[clusterIndices][:,clusterIndices]
                    newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()]
                    if prototypeIndices[label] != newCentroidIdx:
                        prototypeIndices[label] = newCentroidIdx
            elif randomInitialization: # replace prototype by current instance i if longer
                if len(instances[prototypeIndices[label]]) < len(instances[i]):
                    prototypeIndices[label] = i
                    
    if assign:
        return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize)
    else:
        return prototypeIndices, None

def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1):
    clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
    clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex)
    return clusterSizes

def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1):
    if clusters is None:
        clusters = {protoId:[] for protoId in prototypeIndices+[-1]}
        for i,l in enumerate(labels):
            clusters[l].append(i)
        clusters = [clusters[protoId] for protoId in prototypeIndices]
    for i, cluster in enumerate(clusters):
        n = len(cluster)
        print('cluster {}: {} elements'.format(prototypeIndices[i], n))
        if n >=2:
            for j,k in enumerate(cluster):
                for l in cluster[:j]:
                    if similarities[k][l] < 0:
                        similarities[k][l] = similarityFunc(instances[k], instances[l])
                        similarities[l][k] = similarities[k][l]
            print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1)))
            print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1))))

# Gaussian Mixture Models
def plotGMM(mean, covariance, gmmId, fig, color, alpha = 0.3):
    v, w = np.linalg.eigh(covariance)
    angle = 180*np.arctan2(w[0][1], w[0][0])/np.pi
    v *= 4
    ell = mpl.patches.Ellipse(mean, v[0], v[1], 180+angle, color=color)
    ell.set_clip_box(fig.bbox)
    ell.set_alpha(alpha)
    fig.axes[0].add_artist(ell)
    plt.plot([mean[0]], [mean[1]], 'x'+color)
    plt.annotate(str(gmmId), xy=(mean[0]+1, mean[1]+1))

def plotGMMClusters(model, labels = None, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3):
    '''plot the ellipse corresponding to the Gaussians
    and the predicted classes of the instances in the dataset'''
    if fig is None:
        fig = plt.figure()
    if len(fig.get_axes()) == 0:
        fig.add_subplot(111)
    for i in xrange(model.n_components):
        mean = model.means_[i]/nUnitsPerPixel
        covariance = model.covariances_[i]/nUnitsPerPixel
        # plot points
        if dataset is not None:
            tmpDataset = dataset/nUnitsPerPixel
            plt.scatter(tmpDataset[labels == i, 0], tmpDataset[labels == i, 1], .8, color=colors[i])
        # plot an ellipse to show the Gaussian component
        plotGMM(mean, covariance, i, fig, colors[i], alpha)
    if dataset is None: # to address issues without points, the axes limits are not redrawn
        minima = model.means_.min(0)
        maxima = model.means_.max(0)
        xwidth = 0.5*(maxima[0]-minima[0])
        ywidth = 0.5*(maxima[1]-minima[1])
        plt.xlim(minima[0]-xwidth,maxima[0]+xwidth)
        plt.ylim(minima[1]-ywidth,maxima[1]+ywidth)