view python/ml.py @ 184:d70e9b36889c

initial work on flow vectors and clustering algorithms
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Fri, 25 Nov 2011 18:38:54 -0500
parents ed944ff45e8c
children 5957aa1d69e1 8bafd054cda4
line wrap: on
line source

#! /usr/bin/env python
'''Libraries for machine learning algorithms'''

__metaclass__ = type

class Centroid:
    'Wrapper around instances to add a counter'

    def __init__(self, instance, nInstances = 1):
        self.instance = instance
        self.nInstances = nInstances

    # def similar(instance2):
    #     return self.instance.similar(instance2)

    def add(self, instance2):
        self.instance = self.instance.multiply(self.nInstances)+instance2
        self.nInstances += 1
        self.instance = self.instance.multiply(1/float(self.nInstances))

    def average(c):
        inst = self.instance.multiply(self.nInstances)+c.instance.multiply(instance.nInstances)
        inst.multiply(1/(self.nInstances+instance.nInstances))
        return Centroid(inst, self.nInstances+instance.nInstances)

    def draw(self, options = ''):
        from matplotlib.pylab import text
        self.instance.draw(options)
        text(self.instance.position.x+1, self.instance.position.y+1, str(self.nInstances))


def clustering(data, similar, initialCentroids = []):
    '''k-means algorithm with similarity function
    Two instances should be in the same cluster if the sameCluster function returns true for two instances. It is supposed that the average centroid of a set of instances can be computed, using the function. 
    The number of clusters will be determined accordingly

    data: list of instances
    averageCentroid: '''

    from random import shuffle
    from copy import copy, deepcopy
    localdata = copy(data) # shallow copy to avoid modifying data
    shuffle(localdata)
    if initialCentroids:
        centroids = deepcopy(initialCentroids)
    else:
        centroids = [Centroid(localdata[0])]
    for instance in localdata[1:]:
        i = 0
        while i<len(centroids) and not similar(centroids[i].instance, instance):
            i += 1
        if i == len(centroids):
            centroids.append(Centroid(instance))
        else:
            centroids[i].add(instance)

    return centroids