changeset 908:b297525b2cbf

added options to the prototype cluster algorithm, work in progress
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Mon, 26 Jun 2017 00:10:35 -0400
parents 9fd7b18f75b4
children cd038493f8c6
files python/ml.py python/storage.py scripts/learn-motion-patterns.py
diffstat 3 files changed, 69 insertions(+), 16 deletions(-) [+]
line wrap: on
line diff
--- a/python/ml.py	Fri Jun 23 23:50:02 2017 -0400
+++ b/python/ml.py	Mon Jun 26 00:10:35 2017 -0400
@@ -132,6 +132,12 @@
     code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
     return code,sigma
 
+class Cluster:
+    'Represents a cluster, with a prototype id and the list of instances in cluster'
+    def __init__(prototypeId, memberIndices = []):
+        self.prototypeId = prototypeId
+        self.memberIndices = memberIndices
+
 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None):
     '''Assigns instances to prototypes 
     if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters
@@ -161,7 +167,7 @@
             indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex]
     return prototypeIndices, labels
 
-def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, randomInitialization = False, assign = True, initialPrototypeIndices = None):
+def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, optimizeCentroid = True, randomInitialization = False, assign = True, initialPrototypeIndices = None):
     '''Finds exemplar (prototype) instance that represent each cluster
     Returns the prototype indices (in the instances list) and the cluster label of each instance
 
@@ -174,15 +180,19 @@
     it will become a new prototype. 
     Non-prototype instances will be assigned to an existing prototype
 
-    TODO: at each step, optimize the prototype as the most similar in its current cluster (can be done easily if similarities are already computed)'''
+    if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster
 
-    # sort instances based on length
+    TODO: check how similarity evolves in clusters'''
     if len(instances) == 0:
         print('no instances to cluster (empty list)')
         return None
+    if similarityFunc is None:
+        print('similarityFunc is None')
+        return None
 
+    # sort instances based on length
     indices = range(len(instances))
-    if randomInitialization:
+    if randomInitialization or optimizeCentroid:
         indices = np.random.permutation(indices)
     else:
         def compare(i, j):
@@ -194,22 +204,39 @@
                 return 1
         indices.sort(compare)
     # go through all instances
+    clusters = []
     if initialPrototypeIndices is None:
         prototypeIndices = [indices[0]]
     else:
-        prototypeIndices = initialPrototypeIndices
+        prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances
+    for i in prototypeIndices:
+        clusters.append([i])
     for i in indices[1:]:
-        if similarityFunc is not None:
-            for j in prototypeIndices:
-                if similarities[i][j] < 0:
-                    similarities[i][j] = similarityFunc(instances[i], instances[j])
-                    similarities[j][i] = similarities[i][j]
-        if similarities[i][prototypeIndices].max() < minSimilarity:
+        for j in prototypeIndices:
+            if similarities[i][j] < 0:
+                similarities[i][j] = similarityFunc(instances[i], instances[j])
+                similarities[j][i] = similarities[i][j]
+        label = similarities[i][prototypeIndices].argmax()
+        if similarities[i][prototypeIndices[label]] < minSimilarity:
             prototypeIndices.append(i)
-        elif randomInitialization: # replace prototype by current instance i if longer
-            label = similarities[i][prototypeIndices].argmax()
-            if len(instances[prototypeIndices[label]]) < len(instances[i]):
-                prototypeIndices[label] = i
+            clusters.append([])
+        else:
+            clusters[label].append(i)
+            if optimizeCentroid:
+                if len(clusters[label]) >= 2: # no point if only one element in cluster
+                    for j in clusters[label][:-1]:
+                        if similarities[i][j] < 0:
+                            similarities[i][j] = similarityFunc(instances[i], instances[j])
+                            similarities[j][i] = similarities[i][j]
+                    clusterIndices = clusters[label]
+                    clusterSimilarities = similarities[clusterIndices][:,clusterIndices]
+                    newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()]
+                    if prototypeIndices[label] != newCentroidIdx:
+                        prototypeIndices[label] = newCentroidIdx
+            elif randomInitialization: # replace prototype by current instance i if longer
+                if len(instances[prototypeIndices[label]]) < len(instances[i]):
+                    prototypeIndices[label] = i
+                    
     if assign:
         return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize)
     else:
@@ -220,6 +247,24 @@
     clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex)
     return clusterSizes
 
+def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1):
+    if clusters is None:
+        clusters = {protoId:[] for protoId in prototypeIndices+[-1]}
+        for i,l in enumerate(labels):
+            clusters[l].append(i)
+        clusters = [clusters[protoId] for protoId in prototypeIndices]
+    for i, cluster in enumerate(clusters):
+        n = len(cluster)
+        print('cluster {}: {} elements'.format(prototypeIndices[i], n))
+        if n >=2:
+            for j,k in enumerate(cluster):
+                for l in cluster[:j]:
+                    if similarities[k][l] < 0:
+                        similarities[k][l] = similarityFunc(instances[k], instances[l])
+                        similarities[l][k] = similarities[k][l]
+            print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1)))
+            print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1))))
+                    
 # Gaussian Mixture Models
 def plotGMMClusters(model, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3):
     '''plot the ellipse corresponding to the Gaussians
--- a/python/storage.py	Fri Jun 23 23:50:02 2017 -0400
+++ b/python/storage.py	Mon Jun 26 00:10:35 2017 -0400
@@ -19,6 +19,10 @@
                   'car':2,
                   'truck':3}
 
+tableNames = {'feature':'positions',
+              'object': 'objects',
+              'objectfeatures': 'positions'}
+
 #########################
 # Sqlite
 #########################
@@ -411,6 +415,9 @@
             else:
                 dbfn = filename
             cursor.execute('INSERT INTO prototypes (id, dbfilename, trajectory_type, nMatchings) VALUES ({},\"{}\",\"{}\",{})'.format(protoId, dbfn, trajectoryType, n))
+        cursor.execute('SELECT * from sqlite_master WHERE type = \"table\" and name = \"{}\"'.format(tableNames[trajectoryType]))
+        if len(cursor.fetchall) == 0:
+            pass # save prototype trajectory data
     except sqlite3.OperationalError as error:
         printDBError(error)
     connection.commit()
--- a/scripts/learn-motion-patterns.py	Fri Jun 23 23:50:02 2017 -0400
+++ b/scripts/learn-motion-patterns.py	Mon Jun 26 00:10:35 2017 -0400
@@ -17,6 +17,7 @@
 parser.add_argument('--metric', dest = 'metric', help = 'metric for the similarity of trajectory points', default = 'cityblock') # default is manhattan distance
 parser.add_argument('-s', dest = 'minSimilarity', help = 'minimum similarity to put a trajectory in a cluster', type = float, required = True)
 parser.add_argument('-c', dest = 'minClusterSize', help = 'minimum cluster size', type = int, default = None)
+parser.add_argument('--optimize', dest = 'optimizeCentroid', help = 'recompute centroid at each assignment', action = 'store_true')
 parser.add_argument('--random', dest = 'randomInitialization', help = 'random initialization of clustering algorithm', action = 'store_true')
 parser.add_argument('--subsample', dest = 'positionSubsamplingRate', help = 'rate of position subsampling (1 every n positions)', type = int)
 parser.add_argument('--display', dest = 'display', help = 'display trajectories', action = 'store_true')
@@ -60,7 +61,7 @@
 
 similarities = -np.ones((nTrajectories, nTrajectories))
 
-prototypeIndices, labels = ml.prototypeCluster(trajectories, similarities, args.minSimilarity, lambda x,y : lcss.computeNormalized(x, y), args.minClusterSize, args.randomInitialization, True, None) # this line can be called again without reinitializing similarities
+prototypeIndices, labels = ml.prototypeCluster(trajectories, similarities, args.minSimilarity, lambda x,y : lcss.computeNormalized(x, y), args.minClusterSize, args.optimizeCentroid, args.randomInitialization, True, None) # this line can be called again without reinitializing similarities
 
 clusterSizes = ml.computeClusterSizes(labels, prototypeIndices, -1)
 print(clusterSizes)