repo/traffic-intelligence: python/ml.py comparison

comparison python/ml.py @ 735:0e875a7f5759 dev

modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities

author	Nicolas Saunier <nicolas.saunier@polymtl.ca>
date	Wed, 12 Aug 2015 00:24:06 -0400
parents	1d4dcb5c8708
children	2472b4d59aea

comparison

equal deleted inserted replaced

-:1d4dcb5c8708
+:0e875a7f5759
 	features = whiten(features)
 	centroids,distortion = kmeans(features,k, iter)
 	code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
 	return code,sigma
-def prototypeCluster(instances, similarityMatrix, minSimilarity, minClusterSize = None, randomInitialization = False):
+def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = None, randomInitialization = False):
 '''Finds exemplar (prototype) instance that represent each cluster
 Returns the prototype indices (in the instances list) and the cluster label of each instance
 the elements in the instances list must have a length (method __len__), or one can use the random initialization
-the positions in the instances list corresponds to the similarityMatrix
+the positions in the instances list corresponds to the similarities
+if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed)
+similarities must still be allocated with the right size
 if an instance is different enough (<minSimilarity),
 it will become a new prototype.
 Non-prototype instances will be assigned to an existing prototype
 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters
 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize'''
 # sort instances based on length
-indices = range(similarityMatrix.shape[0])
+indices = range(len(instances))
 if randomInitialization:
 indices = np.random.permutation(indices)
 else:
 def compare(i, j):
 if len(instances[i]) > len(instances[j]):
 return 1
 indices.sort(compare)
 # go through all instances
 prototypeIndices = [indices[0]]
 for i in indices[1:]:
-if similarityMatrix[i][prototypeIndices].max() < minSimilarity:
+if similarityFunc is not None:
+for j in prototypeIndices:
+if similarities[i][j] < 0:
+similarities[i][j] = similarityFunc(instances[i], instances[j])
+similarities[j][i] = similarities[i][j]
+if similarities[i][prototypeIndices].max() < minSimilarity:
 prototypeIndices.append(i)
 # assignment
-indices = [i for i in range(similarityMatrix.shape[0]) if i not in prototypeIndices]
+indices = [i for i in range(similarities.shape[0]) if i not in prototypeIndices]
 assign = True
 while assign:
-labels = [-1]*similarityMatrix.shape[0]
+labels = [-1]*similarities.shape[0]
 for i in prototypeIndices:
 labels[i] = i
 for i in indices:
-prototypeIndex = similarityMatrix[i][prototypeIndices].argmax()
+if similarityFunc is not None:
-labels[i] = prototypeIndices[prototypeIndex]
+for j in prototypeIndices:
+if similarities[i][j] < 0:
+similarities[i][j] = similarityFunc(instances[i], instances[j])
+similarities[j][i] = similarities[i][j]
+prototypeIdx = similarities[i][prototypeIndices].argmax()
+if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity:
+labels[i] = prototypeIndices[prototypeIdx]
+else:
+labels[i] = -1 # outlier
 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get)
 assign = (clusterSizes[smallestClusterIndex] < minClusterSize)
 if assign:
 prototypeIndices.remove(smallestClusterIndex)
 indices.append(smallestClusterIndex)

Mercurial Hosting > traffic-intelligence

comparison python/ml.py @ 735:0e875a7f5759 dev