Mercurial Hosting > traffic-intelligence
diff python/ml.py @ 980:23f98ebb113f
first tests for clustering algo
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Mon, 19 Feb 2018 16:32:59 -0500 |
parents | 184f1dd307f9 |
children | e8eabef7857c |
line wrap: on
line diff
--- a/python/ml.py Mon Feb 19 10:47:19 2018 -0500 +++ b/python/ml.py Mon Feb 19 16:32:59 2018 -0500 @@ -156,8 +156,14 @@ def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): '''Assigns instances to prototypes - if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters - and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' + if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters + and reassigning all elements in the cluster until no cluster is smaller than minClusterSize + + labels are indices in the prototypeIndices''' + if similarityFunc is None: + print('similarityFunc is None') + return None + indices = [i for i in range(len(instances)) if i not in prototypeIndices] labels = [-1]*len(instances) assign = True @@ -165,14 +171,13 @@ for i in prototypeIndices: labels[i] = i for i in indices: - if similarityFunc is not None: - for j in prototypeIndices: - if similarities[i][j] < 0: - similarities[i][j] = similarityFunc(instances[i], instances[j]) - similarities[j][i] = similarities[i][j] - prototypeIdx = similarities[i][prototypeIndices].argmax() - if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity: - labels[i] = prototypeIndices[prototypeIdx] + for j in prototypeIndices: + if similarities[i][j] < 0: + similarities[i][j] = similarityFunc(instances[i], instances[j]) + similarities[j][i] = similarities[i][j] + label = similarities[i][prototypeIndices].argmax() + if similarities[i][prototypeIndices[label]] >= minSimilarity: + labels[i] = prototypeIndices[label] else: labels[i] = -1 # outlier clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} @@ -182,11 +187,12 @@ prototypeIndices.remove(smallestClusterIndex) indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] return prototypeIndices, labels -def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = True, randomInitialization = False, initialPrototypeIndices = None): + +def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None): '''Finds exemplar (prototype) instance that represent each cluster Returns the prototype indices (in the instances list) - the elements in the instances list must have a length (method __len__), or one can use the random initialization + the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid the positions in the instances list corresponds to the similarities if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) similarities must still be allocated with the right size @@ -210,7 +216,7 @@ # sort instances based on length indices = range(len(instances)) if randomInitialization or optimizeCentroid: - indices = np.random.permutation(indices) + indices = np.random.permutation(indices).tolist() else: def compare(i, j): if len(instances[i]) > len(instances[j]): @@ -220,7 +226,7 @@ else: return 1 indices.sort(compare) - # go through all instances + # initialize clusters clusters = [] if initialPrototypeIndices is None: prototypeIndices = [indices[0]] @@ -229,12 +235,13 @@ for i in prototypeIndices: clusters.append([i]) indices.remove(i) + # go through all instances for i in indices: for j in prototypeIndices: if similarities[i][j] < 0: similarities[i][j] = similarityFunc(instances[i], instances[j]) similarities[j][i] = similarities[i][j] - label = similarities[i][prototypeIndices].argmax() + label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices if similarities[i][prototypeIndices[label]] < minSimilarity: prototypeIndices.append(i) clusters.append([]) @@ -313,3 +320,11 @@ ywidth = 0.5*(maxima[1]-minima[1]) plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) + +if __name__ == "__main__": + import doctest + import unittest + suite = doctest.DocFileSuite('tests/ml.txt') + unittest.TextTestRunner().run(suite) +# #doctest.testmod() +# #doctest.testfile("example.txt")