Mercurial Hosting > traffic-intelligence
comparison python/ml.py @ 735:0e875a7f5759 dev
modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Wed, 12 Aug 2015 00:24:06 -0400 |
parents | 1d4dcb5c8708 |
children | 2472b4d59aea |
comparison
equal
deleted
inserted
replaced
734:1d4dcb5c8708 | 735:0e875a7f5759 |
---|---|
109 features = whiten(features) | 109 features = whiten(features) |
110 centroids,distortion = kmeans(features,k, iter) | 110 centroids,distortion = kmeans(features,k, iter) |
111 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) | 111 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) |
112 return code,sigma | 112 return code,sigma |
113 | 113 |
114 def prototypeCluster(instances, similarityMatrix, minSimilarity, minClusterSize = None, randomInitialization = False): | 114 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = None, randomInitialization = False): |
115 '''Finds exemplar (prototype) instance that represent each cluster | 115 '''Finds exemplar (prototype) instance that represent each cluster |
116 Returns the prototype indices (in the instances list) and the cluster label of each instance | 116 Returns the prototype indices (in the instances list) and the cluster label of each instance |
117 | 117 |
118 the elements in the instances list must have a length (method __len__), or one can use the random initialization | 118 the elements in the instances list must have a length (method __len__), or one can use the random initialization |
119 the positions in the instances list corresponds to the similarityMatrix | 119 the positions in the instances list corresponds to the similarities |
120 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) | |
121 similarities must still be allocated with the right size | |
120 | 122 |
121 if an instance is different enough (<minSimilarity), | 123 if an instance is different enough (<minSimilarity), |
122 it will become a new prototype. | 124 it will become a new prototype. |
123 Non-prototype instances will be assigned to an existing prototype | 125 Non-prototype instances will be assigned to an existing prototype |
124 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters | 126 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters |
125 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' | 127 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' |
126 | 128 |
127 # sort instances based on length | 129 # sort instances based on length |
128 indices = range(similarityMatrix.shape[0]) | 130 indices = range(len(instances)) |
129 if randomInitialization: | 131 if randomInitialization: |
130 indices = np.random.permutation(indices) | 132 indices = np.random.permutation(indices) |
131 else: | 133 else: |
132 def compare(i, j): | 134 def compare(i, j): |
133 if len(instances[i]) > len(instances[j]): | 135 if len(instances[i]) > len(instances[j]): |
138 return 1 | 140 return 1 |
139 indices.sort(compare) | 141 indices.sort(compare) |
140 # go through all instances | 142 # go through all instances |
141 prototypeIndices = [indices[0]] | 143 prototypeIndices = [indices[0]] |
142 for i in indices[1:]: | 144 for i in indices[1:]: |
143 if similarityMatrix[i][prototypeIndices].max() < minSimilarity: | 145 if similarityFunc is not None: |
146 for j in prototypeIndices: | |
147 if similarities[i][j] < 0: | |
148 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
149 similarities[j][i] = similarities[i][j] | |
150 if similarities[i][prototypeIndices].max() < minSimilarity: | |
144 prototypeIndices.append(i) | 151 prototypeIndices.append(i) |
145 | 152 |
146 # assignment | 153 # assignment |
147 indices = [i for i in range(similarityMatrix.shape[0]) if i not in prototypeIndices] | 154 indices = [i for i in range(similarities.shape[0]) if i not in prototypeIndices] |
148 assign = True | 155 assign = True |
149 while assign: | 156 while assign: |
150 labels = [-1]*similarityMatrix.shape[0] | 157 labels = [-1]*similarities.shape[0] |
151 for i in prototypeIndices: | 158 for i in prototypeIndices: |
152 labels[i] = i | 159 labels[i] = i |
153 for i in indices: | 160 for i in indices: |
154 prototypeIndex = similarityMatrix[i][prototypeIndices].argmax() | 161 if similarityFunc is not None: |
155 labels[i] = prototypeIndices[prototypeIndex] | 162 for j in prototypeIndices: |
163 if similarities[i][j] < 0: | |
164 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
165 similarities[j][i] = similarities[i][j] | |
166 prototypeIdx = similarities[i][prototypeIndices].argmax() | |
167 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity: | |
168 labels[i] = prototypeIndices[prototypeIdx] | |
169 else: | |
170 labels[i] = -1 # outlier | |
156 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | 171 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
157 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) | 172 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) |
158 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) | 173 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) |
159 if assign: | 174 if assign: |
160 prototypeIndices.remove(smallestClusterIndex) | 175 prototypeIndices.remove(smallestClusterIndex) |
161 indices.append(smallestClusterIndex) | 176 indices.append(smallestClusterIndex) |
162 | 177 |