Mercurial Hosting > traffic-intelligence
comparison python/ml.py @ 980:23f98ebb113f
first tests for clustering algo
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Mon, 19 Feb 2018 16:32:59 -0500 |
parents | 184f1dd307f9 |
children | e8eabef7857c |
comparison
equal
deleted
inserted
replaced
979:cc89267b5ff9 | 980:23f98ebb113f |
---|---|
154 self.prototypeId = prototypeId | 154 self.prototypeId = prototypeId |
155 self.memberIndices = memberIndices | 155 self.memberIndices = memberIndices |
156 | 156 |
157 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): | 157 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): |
158 '''Assigns instances to prototypes | 158 '''Assigns instances to prototypes |
159 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters | 159 if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters |
160 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' | 160 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize |
161 | |
162 labels are indices in the prototypeIndices''' | |
163 if similarityFunc is None: | |
164 print('similarityFunc is None') | |
165 return None | |
166 | |
161 indices = [i for i in range(len(instances)) if i not in prototypeIndices] | 167 indices = [i for i in range(len(instances)) if i not in prototypeIndices] |
162 labels = [-1]*len(instances) | 168 labels = [-1]*len(instances) |
163 assign = True | 169 assign = True |
164 while assign: | 170 while assign: |
165 for i in prototypeIndices: | 171 for i in prototypeIndices: |
166 labels[i] = i | 172 labels[i] = i |
167 for i in indices: | 173 for i in indices: |
168 if similarityFunc is not None: | 174 for j in prototypeIndices: |
169 for j in prototypeIndices: | 175 if similarities[i][j] < 0: |
170 if similarities[i][j] < 0: | 176 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
171 similarities[i][j] = similarityFunc(instances[i], instances[j]) | 177 similarities[j][i] = similarities[i][j] |
172 similarities[j][i] = similarities[i][j] | 178 label = similarities[i][prototypeIndices].argmax() |
173 prototypeIdx = similarities[i][prototypeIndices].argmax() | 179 if similarities[i][prototypeIndices[label]] >= minSimilarity: |
174 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity: | 180 labels[i] = prototypeIndices[label] |
175 labels[i] = prototypeIndices[prototypeIdx] | |
176 else: | 181 else: |
177 labels[i] = -1 # outlier | 182 labels[i] = -1 # outlier |
178 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | 183 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
179 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) | 184 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) |
180 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) | 185 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) |
181 if assign: | 186 if assign: |
182 prototypeIndices.remove(smallestClusterIndex) | 187 prototypeIndices.remove(smallestClusterIndex) |
183 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] | 188 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] |
184 return prototypeIndices, labels | 189 return prototypeIndices, labels |
185 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = True, randomInitialization = False, initialPrototypeIndices = None): | 190 |
191 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None): | |
186 '''Finds exemplar (prototype) instance that represent each cluster | 192 '''Finds exemplar (prototype) instance that represent each cluster |
187 Returns the prototype indices (in the instances list) | 193 Returns the prototype indices (in the instances list) |
188 | 194 |
189 the elements in the instances list must have a length (method __len__), or one can use the random initialization | 195 the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid |
190 the positions in the instances list corresponds to the similarities | 196 the positions in the instances list corresponds to the similarities |
191 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) | 197 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) |
192 similarities must still be allocated with the right size | 198 similarities must still be allocated with the right size |
193 | 199 |
194 if an instance is different enough (<minSimilarity), | 200 if an instance is different enough (<minSimilarity), |
208 return None | 214 return None |
209 | 215 |
210 # sort instances based on length | 216 # sort instances based on length |
211 indices = range(len(instances)) | 217 indices = range(len(instances)) |
212 if randomInitialization or optimizeCentroid: | 218 if randomInitialization or optimizeCentroid: |
213 indices = np.random.permutation(indices) | 219 indices = np.random.permutation(indices).tolist() |
214 else: | 220 else: |
215 def compare(i, j): | 221 def compare(i, j): |
216 if len(instances[i]) > len(instances[j]): | 222 if len(instances[i]) > len(instances[j]): |
217 return -1 | 223 return -1 |
218 elif len(instances[i]) == len(instances[j]): | 224 elif len(instances[i]) == len(instances[j]): |
219 return 0 | 225 return 0 |
220 else: | 226 else: |
221 return 1 | 227 return 1 |
222 indices.sort(compare) | 228 indices.sort(compare) |
223 # go through all instances | 229 # initialize clusters |
224 clusters = [] | 230 clusters = [] |
225 if initialPrototypeIndices is None: | 231 if initialPrototypeIndices is None: |
226 prototypeIndices = [indices[0]] | 232 prototypeIndices = [indices[0]] |
227 else: | 233 else: |
228 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances | 234 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances |
229 for i in prototypeIndices: | 235 for i in prototypeIndices: |
230 clusters.append([i]) | 236 clusters.append([i]) |
231 indices.remove(i) | 237 indices.remove(i) |
238 # go through all instances | |
232 for i in indices: | 239 for i in indices: |
233 for j in prototypeIndices: | 240 for j in prototypeIndices: |
234 if similarities[i][j] < 0: | 241 if similarities[i][j] < 0: |
235 similarities[i][j] = similarityFunc(instances[i], instances[j]) | 242 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
236 similarities[j][i] = similarities[i][j] | 243 similarities[j][i] = similarities[i][j] |
237 label = similarities[i][prototypeIndices].argmax() | 244 label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices |
238 if similarities[i][prototypeIndices[label]] < minSimilarity: | 245 if similarities[i][prototypeIndices[label]] < minSimilarity: |
239 prototypeIndices.append(i) | 246 prototypeIndices.append(i) |
240 clusters.append([]) | 247 clusters.append([]) |
241 else: | 248 else: |
242 clusters[label].append(i) | 249 clusters[label].append(i) |
311 maxima = model.means_.max(0) | 318 maxima = model.means_.max(0) |
312 xwidth = 0.5*(maxima[0]-minima[0]) | 319 xwidth = 0.5*(maxima[0]-minima[0]) |
313 ywidth = 0.5*(maxima[1]-minima[1]) | 320 ywidth = 0.5*(maxima[1]-minima[1]) |
314 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) | 321 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) |
315 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) | 322 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) |
323 | |
324 if __name__ == "__main__": | |
325 import doctest | |
326 import unittest | |
327 suite = doctest.DocFileSuite('tests/ml.txt') | |
328 unittest.TextTestRunner().run(suite) | |
329 # #doctest.testmod() | |
330 # #doctest.testfile("example.txt") |