comparison python/ml.py @ 980:23f98ebb113f

first tests for clustering algo
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Mon, 19 Feb 2018 16:32:59 -0500
parents 184f1dd307f9
children e8eabef7857c
comparison
equal deleted inserted replaced
979:cc89267b5ff9 980:23f98ebb113f
154 self.prototypeId = prototypeId 154 self.prototypeId = prototypeId
155 self.memberIndices = memberIndices 155 self.memberIndices = memberIndices
156 156
157 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0): 157 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0):
158 '''Assigns instances to prototypes 158 '''Assigns instances to prototypes
159 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters 159 if minClusterSize is not 0, the clusters will be refined by removing iteratively the smallest clusters
160 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' 160 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize
161
162 labels are indices in the prototypeIndices'''
163 if similarityFunc is None:
164 print('similarityFunc is None')
165 return None
166
161 indices = [i for i in range(len(instances)) if i not in prototypeIndices] 167 indices = [i for i in range(len(instances)) if i not in prototypeIndices]
162 labels = [-1]*len(instances) 168 labels = [-1]*len(instances)
163 assign = True 169 assign = True
164 while assign: 170 while assign:
165 for i in prototypeIndices: 171 for i in prototypeIndices:
166 labels[i] = i 172 labels[i] = i
167 for i in indices: 173 for i in indices:
168 if similarityFunc is not None: 174 for j in prototypeIndices:
169 for j in prototypeIndices: 175 if similarities[i][j] < 0:
170 if similarities[i][j] < 0: 176 similarities[i][j] = similarityFunc(instances[i], instances[j])
171 similarities[i][j] = similarityFunc(instances[i], instances[j]) 177 similarities[j][i] = similarities[i][j]
172 similarities[j][i] = similarities[i][j] 178 label = similarities[i][prototypeIndices].argmax()
173 prototypeIdx = similarities[i][prototypeIndices].argmax() 179 if similarities[i][prototypeIndices[label]] >= minSimilarity:
174 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity: 180 labels[i] = prototypeIndices[label]
175 labels[i] = prototypeIndices[prototypeIdx]
176 else: 181 else:
177 labels[i] = -1 # outlier 182 labels[i] = -1 # outlier
178 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} 183 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
179 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) 184 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get)
180 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) 185 assign = (clusterSizes[smallestClusterIndex] < minClusterSize)
181 if assign: 186 if assign:
182 prototypeIndices.remove(smallestClusterIndex) 187 prototypeIndices.remove(smallestClusterIndex)
183 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] 188 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex]
184 return prototypeIndices, labels 189 return prototypeIndices, labels
185 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = True, randomInitialization = False, initialPrototypeIndices = None): 190
191 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, optimizeCentroid = False, randomInitialization = False, initialPrototypeIndices = None):
186 '''Finds exemplar (prototype) instance that represent each cluster 192 '''Finds exemplar (prototype) instance that represent each cluster
187 Returns the prototype indices (in the instances list) 193 Returns the prototype indices (in the instances list)
188 194
189 the elements in the instances list must have a length (method __len__), or one can use the random initialization 195 the elements in the instances list must have a length (method __len__), or one can use the optimizeCentroid
190 the positions in the instances list corresponds to the similarities 196 the positions in the instances list corresponds to the similarities
191 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed) 197 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed)
192 similarities must still be allocated with the right size 198 similarities must still be allocated with the right size
193 199
194 if an instance is different enough (<minSimilarity), 200 if an instance is different enough (<minSimilarity),
208 return None 214 return None
209 215
210 # sort instances based on length 216 # sort instances based on length
211 indices = range(len(instances)) 217 indices = range(len(instances))
212 if randomInitialization or optimizeCentroid: 218 if randomInitialization or optimizeCentroid:
213 indices = np.random.permutation(indices) 219 indices = np.random.permutation(indices).tolist()
214 else: 220 else:
215 def compare(i, j): 221 def compare(i, j):
216 if len(instances[i]) > len(instances[j]): 222 if len(instances[i]) > len(instances[j]):
217 return -1 223 return -1
218 elif len(instances[i]) == len(instances[j]): 224 elif len(instances[i]) == len(instances[j]):
219 return 0 225 return 0
220 else: 226 else:
221 return 1 227 return 1
222 indices.sort(compare) 228 indices.sort(compare)
223 # go through all instances 229 # initialize clusters
224 clusters = [] 230 clusters = []
225 if initialPrototypeIndices is None: 231 if initialPrototypeIndices is None:
226 prototypeIndices = [indices[0]] 232 prototypeIndices = [indices[0]]
227 else: 233 else:
228 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances 234 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances
229 for i in prototypeIndices: 235 for i in prototypeIndices:
230 clusters.append([i]) 236 clusters.append([i])
231 indices.remove(i) 237 indices.remove(i)
238 # go through all instances
232 for i in indices: 239 for i in indices:
233 for j in prototypeIndices: 240 for j in prototypeIndices:
234 if similarities[i][j] < 0: 241 if similarities[i][j] < 0:
235 similarities[i][j] = similarityFunc(instances[i], instances[j]) 242 similarities[i][j] = similarityFunc(instances[i], instances[j])
236 similarities[j][i] = similarities[i][j] 243 similarities[j][i] = similarities[i][j]
237 label = similarities[i][prototypeIndices].argmax() 244 label = similarities[i][prototypeIndices].argmax() # index in prototypeIndices
238 if similarities[i][prototypeIndices[label]] < minSimilarity: 245 if similarities[i][prototypeIndices[label]] < minSimilarity:
239 prototypeIndices.append(i) 246 prototypeIndices.append(i)
240 clusters.append([]) 247 clusters.append([])
241 else: 248 else:
242 clusters[label].append(i) 249 clusters[label].append(i)
311 maxima = model.means_.max(0) 318 maxima = model.means_.max(0)
312 xwidth = 0.5*(maxima[0]-minima[0]) 319 xwidth = 0.5*(maxima[0]-minima[0])
313 ywidth = 0.5*(maxima[1]-minima[1]) 320 ywidth = 0.5*(maxima[1]-minima[1])
314 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth) 321 plt.xlim(minima[0]-xwidth,maxima[0]+xwidth)
315 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth) 322 plt.ylim(minima[1]-ywidth,maxima[1]+ywidth)
323
324 if __name__ == "__main__":
325 import doctest
326 import unittest
327 suite = doctest.DocFileSuite('tests/ml.txt')
328 unittest.TextTestRunner().run(suite)
329 # #doctest.testmod()
330 # #doctest.testfile("example.txt")