comparison python/ml.py @ 908:b297525b2cbf

added options to the prototype cluster algorithm, work in progress
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Mon, 26 Jun 2017 00:10:35 -0400
parents 9fd7b18f75b4
children 1cd878812529
comparison
equal deleted inserted replaced
907:9fd7b18f75b4 908:b297525b2cbf
129 # k-means 129 # k-means
130 features = whiten(features) 130 features = whiten(features)
131 centroids,distortion = kmeans(features,k, iter) 131 centroids,distortion = kmeans(features,k, iter)
132 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) 132 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
133 return code,sigma 133 return code,sigma
134
135 class Cluster:
136 'Represents a cluster, with a prototype id and the list of instances in cluster'
137 def __init__(prototypeId, memberIndices = []):
138 self.prototypeId = prototypeId
139 self.memberIndices = memberIndices
134 140
135 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None): 141 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None):
136 '''Assigns instances to prototypes 142 '''Assigns instances to prototypes
137 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters 143 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters
138 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' 144 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize'''
159 if assign: 165 if assign:
160 prototypeIndices.remove(smallestClusterIndex) 166 prototypeIndices.remove(smallestClusterIndex)
161 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] 167 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex]
162 return prototypeIndices, labels 168 return prototypeIndices, labels
163 169
164 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, randomInitialization = False, assign = True, initialPrototypeIndices = None): 170 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, optimizeCentroid = True, randomInitialization = False, assign = True, initialPrototypeIndices = None):
165 '''Finds exemplar (prototype) instance that represent each cluster 171 '''Finds exemplar (prototype) instance that represent each cluster
166 Returns the prototype indices (in the instances list) and the cluster label of each instance 172 Returns the prototype indices (in the instances list) and the cluster label of each instance
167 173
168 the elements in the instances list must have a length (method __len__), or one can use the random initialization 174 the elements in the instances list must have a length (method __len__), or one can use the random initialization
169 the positions in the instances list corresponds to the similarities 175 the positions in the instances list corresponds to the similarities
172 178
173 if an instance is different enough (<minSimilarity), 179 if an instance is different enough (<minSimilarity),
174 it will become a new prototype. 180 it will become a new prototype.
175 Non-prototype instances will be assigned to an existing prototype 181 Non-prototype instances will be assigned to an existing prototype
176 182
177 TODO: at each step, optimize the prototype as the most similar in its current cluster (can be done easily if similarities are already computed)''' 183 if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster
178 184
179 # sort instances based on length 185 TODO: check how similarity evolves in clusters'''
180 if len(instances) == 0: 186 if len(instances) == 0:
181 print('no instances to cluster (empty list)') 187 print('no instances to cluster (empty list)')
182 return None 188 return None
183 189 if similarityFunc is None:
190 print('similarityFunc is None')
191 return None
192
193 # sort instances based on length
184 indices = range(len(instances)) 194 indices = range(len(instances))
185 if randomInitialization: 195 if randomInitialization or optimizeCentroid:
186 indices = np.random.permutation(indices) 196 indices = np.random.permutation(indices)
187 else: 197 else:
188 def compare(i, j): 198 def compare(i, j):
189 if len(instances[i]) > len(instances[j]): 199 if len(instances[i]) > len(instances[j]):
190 return -1 200 return -1
192 return 0 202 return 0
193 else: 203 else:
194 return 1 204 return 1
195 indices.sort(compare) 205 indices.sort(compare)
196 # go through all instances 206 # go through all instances
207 clusters = []
197 if initialPrototypeIndices is None: 208 if initialPrototypeIndices is None:
198 prototypeIndices = [indices[0]] 209 prototypeIndices = [indices[0]]
199 else: 210 else:
200 prototypeIndices = initialPrototypeIndices 211 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances
212 for i in prototypeIndices:
213 clusters.append([i])
201 for i in indices[1:]: 214 for i in indices[1:]:
202 if similarityFunc is not None: 215 for j in prototypeIndices:
203 for j in prototypeIndices: 216 if similarities[i][j] < 0:
204 if similarities[i][j] < 0: 217 similarities[i][j] = similarityFunc(instances[i], instances[j])
205 similarities[i][j] = similarityFunc(instances[i], instances[j]) 218 similarities[j][i] = similarities[i][j]
206 similarities[j][i] = similarities[i][j] 219 label = similarities[i][prototypeIndices].argmax()
207 if similarities[i][prototypeIndices].max() < minSimilarity: 220 if similarities[i][prototypeIndices[label]] < minSimilarity:
208 prototypeIndices.append(i) 221 prototypeIndices.append(i)
209 elif randomInitialization: # replace prototype by current instance i if longer 222 clusters.append([])
210 label = similarities[i][prototypeIndices].argmax() 223 else:
211 if len(instances[prototypeIndices[label]]) < len(instances[i]): 224 clusters[label].append(i)
212 prototypeIndices[label] = i 225 if optimizeCentroid:
226 if len(clusters[label]) >= 2: # no point if only one element in cluster
227 for j in clusters[label][:-1]:
228 if similarities[i][j] < 0:
229 similarities[i][j] = similarityFunc(instances[i], instances[j])
230 similarities[j][i] = similarities[i][j]
231 clusterIndices = clusters[label]
232 clusterSimilarities = similarities[clusterIndices][:,clusterIndices]
233 newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()]
234 if prototypeIndices[label] != newCentroidIdx:
235 prototypeIndices[label] = newCentroidIdx
236 elif randomInitialization: # replace prototype by current instance i if longer
237 if len(instances[prototypeIndices[label]]) < len(instances[i]):
238 prototypeIndices[label] = i
239
213 if assign: 240 if assign:
214 return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize) 241 return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize)
215 else: 242 else:
216 return prototypeIndices, None 243 return prototypeIndices, None
217 244
218 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): 245 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1):
219 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} 246 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
220 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) 247 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex)
221 return clusterSizes 248 return clusterSizes
222 249
250 def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1):
251 if clusters is None:
252 clusters = {protoId:[] for protoId in prototypeIndices+[-1]}
253 for i,l in enumerate(labels):
254 clusters[l].append(i)
255 clusters = [clusters[protoId] for protoId in prototypeIndices]
256 for i, cluster in enumerate(clusters):
257 n = len(cluster)
258 print('cluster {}: {} elements'.format(prototypeIndices[i], n))
259 if n >=2:
260 for j,k in enumerate(cluster):
261 for l in cluster[:j]:
262 if similarities[k][l] < 0:
263 similarities[k][l] = similarityFunc(instances[k], instances[l])
264 similarities[l][k] = similarities[k][l]
265 print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1)))
266 print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1))))
267
223 # Gaussian Mixture Models 268 # Gaussian Mixture Models
224 def plotGMMClusters(model, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): 269 def plotGMMClusters(model, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3):
225 '''plot the ellipse corresponding to the Gaussians 270 '''plot the ellipse corresponding to the Gaussians
226 and the predicted classes of the instances in the dataset''' 271 and the predicted classes of the instances in the dataset'''
227 if fig is None: 272 if fig is None: