Mercurial Hosting > traffic-intelligence
comparison python/ml.py @ 908:b297525b2cbf
added options to the prototype cluster algorithm, work in progress
author | Nicolas Saunier <nicolas.saunier@polymtl.ca> |
---|---|
date | Mon, 26 Jun 2017 00:10:35 -0400 |
parents | 9fd7b18f75b4 |
children | 1cd878812529 |
comparison
equal
deleted
inserted
replaced
907:9fd7b18f75b4 | 908:b297525b2cbf |
---|---|
129 # k-means | 129 # k-means |
130 features = whiten(features) | 130 features = whiten(features) |
131 centroids,distortion = kmeans(features,k, iter) | 131 centroids,distortion = kmeans(features,k, iter) |
132 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) | 132 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) |
133 return code,sigma | 133 return code,sigma |
134 | |
135 class Cluster: | |
136 'Represents a cluster, with a prototype id and the list of instances in cluster' | |
137 def __init__(prototypeId, memberIndices = []): | |
138 self.prototypeId = prototypeId | |
139 self.memberIndices = memberIndices | |
134 | 140 |
135 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None): | 141 def assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc = None, minClusterSize = None): |
136 '''Assigns instances to prototypes | 142 '''Assigns instances to prototypes |
137 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters | 143 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters |
138 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' | 144 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' |
159 if assign: | 165 if assign: |
160 prototypeIndices.remove(smallestClusterIndex) | 166 prototypeIndices.remove(smallestClusterIndex) |
161 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] | 167 indices = [i for i in range(similarities.shape[0]) if labels[i] == smallestClusterIndex] |
162 return prototypeIndices, labels | 168 return prototypeIndices, labels |
163 | 169 |
164 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, randomInitialization = False, assign = True, initialPrototypeIndices = None): | 170 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = 0, optimizeCentroid = True, randomInitialization = False, assign = True, initialPrototypeIndices = None): |
165 '''Finds exemplar (prototype) instance that represent each cluster | 171 '''Finds exemplar (prototype) instance that represent each cluster |
166 Returns the prototype indices (in the instances list) and the cluster label of each instance | 172 Returns the prototype indices (in the instances list) and the cluster label of each instance |
167 | 173 |
168 the elements in the instances list must have a length (method __len__), or one can use the random initialization | 174 the elements in the instances list must have a length (method __len__), or one can use the random initialization |
169 the positions in the instances list corresponds to the similarities | 175 the positions in the instances list corresponds to the similarities |
172 | 178 |
173 if an instance is different enough (<minSimilarity), | 179 if an instance is different enough (<minSimilarity), |
174 it will become a new prototype. | 180 it will become a new prototype. |
175 Non-prototype instances will be assigned to an existing prototype | 181 Non-prototype instances will be assigned to an existing prototype |
176 | 182 |
177 TODO: at each step, optimize the prototype as the most similar in its current cluster (can be done easily if similarities are already computed)''' | 183 if optimizeCentroid is True, each time an element is added, we recompute the centroid trajectory as the most similar to all in the cluster |
178 | 184 |
179 # sort instances based on length | 185 TODO: check how similarity evolves in clusters''' |
180 if len(instances) == 0: | 186 if len(instances) == 0: |
181 print('no instances to cluster (empty list)') | 187 print('no instances to cluster (empty list)') |
182 return None | 188 return None |
183 | 189 if similarityFunc is None: |
190 print('similarityFunc is None') | |
191 return None | |
192 | |
193 # sort instances based on length | |
184 indices = range(len(instances)) | 194 indices = range(len(instances)) |
185 if randomInitialization: | 195 if randomInitialization or optimizeCentroid: |
186 indices = np.random.permutation(indices) | 196 indices = np.random.permutation(indices) |
187 else: | 197 else: |
188 def compare(i, j): | 198 def compare(i, j): |
189 if len(instances[i]) > len(instances[j]): | 199 if len(instances[i]) > len(instances[j]): |
190 return -1 | 200 return -1 |
192 return 0 | 202 return 0 |
193 else: | 203 else: |
194 return 1 | 204 return 1 |
195 indices.sort(compare) | 205 indices.sort(compare) |
196 # go through all instances | 206 # go through all instances |
207 clusters = [] | |
197 if initialPrototypeIndices is None: | 208 if initialPrototypeIndices is None: |
198 prototypeIndices = [indices[0]] | 209 prototypeIndices = [indices[0]] |
199 else: | 210 else: |
200 prototypeIndices = initialPrototypeIndices | 211 prototypeIndices = initialPrototypeIndices # think of the format: if indices, have to be in instances |
212 for i in prototypeIndices: | |
213 clusters.append([i]) | |
201 for i in indices[1:]: | 214 for i in indices[1:]: |
202 if similarityFunc is not None: | 215 for j in prototypeIndices: |
203 for j in prototypeIndices: | 216 if similarities[i][j] < 0: |
204 if similarities[i][j] < 0: | 217 similarities[i][j] = similarityFunc(instances[i], instances[j]) |
205 similarities[i][j] = similarityFunc(instances[i], instances[j]) | 218 similarities[j][i] = similarities[i][j] |
206 similarities[j][i] = similarities[i][j] | 219 label = similarities[i][prototypeIndices].argmax() |
207 if similarities[i][prototypeIndices].max() < minSimilarity: | 220 if similarities[i][prototypeIndices[label]] < minSimilarity: |
208 prototypeIndices.append(i) | 221 prototypeIndices.append(i) |
209 elif randomInitialization: # replace prototype by current instance i if longer | 222 clusters.append([]) |
210 label = similarities[i][prototypeIndices].argmax() | 223 else: |
211 if len(instances[prototypeIndices[label]]) < len(instances[i]): | 224 clusters[label].append(i) |
212 prototypeIndices[label] = i | 225 if optimizeCentroid: |
226 if len(clusters[label]) >= 2: # no point if only one element in cluster | |
227 for j in clusters[label][:-1]: | |
228 if similarities[i][j] < 0: | |
229 similarities[i][j] = similarityFunc(instances[i], instances[j]) | |
230 similarities[j][i] = similarities[i][j] | |
231 clusterIndices = clusters[label] | |
232 clusterSimilarities = similarities[clusterIndices][:,clusterIndices] | |
233 newCentroidIdx = clusterIndices[clusterSimilarities.sum(0).argmax()] | |
234 if prototypeIndices[label] != newCentroidIdx: | |
235 prototypeIndices[label] = newCentroidIdx | |
236 elif randomInitialization: # replace prototype by current instance i if longer | |
237 if len(instances[prototypeIndices[label]]) < len(instances[i]): | |
238 prototypeIndices[label] = i | |
239 | |
213 if assign: | 240 if assign: |
214 return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize) | 241 return assignToPrototypeClusters(instances, prototypeIndices, similarities, minSimilarity, similarityFunc, minClusterSize) |
215 else: | 242 else: |
216 return prototypeIndices, None | 243 return prototypeIndices, None |
217 | 244 |
218 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): | 245 def computeClusterSizes(labels, prototypeIndices, outlierIndex = -1): |
219 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} | 246 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} |
220 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) | 247 clusterSizes['outlier'] = sum(np.array(labels) == outlierIndex) |
221 return clusterSizes | 248 return clusterSizes |
222 | 249 |
250 def computeClusterStatistics(labels, prototypeIndices, instances, similarities, similarityFunc, clusters = None, outlierIndex = -1): | |
251 if clusters is None: | |
252 clusters = {protoId:[] for protoId in prototypeIndices+[-1]} | |
253 for i,l in enumerate(labels): | |
254 clusters[l].append(i) | |
255 clusters = [clusters[protoId] for protoId in prototypeIndices] | |
256 for i, cluster in enumerate(clusters): | |
257 n = len(cluster) | |
258 print('cluster {}: {} elements'.format(prototypeIndices[i], n)) | |
259 if n >=2: | |
260 for j,k in enumerate(cluster): | |
261 for l in cluster[:j]: | |
262 if similarities[k][l] < 0: | |
263 similarities[k][l] = similarityFunc(instances[k], instances[l]) | |
264 similarities[l][k] = similarities[k][l] | |
265 print('Mean similarity to prototype: {}'.format((similarities[prototypeIndices[i]][cluster].sum()+1)/(n-1))) | |
266 print('Mean overall similarity: {}'.format((similarities[cluster][:,cluster].sum()+n)/(n*(n-1)))) | |
267 | |
223 # Gaussian Mixture Models | 268 # Gaussian Mixture Models |
224 def plotGMMClusters(model, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): | 269 def plotGMMClusters(model, dataset = None, fig = None, colors = utils.colors, nUnitsPerPixel = 1., alpha = 0.3): |
225 '''plot the ellipse corresponding to the Gaussians | 270 '''plot the ellipse corresponding to the Gaussians |
226 and the predicted classes of the instances in the dataset''' | 271 and the predicted classes of the instances in the dataset''' |
227 if fig is None: | 272 if fig is None: |