comparison python/ml.py @ 735:0e875a7f5759 dev

modified prototypeCluster algorithm to enforce similarity when re-assigning and to compute only the necessary similarities
author Nicolas Saunier <nicolas.saunier@polymtl.ca>
date Wed, 12 Aug 2015 00:24:06 -0400
parents 1d4dcb5c8708
children 2472b4d59aea
comparison
equal deleted inserted replaced
734:1d4dcb5c8708 735:0e875a7f5759
109 features = whiten(features) 109 features = whiten(features)
110 centroids,distortion = kmeans(features,k, iter) 110 centroids,distortion = kmeans(features,k, iter)
111 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster) 111 code,distance = vq(features,centroids) # code starting from 0 (represent first cluster) to k-1 (last cluster)
112 return code,sigma 112 return code,sigma
113 113
114 def prototypeCluster(instances, similarityMatrix, minSimilarity, minClusterSize = None, randomInitialization = False): 114 def prototypeCluster(instances, similarities, minSimilarity, similarityFunc = None, minClusterSize = None, randomInitialization = False):
115 '''Finds exemplar (prototype) instance that represent each cluster 115 '''Finds exemplar (prototype) instance that represent each cluster
116 Returns the prototype indices (in the instances list) and the cluster label of each instance 116 Returns the prototype indices (in the instances list) and the cluster label of each instance
117 117
118 the elements in the instances list must have a length (method __len__), or one can use the random initialization 118 the elements in the instances list must have a length (method __len__), or one can use the random initialization
119 the positions in the instances list corresponds to the similarityMatrix 119 the positions in the instances list corresponds to the similarities
120 if similarityFunc is provided, the similarities are calculated as needed (this is faster) if not in similarities (negative if not computed)
121 similarities must still be allocated with the right size
120 122
121 if an instance is different enough (<minSimilarity), 123 if an instance is different enough (<minSimilarity),
122 it will become a new prototype. 124 it will become a new prototype.
123 Non-prototype instances will be assigned to an existing prototype 125 Non-prototype instances will be assigned to an existing prototype
124 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters 126 if minClusterSize is not None, the clusters will be refined by removing iteratively the smallest clusters
125 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize''' 127 and reassigning all elements in the cluster until no cluster is smaller than minClusterSize'''
126 128
127 # sort instances based on length 129 # sort instances based on length
128 indices = range(similarityMatrix.shape[0]) 130 indices = range(len(instances))
129 if randomInitialization: 131 if randomInitialization:
130 indices = np.random.permutation(indices) 132 indices = np.random.permutation(indices)
131 else: 133 else:
132 def compare(i, j): 134 def compare(i, j):
133 if len(instances[i]) > len(instances[j]): 135 if len(instances[i]) > len(instances[j]):
138 return 1 140 return 1
139 indices.sort(compare) 141 indices.sort(compare)
140 # go through all instances 142 # go through all instances
141 prototypeIndices = [indices[0]] 143 prototypeIndices = [indices[0]]
142 for i in indices[1:]: 144 for i in indices[1:]:
143 if similarityMatrix[i][prototypeIndices].max() < minSimilarity: 145 if similarityFunc is not None:
146 for j in prototypeIndices:
147 if similarities[i][j] < 0:
148 similarities[i][j] = similarityFunc(instances[i], instances[j])
149 similarities[j][i] = similarities[i][j]
150 if similarities[i][prototypeIndices].max() < minSimilarity:
144 prototypeIndices.append(i) 151 prototypeIndices.append(i)
145 152
146 # assignment 153 # assignment
147 indices = [i for i in range(similarityMatrix.shape[0]) if i not in prototypeIndices] 154 indices = [i for i in range(similarities.shape[0]) if i not in prototypeIndices]
148 assign = True 155 assign = True
149 while assign: 156 while assign:
150 labels = [-1]*similarityMatrix.shape[0] 157 labels = [-1]*similarities.shape[0]
151 for i in prototypeIndices: 158 for i in prototypeIndices:
152 labels[i] = i 159 labels[i] = i
153 for i in indices: 160 for i in indices:
154 prototypeIndex = similarityMatrix[i][prototypeIndices].argmax() 161 if similarityFunc is not None:
155 labels[i] = prototypeIndices[prototypeIndex] 162 for j in prototypeIndices:
163 if similarities[i][j] < 0:
164 similarities[i][j] = similarityFunc(instances[i], instances[j])
165 similarities[j][i] = similarities[i][j]
166 prototypeIdx = similarities[i][prototypeIndices].argmax()
167 if similarities[i][prototypeIndices[prototypeIdx]] >= minSimilarity:
168 labels[i] = prototypeIndices[prototypeIdx]
169 else:
170 labels[i] = -1 # outlier
156 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices} 171 clusterSizes = {i: sum(np.array(labels) == i) for i in prototypeIndices}
157 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get) 172 smallestClusterIndex = min(clusterSizes, key = clusterSizes.get)
158 assign = (clusterSizes[smallestClusterIndex] < minClusterSize) 173 assign = (clusterSizes[smallestClusterIndex] < minClusterSize)
159 if assign: 174 if assign:
160 prototypeIndices.remove(smallestClusterIndex) 175 prototypeIndices.remove(smallestClusterIndex)
161 indices.append(smallestClusterIndex) 176 indices.append(smallestClusterIndex)
162 177