# -*- coding: utf-8 -*-
#
# This file is subject to the terms and conditions defined in
# file 'LICENSE', which is part of this source code package.
#
#
import numpy as np
import time
from BasicTools.Helpers.ProgressBar import printProgressBar
[docs]def GetLabelsVectorFromClusters(clusters):
'''
Returns a labels vector "labels".
Parameters
----------
clusters: dict
clusters[k] is an array containing the indices of points belonging to cluster k.
Returns
-------
labels: 1D array of integers
labels[j] = k if example j belongs to cluster k.
'''
nClusters = len(clusters)
nExamples = 0
for i in range(nClusters):
nExamples += len(clusters[i])
labels = np.zeros(nExamples)
for i in range(nClusters):
nMembers = len(clusters[i])
for j in range(nMembers):
labels[clusters[i][j]] = i
return labels.astype(int)
[docs]def GetClustersFromLabelsVector(labels):
'''
Returns a dictionary containing clustering results.
Parameters
----------
labels: 1D array of integers
labels[j] = k if example j belongs to cluster k.
Returns
-------
clusters: dict
clusters[k] is an array containing the indices of points belonging to cluster k.
'''
clusters = {}
nClusters = labels.max()+1
for k in range(nClusters):
clusters[k] = np.nonzero(labels==k)[0]
return clusters
[docs]def GetAdjacentClustersFromLabelsVector(labels, localNbSnapshots = None):
'''
Returns a dictionary with keys the cluster number and values the numbers of cluster adjacent
from the data used in the clustering (through the labels).
Parameters
----------
labels: 1D array of integers
labels[j] = k if example j belongs to cluster k.
localNbSnapshots: 1D array or list of integers
localNbSnapshots[j] = is the size of j-th group of values for which adjence is well-defined.
Returns
-------
adjacentClusters: dict
adjacentClusters[k] is an array containing the indices of the clusters adjacent to cluster k.
snapshotsOfAdjacentClusters: dict
snapshotsOfAdjacentClusters[k] is an array containing the indices of points belonging to cluster k and its adjacent clusters.
'''
if localNbSnapshots is not None:
assert np.sum(np.array(localNbSnapshots)) == len(labels), "localNbSnapshots not consistent with labels"
else:
localNbSnapshots = [len(labels)]
localLabels = []
rank = 0
for l in localNbSnapshots:
localLabels.append(labels[rank:rank+l])
rank += l
#print("localLabels =", localLabels)
adjacentClusters = {}
nClusters = labels.max()+1
for k in range(nClusters):
adjacentClusters[k] = set()
adjacentClusters[k].add(k)
for label in localLabels:
nLabel = len(label)
adjacentClusters[label[0]].add(label[1])
adjacentClusters[label[nLabel-1]].add(label[nLabel-2])
for i in range(1,nLabel-1):
adjacentClusters[label[i]].add(label[i+1])
adjacentClusters[label[i]].add(label[i-1])
snapshotsOfAdjacentClusters = {}
for k in range(nClusters):
tempRes = np.array([])
for label in adjacentClusters[k]:
tempRes = np.union1d(tempRes, np.nonzero(labels==label)[0])
snapshotsOfAdjacentClusters[k] = np.array(tempRes, dtype = int)
return adjacentClusters, snapshotsOfAdjacentClusters
[docs]class KMedoids(object):
"""
Class for k-medoids clustering.
Attributes
----------
nClusters: int
Number of clusters.
nIter: int, default 100
Maximum number of iterations.
init: str, 'k-meds++' or 'random', default 'k-meds++'
Medoids initialization method. Random selection if 'random'.
If 'k-meds++', we use the method described in the following
article: Hae-Sang Park, Chi-Hyuck Jun, "A simple and fast
algorithm for K-medoids clustering", 2009.
If 'multipleRuns', the clustering algorithm is run self.runs
times with random initial medoids. The best solution in terms
of the cost function is returned.
medoids: 1D array of length nClusters
Array of integers containings the ids of the medoids.
algo: 'ParkJun' or 'PAM', default 'PAM'
Algorithm for k-medoids. Park & Jun's algorithm is simpler and
faster but explores a smaller search space than PAM (Partitioning
around medoids).
squaredDist: boolean, default True.
Says whether the cost function and the medoid update rule use squared
dissimilarities.
runs: integer, default 10.
Number of times the clustering algorithm is run when using init='multipleRuns'.
"""
def __init__(self, nClusters, nIter=100, init='k-meds++', algo='PAM', squaredDist=False,
runs=10):
self.nClusters = nClusters
self.nIter = nIter
self.init = init
self.medoids = None
self.algo = algo
self.squaredDist = squaredDist
self.runs = runs
self.costFunction = None
[docs] def InitializeMedoids(self, distanceMatrix):
'''
Initial medoids selection.
Method described in Hae-Sang Park, Chi-Hyuck Jun,
"A simple and fast algorithm for K-medoids clustering", 2009.
Parameters
----------
distanceMatrix: 2D array of shape (n_samples,n_samples)
'''
if self.init == 'k-meds++':
a = np.sum(distanceMatrix, axis=1)
b = np.transpose(np.transpose(distanceMatrix)/a)
v = np.sum(b, axis=0)
self.medoids = np.sort(np.argsort(v)[:self.nClusters]) # array of indices
elif self.init == 'random' or self.init == 'multipleRuns':
self.medoids = np.random.choice(distanceMatrix.shape[0],self.nClusters,
replace=False)
[docs] def SetCostFunction(self, costFunction):
self.costFunction = costFunction
[docs] def GetMedoids(self):
return self.medoids
[docs] def EvalCostFunction(self, medoids, distMatrix, isAlreadySquared=True, **kwargs):
if self.costFunction != None:
return self.costFunction(medoids, distMatrix, **kwargs)
if self.squaredDist:
if isAlreadySquared:
distanceMatrix = distMatrix
else:
distanceMatrix = distMatrix*distMatrix
else:
distanceMatrix = distMatrix
labels = np.argmin(distanceMatrix[:,medoids], axis=1)
#start = time.time()
cost = 0
for i in range(distanceMatrix.shape[0]):
cost += distanceMatrix[i,medoids[labels[i]]]
#print("old", time.time() - start, cost)
#start = time.time()
#cost2 = np.trace(distanceMatrix[:,medoids[labels]])
#print("new", time.time() - start, cost2)
#print("==")
return cost
[docs] def fit_ParkJun(self, distMatrix, printCostFct=False, verbose = False):
'''
Implementation of k-medoids clustering based on the Voronoi
iteration approach (Park and Jun 2009).
This code is a slightly modified version of the code presented
in: "NumPy/SciPy recipes for data science: k-Medoids clustering",
C. Bauckhage.
Parameters
----------
distMatrix: 2D array of shape (n_samples,n_samples)
printCostFct: boolean
'''
# Examples in the dataset are represented by an index.
# medoids is an array of length nClusters, such that medoids[i]
# is the index of the medoid of the i-th cluster.
# clusters is a dictionary. clusters[k] is an array containing
# the indices of points belonging to cluster k.
t0 = time.time()
if self.medoids is None: # pragma: no cover
self.InitializeMedoids(distMatrix)
medoidsNew = np.copy(self.medoids)
clusters = {}
if self.squaredDist: # pragma: no cover
distanceMatrix = distMatrix*distMatrix # we work on squared distance matrix
else:
distanceMatrix = distMatrix
if verbose:
printProgressBar(0, self.nIter, prefix = 'Progress clustering ParkJun:', suffix = 'Complete', length = 50)
if printCostFct:
cost = self.EvalCostFunction(medoidsNew, distanceMatrix, isAlreadySquared=True)
print("Iter: 0 , Cost: %f " % cost)
for i in range(self.nIter):
# Cluster assignment (for each point, determine the closest medoid)
clusterAssignments = np.argmin(distanceMatrix[:,self.medoids], axis=1)
# clusterAssignments[j] = index of the cluster that example j belongs to.
for k in range(self.nClusters):
clusters[k] = np.where(clusterAssignments==k)[0]
# array of indices of points belonging to cluster k
# Update cluster medoids
for k in range(self.nClusters):
clusterMembers1,clusterMembers2 = np.meshgrid(clusters[k],clusters[k])
intraClusterDistances = np.mean(distanceMatrix[clusterMembers1,clusterMembers2],axis=1)
#print("intraClusterDistances =", intraClusterDistances)
#print("clusterMembers1 =", clusterMembers1)
#print("clusterMembers2 =", clusterMembers2)
medoidId = np.argmin(intraClusterDistances)
# the medoid minimizes the sum of squared distances to all other points in the cluster
medoidsNew[k] = clusters[k][medoidId]
np.sort(medoidsNew)
if printCostFct:
cost = self.EvalCostFunction(medoidsNew, distanceMatrix, isAlreadySquared=True)
print("Iter: %d , Cost: %f " %(i+1, cost))
if verbose:
printProgressBar(i, self.nIter, prefix = 'Progress clustering ParkJun:', suffix = 'Complete', length = 50)
# Checking convergence
if np.array_equal(self.medoids, medoidsNew):
print("The algorithm converged after ", i+1, " iterations.")
break
self.medoids = np.copy(medoidsNew)
else:# pragma: no cover
print("The algorithm did not converge after ", i+1, " iterations.")
print("Computation time: ", time.time()-t0, " s")
[docs] def fit_PAM(self, distMatrix, printCostFct=False, verbose = False):
'''
Implementation of Partitioning Around Medoids (PAM) algorithm
for k-medoids.
Parameters
----------
distMatrix: 2D array of shape (n_samples,n_samples)
printCostFct: boolean
'''
t0 = time.time()
if self.medoids is None:# pragma: no cover
self.InitializeMedoids(distMatrix)
medoidsNew = np.copy(self.medoids)
if self.squaredDist:
distanceMatrix = distMatrix*distMatrix # we work on squared distance matrix
else:
distanceMatrix = distMatrix
if verbose:
printProgressBar(0, self.nIter, prefix = 'Progress clustering PAM:', suffix = 'Complete', length = 50)
if printCostFct:
cost = self.EvalCostFunction(medoidsNew, distanceMatrix, isAlreadySquared=True)
print("Iter: 0 , Cost: %f " % cost)
bestMedoidForSwap = 0 # id of cluster with the best swap
bestNonMedoidForSwap = 0 # id of the best point for the swap
for i in range(self.nIter):
costFunction = self.EvalCostFunction(medoidsNew, distanceMatrix, isAlreadySquared=True)
for k in range(self.nClusters):
medoidsTested = np.copy(medoidsNew)
for j in range(distanceMatrix.shape[0]):
if j not in medoidsNew:
# Swap:
medoidsTested[k] = j
cost = self.EvalCostFunction(medoidsTested, distanceMatrix, isAlreadySquared=True)
if cost < costFunction:
costFunction = cost
bestMedoidForSwap = k
bestNonMedoidForSwap = j
medoidsNew[bestMedoidForSwap] = bestNonMedoidForSwap
if printCostFct:
cost = self.EvalCostFunction(medoidsNew, distanceMatrix, isAlreadySquared=True)
print("Iter: %d , Cost: %f " %(i+1, cost))
if verbose:
printProgressBar(i, self.nIter, prefix = 'Progress clustering PAM:', suffix = 'Complete', length = 50)
# Checking convergence
if np.array_equal(self.medoids, medoidsNew):
print("The algorithm converged after ", i+1, " iterations.")
break
self.medoids = np.copy(medoidsNew)
else:# pragma: no cover
print("The algorithm did not converge after ", i+1, " iterations.")
print("Computation time: ", time.time()-t0, " s")
[docs] def fit(self, distMatrix, printCostFct=False, verbose = False):
if self.init == 'multipleRuns':
costFunction = 10000000000
bestMeds = None
for r in range(self.runs):
self.InitializeMedoids(distMatrix)
if self.algo == "ParkJun":# pragma: no cover
self.fit_ParkJun(distMatrix, printCostFct=printCostFct, verbose=verbose)
elif self.algo == "PAM":
self.fit_PAM(distMatrix, printCostFct=printCostFct, verbose=verbose)
else:# pragma: no cover
print("Unknown algorithm name. Please use either 'ParkJun' or 'PAM'.")
newMeds = self.GetMedoids()
cost = self.EvalCostFunction(self.GetMedoids(), distMatrix, isAlreadySquared=False)
if cost < costFunction:
costFunction = cost
bestMeds = np.copy(newMeds)
self.medoids = np.copy(bestMeds)
else:
self.InitializeMedoids(distMatrix)
if self.algo == "ParkJun":
self.fit_ParkJun(distMatrix, printCostFct=printCostFct, verbose=verbose)
elif self.algo == "PAM":
self.fit_PAM(distMatrix, printCostFct=printCostFct, verbose=verbose)
else:# pragma: no cover
print("Unknown algorithm name. Please use either 'ParkJun' or 'PAM'.")
[docs] def predict(self, distMatrix):
'''
Gives cluster assignments for training data.
Parameters
----------
distMatrix: 2D array of shape (n_samples,n_samples)
Returns
---------
labels: 1D array of length n_samples
Array of integers containing the index of the cluster
each sample belongs to.
'''
labels = np.argmin(distMatrix[:,self.medoids], axis=1)
return labels
[docs] def predictTest(self, distMatrix):
'''
Gives cluster assignments for test data.
Parameters
----------
distMatrix: 2D array of shape (n_samples,n_clusters)
such that distMatrix[i,k] is the distance
between the i-th test example with the
k-th medoid.
Returns
---------
labels: 1D array of length n_samples
Array of integers containing the index of the cluster
each sample belongs to.
'''
labels = np.argmin(distMatrix, axis=1)
return labels
[docs] def fit_predict(self, distMatrix, printCostFct=False, verbose = False):
'''
Computes clusters and gives cluster assignments.
Parameters
----------
distMatrix: 2D array of shape (n_samples,n_samples)
Returns
---------
labels: 1D array of length n_samples
Array of integers containing the index of the cluster
each sample belongs to.
'''
self.fit(distMatrix, printCostFct, verbose)
return self.predict(distMatrix)
if __name__ == "__main__":# pragma: no cover
from genericROM import RunTestFile
RunTestFile(__file__)