Source code for genericROM.BasicAlgorithms.DataVisualization

# -*- coding: utf-8 -*-
#
# This file is subject to the terms and conditions defined in
# file 'LICENSE', which is part of this source code package.
#
#

import numpy as np
from sklearn.manifold import MDS, TSNE
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
from itertools import cycle, islice
from genericROM.BasicAlgorithms import Clustering



[docs]class VisualizationToolbox(object): """ Attributes ---------- method: str, 't-SNE' or 'MDS' String indicating the visualization method. dissimilarity: str, 'precomputed' or 'euclidean' Use 'precomputed' if your input is a dissimilarity matrix. Use 'euclidean' if your input is an array containing data points. tSNEperplexity: float Parameter of the t-SNE algorithm, see Scikit-Learn's documentation. embeddedData: 2D array Array containing the embedded representation of some data, with embedded data points given in rows. """ def __init__(self, method=None, dissimilarity=None): self.method = method self.dissimilarity = dissimilarity self.tSNEperplexity = None self.embeddedData = None
[docs] def SetTSNEPerplexity(self, tSNEperplexity): self.tSNEperplexity = tSNEperplexity
[docs] def SetEmbeddedData(self, X): self.embeddedData = X
[docs] def GetEmbeddedData(self): return self.embeddedData
[docs] def LoadEmbeddedData(self, dataFile): self.SetEmbeddedData(np.load(dataFile))
[docs] def fit(self, X, dimension=2, outputName=None): ''' Computes the embedded representation of X. Parameters ---------- X: 2D array of shape [n_samples, n_features] or [n_samples, n_samples] Contains either examples in rows or a precomputed dissimilarity matrix. Be sure you set self.method = 'euclidean' if X contains examples in rows, or self.method = 'precomputed' if X is a precomputed dissimilarity matrix. dimension: int Dimension of the embedding space. outputName: str Name of the npy output file where the embedded dataset is saved. ''' if self.method == 't-SNE': tsne = TSNE(n_components=dimension, perplexity=self.tSNEperplexity, learning_rate="auto", metric = self.dissimilarity, init="random") embeddedData = tsne.fit_transform(X) elif self.method == 'MDS': embeddedData,_ = ApplyMDS(X, dimension, metric=True, dissimilarity = self.dissimilarity) if outputName is not None: np.save(outputName, embeddedData) self.embeddedData = embeddedData
[docs] def fit_transform(self, X, dimension=2, outputName=None): ''' Computes and returns the embedded representation of X. Parameters ---------- X: 2D array of shape [n_samples, n_features] or [n_samples, n_samples] Contains either examples in rows or a precomputed dissimilarity matrix. Be sure you set self.method = 'euclidean' if X contains examples in rows, or self.method = 'precomputed' if X is a precomputed dissimilarity matrix. dimension: int Dimension of the embedding space. outputName: str Name of the npy output file where the embedded dataset is saved. Returns ------- embeddedData: 2D array of shape [n_samples, dimension] ''' self.fit(X, dimension, outputName) return self.embeddedData
[docs] def PlotEmbeddedData(self, fileName, axesNames): fig = plt.figure() plt.xlabel(axesNames[0]) plt.ylabel(axesNames[1]) plt.gca().set_aspect('auto') if self.embeddedData.shape[1]==2: plt.scatter(self.embeddedData[:,0],self.embeddedData[:,1], c = 'b', marker = '.') elif self.embeddedData.shape[1]==3: ax = fig.add_subplot(111, projection='3d', label="3d") ax.scatter(self.embeddedData[:,0],self.embeddedData[:,1],self.embeddedData[:,2], c = 'b', marker = '.') ax.set_zlabel(axesNames[2]) plt.savefig(fileName+".png")
[docs] def PlotClusteringResultsOnEmbeddedData(self, fileName, clusters, representatives=None, plotLabels=False, axesNames=None, colors=None): if self.embeddedData.shape[1]==2: Plot2DClusteringResults(fileName, self.embeddedData, clusters, representatives, plotLabels, axesNames, colors=colors) elif self.embeddedData.shape[1]==3: Plot3DClusteringResults(fileName, self.embeddedData, clusters, representatives, plotLabels, axesNames, colors=colors)
[docs] def PlotClusteringResultsAndSpecificPointsOnEmbeddedData(self, fileName, clusters, specificPoints, sizeSpecificPts=30, axesNames=None, colors=None): if self.embeddedData.shape[1]==2: Plot2DClusteringResultsWithSpecificPoints(fileName, self.embeddedData, clusters, specificPoints, sizeSpecificPts, axesNames, colors=colors) elif self.embeddedData.shape[1]==3: Plot3DClusteringResultsWithSpecificPoints(fileName, self.embeddedData, clusters, specificPoints, sizeSpecificPts, axesNames, colors=colors)
[docs]def ApplyMDS(X, dimension, metric=True, dissimilarity='precomputed'): ''' Wrapper of scikit-learn's multidimensional scaling (MDS) function. ''' mds = MDS(n_components=dimension, metric=metric, n_init=10, max_iter=300, dissimilarity=dissimilarity) mds.fit(X) embeddedDataset = mds.embedding_ stress = mds.stress_ # sum for i<j of (distanceMatrix[i,j] - dij(embeddedDataset))^2 where dij is the euclidean distance in the embedded space # stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 in source code https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/manifold/mds.py#L279 distSquaredNorm = 0.5*((X.ravel())**2).sum() score = np.sqrt(stress/distSquaredNorm) print("Dim: ", dimension, " - Metric: ", metric, " - MDS score: ", score) return embeddedDataset,score
[docs]def Plot2DClusteringResults(fileName, dataset, clusters, representatives=None, plotLabels=False, axesNames=None, colors=None): labels = Clustering.GetLabelsVectorFromClusters(clusters) if colors is None: colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF', '#9400D3', '#FF1493','#DAA520', '#808000']), int(max(labels) + 1)))) fig, ax = plt.subplots(1, 1) ax.scatter(dataset[:,0], dataset[:,1],s=10,c=colors[labels]) plt.gca().set_aspect('auto') if plotLabels:#pragma: no cover # Draw white circles at cluster centers ax.scatter(dataset[representatives,0], dataset[representatives,1], marker='o', c="white", alpha=1, s=200, edgecolor='k') for k in range(len(representatives)): ax.scatter(dataset[representatives[k],0], dataset[representatives[k],1], marker='$%d$' % k, alpha=1, s=50, edgecolor='k') if axesNames is not None:#pragma: no cover ax.set_xlabel(axesNames[0]) ax.set_ylabel(axesNames[1]) plt.savefig(fileName+".png")
[docs]def Plot3DClusteringResults(fileName, dataset, clusters, representatives=None, plotLabels=False, axesNames=None, colors=None): labels = Clustering.GetLabelsVectorFromClusters(clusters) if colors is None: colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF', '#9400D3', '#FF1493','#DAA520', '#808000']), int(max(labels) + 1)))) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(dataset[:,0], dataset[:,1], dataset[:,2], s=10,c=colors[labels]) plt.gca().set_aspect('auto') if plotLabels:#pragma: no cover # Draw white circles at cluster centers ax.scatter(dataset[representatives,0], dataset[representatives,1], dataset[representatives,2], marker='o', c="white", alpha=1, s=200, edgecolor='k') for k in range(len(representatives)): ax.scatter(dataset[representatives[k],0], dataset[representatives[k],1], dataset[representatives[k],2], marker='$%d$' % k, alpha=1, s=50, edgecolor='k') if axesNames is not None:#pragma: no cover ax.set_xlabel(axesNames[0]) ax.set_ylabel(axesNames[1]) ax.set_zlabel(axesNames[2]) plt.savefig(fileName+".png")
[docs]def Plot2DClusteringResultsWithSpecificPoints(fileName, dataset, clusters, specificPoints, sizeSpecificPts=30, axesNames=None, colors=None): labels = Clustering.GetLabelsVectorFromClusters(clusters) if colors is None: colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF', '#9400D3', '#FF1493','#DAA520', '#808000']), int(max(labels) + 1)))) fig, ax = plt.subplots(1, 1) ax.scatter(dataset[:,0], dataset[:,1],s=10,c=colors[labels]) ax.scatter(dataset[specificPoints,0], dataset[specificPoints,1], marker='o', c="white", alpha=1, s=sizeSpecificPts, edgecolor='k') plt.gca().set_aspect('auto') if axesNames is not None:#pragma: no cover ax.set_xlabel(axesNames[0]) ax.set_ylabel(axesNames[1]) plt.savefig(fileName+".png")
[docs]def Plot3DClusteringResultsWithSpecificPoints(fileName, dataset, clusters, specificPoints, sizeSpecificPts=30, axesNames=None, colors=None): labels = Clustering.GetLabelsVectorFromClusters(clusters) if colors is None: colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF', '#9400D3', '#FF1493','#DAA520', '#808000']), int(max(labels) + 1)))) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(dataset[:,0], dataset[:,1], dataset[:,2], s=10,c=colors[labels]) ax.scatter(dataset[specificPoints,0], dataset[specificPoints,1], dataset[specificPoints,2], marker='o', c="white", alpha=1, s=sizeSpecificPts, edgecolor='k') plt.gca().set_aspect('auto') if axesNames is not None:#pragma: no cover ax.set_xlabel(axesNames[0]) ax.set_ylabel(axesNames[1]) ax.set_zlabel(axesNames[2]) plt.savefig(fileName+".png")
if __name__ == "__main__":# pragma: no cover from genericROM import RunTestFile RunTestFile(__file__)