Source code for genericROM.BasicAlgorithms.DataVisualization

# -*- coding: utf-8 -*-
#
# This file is subject to the terms and conditions defined in
# file 'LICENSE', which is part of this source code package.
#
#

import numpy as np
from sklearn.manifold import MDS, TSNE
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
from itertools import cycle, islice
from genericROM.BasicAlgorithms import Clustering



[docs]class VisualizationToolbox(object):
    """

    Attributes
    ----------
    method: str, 't-SNE' or 'MDS'
        String indicating the visualization method.
    dissimilarity: str, 'precomputed' or 'euclidean'
        Use 'precomputed' if your input is a dissimilarity matrix.
        Use 'euclidean' if your input is an array containing data points.
    tSNEperplexity: float
        Parameter of the t-SNE algorithm, see Scikit-Learn's documentation.
    embeddedData: 2D array
        Array containing the embedded representation of some data, with embedded
        data points given in rows.
    """
    def __init__(self, method=None, dissimilarity=None):
        self.method         = method
        self.dissimilarity  = dissimilarity
        self.tSNEperplexity = None
        self.embeddedData   = None


[docs]    def SetTSNEPerplexity(self, tSNEperplexity):
        self.tSNEperplexity = tSNEperplexity


[docs]    def SetEmbeddedData(self, X):
        self.embeddedData = X


[docs]    def GetEmbeddedData(self):
        return self.embeddedData


[docs]    def LoadEmbeddedData(self, dataFile):
        self.SetEmbeddedData(np.load(dataFile))


[docs]    def fit(self, X, dimension=2, outputName=None):
        '''
        Computes the embedded representation of X.

        Parameters
        ----------
        X: 2D array of shape [n_samples, n_features] or [n_samples, n_samples]
            Contains either examples in rows or a precomputed dissimilarity matrix.
            Be sure you set self.method = 'euclidean' if X contains examples in
            rows, or self.method = 'precomputed' if X is a precomputed dissimilarity
            matrix.
        dimension: int
            Dimension of the embedding space.
        outputName: str
            Name of the npy output file where the embedded dataset is saved.
        '''
        if self.method == 't-SNE':
            tsne         = TSNE(n_components=dimension, perplexity=self.tSNEperplexity,
                                learning_rate="auto", metric = self.dissimilarity, init="random")
            embeddedData = tsne.fit_transform(X)
        elif self.method == 'MDS':
            embeddedData,_ = ApplyMDS(X, dimension, metric=True,
                                      dissimilarity = self.dissimilarity)
        if outputName is not None:
            np.save(outputName, embeddedData)
        self.embeddedData = embeddedData


[docs]    def fit_transform(self, X, dimension=2, outputName=None):
        '''
        Computes and returns the embedded representation of X.

        Parameters
        ----------
        X: 2D array of shape [n_samples, n_features] or [n_samples, n_samples]
            Contains either examples in rows or a precomputed dissimilarity matrix.
            Be sure you set self.method = 'euclidean' if X contains examples in
            rows, or self.method = 'precomputed' if X is a precomputed dissimilarity
            matrix.
        dimension: int
            Dimension of the embedding space.
        outputName: str
            Name of the npy output file where the embedded dataset is saved.

        Returns
        -------
        embeddedData: 2D array of shape [n_samples, dimension]
        '''
        self.fit(X, dimension, outputName)
        return self.embeddedData


[docs]    def PlotEmbeddedData(self, fileName, axesNames):

        fig = plt.figure()
        plt.xlabel(axesNames[0])
        plt.ylabel(axesNames[1])
        plt.gca().set_aspect('auto')

        if self.embeddedData.shape[1]==2:
            plt.scatter(self.embeddedData[:,0],self.embeddedData[:,1], c = 'b', marker = '.')

        elif self.embeddedData.shape[1]==3:
            ax = fig.add_subplot(111, projection='3d', label="3d")
            ax.scatter(self.embeddedData[:,0],self.embeddedData[:,1],self.embeddedData[:,2], c = 'b', marker = '.')
            ax.set_zlabel(axesNames[2])
        plt.savefig(fileName+".png")


[docs]    def PlotClusteringResultsOnEmbeddedData(self, fileName, clusters, representatives=None, plotLabels=False,
                                            axesNames=None, colors=None):
        if self.embeddedData.shape[1]==2:
            Plot2DClusteringResults(fileName, self.embeddedData, clusters, representatives, plotLabels,
                                    axesNames, colors=colors)
        elif self.embeddedData.shape[1]==3:
            Plot3DClusteringResults(fileName, self.embeddedData, clusters, representatives, plotLabels,
                                    axesNames, colors=colors)


[docs]    def PlotClusteringResultsAndSpecificPointsOnEmbeddedData(self, fileName, clusters, specificPoints,
                                                             sizeSpecificPts=30, axesNames=None, colors=None):
        if self.embeddedData.shape[1]==2:
            Plot2DClusteringResultsWithSpecificPoints(fileName, self.embeddedData, clusters,
                                                      specificPoints, sizeSpecificPts, axesNames,
                                                      colors=colors)
        elif self.embeddedData.shape[1]==3:
            Plot3DClusteringResultsWithSpecificPoints(fileName, self.embeddedData, clusters,
                                                      specificPoints, sizeSpecificPts, axesNames,
                                                      colors=colors)


[docs]def ApplyMDS(X, dimension, metric=True, dissimilarity='precomputed'):
    '''
    Wrapper of scikit-learn's multidimensional scaling (MDS) function.
    '''
    mds = MDS(n_components=dimension, metric=metric, n_init=10, max_iter=300, dissimilarity=dissimilarity)
    mds.fit(X)
    embeddedDataset = mds.embedding_
    stress = mds.stress_                            # sum for i<j of (distanceMatrix[i,j] - dij(embeddedDataset))^2 where dij is the euclidean distance in the embedded space
    # stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2  in source code https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/manifold/mds.py#L279
    distSquaredNorm = 0.5*((X.ravel())**2).sum()
    score = np.sqrt(stress/distSquaredNorm)
    print("Dim: ", dimension, " - Metric: ", metric, " - MDS score: ", score)
    return embeddedDataset,score


[docs]def Plot2DClusteringResults(fileName, dataset, clusters, representatives=None, plotLabels=False, axesNames=None, colors=None):
    labels = Clustering.GetLabelsVectorFromClusters(clusters)
    if colors is None:
        colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
                                             '#9400D3', '#FF1493','#DAA520', '#808000']),
                                             int(max(labels) + 1))))
    fig, ax = plt.subplots(1, 1)
    ax.scatter(dataset[:,0], dataset[:,1],s=10,c=colors[labels])
    plt.gca().set_aspect('auto')
    if plotLabels:#pragma: no cover
        # Draw white circles at cluster centers
        ax.scatter(dataset[representatives,0], dataset[representatives,1], marker='o',
                   c="white", alpha=1, s=200, edgecolor='k')
        for k in range(len(representatives)):
            ax.scatter(dataset[representatives[k],0], dataset[representatives[k],1], marker='$%d$' % k, alpha=1,
                       s=50, edgecolor='k')
    if axesNames is not None:#pragma: no cover
        ax.set_xlabel(axesNames[0])
        ax.set_ylabel(axesNames[1])
    plt.savefig(fileName+".png")


[docs]def Plot3DClusteringResults(fileName, dataset, clusters, representatives=None, plotLabels=False, axesNames=None, colors=None):
    labels = Clustering.GetLabelsVectorFromClusters(clusters)
    if colors is None:
        colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
                                             '#9400D3', '#FF1493','#DAA520', '#808000']),
                                             int(max(labels) + 1))))
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(dataset[:,0], dataset[:,1], dataset[:,2], s=10,c=colors[labels])
    plt.gca().set_aspect('auto')
    if plotLabels:#pragma: no cover
        # Draw white circles at cluster centers
        ax.scatter(dataset[representatives,0], dataset[representatives,1], dataset[representatives,2], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')
        for k in range(len(representatives)):
            ax.scatter(dataset[representatives[k],0], dataset[representatives[k],1],
                        dataset[representatives[k],2], marker='$%d$' % k, alpha=1,
                        s=50, edgecolor='k')
    if axesNames is not None:#pragma: no cover
        ax.set_xlabel(axesNames[0])
        ax.set_ylabel(axesNames[1])
        ax.set_zlabel(axesNames[2])
    plt.savefig(fileName+".png")


[docs]def Plot2DClusteringResultsWithSpecificPoints(fileName, dataset, clusters, specificPoints, sizeSpecificPts=30, axesNames=None, colors=None):
    labels = Clustering.GetLabelsVectorFromClusters(clusters)
    if colors is None:
        colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
                                             '#9400D3', '#FF1493','#DAA520', '#808000']),
                                             int(max(labels) + 1))))
    fig, ax = plt.subplots(1, 1)
    ax.scatter(dataset[:,0], dataset[:,1],s=10,c=colors[labels])
    ax.scatter(dataset[specificPoints,0], dataset[specificPoints,1], marker='o',
               c="white", alpha=1, s=sizeSpecificPts, edgecolor='k')
    plt.gca().set_aspect('auto')
    if axesNames is not None:#pragma: no cover
        ax.set_xlabel(axesNames[0])
        ax.set_ylabel(axesNames[1])
    plt.savefig(fileName+".png")


[docs]def Plot3DClusteringResultsWithSpecificPoints(fileName, dataset, clusters, specificPoints, sizeSpecificPts=30, axesNames=None, colors=None):
    labels = Clustering.GetLabelsVectorFromClusters(clusters)
    if colors is None:
        colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
                                             '#9400D3', '#FF1493','#DAA520', '#808000']),
                                             int(max(labels) + 1))))
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(dataset[:,0], dataset[:,1], dataset[:,2], s=10,c=colors[labels])
    ax.scatter(dataset[specificPoints,0], dataset[specificPoints,1], dataset[specificPoints,2], marker='o',
               c="white", alpha=1, s=sizeSpecificPts, edgecolor='k')
    plt.gca().set_aspect('auto')
    if axesNames is not None:#pragma: no cover
        ax.set_xlabel(axesNames[0])
        ax.set_ylabel(axesNames[1])
        ax.set_zlabel(axesNames[2])
    plt.savefig(fileName+".png")



if __name__ == "__main__":# pragma: no cover

    from genericROM import RunTestFile
    RunTestFile(__file__)