Source code for genericROM.BasicAlgorithms.DataVisualization
# -*- coding: utf-8 -*-
#
# This file is subject to the terms and conditions defined in
# file 'LICENSE', which is part of this source code package.
#
#
import numpy as np
from sklearn.manifold import MDS, TSNE
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D
from itertools import cycle, islice
from genericROM.BasicAlgorithms import Clustering
[docs]class VisualizationToolbox(object):
"""
Attributes
----------
method: str, 't-SNE' or 'MDS'
String indicating the visualization method.
dissimilarity: str, 'precomputed' or 'euclidean'
Use 'precomputed' if your input is a dissimilarity matrix.
Use 'euclidean' if your input is an array containing data points.
tSNEperplexity: float
Parameter of the t-SNE algorithm, see Scikit-Learn's documentation.
embeddedData: 2D array
Array containing the embedded representation of some data, with embedded
data points given in rows.
"""
def __init__(self, method=None, dissimilarity=None):
self.method = method
self.dissimilarity = dissimilarity
self.tSNEperplexity = None
self.embeddedData = None
[docs] def fit(self, X, dimension=2, outputName=None):
'''
Computes the embedded representation of X.
Parameters
----------
X: 2D array of shape [n_samples, n_features] or [n_samples, n_samples]
Contains either examples in rows or a precomputed dissimilarity matrix.
Be sure you set self.method = 'euclidean' if X contains examples in
rows, or self.method = 'precomputed' if X is a precomputed dissimilarity
matrix.
dimension: int
Dimension of the embedding space.
outputName: str
Name of the npy output file where the embedded dataset is saved.
'''
if self.method == 't-SNE':
tsne = TSNE(n_components=dimension, perplexity=self.tSNEperplexity,
learning_rate="auto", metric = self.dissimilarity, init="random")
embeddedData = tsne.fit_transform(X)
elif self.method == 'MDS':
embeddedData,_ = ApplyMDS(X, dimension, metric=True,
dissimilarity = self.dissimilarity)
if outputName is not None:
np.save(outputName, embeddedData)
self.embeddedData = embeddedData
[docs] def fit_transform(self, X, dimension=2, outputName=None):
'''
Computes and returns the embedded representation of X.
Parameters
----------
X: 2D array of shape [n_samples, n_features] or [n_samples, n_samples]
Contains either examples in rows or a precomputed dissimilarity matrix.
Be sure you set self.method = 'euclidean' if X contains examples in
rows, or self.method = 'precomputed' if X is a precomputed dissimilarity
matrix.
dimension: int
Dimension of the embedding space.
outputName: str
Name of the npy output file where the embedded dataset is saved.
Returns
-------
embeddedData: 2D array of shape [n_samples, dimension]
'''
self.fit(X, dimension, outputName)
return self.embeddedData
[docs] def PlotEmbeddedData(self, fileName, axesNames):
fig = plt.figure()
plt.xlabel(axesNames[0])
plt.ylabel(axesNames[1])
plt.gca().set_aspect('auto')
if self.embeddedData.shape[1]==2:
plt.scatter(self.embeddedData[:,0],self.embeddedData[:,1], c = 'b', marker = '.')
elif self.embeddedData.shape[1]==3:
ax = fig.add_subplot(111, projection='3d', label="3d")
ax.scatter(self.embeddedData[:,0],self.embeddedData[:,1],self.embeddedData[:,2], c = 'b', marker = '.')
ax.set_zlabel(axesNames[2])
plt.savefig(fileName+".png")
[docs] def PlotClusteringResultsOnEmbeddedData(self, fileName, clusters, representatives=None, plotLabels=False,
axesNames=None, colors=None):
if self.embeddedData.shape[1]==2:
Plot2DClusteringResults(fileName, self.embeddedData, clusters, representatives, plotLabels,
axesNames, colors=colors)
elif self.embeddedData.shape[1]==3:
Plot3DClusteringResults(fileName, self.embeddedData, clusters, representatives, plotLabels,
axesNames, colors=colors)
[docs] def PlotClusteringResultsAndSpecificPointsOnEmbeddedData(self, fileName, clusters, specificPoints,
sizeSpecificPts=30, axesNames=None, colors=None):
if self.embeddedData.shape[1]==2:
Plot2DClusteringResultsWithSpecificPoints(fileName, self.embeddedData, clusters,
specificPoints, sizeSpecificPts, axesNames,
colors=colors)
elif self.embeddedData.shape[1]==3:
Plot3DClusteringResultsWithSpecificPoints(fileName, self.embeddedData, clusters,
specificPoints, sizeSpecificPts, axesNames,
colors=colors)
[docs]def ApplyMDS(X, dimension, metric=True, dissimilarity='precomputed'):
'''
Wrapper of scikit-learn's multidimensional scaling (MDS) function.
'''
mds = MDS(n_components=dimension, metric=metric, n_init=10, max_iter=300, dissimilarity=dissimilarity)
mds.fit(X)
embeddedDataset = mds.embedding_
stress = mds.stress_ # sum for i<j of (distanceMatrix[i,j] - dij(embeddedDataset))^2 where dij is the euclidean distance in the embedded space
# stress = ((dis.ravel() - disparities.ravel()) ** 2).sum() / 2 in source code https://github.com/scikit-learn/scikit-learn/blob/1495f6924/sklearn/manifold/mds.py#L279
distSquaredNorm = 0.5*((X.ravel())**2).sum()
score = np.sqrt(stress/distSquaredNorm)
print("Dim: ", dimension, " - Metric: ", metric, " - MDS score: ", score)
return embeddedDataset,score
[docs]def Plot2DClusteringResults(fileName, dataset, clusters, representatives=None, plotLabels=False, axesNames=None, colors=None):
labels = Clustering.GetLabelsVectorFromClusters(clusters)
if colors is None:
colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
'#9400D3', '#FF1493','#DAA520', '#808000']),
int(max(labels) + 1))))
fig, ax = plt.subplots(1, 1)
ax.scatter(dataset[:,0], dataset[:,1],s=10,c=colors[labels])
plt.gca().set_aspect('auto')
if plotLabels:#pragma: no cover
# Draw white circles at cluster centers
ax.scatter(dataset[representatives,0], dataset[representatives,1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for k in range(len(representatives)):
ax.scatter(dataset[representatives[k],0], dataset[representatives[k],1], marker='$%d$' % k, alpha=1,
s=50, edgecolor='k')
if axesNames is not None:#pragma: no cover
ax.set_xlabel(axesNames[0])
ax.set_ylabel(axesNames[1])
plt.savefig(fileName+".png")
[docs]def Plot3DClusteringResults(fileName, dataset, clusters, representatives=None, plotLabels=False, axesNames=None, colors=None):
labels = Clustering.GetLabelsVectorFromClusters(clusters)
if colors is None:
colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
'#9400D3', '#FF1493','#DAA520', '#808000']),
int(max(labels) + 1))))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(dataset[:,0], dataset[:,1], dataset[:,2], s=10,c=colors[labels])
plt.gca().set_aspect('auto')
if plotLabels:#pragma: no cover
# Draw white circles at cluster centers
ax.scatter(dataset[representatives,0], dataset[representatives,1], dataset[representatives,2], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for k in range(len(representatives)):
ax.scatter(dataset[representatives[k],0], dataset[representatives[k],1],
dataset[representatives[k],2], marker='$%d$' % k, alpha=1,
s=50, edgecolor='k')
if axesNames is not None:#pragma: no cover
ax.set_xlabel(axesNames[0])
ax.set_ylabel(axesNames[1])
ax.set_zlabel(axesNames[2])
plt.savefig(fileName+".png")
[docs]def Plot2DClusteringResultsWithSpecificPoints(fileName, dataset, clusters, specificPoints, sizeSpecificPts=30, axesNames=None, colors=None):
labels = Clustering.GetLabelsVectorFromClusters(clusters)
if colors is None:
colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
'#9400D3', '#FF1493','#DAA520', '#808000']),
int(max(labels) + 1))))
fig, ax = plt.subplots(1, 1)
ax.scatter(dataset[:,0], dataset[:,1],s=10,c=colors[labels])
ax.scatter(dataset[specificPoints,0], dataset[specificPoints,1], marker='o',
c="white", alpha=1, s=sizeSpecificPts, edgecolor='k')
plt.gca().set_aspect('auto')
if axesNames is not None:#pragma: no cover
ax.set_xlabel(axesNames[0])
ax.set_ylabel(axesNames[1])
plt.savefig(fileName+".png")
[docs]def Plot3DClusteringResultsWithSpecificPoints(fileName, dataset, clusters, specificPoints, sizeSpecificPts=30, axesNames=None, colors=None):
labels = Clustering.GetLabelsVectorFromClusters(clusters)
if colors is None:
colors = np.array(list(islice(cycle(['#0000FF', '#008000', '#FF0000', '#FFA500', '#000080', '#FFD700', '#008B8B', '#32CD32', '#808080', '#F08080', '#8B4513', '#00FFFF',
'#9400D3', '#FF1493','#DAA520', '#808000']),
int(max(labels) + 1))))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(dataset[:,0], dataset[:,1], dataset[:,2], s=10,c=colors[labels])
ax.scatter(dataset[specificPoints,0], dataset[specificPoints,1], dataset[specificPoints,2], marker='o',
c="white", alpha=1, s=sizeSpecificPts, edgecolor='k')
plt.gca().set_aspect('auto')
if axesNames is not None:#pragma: no cover
ax.set_xlabel(axesNames[0])
ax.set_ylabel(axesNames[1])
ax.set_zlabel(axesNames[2])
plt.savefig(fileName+".png")
if __name__ == "__main__":# pragma: no cover
from genericROM import RunTestFile
RunTestFile(__file__)