583 lines
26 KiB
Python
583 lines
26 KiB
Python
# Keras Utility Imports
|
|
from keras import backend as K
|
|
from keras.utils import plot_model
|
|
|
|
from collections import defaultdict
|
|
from random import shuffle
|
|
|
|
# Numpy
|
|
import numpy as np
|
|
|
|
# Maths
|
|
from math import sqrt
|
|
from sklearn.cluster import DBSCAN, KMeans
|
|
from sklearn.metrics import silhouette_samples, silhouette_score
|
|
from sklearn.manifold.t_sne import TSNE
|
|
from sklearn.decomposition import PCA
|
|
|
|
# Plotting
|
|
import matplotlib.pyplot as plt
|
|
from PIL import ImageDraw, Image
|
|
import matplotlib.cm as cm
|
|
"""
|
|
UseFull Links:
|
|
|
|
Keras Timedistributed Wrapper is applysing the SAME Layer, which means same WEIGHTS:
|
|
http://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/
|
|
|
|
"""
|
|
|
|
|
|
class Trainer(object):
|
|
def __init__(self, mode, trackCollection, classes, categorical_distribution,
|
|
batchSize=400, timesteps=5, filters=0, rotating=False):
|
|
if mode.lower() not in ['gumble', 'vae', 'refined']:
|
|
raise ValueError('Needs to be eather "gumble", "vae" or "refined"')
|
|
|
|
self.mode = mode.lower()
|
|
self.tc = trackCollection if isinstance(trackCollection, list) else list(trackCollection)
|
|
|
|
self.rotating = rotating
|
|
self.timesteps = timesteps
|
|
|
|
self.classes = classes
|
|
self.cD = categorical_distribution
|
|
self.batchSize = batchSize
|
|
self.epochs = 100
|
|
self.epsilon_std = 0.01
|
|
self.tau = K.variable(5.0, name="temperature")
|
|
self.anneal_rate = 0.0003
|
|
self.min_temperature = 0.5
|
|
_, _, height, width, _ = self.tc[0].as_n_sample_4D(
|
|
self.timesteps, for_track=list(self.tc[0].keys())[0]).shape
|
|
|
|
self.height = height
|
|
self.width = width
|
|
self.original_dim = self.timesteps * self.width * self.height * 1 # = 5*30*30 = 4500px
|
|
self.filters = int(sqrt(self.width ** 2 + self.height ** 2)) // 2 if not filters else filters
|
|
|
|
self.trained = False
|
|
self.model = None
|
|
self.encoder = None
|
|
self.generator = None
|
|
|
|
def set_model(self, model, loss, optimizer='adagrad'):
|
|
self.model = model
|
|
self.model.compile(optimizer=optimizer, loss=loss)
|
|
self.model.summary()
|
|
|
|
def set_generator(self, generator):
|
|
self.generator = generator
|
|
|
|
def set_encoder(self, encoder):
|
|
self.encoder = encoder
|
|
|
|
def load_weights(self, fileName):
|
|
self.model.load_weights(fileName)
|
|
self.trained = True
|
|
|
|
def save_weights(self, fileName):
|
|
self.model.save_weights(fileName)
|
|
|
|
def train(self, fileName=None):
|
|
for i in range(self.epochs):
|
|
tracklists = [list(x.keys()) for x in self.tc] # TODO: add additional shuffeling!!!
|
|
for keys in zip(*tracklists):
|
|
data = np.empty(0)
|
|
while data.shape[0] < self.batchSize:
|
|
tempData = [self.tc[idx].as_n_sample_4D(self.timesteps, in_walk_dir=self.rotating, for_track=key)
|
|
for idx, key in enumerate(keys)]
|
|
tempData = np.row_stack(tempData)
|
|
if data.shape[0] == 0:
|
|
data = tempData
|
|
else:
|
|
data = np.row_stack((data, tempData))
|
|
np.random.shuffle(data)
|
|
smoothing = data.shape[0] // self.batchSize * self.batchSize
|
|
if smoothing:
|
|
data = data[:smoothing]
|
|
self.model.fit(data, data, shuffle=True, epochs=1, batch_size=self.batchSize)
|
|
data = None
|
|
|
|
K.set_value(self.tau,
|
|
np.max([K.get_value(self.tau) * np.exp(- self.anneal_rate * i),
|
|
self.min_temperature]))
|
|
if fileName:
|
|
self.save_weights(fileName)
|
|
|
|
def plot_model(self, filename, show_shapes=True, show_layer_names=True):
|
|
plot_model(self.model, filename, show_shapes=show_shapes, show_layer_names=show_layer_names)
|
|
|
|
def color_random_track(self, completeSequence=False, show=True, fileName='', primaryTC=0,
|
|
nClusters=0, multiPath=False, cMode='kmeans', aMode='none'):
|
|
if not self.trained:
|
|
raise EnvironmentError('Please train this Model first!')
|
|
|
|
self.tc[primaryTC].map.refresh_random_clock()
|
|
track = self.tc[primaryTC].map.return_random_path()
|
|
if fileName:
|
|
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
|
|
|
|
return self.color_track(track, show=show, completeSequence=completeSequence, primaryTC=primaryTC,
|
|
fileName=fileName, nClusters=nClusters, multiPath=multiPath, cMode=cMode, aMode=aMode)
|
|
|
|
def color_track(self, track, completeSequence=False, show=True, fileName='', nClusters=0, multiPath=False,
|
|
cMode='kmeans', aMode='none', primaryTC=0):
|
|
if not self.trained:
|
|
raise EnvironmentError('Please train this Model first!')
|
|
isoArray = self.tc[primaryTC].map.isovists.get_items_for_track(track, dim='full', in_walk_dir=True).swapaxes(0, 2)[
|
|
..., None].transpose((0, 2, 1, 3))
|
|
smoothing = (isoArray.shape[0] // self.timesteps) * self.timesteps
|
|
isoArray = isoArray[:smoothing]
|
|
sequenceArray = np.array([isoArray[i:i+self.timesteps] for i in range(len(isoArray)-self.timesteps)])
|
|
dummyData = self.tc[primaryTC].as_n_sample_4D(self.timesteps).astype(int)
|
|
dummyData = dummyData.reshape((-1, self.timesteps, self.height, self.width, 1))
|
|
testdata = np.row_stack((dummyData, sequenceArray))[-1000:]
|
|
|
|
keys = [track[i+self.timesteps//2] for i in range(len(sequenceArray))]
|
|
|
|
tsneArray = self.reduce_and_color(testData=testdata, nClusters=nClusters,
|
|
primaryTC=primaryTC, aMode=aMode,
|
|
cMode=cMode)[-len(sequenceArray):] + 1 # This is for color correction
|
|
npmax = np.max(tsneArray[:, -1])+1
|
|
figure = np.where(self.tc[primaryTC].map.imgArray > 0, npmax, 0)
|
|
|
|
for i in range(len(sequenceArray)):
|
|
figure[keys[i]] = tsneArray[i, -1]
|
|
|
|
if multiPath:
|
|
return keys, tsneArray[-1]
|
|
else:
|
|
self.print_n_show(figure, 'img', npmax, fileName=fileName, show=show)
|
|
self.print_n_show(tsneArray, 'scatter', npmax, fileName=fileName, show=show)
|
|
|
|
if completeSequence:
|
|
if fileName:
|
|
fileName = '%s_sequence.tif' % fileName[:fileName.find('.')]
|
|
self._Trainer__colored_sequence(tsneArray, isoArray, maxVal=npmax, show=show, fileName=fileName)
|
|
|
|
def __colored_sequence(self, tsneArray, isoArray, maxVal=0, show=True, fileName=''):
|
|
"""
|
|
Returns all the Isovist sequences for a Track, next to its class color.
|
|
:param tsneArray:
|
|
:type tsneArray:
|
|
:param isoArray:
|
|
:type isoArray:
|
|
:param maxVal:
|
|
:type maxVal:
|
|
:param show:
|
|
:type show:
|
|
:param fileName:
|
|
:type fileName:
|
|
:return:
|
|
:rtype:
|
|
"""
|
|
if not self.trained:
|
|
raise EnvironmentError('Please train this Model first!')
|
|
if maxVal == 0:
|
|
maxVal, np.max(tsneArray[:, -1])
|
|
|
|
spacing = 2
|
|
figure = np.full(((spacing + len(tsneArray) * (self.height + spacing)), (self.timesteps + 1) * self.width), 2)
|
|
|
|
# Iterate through a 4 by 4 grid with 100 spacing, to place the image
|
|
for i in range(len(tsneArray)):
|
|
backGround = np.full((self.height, self.width * (self.timesteps + 1)), tsneArray[i, -1])
|
|
|
|
sequence = isoArray[i:i + self.timesteps].swapaxes(0, 1).reshape((
|
|
self.height, self.timesteps * self.width))
|
|
sequence = np.where(sequence > 0, maxVal, 0)
|
|
backGround[:, 0:-self.width] = sequence
|
|
figure[i * self.height + i * spacing: (i + 1)*self.height + i*spacing, :] = backGround
|
|
if fileName:
|
|
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
|
|
self.print_n_show(figure, 'img', maxVal, show=show, fileName=fileName)
|
|
|
|
@staticmethod
|
|
def print_n_show(x, mode, maxValue, show=True, fileName=''):
|
|
# Scatterplot with Classes
|
|
fig, ax = plt.subplots()
|
|
# make the picture
|
|
if mode == 'img' or mode == 'fig':
|
|
pic = ax.imshow(x, cmap='gist_ncar', vmin=0, vmax=maxValue)
|
|
cb = plt.colorbar(pic, spacing='proportional', ticks=np.linspace(0, maxValue, maxValue + 1))
|
|
elif mode == 'rgba':
|
|
pic = ax.imshow(x, cmap='gist_ncar', vmax=255)
|
|
cb = plt.colorbar(pic, spacing='proportional', ticks=np.linspace(0, 255, maxValue + 1))
|
|
elif mode == 'scatter':
|
|
scat = ax.scatter(x[:, 0], x[:, 1], c=x[:, -1], )
|
|
cb = plt.colorbar(scat, spacing='proportional', ticks=np.linspace(0, maxValue, maxValue + 1))
|
|
elif mode == 'bars':
|
|
objects = list(x.keys())
|
|
y_pos = list(range(len(objects)))
|
|
performance = [x[key] for key in objects]
|
|
|
|
bar = ax.bar(y_pos, performance, align='center')
|
|
# fig.xticks(y_pos, objects)
|
|
# fig.ylabel('Usage')
|
|
# fig.title('Programming language usage')
|
|
else:
|
|
raise ValueError('Mode needs to be "img", "fig", "bars", "rgba" or "scatter".')
|
|
fig.tight_layout()
|
|
if show:
|
|
plt.show()
|
|
if fileName:
|
|
plt.savefig(fileName)
|
|
|
|
return True
|
|
|
|
def reduce_and_color(self, testData=None, aMode='tsne', nClusters=0, cMode='kmeans', eps=5, primaryTC=0):
|
|
"""
|
|
:param testData: Numpy Arraym, shape (n, timesteps, height, width, 1)
|
|
:type testData: sdf
|
|
:param eps: When using cMode=DBSCAN,
|
|
:type eps: int
|
|
:param aMode: Dimensonal reduction mode, default = 'pca', other='tsne'
|
|
:type aMode: str
|
|
:param cMode: Clustering mode, default='kmeans', other='DBSCAN'
|
|
:type cMode: str
|
|
:param nClusters: Number of Clusters for kmeans-clustering if 0 nClusters=self.classes
|
|
:type nClusters: int
|
|
:param primaryTC: Index Number of the TrackCollection used for Basemap etc.
|
|
:type primaryTC: int
|
|
:return: Numpy Array (n, X, Y, Labels)
|
|
:rtype: np.ndarray
|
|
"""
|
|
if isinstance(testData, np.ndarray):
|
|
if testData.shape[1:] != (self.timesteps, self.height, self.width, 1):
|
|
raise ValueError('Shape must be (n, timesteps, height, width, 1), but was ', testData.shape)
|
|
|
|
if not isinstance(testData, np.ndarray):
|
|
testData = self.tc[primaryTC].as_n_sample_4D(self.timesteps)
|
|
testData = testData.reshape((-1, self.timesteps, self.height, self.width, 1))
|
|
|
|
n = testData.shape[0]
|
|
|
|
C = np.zeros((n, self.cD * self.classes if self.mode == 'gumble' else self.classes))
|
|
|
|
for i in range(0, n, 100):
|
|
c = self.encoder([testData[i:i + 100]])[0]
|
|
C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)
|
|
|
|
if aMode == 'tsne':
|
|
array = TSNE(metric='hamming').fit_transform(C.reshape(n, -1))
|
|
elif aMode == 'pca':
|
|
array = PCA(n_components=self.classes).fit_transform(C.reshape(n, -1))
|
|
elif aMode.lower() == 'none':
|
|
array = C.reshape(n, -1)
|
|
else:
|
|
raise ValueError('"aMode" needs to be either "pca" or "tsne".')
|
|
|
|
if cMode.lower() == 'dbscan':
|
|
labels = DBSCAN(eps=eps, min_samples=10).fit_predict(array)
|
|
|
|
elif cMode.lower() in ['kmeans', 'kmean', 'k-mean', 'k-means']:
|
|
nClusters = nClusters if nClusters > 0 else self.classes
|
|
labels = KMeans(n_clusters=nClusters).fit_predict(array)
|
|
else:
|
|
raise ValueError('"cMode" needs to be either "kmeans" or "dbscan".')
|
|
|
|
if len(np.unique(labels)) > 1:
|
|
color = labels
|
|
else:
|
|
# Color Generating
|
|
X = array[:, 0] + np.min(array[:, 0])
|
|
Y = (np.min(array[:, 1]) + array[:, 1]) // (np.max(array[:, 1]) + np.min(array[:, 1]))
|
|
color = X * Y
|
|
|
|
return np.column_stack((array, color))
|
|
|
|
def viz_clusters(self, aMode='pca', cMode='kmeans', testdata=None, fileName=''):
|
|
dataArray = self.reduce_and_color(aMode=aMode, cMode=cMode, testData=testdata)
|
|
|
|
self.print_n_show(dataArray, 'scatter', np.max(dataArray[:, -1]), fileName=fileName)
|
|
return True
|
|
|
|
# THIS WORKS in all modes
|
|
def show_prediction(self, n, dataArray=None, show=True, fileName='', startI=0, primaryTC=0):
|
|
if not fileName and not show:
|
|
raise ValueError('Why are you doing this? Print smth or show it!')
|
|
if self.mode in ['gumble', 'vae', 'refined']:
|
|
if not isinstance(dataArray, np.ndarray):
|
|
dataArray = self.tc[primaryTC].as_n_sample_4D(self.timesteps)
|
|
seqWidth = self.width * self.timesteps
|
|
spacing = 1
|
|
sqrtDim = int(sqrt(n)) + 1
|
|
fullwidth = (seqWidth + spacing) * sqrtDim
|
|
fullheight = (self.height*2 + spacing) * sqrtDim
|
|
|
|
figure = np.zeros((fullheight, fullwidth))
|
|
for i in range(n):
|
|
|
|
array = dataArray[i+startI]
|
|
arr_h = self.model.predict(array.reshape((1, self.timesteps, self.height, self.width, 1)))
|
|
f = np.ones((self.height*2, seqWidth))
|
|
f[:self.height, :seqWidth] = array.reshape(seqWidth, self.height).swapaxes(0, 1)
|
|
f[self.height:self.height*2, : seqWidth] = arr_h.reshape(seqWidth, self.height).swapaxes(0, 1)
|
|
|
|
try:
|
|
y, x = divmod(i, sqrtDim)
|
|
except ZeroDivisionError:
|
|
x, y = 0, 0
|
|
|
|
figure[y*self.height*2 + y*spacing: (y+1)*self.height*2 + y*spacing,
|
|
x*seqWidth + x*spacing: (x+1)*seqWidth + x*spacing] = f
|
|
if fileName:
|
|
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
|
|
self.print_n_show(figure, 'img', maxValue=np.max(figure), fileName=fileName)
|
|
if show:
|
|
self.print_n_show(figure, 'img', maxValue=np.max(figure))
|
|
|
|
return True
|
|
|
|
def sample_latent(self, nSamples, show=True, fileName=''):
|
|
if self.mode not in ['gumble', 'vae', 'refined']:
|
|
raise ValueError('Needs to be either of "gumble", "vae", "refined"')
|
|
if self.classes >= self.height:
|
|
raise NotImplementedError('This cannot be shown, edit the Funciton!')
|
|
|
|
seqWidth = self.width * self.timesteps
|
|
spacing = 1
|
|
sqrtDim = int(sqrt(nSamples)) + 1
|
|
fullwidth = (seqWidth + spacing) * sqrtDim
|
|
if self.mode == 'gumble':
|
|
if self.cD >= fullwidth:
|
|
raise NotImplementedError('This cannot be shown, please edit the Function!!!')
|
|
lHSpace = (self.height - self.classes) // 2
|
|
lWSpace = (seqWidth - self.cD) // 2
|
|
lShape = (self.classes, self.cD)
|
|
|
|
else:
|
|
if self.classes >= seqWidth:
|
|
raise NotImplementedError('To many Samples, this cannot be displayed, please edit the Function!!!')
|
|
lHSpace = (self.height - 1) // 2
|
|
lWSpace = (seqWidth - self.classes) // 2
|
|
lShape = (-1, self.classes)
|
|
|
|
fullheight = (self.height*2 + spacing) * sqrtDim
|
|
figure = np.zeros((fullheight, fullwidth))
|
|
|
|
for i in range(nSamples):
|
|
f = np.ones((self.height * 2, seqWidth))
|
|
|
|
if self.mode == 'gumble':
|
|
# https://stackoverflow.com/a/42874726/7746808
|
|
oneHot = np.eye(self.classes)[np.random.randint(0, self.classes, self.cD)]
|
|
sample = oneHot.reshape((-1, self.classes*self.cD))
|
|
f[lHSpace:lHSpace + self.classes, lWSpace:lWSpace + self.cD] = sample.swapaxes(0, 1).reshape(lShape)
|
|
else:
|
|
if self.mode == 'vae':
|
|
sampleSpace = np.random.randn(nSamples, self.classes)
|
|
elif self.mode == 'refined':
|
|
sampleSpace = np.random.rand(nSamples, self.classes)
|
|
else:
|
|
raise ValueError('Needs to be either of "gumble", "vae", "refined"')
|
|
|
|
sample = sampleSpace[i].reshape(lShape)
|
|
f[lHSpace:lHSpace+1, lWSpace:lWSpace + self.classes] = sample
|
|
|
|
arr_h = self.generator.predict(sample)
|
|
f[self.height:self.height*2, : seqWidth] = arr_h.reshape(seqWidth, self.height).swapaxes(0, 1)
|
|
|
|
try:
|
|
y, x = divmod(i, sqrtDim)
|
|
except ZeroDivisionError:
|
|
x, y = 0, 0
|
|
|
|
figure[y * self.height * 2 + y*spacing: (y + 1) * self.height * 2 + y*spacing,
|
|
x * seqWidth + x*spacing: (x + 1) * seqWidth + x*spacing] = f
|
|
|
|
if fileName:
|
|
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
|
|
self.print_n_show(figure, 'img', maxValue=np.max(figure), fileName=fileName)
|
|
if show:
|
|
self.print_n_show(figure, 'img', maxValue=np.max(figure))
|
|
return True
|
|
|
|
def multi_path_coloring(self, nClusters, fileName='', state='', primaryTC=0, uncertainty=False, rgba=False):
|
|
if nClusters <= 2:
|
|
raise ValueError('More than 2 Classes are needed')
|
|
|
|
if fileName and state.lower() == 'load':
|
|
import pickle
|
|
with open(fileName, 'rb') as file:
|
|
patchDict = pickle.load(file)
|
|
else:
|
|
patchDict = defaultdict(list)
|
|
for key in self.tc[primaryTC].keys():
|
|
|
|
tempKeys, tempSequence = self.tc[primaryTC].as_n_sample_4D(self.timesteps,
|
|
in_walk_dir=True,
|
|
keys=True,
|
|
moving_window=True,
|
|
for_track=key)
|
|
|
|
C = np.zeros((len(tempSequence), self.cD * self.classes if self.mode == 'gumble' else self.classes))
|
|
|
|
for i in range(0, len(tempSequence), 100):
|
|
c = self.encoder([tempSequence[i:i + 100]])[0]
|
|
C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)
|
|
|
|
for i, tempKey in enumerate(tempKeys):
|
|
patchDict[tempKey].append(list(C[i]))
|
|
|
|
if fileName and state.lower() == 'dump':
|
|
with open(fileName, 'wb') as f:
|
|
import pickle
|
|
pickle.dump(patchDict, f, pickle.HIGHEST_PROTOCOL)
|
|
|
|
l = list()
|
|
for x in patchDict.keys():
|
|
for elem in patchDict[x]:
|
|
l.append(elem + list(x))
|
|
|
|
a = np.array(l)
|
|
k = KMeans(nClusters).fit_predict(a[:, :-2]) + 1 # Color Correction
|
|
|
|
s = np.zeros((a.shape[0], a.shape[1] + 1))
|
|
s[:, :-1] = a
|
|
s[:, -1] = k
|
|
|
|
patchDict = defaultdict(list)
|
|
for i in range(s.shape[0]):
|
|
key = int(s[i][-3]), int(s[i][-2])
|
|
patchDict[key].append(int(s[i][-1]))
|
|
|
|
from collections import Counter
|
|
c = Counter()
|
|
for key in patchDict.keys():
|
|
c[len(set(patchDict[key]))] += 1
|
|
|
|
npmax = np.max(s[:, -1]) + 1
|
|
# npmax = 4
|
|
self.print_n_show(c, 'bars', npmax)
|
|
|
|
if rgba:
|
|
figure = Image.fromarray(np.where(self.tc[primaryTC].map.imgArray > 0, 255, 0)).convert('RGBA')
|
|
draw = ImageDraw.Draw(figure, 'RGBA')
|
|
from matplotlib import cm
|
|
cmap = cm.get_cmap('gist_ncar', 12) # 12 discrete colors
|
|
|
|
for key in patchDict.keys():
|
|
for value in patchDict[key]:
|
|
color = [int(x*255) for x in cmap(int(value), alpha=0.3)]
|
|
draw.point((key[1], key[0]), fill=tuple(color))
|
|
|
|
self.print_n_show(figure, 'rgba', npmax)
|
|
else:
|
|
figure = np.where(self.tc[primaryTC].map.imgArray > 0, npmax, 0)
|
|
for key in patchDict.keys():
|
|
c = Counter(patchDict[key])
|
|
figure[key] = c.most_common(1)[0][0]
|
|
self.print_n_show(figure, 'img', npmax)
|
|
|
|
if uncertainty:
|
|
uncertainfig = np.where(self.tc[primaryTC].map.imgArray > 0, npmax + 1, 0)
|
|
for key in patchDict.keys():
|
|
uncertainfig[key] = len(set(patchDict[key])) * 2
|
|
self.print_n_show(uncertainfig, 'img', npmax+1)
|
|
return
|
|
|
|
def show_silhouette_score(self, k_list, primaryTC=0):
|
|
X = None
|
|
for key in list(self.tc[primaryTC].keys())[:100]:
|
|
tempSequence = self.tc[primaryTC].as_n_sample_4D(self.timesteps,
|
|
in_walk_dir=True,
|
|
keys=False,
|
|
moving_window=True,
|
|
for_track=key)
|
|
C = np.zeros((len(tempSequence), self.cD * self.classes if self.mode == 'gumble' else self.classes))
|
|
|
|
for i in range(0, len(tempSequence), 100):
|
|
c = self.encoder([tempSequence[i:i + 100]])[0]
|
|
C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)
|
|
if isinstance(X, np.ndarray):
|
|
X = np.row_stack((X, C))
|
|
else:
|
|
X = C
|
|
|
|
for n_clusters in k_list:
|
|
# Create a subplot with 1 row and 2 columns
|
|
fig, (ax1, ax2) = plt.subplots(1, 2)
|
|
fig.set_size_inches(18, 7)
|
|
|
|
# The 1st subplot is the silhouette plot
|
|
# The silhouette coefficient can range from -1, 1 but in this example all
|
|
# lie within [-0.1, 1]
|
|
ax1.set_xlim([-0.1, 1])
|
|
# The (n_clusters+1)*10 is for inserting blank space between silhouette
|
|
# plots of individual clusters, to demarcate them clearly.
|
|
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
|
|
|
|
# Initialize the clusterer with n_clusters value and a random generator
|
|
# seed of 10 for reproducibility.
|
|
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
|
|
cluster_labels = clusterer.fit_predict(X)
|
|
|
|
# The silhouette_score gives the average value for all the samples.
|
|
# This gives a perspective into the density and separation of the formed
|
|
# clusters
|
|
silhouette_avg = silhouette_score(X, cluster_labels)
|
|
print("For n_clusters =", n_clusters,
|
|
"The average silhouette_score is :", silhouette_avg)
|
|
|
|
# Compute the silhouette scores for each sample
|
|
sample_silhouette_values = silhouette_samples(X, cluster_labels)
|
|
|
|
y_lower = 10
|
|
for i in range(n_clusters):
|
|
# Aggregate the silhouette scores for samples belonging to
|
|
# cluster i, and sort them
|
|
ith_cluster_silhouette_values = \
|
|
sample_silhouette_values[cluster_labels == i]
|
|
|
|
ith_cluster_silhouette_values.sort()
|
|
|
|
size_cluster_i = ith_cluster_silhouette_values.shape[0]
|
|
y_upper = y_lower + size_cluster_i
|
|
|
|
color = cm.spectral(float(i) / n_clusters)
|
|
ax1.fill_betweenx(np.arange(y_lower, y_upper),
|
|
0, ith_cluster_silhouette_values,
|
|
facecolor=color, edgecolor=color, alpha=0.7)
|
|
|
|
# Label the silhouette plots with their cluster numbers at the middle
|
|
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
|
|
|
|
# Compute the new y_lower for next plot
|
|
y_lower = y_upper + 10 # 10 for the 0 samples
|
|
|
|
ax1.set_title("The silhouette plot for the various clusters.")
|
|
ax1.set_xlabel("The silhouette coefficient values")
|
|
ax1.set_ylabel("Cluster label")
|
|
|
|
# The vertical line for average silhouette score of all the values
|
|
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
|
|
|
|
ax1.set_yticks([]) # Clear the yaxis labels / ticks
|
|
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
|
|
|
|
# 2nd Plot showing the actual clusters formed
|
|
colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
|
|
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
|
|
c=colors)
|
|
|
|
# Labeling the clusters
|
|
centers = clusterer.cluster_centers_
|
|
# Draw white circles at cluster centers
|
|
ax2.scatter(centers[:, 0], centers[:, 1],
|
|
marker='o', c="white", alpha=1, s=200)
|
|
|
|
for i, c in enumerate(centers):
|
|
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
|
|
|
|
ax2.set_title("The visualization of the clustered data.")
|
|
ax2.set_xlabel("Feature space for the 1st feature")
|
|
ax2.set_ylabel("Feature space for the 2nd feature")
|
|
|
|
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
|
|
"with n_clusters = %d" % n_clusters),
|
|
fontsize=14, fontweight='bold')
|
|
|
|
plt.show()
|