2017-07-22 21:13:27 +02:00

583 lines
26 KiB
Python

# Keras Utility Imports
from keras import backend as K
from keras.utils import plot_model
from collections import defaultdict
from random import shuffle
# Numpy
import numpy as np
# Maths
from math import sqrt
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold.t_sne import TSNE
from sklearn.decomposition import PCA
# Plotting
import matplotlib.pyplot as plt
from PIL import ImageDraw, Image
import matplotlib.cm as cm
"""
UseFull Links:
Keras Timedistributed Wrapper is applysing the SAME Layer, which means same WEIGHTS:
http://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/
"""
class Trainer(object):
def __init__(self, mode, trackCollection, classes, categorical_distribution,
batchSize=400, timesteps=5, filters=0, rotating=False):
if mode.lower() not in ['gumble', 'vae', 'refined']:
raise ValueError('Needs to be eather "gumble", "vae" or "refined"')
self.mode = mode.lower()
self.tc = trackCollection if isinstance(trackCollection, list) else list(trackCollection)
self.rotating = rotating
self.timesteps = timesteps
self.classes = classes
self.cD = categorical_distribution
self.batchSize = batchSize
self.epochs = 100
self.epsilon_std = 0.01
self.tau = K.variable(5.0, name="temperature")
self.anneal_rate = 0.0003
self.min_temperature = 0.5
_, _, height, width, _ = self.tc[0].as_n_sample_4D(
self.timesteps, for_track=list(self.tc[0].keys())[0]).shape
self.height = height
self.width = width
self.original_dim = self.timesteps * self.width * self.height * 1 # = 5*30*30 = 4500px
self.filters = int(sqrt(self.width ** 2 + self.height ** 2)) // 2 if not filters else filters
self.trained = False
self.model = None
self.encoder = None
self.generator = None
def set_model(self, model, loss, optimizer='adagrad'):
self.model = model
self.model.compile(optimizer=optimizer, loss=loss)
self.model.summary()
def set_generator(self, generator):
self.generator = generator
def set_encoder(self, encoder):
self.encoder = encoder
def load_weights(self, fileName):
self.model.load_weights(fileName)
self.trained = True
def save_weights(self, fileName):
self.model.save_weights(fileName)
def train(self, fileName=None):
for i in range(self.epochs):
tracklists = [list(x.keys()) for x in self.tc] # TODO: add additional shuffeling!!!
for keys in zip(*tracklists):
data = np.empty(0)
while data.shape[0] < self.batchSize:
tempData = [self.tc[idx].as_n_sample_4D(self.timesteps, in_walk_dir=self.rotating, for_track=key)
for idx, key in enumerate(keys)]
tempData = np.row_stack(tempData)
if data.shape[0] == 0:
data = tempData
else:
data = np.row_stack((data, tempData))
np.random.shuffle(data)
smoothing = data.shape[0] // self.batchSize * self.batchSize
if smoothing:
data = data[:smoothing]
self.model.fit(data, data, shuffle=True, epochs=1, batch_size=self.batchSize)
data = None
K.set_value(self.tau,
np.max([K.get_value(self.tau) * np.exp(- self.anneal_rate * i),
self.min_temperature]))
if fileName:
self.save_weights(fileName)
def plot_model(self, filename, show_shapes=True, show_layer_names=True):
plot_model(self.model, filename, show_shapes=show_shapes, show_layer_names=show_layer_names)
def color_random_track(self, completeSequence=False, show=True, fileName='', primaryTC=0,
nClusters=0, multiPath=False, cMode='kmeans', aMode='none'):
if not self.trained:
raise EnvironmentError('Please train this Model first!')
self.tc[primaryTC].map.refresh_random_clock()
track = self.tc[primaryTC].map.return_random_path()
if fileName:
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
return self.color_track(track, show=show, completeSequence=completeSequence, primaryTC=primaryTC,
fileName=fileName, nClusters=nClusters, multiPath=multiPath, cMode=cMode, aMode=aMode)
def color_track(self, track, completeSequence=False, show=True, fileName='', nClusters=0, multiPath=False,
cMode='kmeans', aMode='none', primaryTC=0):
if not self.trained:
raise EnvironmentError('Please train this Model first!')
isoArray = self.tc[primaryTC].map.isovists.get_items_for_track(track, dim='full', in_walk_dir=True).swapaxes(0, 2)[
..., None].transpose((0, 2, 1, 3))
smoothing = (isoArray.shape[0] // self.timesteps) * self.timesteps
isoArray = isoArray[:smoothing]
sequenceArray = np.array([isoArray[i:i+self.timesteps] for i in range(len(isoArray)-self.timesteps)])
dummyData = self.tc[primaryTC].as_n_sample_4D(self.timesteps).astype(int)
dummyData = dummyData.reshape((-1, self.timesteps, self.height, self.width, 1))
testdata = np.row_stack((dummyData, sequenceArray))[-1000:]
keys = [track[i+self.timesteps//2] for i in range(len(sequenceArray))]
tsneArray = self.reduce_and_color(testData=testdata, nClusters=nClusters,
primaryTC=primaryTC, aMode=aMode,
cMode=cMode)[-len(sequenceArray):] + 1 # This is for color correction
npmax = np.max(tsneArray[:, -1])+1
figure = np.where(self.tc[primaryTC].map.imgArray > 0, npmax, 0)
for i in range(len(sequenceArray)):
figure[keys[i]] = tsneArray[i, -1]
if multiPath:
return keys, tsneArray[-1]
else:
self.print_n_show(figure, 'img', npmax, fileName=fileName, show=show)
self.print_n_show(tsneArray, 'scatter', npmax, fileName=fileName, show=show)
if completeSequence:
if fileName:
fileName = '%s_sequence.tif' % fileName[:fileName.find('.')]
self._Trainer__colored_sequence(tsneArray, isoArray, maxVal=npmax, show=show, fileName=fileName)
def __colored_sequence(self, tsneArray, isoArray, maxVal=0, show=True, fileName=''):
"""
Returns all the Isovist sequences for a Track, next to its class color.
:param tsneArray:
:type tsneArray:
:param isoArray:
:type isoArray:
:param maxVal:
:type maxVal:
:param show:
:type show:
:param fileName:
:type fileName:
:return:
:rtype:
"""
if not self.trained:
raise EnvironmentError('Please train this Model first!')
if maxVal == 0:
maxVal, np.max(tsneArray[:, -1])
spacing = 2
figure = np.full(((spacing + len(tsneArray) * (self.height + spacing)), (self.timesteps + 1) * self.width), 2)
# Iterate through a 4 by 4 grid with 100 spacing, to place the image
for i in range(len(tsneArray)):
backGround = np.full((self.height, self.width * (self.timesteps + 1)), tsneArray[i, -1])
sequence = isoArray[i:i + self.timesteps].swapaxes(0, 1).reshape((
self.height, self.timesteps * self.width))
sequence = np.where(sequence > 0, maxVal, 0)
backGround[:, 0:-self.width] = sequence
figure[i * self.height + i * spacing: (i + 1)*self.height + i*spacing, :] = backGround
if fileName:
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
self.print_n_show(figure, 'img', maxVal, show=show, fileName=fileName)
@staticmethod
def print_n_show(x, mode, maxValue, show=True, fileName=''):
# Scatterplot with Classes
fig, ax = plt.subplots()
# make the picture
if mode == 'img' or mode == 'fig':
pic = ax.imshow(x, cmap='gist_ncar', vmin=0, vmax=maxValue)
cb = plt.colorbar(pic, spacing='proportional', ticks=np.linspace(0, maxValue, maxValue + 1))
elif mode == 'rgba':
pic = ax.imshow(x, cmap='gist_ncar', vmax=255)
cb = plt.colorbar(pic, spacing='proportional', ticks=np.linspace(0, 255, maxValue + 1))
elif mode == 'scatter':
scat = ax.scatter(x[:, 0], x[:, 1], c=x[:, -1], )
cb = plt.colorbar(scat, spacing='proportional', ticks=np.linspace(0, maxValue, maxValue + 1))
elif mode == 'bars':
objects = list(x.keys())
y_pos = list(range(len(objects)))
performance = [x[key] for key in objects]
bar = ax.bar(y_pos, performance, align='center')
# fig.xticks(y_pos, objects)
# fig.ylabel('Usage')
# fig.title('Programming language usage')
else:
raise ValueError('Mode needs to be "img", "fig", "bars", "rgba" or "scatter".')
fig.tight_layout()
if show:
plt.show()
if fileName:
plt.savefig(fileName)
return True
def reduce_and_color(self, testData=None, aMode='tsne', nClusters=0, cMode='kmeans', eps=5, primaryTC=0):
"""
:param testData: Numpy Arraym, shape (n, timesteps, height, width, 1)
:type testData: sdf
:param eps: When using cMode=DBSCAN,
:type eps: int
:param aMode: Dimensonal reduction mode, default = 'pca', other='tsne'
:type aMode: str
:param cMode: Clustering mode, default='kmeans', other='DBSCAN'
:type cMode: str
:param nClusters: Number of Clusters for kmeans-clustering if 0 nClusters=self.classes
:type nClusters: int
:param primaryTC: Index Number of the TrackCollection used for Basemap etc.
:type primaryTC: int
:return: Numpy Array (n, X, Y, Labels)
:rtype: np.ndarray
"""
if isinstance(testData, np.ndarray):
if testData.shape[1:] != (self.timesteps, self.height, self.width, 1):
raise ValueError('Shape must be (n, timesteps, height, width, 1), but was ', testData.shape)
if not isinstance(testData, np.ndarray):
testData = self.tc[primaryTC].as_n_sample_4D(self.timesteps)
testData = testData.reshape((-1, self.timesteps, self.height, self.width, 1))
n = testData.shape[0]
C = np.zeros((n, self.cD * self.classes if self.mode == 'gumble' else self.classes))
for i in range(0, n, 100):
c = self.encoder([testData[i:i + 100]])[0]
C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)
if aMode == 'tsne':
array = TSNE(metric='hamming').fit_transform(C.reshape(n, -1))
elif aMode == 'pca':
array = PCA(n_components=self.classes).fit_transform(C.reshape(n, -1))
elif aMode.lower() == 'none':
array = C.reshape(n, -1)
else:
raise ValueError('"aMode" needs to be either "pca" or "tsne".')
if cMode.lower() == 'dbscan':
labels = DBSCAN(eps=eps, min_samples=10).fit_predict(array)
elif cMode.lower() in ['kmeans', 'kmean', 'k-mean', 'k-means']:
nClusters = nClusters if nClusters > 0 else self.classes
labels = KMeans(n_clusters=nClusters).fit_predict(array)
else:
raise ValueError('"cMode" needs to be either "kmeans" or "dbscan".')
if len(np.unique(labels)) > 1:
color = labels
else:
# Color Generating
X = array[:, 0] + np.min(array[:, 0])
Y = (np.min(array[:, 1]) + array[:, 1]) // (np.max(array[:, 1]) + np.min(array[:, 1]))
color = X * Y
return np.column_stack((array, color))
def viz_clusters(self, aMode='pca', cMode='kmeans', testdata=None, fileName=''):
dataArray = self.reduce_and_color(aMode=aMode, cMode=cMode, testData=testdata)
self.print_n_show(dataArray, 'scatter', np.max(dataArray[:, -1]), fileName=fileName)
return True
# THIS WORKS in all modes
def show_prediction(self, n, dataArray=None, show=True, fileName='', startI=0, primaryTC=0):
if not fileName and not show:
raise ValueError('Why are you doing this? Print smth or show it!')
if self.mode in ['gumble', 'vae', 'refined']:
if not isinstance(dataArray, np.ndarray):
dataArray = self.tc[primaryTC].as_n_sample_4D(self.timesteps)
seqWidth = self.width * self.timesteps
spacing = 1
sqrtDim = int(sqrt(n)) + 1
fullwidth = (seqWidth + spacing) * sqrtDim
fullheight = (self.height*2 + spacing) * sqrtDim
figure = np.zeros((fullheight, fullwidth))
for i in range(n):
array = dataArray[i+startI]
arr_h = self.model.predict(array.reshape((1, self.timesteps, self.height, self.width, 1)))
f = np.ones((self.height*2, seqWidth))
f[:self.height, :seqWidth] = array.reshape(seqWidth, self.height).swapaxes(0, 1)
f[self.height:self.height*2, : seqWidth] = arr_h.reshape(seqWidth, self.height).swapaxes(0, 1)
try:
y, x = divmod(i, sqrtDim)
except ZeroDivisionError:
x, y = 0, 0
figure[y*self.height*2 + y*spacing: (y+1)*self.height*2 + y*spacing,
x*seqWidth + x*spacing: (x+1)*seqWidth + x*spacing] = f
if fileName:
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
self.print_n_show(figure, 'img', maxValue=np.max(figure), fileName=fileName)
if show:
self.print_n_show(figure, 'img', maxValue=np.max(figure))
return True
def sample_latent(self, nSamples, show=True, fileName=''):
if self.mode not in ['gumble', 'vae', 'refined']:
raise ValueError('Needs to be either of "gumble", "vae", "refined"')
if self.classes >= self.height:
raise NotImplementedError('This cannot be shown, edit the Funciton!')
seqWidth = self.width * self.timesteps
spacing = 1
sqrtDim = int(sqrt(nSamples)) + 1
fullwidth = (seqWidth + spacing) * sqrtDim
if self.mode == 'gumble':
if self.cD >= fullwidth:
raise NotImplementedError('This cannot be shown, please edit the Function!!!')
lHSpace = (self.height - self.classes) // 2
lWSpace = (seqWidth - self.cD) // 2
lShape = (self.classes, self.cD)
else:
if self.classes >= seqWidth:
raise NotImplementedError('To many Samples, this cannot be displayed, please edit the Function!!!')
lHSpace = (self.height - 1) // 2
lWSpace = (seqWidth - self.classes) // 2
lShape = (-1, self.classes)
fullheight = (self.height*2 + spacing) * sqrtDim
figure = np.zeros((fullheight, fullwidth))
for i in range(nSamples):
f = np.ones((self.height * 2, seqWidth))
if self.mode == 'gumble':
# https://stackoverflow.com/a/42874726/7746808
oneHot = np.eye(self.classes)[np.random.randint(0, self.classes, self.cD)]
sample = oneHot.reshape((-1, self.classes*self.cD))
f[lHSpace:lHSpace + self.classes, lWSpace:lWSpace + self.cD] = sample.swapaxes(0, 1).reshape(lShape)
else:
if self.mode == 'vae':
sampleSpace = np.random.randn(nSamples, self.classes)
elif self.mode == 'refined':
sampleSpace = np.random.rand(nSamples, self.classes)
else:
raise ValueError('Needs to be either of "gumble", "vae", "refined"')
sample = sampleSpace[i].reshape(lShape)
f[lHSpace:lHSpace+1, lWSpace:lWSpace + self.classes] = sample
arr_h = self.generator.predict(sample)
f[self.height:self.height*2, : seqWidth] = arr_h.reshape(seqWidth, self.height).swapaxes(0, 1)
try:
y, x = divmod(i, sqrtDim)
except ZeroDivisionError:
x, y = 0, 0
figure[y * self.height * 2 + y*spacing: (y + 1) * self.height * 2 + y*spacing,
x * seqWidth + x*spacing: (x + 1) * seqWidth + x*spacing] = f
if fileName:
fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
self.print_n_show(figure, 'img', maxValue=np.max(figure), fileName=fileName)
if show:
self.print_n_show(figure, 'img', maxValue=np.max(figure))
return True
def multi_path_coloring(self, nClusters, fileName='', state='', primaryTC=0, uncertainty=False, rgba=False):
if nClusters <= 2:
raise ValueError('More than 2 Classes are needed')
if fileName and state.lower() == 'load':
import pickle
with open(fileName, 'rb') as file:
patchDict = pickle.load(file)
else:
patchDict = defaultdict(list)
for key in self.tc[primaryTC].keys():
tempKeys, tempSequence = self.tc[primaryTC].as_n_sample_4D(self.timesteps,
in_walk_dir=True,
keys=True,
moving_window=True,
for_track=key)
C = np.zeros((len(tempSequence), self.cD * self.classes if self.mode == 'gumble' else self.classes))
for i in range(0, len(tempSequence), 100):
c = self.encoder([tempSequence[i:i + 100]])[0]
C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)
for i, tempKey in enumerate(tempKeys):
patchDict[tempKey].append(list(C[i]))
if fileName and state.lower() == 'dump':
with open(fileName, 'wb') as f:
import pickle
pickle.dump(patchDict, f, pickle.HIGHEST_PROTOCOL)
l = list()
for x in patchDict.keys():
for elem in patchDict[x]:
l.append(elem + list(x))
a = np.array(l)
k = KMeans(nClusters).fit_predict(a[:, :-2]) + 1 # Color Correction
s = np.zeros((a.shape[0], a.shape[1] + 1))
s[:, :-1] = a
s[:, -1] = k
patchDict = defaultdict(list)
for i in range(s.shape[0]):
key = int(s[i][-3]), int(s[i][-2])
patchDict[key].append(int(s[i][-1]))
from collections import Counter
c = Counter()
for key in patchDict.keys():
c[len(set(patchDict[key]))] += 1
npmax = np.max(s[:, -1]) + 1
# npmax = 4
self.print_n_show(c, 'bars', npmax)
if rgba:
figure = Image.fromarray(np.where(self.tc[primaryTC].map.imgArray > 0, 255, 0)).convert('RGBA')
draw = ImageDraw.Draw(figure, 'RGBA')
from matplotlib import cm
cmap = cm.get_cmap('gist_ncar', 12) # 12 discrete colors
for key in patchDict.keys():
for value in patchDict[key]:
color = [int(x*255) for x in cmap(int(value), alpha=0.3)]
draw.point((key[1], key[0]), fill=tuple(color))
self.print_n_show(figure, 'rgba', npmax)
else:
figure = np.where(self.tc[primaryTC].map.imgArray > 0, npmax, 0)
for key in patchDict.keys():
c = Counter(patchDict[key])
figure[key] = c.most_common(1)[0][0]
self.print_n_show(figure, 'img', npmax)
if uncertainty:
uncertainfig = np.where(self.tc[primaryTC].map.imgArray > 0, npmax + 1, 0)
for key in patchDict.keys():
uncertainfig[key] = len(set(patchDict[key])) * 2
self.print_n_show(uncertainfig, 'img', npmax+1)
return
def show_silhouette_score(self, k_list, primaryTC=0):
X = None
for key in list(self.tc[primaryTC].keys())[:100]:
tempSequence = self.tc[primaryTC].as_n_sample_4D(self.timesteps,
in_walk_dir=True,
keys=False,
moving_window=True,
for_track=key)
C = np.zeros((len(tempSequence), self.cD * self.classes if self.mode == 'gumble' else self.classes))
for i in range(0, len(tempSequence), 100):
c = self.encoder([tempSequence[i:i + 100]])[0]
C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)
if isinstance(X, np.ndarray):
X = np.row_stack((X, C))
else:
X = C
for n_clusters in k_list:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors)
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1],
marker='o', c="white", alpha=1, s=200)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()