UnsupervisedVisualSequenceL…/Trainer.py

# Keras Utility Imports
from keras import backend as K
from keras.utils import plot_model

from collections import defaultdict
from random import shuffle

# Numpy
import numpy as np

# Maths
from math import sqrt
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold.t_sne import TSNE
from sklearn.decomposition import PCA

# Plotting
import matplotlib.pyplot as plt
from PIL import ImageDraw, Image
import matplotlib.cm as cm
"""
UseFull Links:

Keras Timedistributed Wrapper is applysing the SAME Layer, which means same WEIGHTS:
http://machinelearningmastery.com/timedistributed-layer-for-long-short-term-memory-networks-in-python/

"""


class Trainer(object):
    def __init__(self, mode, trackCollection, classes, categorical_distribution,
                 batchSize=400, timesteps=5, filters=0, rotating=False):
        if mode.lower() not in ['gumble', 'vae', 'refined']:
            raise ValueError('Needs to be eather "gumble", "vae" or "refined"')

        self.mode = mode.lower()
        self.tc = trackCollection if isinstance(trackCollection, list) else list(trackCollection)

        self.rotating = rotating
        self.timesteps = timesteps

        self.classes = classes
        self.cD = categorical_distribution
        self.batchSize = batchSize
        self.epochs = 100
        self.epsilon_std = 0.01
        self.tau = K.variable(5.0, name="temperature")
        self.anneal_rate = 0.0003
        self.min_temperature = 0.5
        _, _, height, width, _ = self.tc[0].as_n_sample_4D(
            self.timesteps, for_track=list(self.tc[0].keys())[0]).shape

        self.height = height
        self.width = width
        self.original_dim = self.timesteps * self.width * self.height * 1  # = 5*30*30 = 4500px
        self.filters = int(sqrt(self.width ** 2 + self.height ** 2)) // 2 if not filters else filters

        self.trained = False
        self.model = None
        self.encoder = None
        self.generator = None

    def set_model(self, model, loss, optimizer='adagrad'):
        self.model = model
        self.model.compile(optimizer=optimizer, loss=loss)
        self.model.summary()

    def set_generator(self, generator):
        self.generator = generator

    def set_encoder(self, encoder):
        self.encoder = encoder

    def load_weights(self, fileName):
        self.model.load_weights(fileName)
        self.trained = True

    def save_weights(self, fileName):
        self.model.save_weights(fileName)

    def train(self, fileName=None):
        for i in range(self.epochs):
            tracklists = [list(x.keys()) for x in self.tc] # TODO: add additional shuffeling!!!
            for keys in zip(*tracklists):
                data = np.empty(0)
                while data.shape[0] < self.batchSize:
                    tempData = [self.tc[idx].as_n_sample_4D(self.timesteps, in_walk_dir=self.rotating, for_track=key)
                                for idx, key in enumerate(keys)]
                    tempData = np.row_stack(tempData)
                    if data.shape[0] == 0:
                        data = tempData
                    else:
                        data = np.row_stack((data, tempData))
                np.random.shuffle(data)
                smoothing = data.shape[0] // self.batchSize * self.batchSize
                if smoothing:
                    data = data[:smoothing]
                self.model.fit(data, data, shuffle=True, epochs=1, batch_size=self.batchSize)
                data = None

            K.set_value(self.tau,
                        np.max([K.get_value(self.tau) * np.exp(- self.anneal_rate * i),
                                self.min_temperature]))
            if fileName:
                self.save_weights(fileName)

    def plot_model(self, filename, show_shapes=True, show_layer_names=True):
        plot_model(self.model, filename, show_shapes=show_shapes, show_layer_names=show_layer_names)

    def color_random_track(self, completeSequence=False, show=True, fileName='', primaryTC=0,
                           nClusters=0, multiPath=False, cMode='kmeans', aMode='none'):
        if not self.trained:
            raise EnvironmentError('Please train this Model first!')

        self.tc[primaryTC].map.refresh_random_clock()
        track = self.tc[primaryTC].map.return_random_path()
        if fileName:
            fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName

        return self.color_track(track, show=show, completeSequence=completeSequence, primaryTC=primaryTC,
                                fileName=fileName, nClusters=nClusters, multiPath=multiPath, cMode=cMode, aMode=aMode)

    def color_track(self, track, completeSequence=False, show=True, fileName='', nClusters=0, multiPath=False,
                    cMode='kmeans', aMode='none', primaryTC=0):
        if not self.trained:
            raise EnvironmentError('Please train this Model first!')
        isoArray = self.tc[primaryTC].map.isovists.get_items_for_track(track, dim='full', in_walk_dir=True).swapaxes(0, 2)[
            ..., None].transpose((0, 2, 1, 3))
        smoothing = (isoArray.shape[0] // self.timesteps) * self.timesteps
        isoArray = isoArray[:smoothing]
        sequenceArray = np.array([isoArray[i:i+self.timesteps] for i in range(len(isoArray)-self.timesteps)])
        dummyData = self.tc[primaryTC].as_n_sample_4D(self.timesteps).astype(int)
        dummyData = dummyData.reshape((-1, self.timesteps, self.height, self.width, 1))
        testdata = np.row_stack((dummyData, sequenceArray))[-1000:]

        keys = [track[i+self.timesteps//2] for i in range(len(sequenceArray))]

        tsneArray = self.reduce_and_color(testData=testdata, nClusters=nClusters,
                                          primaryTC=primaryTC, aMode=aMode,
                                          cMode=cMode)[-len(sequenceArray):] + 1  # This is for color correction
        npmax = np.max(tsneArray[:, -1])+1
        figure = np.where(self.tc[primaryTC].map.imgArray > 0, npmax, 0)

        for i in range(len(sequenceArray)):
            figure[keys[i]] = tsneArray[i, -1]

        if multiPath:
            return keys, tsneArray[-1]
        else:
            self.print_n_show(figure, 'img', npmax, fileName=fileName, show=show)
            self.print_n_show(tsneArray, 'scatter', npmax, fileName=fileName, show=show)

        if completeSequence:
            if fileName:
                fileName = '%s_sequence.tif' % fileName[:fileName.find('.')]
            self._Trainer__colored_sequence(tsneArray, isoArray, maxVal=npmax, show=show, fileName=fileName)

    def __colored_sequence(self, tsneArray, isoArray, maxVal=0, show=True, fileName=''):
        """
        Returns all the Isovist sequences for a Track, next to its class color.
        :param tsneArray:
        :type tsneArray:
        :param isoArray:
        :type isoArray:
        :param maxVal:
        :type maxVal:
        :param show:
        :type show:
        :param fileName:
        :type fileName:
        :return:
        :rtype:
        """
        if not self.trained:
            raise EnvironmentError('Please train this Model first!')
        if maxVal == 0:
            maxVal, np.max(tsneArray[:, -1])

        spacing = 2
        figure = np.full(((spacing + len(tsneArray) * (self.height + spacing)), (self.timesteps + 1) * self.width), 2)

        # Iterate through a 4 by 4 grid with 100 spacing, to place the image
        for i in range(len(tsneArray)):
            backGround = np.full((self.height, self.width * (self.timesteps + 1)), tsneArray[i, -1])

            sequence = isoArray[i:i + self.timesteps].swapaxes(0, 1).reshape((
                self.height, self.timesteps * self.width))
            sequence = np.where(sequence > 0, maxVal, 0)
            backGround[:, 0:-self.width] = sequence
            figure[i * self.height + i * spacing: (i + 1)*self.height + i*spacing, :] = backGround
        if fileName:
            fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
        self.print_n_show(figure, 'img', maxVal, show=show, fileName=fileName)

    @staticmethod
    def print_n_show(x, mode, maxValue, show=True, fileName=''):
        # Scatterplot with Classes
        fig, ax = plt.subplots()
        # make the picture
        if mode == 'img' or mode == 'fig':
            pic = ax.imshow(x, cmap='gist_ncar', vmin=0, vmax=maxValue)
            cb = plt.colorbar(pic, spacing='proportional', ticks=np.linspace(0, maxValue, maxValue + 1))
        elif mode == 'rgba':
            pic = ax.imshow(x, cmap='gist_ncar', vmax=255)
            cb = plt.colorbar(pic, spacing='proportional', ticks=np.linspace(0, 255, maxValue + 1))
        elif mode == 'scatter':
            scat = ax.scatter(x[:, 0], x[:, 1], c=x[:, -1], )
            cb = plt.colorbar(scat, spacing='proportional', ticks=np.linspace(0, maxValue, maxValue + 1))
        elif mode == 'bars':
            objects = list(x.keys())
            y_pos = list(range(len(objects)))
            performance = [x[key] for key in objects]

            bar = ax.bar(y_pos, performance, align='center')
            # fig.xticks(y_pos, objects)
            # fig.ylabel('Usage')
            # fig.title('Programming language usage')
        else:
            raise ValueError('Mode needs to be "img", "fig", "bars", "rgba" or "scatter".')
        fig.tight_layout()
        if show:
            plt.show()
        if fileName:
            plt.savefig(fileName)

        return True

    def reduce_and_color(self, testData=None, aMode='tsne', nClusters=0, cMode='kmeans', eps=5, primaryTC=0):
        """
        :param  testData:   Numpy Arraym, shape (n, timesteps, height, width, 1)
        :type   testData:   sdf
        :param  eps:        When using cMode=DBSCAN,
        :type   eps:        int
        :param  aMode:      Dimensonal reduction mode, default = 'pca', other='tsne'
        :type   aMode:      str
        :param  cMode:      Clustering mode, default='kmeans', other='DBSCAN'
        :type   cMode:      str
        :param  nClusters:  Number of Clusters for kmeans-clustering if 0 nClusters=self.classes
        :type   nClusters:  int
        :param  primaryTC:  Index Number of the TrackCollection used for Basemap etc.
        :type   primaryTC:  int
        :return:            Numpy Array (n, X, Y, Labels)
        :rtype:             np.ndarray
        """
        if isinstance(testData, np.ndarray):
            if testData.shape[1:] != (self.timesteps, self.height, self.width, 1):
                raise ValueError('Shape must be (n, timesteps, height, width, 1), but was ', testData.shape)

        if not isinstance(testData, np.ndarray):
            testData = self.tc[primaryTC].as_n_sample_4D(self.timesteps)
            testData = testData.reshape((-1, self.timesteps, self.height, self.width, 1))

        n = testData.shape[0]

        C = np.zeros((n, self.cD * self.classes if self.mode == 'gumble' else self.classes))

        for i in range(0, n, 100):
            c = self.encoder([testData[i:i + 100]])[0]
            C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)

        if aMode == 'tsne':
            array = TSNE(metric='hamming').fit_transform(C.reshape(n, -1))
        elif aMode == 'pca':
            array = PCA(n_components=self.classes).fit_transform(C.reshape(n, -1))
        elif aMode.lower() == 'none':
            array = C.reshape(n, -1)
        else:
            raise ValueError('"aMode" needs to be either "pca" or "tsne".')

        if cMode.lower() == 'dbscan':
            labels = DBSCAN(eps=eps, min_samples=10).fit_predict(array)

        elif cMode.lower() in ['kmeans', 'kmean', 'k-mean', 'k-means']:
            nClusters = nClusters if nClusters > 0 else self.classes
            labels = KMeans(n_clusters=nClusters).fit_predict(array)
        else:
            raise ValueError('"cMode" needs to be either "kmeans" or "dbscan".')

        if len(np.unique(labels)) > 1:
            color = labels
        else:
            # Color Generating
            X = array[:, 0] + np.min(array[:, 0])
            Y = (np.min(array[:, 1]) + array[:, 1]) // (np.max(array[:, 1]) + np.min(array[:, 1]))
            color = X * Y

        return np.column_stack((array, color))

    def viz_clusters(self, aMode='pca', cMode='kmeans', testdata=None, fileName=''):
            dataArray = self.reduce_and_color(aMode=aMode, cMode=cMode, testData=testdata)

            self.print_n_show(dataArray, 'scatter', np.max(dataArray[:, -1]), fileName=fileName)
            return True

    # THIS WORKS in all modes
    def show_prediction(self, n, dataArray=None, show=True, fileName='', startI=0, primaryTC=0):
        if not fileName and not show:
            raise ValueError('Why are you doing this? Print smth or show it!')
        if self.mode in ['gumble', 'vae', 'refined']:
            if not isinstance(dataArray, np.ndarray):
                dataArray = self.tc[primaryTC].as_n_sample_4D(self.timesteps)
            seqWidth = self.width * self.timesteps
            spacing = 1
            sqrtDim = int(sqrt(n)) + 1
            fullwidth = (seqWidth + spacing) * sqrtDim
            fullheight = (self.height*2 + spacing) * sqrtDim

            figure = np.zeros((fullheight, fullwidth))
            for i in range(n):

                array = dataArray[i+startI]
                arr_h = self.model.predict(array.reshape((1, self.timesteps, self.height, self.width, 1)))
                f = np.ones((self.height*2, seqWidth))
                f[:self.height, :seqWidth] = array.reshape(seqWidth, self.height).swapaxes(0, 1)
                f[self.height:self.height*2, : seqWidth] = arr_h.reshape(seqWidth, self.height).swapaxes(0, 1)

                try:
                    y, x = divmod(i, sqrtDim)
                except ZeroDivisionError:
                    x, y = 0, 0

                figure[y*self.height*2 + y*spacing: (y+1)*self.height*2 + y*spacing,
                       x*seqWidth + x*spacing: (x+1)*seqWidth + x*spacing] = f
            if fileName:
                fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
                self.print_n_show(figure, 'img', maxValue=np.max(figure), fileName=fileName)
            if show:
                self.print_n_show(figure, 'img', maxValue=np.max(figure))

            return True

    def sample_latent(self, nSamples, show=True, fileName=''):
        if self.mode not in ['gumble', 'vae', 'refined']:
            raise ValueError('Needs to be either of "gumble", "vae", "refined"')
        if self.classes >= self.height:
            raise NotImplementedError('This cannot be shown, edit the Funciton!')

        seqWidth = self.width * self.timesteps
        spacing = 1
        sqrtDim = int(sqrt(nSamples)) + 1
        fullwidth = (seqWidth + spacing) * sqrtDim
        if self.mode == 'gumble':
            if self.cD >= fullwidth:
                raise NotImplementedError('This cannot be shown, please edit the Function!!!')
            lHSpace = (self.height - self.classes) // 2
            lWSpace = (seqWidth - self.cD) // 2
            lShape = (self.classes, self.cD)

        else:
            if self.classes >= seqWidth:
                raise NotImplementedError('To many Samples, this cannot be displayed, please edit the Function!!!')
            lHSpace = (self.height - 1) // 2
            lWSpace = (seqWidth - self.classes) // 2
            lShape = (-1, self.classes)

        fullheight = (self.height*2 + spacing) * sqrtDim
        figure = np.zeros((fullheight, fullwidth))

        for i in range(nSamples):
            f = np.ones((self.height * 2, seqWidth))

            if self.mode == 'gumble':
                # https://stackoverflow.com/a/42874726/7746808
                oneHot = np.eye(self.classes)[np.random.randint(0, self.classes, self.cD)]
                sample = oneHot.reshape((-1, self.classes*self.cD))
                f[lHSpace:lHSpace + self.classes, lWSpace:lWSpace + self.cD] = sample.swapaxes(0, 1).reshape(lShape)
            else:
                if self.mode == 'vae':
                    sampleSpace = np.random.randn(nSamples, self.classes)
                elif self.mode == 'refined':
                    sampleSpace = np.random.rand(nSamples, self.classes)
                else:
                    raise ValueError('Needs to be either of "gumble", "vae", "refined"')

                sample = sampleSpace[i].reshape(lShape)
                f[lHSpace:lHSpace+1, lWSpace:lWSpace + self.classes] = sample

            arr_h = self.generator.predict(sample)
            f[self.height:self.height*2, : seqWidth] = arr_h.reshape(seqWidth, self.height).swapaxes(0, 1)

            try:
                y, x = divmod(i, sqrtDim)
            except ZeroDivisionError:
                x, y = 0, 0

            figure[y * self.height * 2 + y*spacing: (y + 1) * self.height * 2 + y*spacing,
                   x * seqWidth + x*spacing: (x + 1) * seqWidth + x*spacing] = f

        if fileName:
            fileName = fileName if fileName.endswith('.tif') else '%s.tif' % fileName
            self.print_n_show(figure, 'img', maxValue=np.max(figure), fileName=fileName)
        if show:
            self.print_n_show(figure, 'img', maxValue=np.max(figure))
        return True

    def multi_path_coloring(self, nClusters, fileName='', state='', primaryTC=0, uncertainty=False, rgba=False):
        if nClusters <= 2:
            raise ValueError('More than 2 Classes are needed')

        if fileName and state.lower() == 'load':
            import pickle
            with open(fileName, 'rb') as file:
                patchDict = pickle.load(file)
        else:
            patchDict = defaultdict(list)
            for key in self.tc[primaryTC].keys():

                tempKeys, tempSequence = self.tc[primaryTC].as_n_sample_4D(self.timesteps,
                                                                           in_walk_dir=True,
                                                                           keys=True,
                                                                           moving_window=True,
                                                                           for_track=key)

                C = np.zeros((len(tempSequence), self.cD * self.classes if self.mode == 'gumble' else self.classes))

                for i in range(0, len(tempSequence), 100):
                    c = self.encoder([tempSequence[i:i + 100]])[0]
                    C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)

                for i, tempKey in enumerate(tempKeys):
                        patchDict[tempKey].append(list(C[i]))

        if fileName and state.lower() == 'dump':
            with open(fileName, 'wb') as f:
                import pickle
                pickle.dump(patchDict, f, pickle.HIGHEST_PROTOCOL)

        l = list()
        for x in patchDict.keys():
            for elem in patchDict[x]:
                l.append(elem + list(x))

        a = np.array(l)
        k = KMeans(nClusters).fit_predict(a[:, :-2]) + 1  # Color Correction

        s = np.zeros((a.shape[0], a.shape[1] + 1))
        s[:, :-1] = a
        s[:, -1] = k

        patchDict = defaultdict(list)
        for i in range(s.shape[0]):
            key = int(s[i][-3]), int(s[i][-2])
            patchDict[key].append(int(s[i][-1]))

        from collections import Counter
        c = Counter()
        for key in patchDict.keys():
            c[len(set(patchDict[key]))] += 1

        npmax = np.max(s[:, -1]) + 1
        # npmax = 4
        self.print_n_show(c, 'bars', npmax)

        if rgba:
            figure = Image.fromarray(np.where(self.tc[primaryTC].map.imgArray > 0, 255, 0)).convert('RGBA')
            draw = ImageDraw.Draw(figure, 'RGBA')
            from matplotlib import cm
            cmap = cm.get_cmap('gist_ncar', 12)  # 12 discrete colors

            for key in patchDict.keys():
                for value in patchDict[key]:
                    color = [int(x*255) for x in cmap(int(value), alpha=0.3)]
                    draw.point((key[1], key[0]), fill=tuple(color))

            self.print_n_show(figure, 'rgba', npmax)
        else:
            figure = np.where(self.tc[primaryTC].map.imgArray > 0, npmax, 0)
            for key in patchDict.keys():
                c = Counter(patchDict[key])
                figure[key] = c.most_common(1)[0][0]
            self.print_n_show(figure, 'img', npmax)

        if uncertainty:
            uncertainfig = np.where(self.tc[primaryTC].map.imgArray > 0, npmax + 1, 0)
            for key in patchDict.keys():
                uncertainfig[key] = len(set(patchDict[key])) * 2
            self.print_n_show(uncertainfig, 'img', npmax+1)
        return

    def show_silhouette_score(self, k_list, primaryTC=0):
        X = None
        for key in list(self.tc[primaryTC].keys())[:100]:
            tempSequence = self.tc[primaryTC].as_n_sample_4D(self.timesteps,
                                                             in_walk_dir=True,
                                                             keys=False,
                                                             moving_window=True,
                                                             for_track=key)
            C = np.zeros((len(tempSequence), self.cD * self.classes if self.mode == 'gumble' else self.classes))

            for i in range(0, len(tempSequence), 100):
                c = self.encoder([tempSequence[i:i + 100]])[0]
                C[i:i + 100] = c.reshape(-1, self.cD * self.classes if self.mode == 'gumble' else self.classes)
            if isinstance(X, np.ndarray):
                X = np.row_stack((X, C))
            else:
                X = C

        for n_clusters in k_list:
            # Create a subplot with 1 row and 2 columns
            fig, (ax1, ax2) = plt.subplots(1, 2)
            fig.set_size_inches(18, 7)

            # The 1st subplot is the silhouette plot
            # The silhouette coefficient can range from -1, 1 but in this example all
            # lie within [-0.1, 1]
            ax1.set_xlim([-0.1, 1])
            # The (n_clusters+1)*10 is for inserting blank space between silhouette
            # plots of individual clusters, to demarcate them clearly.
            ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

            # Initialize the clusterer with n_clusters value and a random generator
            # seed of 10 for reproducibility.
            clusterer = KMeans(n_clusters=n_clusters, random_state=10)
            cluster_labels = clusterer.fit_predict(X)

            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed
            # clusters
            silhouette_avg = silhouette_score(X, cluster_labels)
            print("For n_clusters =", n_clusters,
                  "The average silhouette_score is :", silhouette_avg)

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(X, cluster_labels)

            y_lower = 10
            for i in range(n_clusters):
                # Aggregate the silhouette scores for samples belonging to
                # cluster i, and sort them
                ith_cluster_silhouette_values = \
                    sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.spectral(float(i) / n_clusters)
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0, ith_cluster_silhouette_values,
                                  facecolor=color, edgecolor=color, alpha=0.7)

                # Label the silhouette plots with their cluster numbers at the middle
                ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                # Compute the new y_lower for next plot
                y_lower = y_upper + 10  # 10 for the 0 samples

            ax1.set_title("The silhouette plot for the various clusters.")
            ax1.set_xlabel("The silhouette coefficient values")
            ax1.set_ylabel("Cluster label")

            # The vertical line for average silhouette score of all the values
            ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

            ax1.set_yticks([])  # Clear the yaxis labels / ticks
            ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

            # 2nd Plot showing the actual clusters formed
            colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
            ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                        c=colors)

            # Labeling the clusters
            centers = clusterer.cluster_centers_
            # Draw white circles at cluster centers
            ax2.scatter(centers[:, 0], centers[:, 1],
                        marker='o', c="white", alpha=1, s=200)

            for i, c in enumerate(centers):
                ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)

            ax2.set_title("The visualization of the clustered data.")
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 2nd feature")

            plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                          "with n_clusters = %d" % n_clusters),
                         fontsize=14, fontweight='bold')

            plt.show()