"""Principal Convex Hull Analysis (PCHA) / Archetypal Analysis.""" from __future__ import division import numpy as np from numpy.matlib import repmat from scipy.sparse import csr_matrix from datetime import datetime as dt import time #from furthest_sum import furthest_sum def PCHA(X, noc, I=None, U=None, delta=0, verbose=False, conv_crit=1E-6, maxiter=500): """Return archetypes of dataset. Note: Commonly data is formatted to have shape (examples, dimensions). This function takes input and returns output of the transposed shape, (dimensions, examples). Parameters ---------- X : numpy.2darray Data matrix in which to find archetypes noc : int Number of archetypes to find I : 1d-array Entries of X to use for dictionary in C (optional) U : 1d-array Entries of X to model in S (optional) Output ------ XC : numpy.2darray I x noc feature matrix (i.e. XC=X[:,I]*C forming the archetypes) S : numpy.2darray noc x length(U) matrix, S>=0 |S_j|_1=1 C : numpy.2darray noc x length(U) matrix, S>=0 |S_j|_1=1 SSE : float Sum of Squared Errors varexlp : float Percent variation explained by the model """ def S_update(S, XCtX, CtXtXC, muS, SST, SSE, niter): """Update S for one iteration of the algorithm.""" noc, J = S.shape e = np.ones((noc, 1)) for k in range(niter): SSE_old = SSE g = (np.dot(CtXtXC, S) - XCtX) / (SST / J) g = g - e * np.sum(g.A * S.A, axis=0) S_old = S while True: S = (S_old - g * muS).clip(min=0) S = S / np.dot(e, np.sum(S, axis=0)) SSt = S * S.T SSE = SST - 2 * np.sum(XCtX.A * S.A) + np.sum(CtXtXC.A * SSt.A) if SSE <= SSE_old * (1 + 1e-9): muS = muS * 1.2 break else: muS = muS / 2 return S, SSE, muS, SSt def C_update(X, XSt, XC, SSt, C, delta, muC, mualpha, SST, SSE, niter=1): """Update C for one iteration of the algorithm.""" J, nos = C.shape if delta != 0: alphaC = np.sum(C, axis=0).A[0] C = np.dot(C, np.diag(1 / alphaC)) e = np.ones((J, 1)) XtXSt = np.dot(X.T, XSt) for k in range(niter): # Update C SSE_old = SSE g = (np.dot(X.T, np.dot(XC, SSt)) - XtXSt) / SST if delta != 0: g = np.dot(g, np.diag(alphaC)) g = g.A - e * np.sum(g.A * C.A, axis=0) C_old = C while True: C = (C_old - muC * g).clip(min=0) nC = np.sum(C, axis=0) + np.finfo(float).eps C = np.dot(C, np.diag(1 / nC.A[0])) if delta != 0: Ct = C * np.diag(alphaC) else: Ct = C XC = np.dot(X, Ct) CtXtXC = np.dot(XC.T, XC) SSE = SST - 2 * np.sum(XC.A * XSt.A) + np.sum(CtXtXC.A * SSt.A) if SSE <= SSE_old * (1 + 1e-9): muC = muC * 1.2 break else: muC = muC / 2 # Update alphaC SSE_old = SSE if delta != 0: g = (np.diag(CtXtXC * SSt).T / alphaC - np.sum(C.A * XtXSt.A)) / (SST * J) alphaC_old = alphaC while True: alphaC = alphaC_old - mualpha * g alphaC[alphaC < 1 - delta] = 1 - delta alphaC[alphaC > 1 + delta] = 1 + delta XCt = np.dot(XC, np.diag(alphaC / alphaC_old)) CtXtXC = np.dot(XCt.T, XCt) SSE = SST - 2 * np.sum(XCt.A * XSt.A) + np.sum(CtXtXC.A * SSt.A) if SSE <= SSE_old * (1 + 1e-9): mualpha = mualpha * 1.2 XC = XCt break else: mualpha = mualpha / 2 if delta != 0: C = C * np.diag(alphaC) return C, SSE, muC, mualpha, CtXtXC, XC N, M = X.shape if I is None: I = range(M) if U is None: U = range(M) SST = np.sum(X[:, U] * X[:, U]) # Initialize C try: i = furthest_sum(X[:, I], noc, [int(np.ceil(len(I) * np.random.rand()))]) except IndexError: class InitializationException(Exception): pass raise InitializationException("Initialization does not converge. Too few examples in dataset.") j = range(noc) C = csr_matrix((np.ones(len(i)), (i, j)), shape=(len(I), noc)).todense() XC = np.dot(X[:, I], C) muS, muC, mualpha = 1, 1, 1 # Initialise S XCtX = np.dot(XC.T, X[:, U]) CtXtXC = np.dot(XC.T, XC) S = -np.log(np.random.random((noc, len(U)))) S = S / np.dot(np.ones((noc, 1)), np.mat(np.sum(S, axis=0))) SSt = np.dot(S, S.T) SSE = SST - 2 * np.sum(XCtX.A * S.A) + np.sum(CtXtXC.A * SSt.A) S, SSE, muS, SSt = S_update(S, XCtX, CtXtXC, muS, SST, SSE, 25) # Set PCHA parameters iter_ = 0 dSSE = np.inf t1 = dt.now() varexpl = (SST - SSE) / SST if verbose: print('\nPrincipal Convex Hull Analysis / Archetypal Analysis') print('A ' + str(noc) + ' component model will be fitted') print('To stop algorithm press control C\n') dheader = '%10s | %10s | %10s | %10s | %10s | %10s | %10s | %10s' % ('Iteration', 'Expl. var.', 'Cost func.', 'Delta SSEf.', 'muC', 'mualpha', 'muS', ' Time(s) ') dline = '-----------+------------+------------+-------------+------------+------------+------------+------------+' while np.abs(dSSE) >= conv_crit * np.abs(SSE) and iter_ < maxiter and varexpl < 0.9999: if verbose and iter_ % 100 == 0: print(dline) print(dheader) print(dline) told = t1 iter_ += 1 SSE_old = SSE # C (and alpha) update XSt = np.dot(X[:, U], S.T) C, SSE, muC, mualpha, CtXtXC, XC = C_update( X[:, I], XSt, XC, SSt, C, delta, muC, mualpha, SST, SSE, 10 ) # S update XCtX = np.dot(XC.T, X[:, U]) S, SSE, muS, SSt = S_update( S, XCtX, CtXtXC, muS, SST, SSE, 10 ) # Evaluate and display iteration dSSE = SSE_old - SSE t1 = dt.now() if iter_ % 1 == 0: time.sleep(0.000001) varexpl = (SST - SSE) / SST if verbose: print('%10.0f | %10.4f | %10.4e | %10.4e | %10.4e | %10.4e | %10.4e | %10.4f \n' % (iter_, varexpl, SSE, dSSE/np.abs(SSE), muC, mualpha, muS, (t1-told).seconds)) # Display final iteration varexpl = (SST - SSE) / SST if verbose: print(dline) print(dline) print('%10.0f | %10.4f | %10.4e | %10.4e | %10.4e | %10.4e | %10.4e | %10.4f \n' % (iter_, varexpl, SSE, dSSE/np.abs(SSE), muC, mualpha, muS, (t1-told).seconds)) # Sort components according to importance ind, vals = zip( *sorted(enumerate(np.sum(S, axis=1)), key=lambda x: x[0], reverse=1) ) S = S[ind, :] C = C[:, ind] XC = XC[:, ind] return XC, S, C, SSE, varexpl """FurthestSum algorithm to efficiently generate initial seeds/archetypes.""" def furthest_sum(K, noc, i, exclude=[]): """Furthest sum algorithm, to efficiently generat initial seed/archetypes. Note: Commonly data is formatted to have shape (examples, dimensions). This function takes input and returns output of the transposed shape, (dimensions, examples). Parameters ---------- K : numpy 2d-array Either a data matrix or a kernel matrix. noc : int Number of candidate archetypes to extract. i : int inital observation used for to generate the FurthestSum. exclude : numpy.1darray Entries in K that can not be used as candidates. Output ------ i : int The extracted candidate archetypes """ def max_ind_val(l): return max(zip(range(len(l)), l), key=lambda x: x[1]) I, J = K.shape index = np.array(range(J)) index[exclude] = 0 index[i] = -1 ind_t = i sum_dist = np.zeros((1, J), np.complex128) if J > noc * I: Kt = K Kt2 = np.sum(Kt ** 2, axis=0) for k in range(1, noc + 11): if k > noc - 1: Kq = np.dot(Kt[:, i[0]], Kt) sum_dist -= np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[i[0]]) index[i[0]] = i[0] i = i[1:] t = np.where(index != -1)[0] Kq = np.dot(Kt[:, ind_t].T, Kt) sum_dist += np.lib.scimath.sqrt(Kt2 - 2 * Kq + Kt2[ind_t]) ind, val = max_ind_val(sum_dist[:, t][0].real) ind_t = t[ind] i.append(ind_t) index[ind_t] = -1 else: if I != J or np.sum(K - K.T) != 0: # Generate kernel if K not one Kt = K K = np.dot(Kt.T, Kt) K = np.lib.scimath.sqrt( repmat(np.diag(K), J, 1) - 2 * K + \ repmat(np.mat(np.diag(K)).T, 1, J) ) Kt2 = np.diag(K) # Horizontal for k in range(1, noc + 11): if k > noc - 1: sum_dist -= np.lib.scimath.sqrt(Kt2 - 2 * K[i[0], :] + Kt2[i[0]]) index[i[0]] = i[0] i = i[1:] t = np.where(index != -1)[0] sum_dist += np.lib.scimath.sqrt(Kt2 - 2 * K[ind_t, :] + Kt2[ind_t]) ind, val = max_ind_val(sum_dist[:, t][0].real) ind_t = t[ind] i.append(ind_t) index[ind_t] = -1 return i