""" This modules contains several classes to perform non-linear dimension reduction. Each class has 2 methods, 'train' and 'test' - 'train' performs the computation of low-simensional data embedding and the information to generalize to new data - 'test' computes the embedding for new dsamples of data This is done for - Multi-dimensional scaling - Isompap (knn or eps-neighb implementation) - Locality Preseving projections (LPP) - laplacian embedding (train only) Future developpements will include some supervised cases, e.g. LDA,LDE and the estimation of the latent dimension, at least in simple cases. """ import numpy as np import numpy.linalg as nl import numpy.random as nr import nipy.neurospin.graph.graph as fg import nipy.neurospin.graph.field as ff # ------------------------------------------------------------------ # ------------- Auxiliary functions -------------------------------- # ------------------------------------------------------------------ def _linear_dim_criterion_(l,k,dimf,n): """ likelihood = _linear_dim_criterion_(k,l,dimf,n) this function returns the likelihood of a dataset with rank k embedded in gaussian noise of shape(n,dimf) INPUT: - l = spectrum - k = test rank - dimf = maximal rank - n number of inputs OUPUT: The log-likelihood NOTE: This is imlpempented from Minka et al., 2001 """ if k>dimf: raise ValueError, "the dimension cannot exceed dimf" import scipy.special as SP Pu = -k*np.log(2) for i in range(k): Pu += SP.gammaln((dimf-i)/2)-np.log(np.pi)*(dimf-i)/2 pl = np.sum(np.log(l[:k])) Pl = -pl*n/2 if k==dimf: Pv = 0 v=1 else: v = np.sum(l[k:dimf])/(dimf-k) Pv = -np.log(v)*n*(dimf-k)/2 m = dimf*k-k*(k+1)/2 Pp = np.log(2*np.pi)*(m+k+1)/2 Pa = 0 nl = l.copy() nl[k:dimf] = v for i in range(k): for j in range (i+1,dimf): Pa = Pa + np.log((l[i]-l[j])*(1./nl[j]-1./nl[i]))+np.log(n) Pa = -Pa/2 lE = Pu+Pl+Pv+Pp+Pa-k*np.log(n)/2 return lE def infer_latent_dim(X,verbose = 0, maxr = -1): """ r = infer_latent_dim(X,verbose = 0) Infer the latent dimension of an aray assuming data+gaussian noise mixture INPUT: - an array X - verbose=0 : verbositry level - maxr=-1 maximum dimension that can be achieved if maxr = -1, this is equal to rank(X) OUPTUT - r the inferred dimension """ if maxr ==-1: maxr = np.minimum(X.shape[0],X.shape[1]) import numpy.linalg as L U,S,V = nl.svd(X,0) if verbose>1: print "Singular Values", S L = [] for k in range(maxr): L.append(_linear_dim_criterion_(S,k,X.shape[1],X.shape[0])/X.shape[0]) L = np.array(L) rank = np.argmax(L) if verbose: import matplotlib.pylab as mp mp.figure() mp.bar(np.arange(maxr),L-L.mean()) mp.show() return rank def Euclidian_distance(X,Y=None): """ Considering the rows of X (and Y=X) as vectors, compute the distance matrix between each pair of vector """ if Y == None: Y = X if X.shape[1]!=Y.shape[1]: raise ValueError, "incompatible dimension for X and Y matrices" s1 = X.shape[0] s2 = Y.shape[0] NX = np.reshape(np.sum(X*X,1),(s1,1)) NY = np.reshape(np.sum(Y*Y,1),(1,s2)) ED = np.repeat(NX,s2,1) ED = ED + np.repeat(NY,s1,0) ED = ED-2*np.dot(X,np.transpose(Y)) ED = np.maximum(ED,0) ED = np.sqrt(ED) return ED def CCA(X,Y,eps = 1.e-12): """ Canonical corelation analysis of two matrices INPUT: - X and Y are (nbitem,p) and (nbitem,q) arrays that are analysed - eps=1.e-12 is a small biasing constant to grant invertibility of the matrices OUTPUT - ccs: the canconical correlations NOTE - It is expected that nbitem>>max(p,q) - In general it makes more sense if p=q """ from numpy.linalg import cholesky,inv,svd if Y.shape[0]!=X.shape[0]: raise ValueError,"Incompatible dimensions for X and Y" nb = X.shape[0] p = X.shape[1] q = Y.shape[1] sqX = np.dot(np.transpose(X),X) sqY = np.dot(np.transpose(Y),Y) sqX += np.trace(sqX)*eps*np.eye(p) sqY += np.trace(sqY)*eps*np.eye(q) rsqX = cholesky(sqX)# sqX = rsqX*rsQx^T rsqY = cholesky(sqY) iX = np.transpose(inv(rsqX)) iY = np.transpose(inv(rsqY)) Cxy = np.dot(np.transpose(np.dot(X,iX)),np.dot(Y,iY)) uv,ccs,vv = svd(Cxy) return ccs def Euclidian_mds(X,dim,verbose=0): """ returns a dim-dimensional MDS representation of the rows of X using an Euclidian metric """ d = Euclidian_distance(X) return(mds(d,dim,verbose)) def mds(dg,dim=1,verbose=0): """ Multi-dimensional scaling, i.e. derivation of low dimensional representations from distance matrices. INPUT: - dg: a (nbitem,nbitem) distance matrix - dim=1: the dimension of the desired representation - verbose=0: verbosity level """ # take the square distances and center the matrix dg = dg*dg rm0 = dg.mean(0) rm1 = dg.mean(1) mm = dg.mean() dg = dg-rm0 dg = np.transpose(np.transpose(dg)-rm1) dg = dg+mm import numpy.linalg as L U,S,V = nl.svd(dg,0) S = np.sqrt(S) chart = np.dot(U,np.diag(S)) chart = chart[:,:dim] if verbose: import matplotlib.pylab as mp mp.figure() mp.bar(np.arange(np.size(S)),S) mp.show() return chart,np.transpose(V),rm0 def isomap_dev(G,dim=1,p=300,verbose = 0): """ chart,proj,offset =isomap(G,dim=1,p=300,verbose = 0) Isomapping of the data return the dim-dimensional ISOMAP chart that best represents the graph G INPUT: - G : Weighted graph that represents the data - dim=1 : number of dimensions - p=300 : nystrom reduction of the problem - verbose = 0: verbosity level OUTPUT - chart, array of shape(G.V,dim) NOTE: - this 'dev' version is expected to yield more accurate results than the other approximation, because of a better out of samples generalization procedure. """ n = G.V dim = np.minimum(dim,n) p = np.minimum(p,n) # get the geodesic distances in the graph if p0 half space if ((X1[2]>0) & (X2[2]>0)): if X1[1]*X2[1]<0: OK[e]=0 if X1[0]*X2[0]<0: OK[e]=0 G.remove_edges(OK) return G,X def _test_isomap_orange(verbose=0): """ A test of the isomap new look using a swiss roll """ nbsamp = 1000 G,X = _orange(nbsamp) nbseed = 1000#nbsamp rdim = 3 u = isomap_dev(G,rdim,nbseed,1) check_isometry(G,u[:,:2],nseeds =100) #v = local_correction_for_embdedding(G,u[:,:2],sigma = 1.0) K = partial_floyd_graph(G,300) v = sparse_local_correction_for_embedding(K,u[:,:2],sigma = 1.0) check_isometry(G,v,nseeds =100) if verbose: import matplotlib.pylab as mp mp.figure() mp.plot(u[:,0],u[:,1],'.') mp.plot(v[:,0],v[:,1],'.r') mp.show() def _test_cca(): """ Basic (valid) test of the CCA """ X = nr.randn(100,3) Y = nr.randn(100,3) cc1 = CCA(X,Y) A = nr.randn(3,3) Z = np.dot(X,A) cc2 = CCA(X,Z) cc3 = CCA(Y,Z) test = (np.sum((cc1-cc3)**2)<1.e-7)&(np.min(cc2>1.e-7)) return test def _test_isomap_dev(): """ A test of the isomap new look using a swiss roll """ nbsamp = 1000 X,x = _swiss_roll(nbsamp) G = fg.WeightedGraph(nbsamp) G.knn(X,8) nbseed = 300 rdim = 3 u = isomap_dev(G,rdim,nbseed,1) sv = CCA(x,u[:,:3]) check_isometry(G,u[:,:2],nseeds =100) return (sv.sum()>1.9) def _test_mds(): X = nr.randn(10,3) M = MDS(X,rdim=2) u = M.train() x = X[:2,:] a = M.test(x) eps = 1.e-12 test = np.sum(a-u[:2,:])**2