""" Bayesian Gaussian Mixture Model Classes: contains the basic fields and methods of Bayesian GMMs the high level functions are/should be binded in C The base class BGMM relies on an implementation that perfoms Gibbs sampling A derived class VBGMM uses Variational Bayes inference instead A third class is introduces to take advnatge of the old C-bindings, but it is limited to diagonal covariance models fixme: the docs should be rewritten Author : Bertrand Thirion, 2008-2009 """ import numpy as np import numpy.random as nr import nipy.neurospin.clustering.clustering as fc from gmm import GMM # -------------------------------------------- # --- ancillary functions -------------------- # -------------------------------------------- #fixme : this might be put elsewehere def dirichlet_eval(w,alpha): """ Evaluate the probability of a certain discrete draw w from the Dirichlet density with parameters alpha Parameters ---------- w: array of shape (n) alpha: array of shape (n) FIXME : check that the dimensions of x and alpha are compatible """ from scipy.special import gammaln if np.shape(w)!=np.shape(alpha): raise ValueError , "incompatible dimensions" loge = np.sum((alpha-1)*np.log(w)) logb = np.sum(gammaln(alpha))-gammaln(alpha.sum()) loge-= logb return np.exp(loge) def generate_normals(m,P): """ Generate a Gaussian sample with mean m and precision P Parameters ---------- m array of shape n: the mean vector P array of shape (n,n): the precision matrix Returns ------- ng : array of shape(n): a draw from the gaussian density """ from numpy.linalg import cholesky,inv L = inv(cholesky(P)) ng = nr.randn(m.shape[0]) ng = np.dot(ng,L) ng += m return ng def generate_Wishart(n,V): """ Generate a sample from Wishart Parameters ---------- n (scalar) = the number of degrees of freedom (dofs) V = array of shape (n,n) the scale matrix Returns ------- W: array of shape (n,n): the Wishart draw """ from numpy.linalg import cholesky L = cholesky(V) p = V.shape[0] A = nr.randn(p,p) a = np.array([np.sqrt(nr.chisquare(n-i)) for i in range(p)]) for i in range(p): A[i,i:] = 0 A[i,i] = a[i] R = np.dot(L,A) W = np.dot(R,R.T) return W def Wishart_eval(n,V,W): """ Evaluation of the probability of W under Wishart(n,V) Parameters ---------- n (scalar) = the number of degrees of freedom (dofs) V = array of shape (n,n) the scale matrix W: array of shape (n,n): the Wishart draw Returns ------- (float) the density """ # check that shape(V)==shape(W) from numpy.linalg import det,pinv from scipy.special import gammaln p = V.shape[0] ldW = np.log(det(W))*(n-p-1)/2 ltr = - np.trace(np.dot(pinv(V),W))/2 la = ( n*p*np.log(2) + np.log(det(V))*n )/2 lg = np.log(np.pi)*p*(p-1)/4 for j in range(p): lg += gammaln((n-j)/2) lt = ldW + ltr -la -lg return np.exp(lt) def normal_eval(mu,P,x): """ Probability of x under normal(mu,inv(P)) Parameters ---------- mu: array of shape (n): the mean parameter P: array of shape (n,n): the precision matrix x: array of shape (n): the data to be evaluated Returns ------- (float) the density """ from numpy.linalg import det p = np.size(mu) mu = np.reshape(mu,(1,p)) w0 = np.log(det(P/2/np.pi)) w0/=2 x = np.reshape(x,(1,p)) q = np.dot(np.dot(mu-x,P),np.transpose(mu-x)) w = w0 - q/2 L = np.exp(w) return np.squeeze(L) def generate_perm(k,nperm=100): """ returns an array of shape(nbperm, k) representing the permutations of k elements - nperm=100 if k>5: only nperm random draws are generated Returns ------- p: array of shape(nperm,k): each row is permutation of k """ if k==1: return np.reshape(np.array([0]),(1,1)).astype(np.int) if k<6: aux = generate_perm(k-1) n = aux.shape[0] perm = np.zeros((n*k,k)).astype(np.int) for i in range(k): perm[i*n:(i+1)*n,:i] = aux[:,:i] perm[i*n:(i+1)*n,i] = k-1 perm[i*n:(i+1)*n,i+1:] = aux[:,i:] else: from numpy.random import rand perm = np.zeros((nperm,k)).astype(np.int) for i in range(nperm): p = np.argsort(rand(k)) perm[i,:] = p return perm def apply_perm(perm,z): """ Permutation of the values of z """ z0 = perm[z.copy()] return z0 def multinomial(Likelihood): """ Generate samples form a miltivariate distribution Parameters ---------- Likelihood: array of shape (nelements, nclasses): likelihood of each element belongin to each class each row is assumedt to sum to 1 One sample is draw from each row, resulting in Returns ------- z array of shape (nelements): the draws, that take values in [0..nclasses-1] """ nvox = Likelihood.shape[0] nclasses = Likelihood.shape[1] cuml = np.zeros((nvox,nclasses+1)) cuml[:,1:] = np.cumsum(Likelihood,1) aux = np.random.rand(nvox,1) z = np.argmax(aux1: cent,z,J = fc.cmeans(x,self.k) else: z = np.zeros(x.shape[0]).astype(np.int) self.update(x,z) def pop(self,z): """ compute the population, i.e. the statistics of allocation Parameters ---------- z array of shape (nbitems), type = np.int the allocation variable Returns ------- hist : array shape (self.k)n count variable """ hist = np.array([np.sum(z==k) for k in range(self.k)]) return hist def update_weights(self,z): """ Given the allocation vector z, resmaple the weights parameter Parameters ---------- z array of shape (nbitems), type = np.int the allocation variable """ pop = self.pop(z) weights = pop+self.prior_weights self.weights = np.random.dirichlet(weights) def update_means(self,x,z): """ Given the allocation vector z, and the corresponding data x, resample the mean Parameters ---------- x array of shape (nbitems,self.dim) the data used in the estimation process z array of shape (nbitems), type = np.int the corresponding classification """ pop = self.pop(z) self.shrinkage = self.prior_shrinkage + pop empmeans = np.zeros(np.shape(self.means)) prior_shrinkage = np.reshape(self.prior_shrinkage,(self.k,1)) shrinkage = np.reshape(self.shrinkage,(self.k,1)) for k in range(self.k): empmeans[k] = np.sum(x[z==k],0) means = empmeans + self.prior_means*prior_shrinkage means/= shrinkage for k in range(self.k): self.means[k] = generate_normals(\ means[k],self.precisions[k]*self.shrinkage[k]) def update_precisions(self,x,z): """ Given the allocation vector z, and the corresponding data x, resample the precisions Parameters ---------- x array of shape (nbitems,self.dim) the data used in the estimation process z array of shape (nbitems), type = np.int the corresponding classification """ from numpy.linalg import pinv pop = self.pop(z) self.dof = self.prior_dof + pop +1 #computing the empirical covariance empmeans = np.zeros(np.shape(self.means)) for k in range(self.k): empmeans[k] = np.sum(x[z==k],0) rpop = (pop+(pop==0)).astype('f') empmeans= (empmeans.T/rpop).T empcov = np.zeros(np.shape(self.precisions)) for k in range(self.k): dx = np.reshape(x[z==k]-empmeans[k],(pop[k],self.dim)) empcov[k] += np.dot(dx.T,dx) covariance = np.array([pinv(self.prior_scale[k]) for k in range(self.k)]) covariance += empcov dx = np.reshape(empmeans-self.prior_means,(self.k,self.dim,1)) addcov = np.array([np.dot(dx[k],dx[k].T) for k in range(self.k)]) prior_shrinkage = np.reshape(self.prior_shrinkage,(self.k,1,1)) covariance += addcov*prior_shrinkage scale = np.array([pinv(covariance[k]) for k in range(self.k)]) for k in range(self.k): self.precisions[k] = generate_Wishart(self.dof[k],scale[k]) def update(self,x,z): """ update function (draw a sample of the GMM parameters) Parameters ---------- x array of shape (nbitems,self.dim) the data used in the estimation process z array of shape (nbitems), type = np.int the corresponding classification """ self.update_weights(z) self.update_precisions(x,z) self.update_means(x,z) def sample_indicator(self,L): """ sample the indicator from the likelihood Parameters ---------- L: array of shape (nbitem,self.k) component-wise likelihood Returns ------- z: array of shape(nbitem): a draw of the membership variable """ tiny = 1+1.e-15 L = np.transpose(np.transpose(L)/L.sum(1)) L/=tiny z = multinomial(L) return z def sample(self,x,niter=1,mem=0,verbose=0): """ sample the indicator and parameters Parameters ---------- x array of shape (nbitems,self.dim) the data used in the estimation process niter=1 : the number of iterations to perform mem=0: if mem, the best values of the parameters are computed verbose=0: verbosity mode Returns ------- best_weights: array of shape (self.k) best_means: array of shape (self.k,self.dim) best_precisions: array of shape (self.k,self.dim,self.dim) possibleZ: array of shape (nbitems,niter) the z that give the highest posterior to the data is returned first """ self.check_x(x) if mem: possibleZ = -np.ones((x.shape[0],niter)).astype(np.int) score = -np.infty bpz = -np.infty Mll = 0 for i in range(niter): L = self.likelihood(x) sll = np.mean(np.log(np.sum(L,1))) sll += np.log(self.probability_under_prior()) if sll>score: score = sll best_weights = self.weights.copy() best_means = self.means.copy() best_precisions = self.precisions.copy() z = self.sample_indicator(L) if mem: possibleZ[:,i] = z puz = sll # to save time #puz = self._posterior_under_z(x,z) self.update(x,z) if puz>bpz: ibz = i bpz = puz if mem: aux = possibleZ[:,0].copy() possibleZ[:,0] = possibleZ[:,ibz].copy() possibleZ[:,ibz] = aux return best_weights, best_means,best_precisions,possibleZ def sample_and_average(self,x,niter=1,verbose=0): """ sample the indicator and parameters the average values for weights,means, precisions are returned Parameters ---------- x = array of shape (nbitems,dim) the data from which bic is computed niter=1: number of iterations Returns ------- weights: array of shape (self.k) means: array of shape (self.k,self.dim) precisions: array of shape (self.k,self.dim,self.dim) or (self.k, self.dim) these are the average parameters across samplings Note ---- All this makes sense only if no label switching as occurred so this is wrong in general (asymptotically) fix: implement a permutation procedure for components identification """ aprec = np.zeros(np.shape(self.precisions)) aweights = np.zeros(np.shape(self.weights)) ameans = np.zeros(np.shape(self.means)) for i in range(niter): L = self.likelihood(x) z = self.sample_indicator(L) self.update(x,z) aprec += self.precisions aweights += self.weights ameans += self.means aprec/=niter ameans/=niter aweights/=niter return aweights, ameans, aprec def probability_under_prior(self): """ Compute the probability of the current parameters of self given the priors """ p0 = 1 p0 = dirichlet_eval(self.weights,self.prior_weights) for k in range(self.k): mp = self.precisions[k]*self.prior_shrinkage[k] p0 *= normal_eval(self.prior_means[k],mp,self.means[k]) p0 *= Wishart_eval(self.prior_dof[k],self.prior_scale[k], self.precisions[k]) return p0 def conditional_posterior_proba(self,x,z): """ Compute the probability of the current parameters of self given x and z Parameters ---------- x= array of shape (nbitems,dim) the data from which bic is computed z= array of shape (nbitems), type = np.int the corresponding classification """ pop = self.pop(z) #0. Compute the empirical means empmeans = np.zeros(np.shape(self.means)) for k in range(self.k): empmeans[k] = np.sum(x[z==k],0) rpop = (pop+(pop==0)).astype(np.float) empmeans = (empmeans.T/rpop).T #1. the precisions dof = self.prior_dof + pop +1 empcov = np.zeros(np.shape(self.precisions)) for k in range(self.k): dx = np.reshape(x[z==k]-empmeans[k],(pop[k],self.dim)) empcov[k] += np.dot(dx.T,dx) from numpy.linalg import pinv covariance = np.array([pinv(self.prior_scale[k]) for k in range(self.k)]) covariance += empcov dx = np.reshape(self.means-self.prior_means, (self.k,self.dim,1)) addcov = np.array([np.dot(dx[k],dx[k].T) for k in range(self.k)]) prior_shrinkage = np.reshape(self.prior_shrinkage,(self.k,1,1)) covariance += addcov*prior_shrinkage scale = np.array([pinv(covariance[k]) for k in range(self.k)]) #2. the means empmeans= (empmeans.T*rpop).T shrinkage = self.prior_shrinkage + pop prior_shrinkage = np.reshape(self.prior_shrinkage,(self.k,1)) shrinkage = np.reshape(shrinkage,(self.k,1)) means = empmeans + self.prior_means*prior_shrinkage means/= shrinkage #3. the weights weights = np.array([np.sum(z==k) for k in range(len(self.weights))]) weights += self.prior_weights #4. evaluate the posteriors pp = 1 pp = dirichlet_eval(self.weights,weights) for k in range(self.k): pp*= Wishart_eval(dof[k],scale[k],self.precisions[k]) for k in range(self.k): #mp = self.precisions[k]*shrinkage[k] mp = scale[k]*shrinkage[k] pp *= normal_eval(means[k],mp,self.means[k]) return pp def evidence(self,x,z,nperm=0,verbose=0): """ See Bfactor(self,x,z,nperm=0,verbose=0) """ return self.Bfactor(self,x,z,nperm,verbose) def Bfactor(self,x,z,nperm=0,verbose=0): """ Evaluate the Bayes Factor of the current model using Chib's method Parameters ---------- x= array of shape (nbitems,dim) the data from which bic is computed z= array of shape (nbitems), type = np.int the corresponding classification nperm=0 : the number of permutations to sample to model the label switching issue in the computation of the BF By default, exhaustive permutations are used if nperm>0, this number is used. Note that this is simply to save time verbose=0: verbosity mode Returns ------- bf (float) the computed evidence (Bayes factor) Note ---- See: Marginal Likelihood from the Gibbs Output Journal article by Siddhartha Chib; Journal of the American Statistical Association, Vol. 90, 1995 """ niter = z.shape[1] p = [] perm = generate_perm(self.k) if nperm>perm.shape[0]: nperm = perm.shape[0] for i in range(niter): if nperm==0: for j in range(perm.shape[0]): pz = apply_perm(perm[j],z[:,i]) temp = self.conditional_posterior_proba(x,pz) p.append(temp) else: drand = np.argsort(np.random.rand(perm.shape[0]))[:nperm] for j in drand: pz = apply_perm(perm[j],z[:,i]) temp = self.conditional_posterior_proba(x,pz) p.append(temp) p = np.array(p) mp = np.mean(p) p0 = self.probability_under_prior() L = self.likelihood(x) bf = np.log(p0) + np.sum(np.log(np.sum(L,1)))- np.log(mp) if verbose: print np.log(p0), np.sum(np.log(np.sum(L,1))), np.log(mp) return bf def _posterior_under_z(self,x,z=None): """ return the integrated likelihood of the data Using Wishart-Normal model the values are not weighted by the component weights deprecated """ from numpy.linalg import det,inv from scipy.special import gammaln if z==None: z = np.zeros(x.shape[0],np.int) w=0 for k in range(self.k): x = x[z==k,:] if (np.size(x)>0): n = x.shape[0] a = self.prior_dof[k] tau = self.prior_shrinkage[k] tau /= (n+tau) w0 = - n*np.log(np.pi)*self.dim w0 += np.log(tau)*self.dim w0/=2 for j in range(self.dim): w0 += gammaln((a+n-j)/2) w0 -= gammaln((a-j)/2) m = np.reshape(self.prior_means[k],(1,self.dim)) b = self.prior_scale[k] ib = inv(b) Q = np.dot(np.transpose(x),x) M = np.reshape(np.sum(x,0),(1,self.dim)) f = ib + Q + n*tau*np.dot(np.transpose(m),m) f -= np.dot(np.transpose(M),M)/(self.prior_shrinkage[k]+n) f -= tau* (np.dot(np.transpose(m),M)+np.dot(np.transpose(M),m)) w0 += a*np.log(det(ib))/2 w0 -= (a+n)*np.log(det(f))/2 w += w0 return w # --------------------------------------------------------- # --- Variational Bayes inference ------------------------- # --------------------------------------------------------- class VBGMM(BGMM): """ Particular subcalss of Bayesian GMMs (BGMM) that implements Variational bayes estimation of the parameters """ def __init__(self, k=1, dim=1, means=None, precisions=None, weights=None, shrinkage=None, dof=None): BGMM.__init__(self, k, dim, means, precisions, weights,shrinkage, dof) self.scale = self.precisions.copy() def _Estep(self,x): """ VB-E step returns the likelihood of the data for each class Parameters ---------- x array of shape (nbitems,dim) the data used in the estimation process Returns ------- L array of shape(nbitem,self.k) component-wise likelihood """ n = x.shape[0] L = np.zeros((n,self.k)) from scipy.special import psi from numpy.linalg import det spsi = psi(np.sum(self.weights)) for k in range(self.k): # compute the data-independent factor first w0 = psi(self.weights[k])-spsi w0 += 0.5*np.log(det(self.scale[k])) w0 -= self.dim*0.5/self.shrinkage[k] w0 += 0.5*np.log(2)*self.dim for i in range (self.dim): w0 += 0.5*psi((self.dof[k]-i)/2) m = np.reshape(self.means[k],(1,self.dim)) b = self.dof[k]*self.scale[k] q = np.sum(np.dot(m-x,b)*(m-x),1) w = w0 - q/2 w -= 0.5*np.log(2*np.pi)*self.dim L[:,k] = np.exp(w) if L.min()<0: stop return L def evidence(self,x,L = None,verbose=0): """ computation of evidence or integrated likelihood Parameters ---------- x array of shape (nbitems,dim) the data from which bic is computed l=None: array of shape (nbitem,self.k) component-wise likelihood If None, it is recomputed verbose=0: verbosity model Returns ------- ev (float) the computed evidence """ from scipy.special import psi from numpy.linalg import det,pinv tiny = 1.e-15 if L==None: L = self._Estep(x) L = (L.T/np.maximum(L.sum(1),tiny)).T pop = L.sum(0)[:self.k] pop = np.reshape(pop,(self.k,1)) spsi = psi(np.sum(self.weights)) empmeans = np.dot(L.T[:self.k],x)/np.maximum(pop,tiny) F = 0 # start with the average likelihood term for k in range(self.k): # compute the data-independent factor first Lav = psi(self.weights[k])-spsi Lav -= np.sum(L[:,k]*np.log(np.maximum(L[:,k],tiny)))/pop[k] Lav -= 0.5*self.dim*np.log(2*np.pi) Lav += 0.5*np.log(det(self.scale[k])) Lav += 0.5*np.log(2)*self.dim for i in range (self.dim): Lav += 0.5*psi((self.dof[k]-i)/2) Lav -= self.dim*0.5/self.shrinkage[k] Lav*= pop[k] empcov = np.zeros((self.dim,self.dim)) dx = x-empmeans[k] empcov = np.dot(dx.T,L[:,k:k+1]*dx) Lav -= 0.5*np.trace(np.dot(empcov,self.scale[k]*self.dof[k])) F+= Lav #then the KL divergences prior_covariance = np.array([pinv(self.prior_scale[k]) for k in range(self.k)]) covariance = np.array([pinv(self.scale[k]) for k in range(self.k)]) Dklw = 0 Dklg = 0 Dkld = dkl_dirichlet(self.weights,self.prior_weights) for k in range(self.k): Dklw += dkl_wishart(self.dof[k],covariance[k], self.prior_dof[k],prior_covariance[k]) nc = self.scale[k]*(self.dof[k]*self.shrinkage[k]) nc0 = self.scale[k]*(self.dof[k]*self.prior_shrinkage[k]) Dklg += dkl_gaussian(self.means[k],nc,self.prior_means[k],nc0) Dkl = Dkld + Dklg + Dklw if verbose: print 'Lav', F, 'Dkl',Dkld,Dklg,Dklw return F-Dkl def _Mstep(self,x,L): """ VB-M step Parameters ---------- x: array of shape(nbitem,self.dim) the data from which the model is estimated L: array of shape(nbitem,self.k) the likelihood of the data under each class """ from numpy.linalg import pinv tiny =1.e-15 pop = L.sum(0) # shrinkage,weights,dof self.weights = self.prior_weights + pop pop = pop[0:self.k] L = L[:,:self.k] self.shrinkage = self.prior_shrinkage + pop self.dof = self.prior_dof + pop #reshape pop = np.reshape(pop,(self.k,1)) prior_shrinkage = np.reshape(self.prior_shrinkage,(self.k,1)) shrinkage = np.reshape(self.shrinkage,(self.k,1)) # means means = np.dot(L.T,x)+ self.prior_means*prior_shrinkage self.means= means/shrinkage #precisions empmeans = np.dot(L.T,x)/np.maximum(pop,tiny) empcov = np.zeros(np.shape(self.prior_scale)) for k in range(self.k): dx = x-empmeans[k] empcov[k] = np.dot(dx.T,L[:,k:k+1]*dx) covariance = np.array([pinv(self.prior_scale[k]) for k in range(self.k)]) covariance += empcov dx = np.reshape(empmeans-self.prior_means,(self.k,self.dim,1)) addcov = np.array([np.dot(dx[k],dx[k].T) for k in range(self.k)]) apms = np.reshape(prior_shrinkage*pop/shrinkage,(self.k,1,1)) covariance += addcov*apms self.scale = np.array([pinv(covariance[k]) for k in range(self.k)]) # fixme : compute the MAP of the precisions #(not used, but for completness and interpretation) def initialize(self, x): """ initialize z using a k-means algorithm, then upate the parameters Parameters ---------- x: array of shape (nbitems,self.dim) the data used in the estimation process """ n = x.shape[0] if self.k>1: cent,z,J = fc.cmeans(x,self.k) else: z = np.zeros(x.shape[0]).astype(np.int) l = np.zeros((n,self.k)) l[np.arange(n),z]=1 self._Mstep(x,l) def map_label(self, x, L=None): """ return the MAP labelling of x Parameters ---------- x array of shape (nbitem,dim) the data under study L=None array of shape(nbitem,self.k) component-wise likelihood if L==None, it is recomputed Returns ------- z: array of shape(nbitem): the resulting MAP labelling of the rows of x """ if L== None: L = self.likelihood(x) z = np.argmax(L,1) return z def estimate(self,x, niter=100, delta = 1.e-4, verbose=0): """ estimation of self given x Parameters ---------- x array of shape (nbitem,dim) the data from which the model is estimated z = None: array of shape (nbitem) a prior labelling of the data to initialize the computation niter=100: maximal number of iterations in the estimation process delta = 1.e-4: increment of data likelihood at which convergence is declared verbose=0: verbosity mode """ # alternation of E/M step until convergence tiny = 1.e-15 cc = np.zeros(np.shape(self.means)) allOld = -np.infty for i in range(niter): cc = self.means.copy() L = self._Estep(x) all = np.mean(np.log(np.maximum( np.sum(L,1),tiny))) if all