#!/usr/bin/env ccp4-python #edit the sidechains to make polyala, all and reliable import copy from collections import namedtuple import logging import re import os import shutil import mmtbx.superpose from ample.util import ample_util from ample.util import pdb_edit _logger = logging.getLogger() SCORE_MATRIX_NAME = 'score.matrix' FILE_LIST_NAME = 'files.list' RMSD_MAX = 50 QSCORE_MIN = 0.01 class SubClusterer(object): """Base class for clustering pdbs by distance Sub-classes just need to provide a generate_distance_matrix class """ def __init__(self,executable=None): if executable and not os.path.exists(executable) and os.access(executable, os.X_OK): raise RuntimeError,"Cannot find subclusterer executable: {0}".format(executable) self.executable = executable self.distance_matrix = None self.index2pdb = [] return def generate_distance_matrix(self,pdb_list): assert False def cluster_by_radius(self, radius): """Return a list of pdbs clustered by the given radius""" if self.distance_matrix is None: raise RuntimeError,"Need to call generate_distance_matrix before cluster_by_radius!" cluster_indices = self._cluster_indices(radius) if cluster_indices: return [ self.index2pdb[i] for i in cluster_indices ] else: return None def _cluster_indices(self,thresh): """Return the indices of the largest cluster that have distances < thresh. We loop through each row of the distance matrix and for each row (pdb) see how many pdbs are < thresh to this pdb. We return the largest cluster. """ #self.dump_matrix("maxcluster.csv") thresh=float(thresh) max_cluster=[] m=self.distance_matrix for i in range(len(m)): cluster=[i] for j in range(len(m)): if m[i][j] is None or j==i: continue if float(m[i][j]) < thresh: cluster.append(j) if len(cluster) > len(max_cluster): max_cluster=copy.copy(cluster) if len(max_cluster) == 1: return None else: return sorted(max_cluster) def dump_raw_matrix(self,file_name): with open(file_name,'w') as f: for row in self.distance_matrix: f.write(",".join(map(str,row))+"\n") f.write("\n") return def dump_pdb_matrix(self, file_name=SCORE_MATRIX_NAME, offset=0): with open(file_name,'w') as f: l = len(self.distance_matrix) + offset for i in range(offset, l): for j in range(i, l): f.write("{0: > 4d} {1: > 4d} {2: > 8.3F}\n".format(i,j, self.distance_matrix[i-offset][j-offset])) f.write("\n") return os.path.abspath(file_name) class CctbxClusterer(SubClusterer): """Class to cluster files with maxcluster""" def generate_distance_matrix(self, pdb_list): """Run cctbx to generate the distance distance_matrix""" num_models = len(pdb_list) if not num_models: msg = "generate_distance_matrix got empty pdb_list!" logging.critical(msg) raise RuntimeError, msg # Index is just the order of the pdb in the file self.index2pdb=pdb_list # Create a square distance_matrix num_models in size filled with None self.distance_matrix = [[None for col in range(num_models)] for row in range(num_models)] # Set zeros diagonal for i, m1 in enumerate(pdb_list): fixed = mmtbx.superpose.SuperposePDB(m1, preset='ca', log=None, quiet=True) for j, m2 in enumerate(pdb_list): if j <= i: continue moving = mmtbx.superpose.SuperposePDB(m2, preset='ca', log=None, quiet=True) rmsd, lsq = moving.superpose(fixed) self.distance_matrix[i][j]=float(rmsd) # Copy in other half of matrix - we use a full matrix as it's easier to scan for clusters for x in range(len(self.distance_matrix)): for y in range(len(self.distance_matrix)): self.distance_matrix[y][x] = self.distance_matrix[x][y] return class FpcClusterer(SubClusterer): """Class to cluster files with fast_protein_clusterer""" def generate_distance_matrix(self,pdb_list): # Create list of pdb files fname = os.path.join(os.getcwd(), "files.list" ) with open( fname, 'w' ) as f: f.write( "\n".join( pdb_list )+"\n" ) # Index is just the order of the pdb in the file self.index2pdb=pdb_list # Run fast_protein_cluster - this is just to generate the distance matrix, but there # doesn't seem to be a way to stop it clustering as well - not a problem as it just # generates more files log_name = os.path.abspath("fast_protein_cluster.log") matrix_file = "fpc.matrix" cmd = [self.executable, "--cluster_write_text_matrix", matrix_file, "-i", fname] retcode = ample_util.run_command( cmd, logfile=log_name ) if retcode != 0: msg = "non-zero return code for fast_protein_cluster in generate_distance_matrix!\nCheck logfile:{0}".format(log_name) logging.critical( msg ) raise RuntimeError, msg mlen=0 data=[] with open(matrix_file) as f: for l in f: l=l.strip().split() x=int(l[0]) y=int(l[1]) d=float(l[2]) mlen=max(mlen,x+1) # +1 as we want the length data.append((x,y,d)) # create empty matrix - we use None's but this means we need to check for then when # looking through the matrix # use square matrix to make indexing easier as we're unlikely to be very big m=[[None for i in range(mlen)] for j in range(mlen)] # Fill in all values (upper triangle) for i,j,d in data: if i > j: m[j][i]=d else: m[i][j]=d # Copy to lower for x in range(mlen): for y in range(mlen): if x==y: continue m[y][x] = m[x][y] self.distance_matrix=m return class GesamtClusterer(SubClusterer): """Class to cluster files with Gesamt""" def generate_distance_matrix(self, models, purge=True, purge_all=False, metric='qscore', nproc=1): # Make sure all the files are in the same directory otherwise we wont' work mdir = os.path.dirname(models[0]) if not all([ os.path.dirname(p) == mdir for p in models ]): raise RuntimeError("All pdb files are not in the same directory!") # Create list of pdb files fname = os.path.join(os.getcwd(), FILE_LIST_NAME) with open(fname, 'w') as f: f.write("\n".join(models)+"\n") # Index is just the order of the pdb in the file self.index2pdb = models nmodels = len(models) # Make the archive _logger.debug("Generating gesamt archive from models in directory {0}".format(mdir)) garchive = 'gesamt.archive' if not os.path.isdir(garchive): os.mkdir(garchive) logfile = os.path.abspath('gesamt_archive.log') cmd = [ self.executable, '--make-archive', garchive, '-pdb', mdir ] #cmd += [ '-nthreads=auto' ] cmd += [ '-nthreads={0}'.format(nproc) ] # HACK FOR DYLD!!!! env = None #env = {'DYLD_LIBRARY_PATH' : '/opt/ccp4-devtools/install/lib'} rtn = ample_util.run_command(cmd, logfile,env = env ) if rtn != 0: raise RuntimeError("Error running gesamt - check logfile: {0}".format(logfile)) if purge_all: os.unlink(logfile) # Now loop through each file creating the matrix if metric == 'rmsd': parity = 0.0 elif metric == 'qscore': parity = 1 else: raise RuntimeError("Unrecognised metric: {0}".format(metric)) m = [[parity for _ in range(nmodels)] for _ in range(nmodels)] for i, model in enumerate(models): mname = os.path.basename(model) gesamt_out = '{0}_gesamt.out'.format(mname) logfile = '{0}_gesamt.log'.format(mname) cmd = [ self.executable, model, '-archive', garchive, '-o', gesamt_out ] cmd += [ '-nthreads={0}'.format(nproc) ] rtn = ample_util.run_command(cmd, logfile) if rtn != 0: raise RuntimeError("Error running gesamt!") else: if purge: os.unlink(logfile) gdata = self.parse_gesamt_out(gesamt_out) assert gdata[0].file_name == mname, gdata[0].file_name + " " + mname score_dict = { g.file_name: (g.rmsd, g.q_score) for g in gdata } for j in range(i + 1, nmodels): # Try and get the rmsd and qscore for this model. If it's missing we assume the model was # too divergent for gesamt to find it and we set the rmsd and qscore to fixed values model2 = os.path.basename(models[j]) try: rmsd, qscore = score_dict[model2] except KeyError: rmsd = RMSD_MAX qscore = QSCORE_MIN if metric == 'rmsd': score = rmsd elif metric == 'qscore': score = qscore else: raise RuntimeError("Unrecognised metric: {0}".format(metric)) m[i][j] = score if purge_all: os.unlink(gesamt_out) # Copy upper half of matrix to lower for x in range(nmodels): for y in range(nmodels): if x == y: continue m[y][x] = m[x][y] self.distance_matrix = m # Remove the gesamt archive if purge: shutil.rmtree(garchive) # Write out the matrix in a form spicker can use self.dump_pdb_matrix(SCORE_MATRIX_NAME) return def parse_gesamt_out(self, out_file): # Assumption is there are no pdb_codes GesamtData = namedtuple('GesamtData', ['count', 'chain_id', 'q_score', 'rmsd', 'seq_id', 'nalign', 'nres', 'file_name']) data = [] with open(out_file) as f: for i, line in enumerate(f): if i < 2: continue # First 2 lines are headers if not line.strip(): continue # ignore blanks try: tmp = GesamtData(*line.split()) # Convert from strings to correct types data.append(GesamtData(int(tmp.count), tmp.chain_id, float(tmp.q_score), float(tmp.rmsd), tmp.seq_id, int(tmp.nalign), int(tmp.nres), os.path.basename(tmp.file_name))) except Exception as e: msg = 'Error parsing line {0}: {1}\n{2}'.format(i, line, e.message) logging.critical(msg) raise e assert len(data),"Failed to read any data!" return data class LsqkabClusterer(SubClusterer): """Class to cluster files with Lsqkab""" def calc_rmsd(self, model1, model2, nresidues=None, logfile='lsqkab.out', purge=False): if not nresidues: _, nresidues = pdb_edit.num_atoms_and_residues(model1, first=True) stdin = """FIT RESIDUE CA 1 TO {0} CHAIN {1} MATCH 1 to {0} CHAIN {1} output RMS end""".format(nresidues, 'A') cmd = [ 'lsqkab', 'XYZINM', model1, 'XYZINF', model2 ] ample_util.run_command(cmd, logfile=logfile, stdin=stdin) rmsd = self.parse_lsqkab_output(logfile) # cleanup if purge: os.unlink(logfile) os.unlink('RMSTAB') return rmsd def generate_distance_matrix(self, models, purge=True, metric='qscore'): # Index is just the order of the pdb in the file self.index2pdb = models num_models = len(models) # Assume all models are the same size and only have a single chain # We also assume that the chain is called 'A' (not relevant here) _, nresidues = pdb_edit.num_atoms_and_residues(models[0], first=True) # Create a square distance_matrix no_models in size filled with None self.distance_matrix = [[None for col in range(num_models)] for row in range(num_models)] logfile='lsqkab.out' parity = 0.0 for i, fixed in enumerate(models): for j, model2 in enumerate(models): if j < i: continue if j == i: rmsd = parity elif j > i: rmsd = self.calc_rmsd(fixed, model2, nresidues=nresidues, logfile=logfile) self.distance_matrix[i][j] = rmsd # Clean up output files from lsqkab os.unlink(logfile) os.unlink('RMSTAB') # Copy in other half of matrix - we use a full matrix as it's easier to scan for clusters for x in range(len(self.distance_matrix)): for y in range(len(self.distance_matrix)): self.distance_matrix[y][x] = self.distance_matrix[x][y] return def parse_lsqkab_output(self, output_file): with open(output_file) as f: for l in f.readlines(): if l.startswith(" RMS XYZ DISPLACEMENT ="): return float(l.split()[4]) assert False class MaxClusterer(SubClusterer): """Class to cluster files with maxcluster""" def generate_distance_matrix(self, pdb_list): """Run maxcluster to generate the distance distance_matrix""" no_models = len( pdb_list ) if not no_models: msg = "generate_distance_matrix got empty pdb_list!" logging.critical( msg ) raise RuntimeError, msg self.index2pdb=[0]*no_models # Maxcluster arguments # -l [file] File containing a list of PDB model fragments # -L [n] Log level (default is 4 for single MaxSub, 1 for lists) # -d [f] The distance cut-off for search (default auto-calibrate) # -bb Perform RMSD fit using backbone atoms # -C [n] Cluster method: 0 - No clustering # -rmsd ??? #os.system(MAX + ' -l list -L 4 -rmsd -d 1000 -bb -C0 >MAX_LOG ') #print 'MAX Done' # Create the list of files for maxcluster fname = os.path.join(os.getcwd(), FILE_LIST_NAME ) with open( fname, 'w' ) as f: f.write( "\n".join( pdb_list )+"\n" ) #log_name = "maxcluster_radius_{0}.log".format(radius) log_name = os.path.abspath("maxcluster.log") cmd = [ self.executable, "-l", fname, "-L", "4", "-rmsd", "-d", "1000", "-bb", "-C0" ] retcode = ample_util.run_command( cmd, logfile=log_name ) if retcode != 0: msg = "non-zero return code for maxcluster in generate_distance_matrix!\nSee logfile: {0}".format(log_name) logging.critical( msg ) raise RuntimeError, msg # Create a square distance_matrix no_models in size filled with None parity = 0 self.distance_matrix = [[parity for col in range(no_models)] for row in range(no_models)] #jmht Save output for parsing - might make more sense to use one of the dedicated maxcluster output formats #max_log = open(cur_dir+'/MAX_LOG') max_log = open( log_name, 'r') pattern = re.compile('INFO \: Model') for line in max_log: if re.match(pattern, line): # Split so that we get a list with # 0: model 1 index # 1: path to model 1 without .pdb suffix # 2: model 2 index # 3: path to model 2 without .pdb suffix # 4: distance metric split = re.split('INFO \: Model\s*(\d*)\s*(.*)\.pdb\s*vs\. Model\s*(\d*)\s*(.*)\.pdb\s*=\s*(\d*\.\d*)', line) self.distance_matrix[ int(split[1]) -1 ][ int(split[3]) -1] = float(split[5]) if split[2]+'.pdb' not in self.index2pdb: self.index2pdb[int(split[1]) -1] = split[2]+'.pdb' if split[4]+'.pdb' not in self.index2pdb: self.index2pdb[int(split[3]) -1] = split[4]+'.pdb' # Copy in other half of matrix - we use a full matrix as it's easier to scan for clusters for x in range(len(self.distance_matrix)): for y in range(len(self.distance_matrix)): self.distance_matrix[y][x] = self.distance_matrix[x][y] return