''' Created on Apr 18, 2013 @author: jmht ''' # system imports import collections import copy import logging import os import re import shutil # our imports from ample.ensembler.constants import ALLATOM, POLYALA, RELIABLE, UNMODIFIED from ample.ensembler import cluster_util from ample.ensembler import subcluster_util from ample.ensembler import truncation_util from ample.util import ample_util from ample.util import pdb_edit from ample.util import sequence_util from ample.util import subcluster from ample.util import theseus _logger = logging.getLogger(__name__) # Data structure to store residue information ScoreVariances = collections.namedtuple("ScoreVariances", ["idx", "resSeq", "variance"]) def model_core_from_fasta(models, alignment_file, work_dir=None, case_sensitive=False): if not os.path.isdir(work_dir): os.mkdir(work_dir) # Read in alignment to get align_seq = sequence_util.Sequence(fasta=alignment_file) # Check all alignments the same length # Get pdb names from alignment headers seq_names = [ h[1:].strip() for h in align_seq.headers ] # Need to check if the alignment file is from gesamt, in which case, the names have the # chain names in brackets appended for i, s in enumerate(seq_names): x = re.search("\([a-zA-Z]*\)$", s) if x: seq_names[i] = s.replace(x.group(0), "") # Get array specifying which positions are core. If the positions all align, then there # will be a capital letter for the residue. Gaps are signified by "-" and non-structurally- # aligned residues by lower-case letters GAP = '-' # Can't use below as Theseus ignores lower-case letters in the alignment if case_sensitive: core = [ all([ x in pdb_edit.one2three.keys() for x in t ]) for t in zip(*align_seq.sequences) ] else: core = [ all([ x != GAP for x in t ]) for t in zip(*align_seq.sequences) ] if not any(core): raise RuntimeError("Cannot generate core for models: {0}".format(models)) # For each sequence, get a list of which positions are core core_positions = [] for seq in align_seq.sequences: p = [] count = 0 for i, pos in enumerate(seq): if pos != GAP: if core[i]: p.append(count) count += 1 core_positions.append(p) # Should check lengths of sequences match the length of the aa in the pdbs # Create dict mapping seq_names to core positions core_dict = dict((s, core_positions[i]) for i, s in enumerate(seq_names)) # Cut the models down to core core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep_idx=core_dict[name]) core_models.append(pdbout) return core_models def model_core_from_theseus(models, alignment_file, var_by_res, work_dir=None): """ Only residues from the first protein are listed in the theseus output, but then not even all of them We assume the output is based on the original alignment so that where each residue in the first protein lines up with either another residue in one of the other proteins or a gap SO - we need to go through the theseus data and for each residue that is core find the corresponding residues in the other proteins We use the resSeq numbers to match the residues across the alignment """ if not os.path.isdir(work_dir): os.mkdir(work_dir) seqalign = sequence_util.Sequence(fasta=alignment_file) # We now need to add the list of pdbs, chains and resSeqs of the other models to the Sequence object for m in models: seqalign.add_pdb_data(m) # Sanity check that the names of the pdb files match those from the fasta header # Format is expected to be: '>1ujb.pdb(A)' names = [ h[1:].split('(')[0] for h in seqalign.headers ] if not seqalign.pdbs == names: raise RuntimeError, "headers and names of pdb files do not match!\n{0}\n{1}".format(seqalign.pdbs, names) # Get the name of the first pdb that the alignment is based on first = seqalign.pdbs[0] # Dictionary mapping model pdb to resSeqs that are core model2core = {} for p in seqalign.pdbs: model2core[p] = [] # initialise # Get list of core resSeqs in the first sequence model2core[first] = [ x.resSeq for x in var_by_res if x.core ] # Now go through the first sequence and get the resSeqs of the corresponding core for the other models pointer = 0 # Tracks where we are in the first sequence for i, resSeq in enumerate(seqalign.resseqs[0]): if model2core[first][pointer] == resSeq: # Core residue in first sequence so append the corresponding resSeqs for the other proteins for j, pdb in enumerate(seqalign.pdbs[1:]): model2core[pdb].append(seqalign.resseqs[j+1][i]) pointer += 1 if pointer >= len(model2core[first]): break core_models = [] for m in models: name = os.path.basename(m) pdbout = ample_util.filename_append(m, astr='core', directory=work_dir) pdb_edit.select_residues(m, pdbout, tokeep=model2core[name]) core_models.append(pdbout) return core_models class Ensembler(object): """Class to generate ensembles from ab inito models (all models must have same sequence) """ def __init__(self): # For all self.work_dir = None # top directory where everything gets done self.theseus_exe = None self.gesamt_exe = None self.lsqkab_exe = os.path.join(os.environ['CCP4'],'bin','lsqkab' + ample_util.EXE_EXT) assert ample_util.is_exe(self.lsqkab_exe),"Cannot find lsqkab: {0}".format(self.lsqkab_exe) # clustering self.cluster_method = "spicker" # the method for initial clustering self.cluster_exe = None self.num_clusters = 1 # truncation self.percent_truncation = 5 self.truncation_levels = None self.truncation_method = "percent" self.truncation_nresidues = None self.truncation_pruning = None self.truncation_scorefile = None self.truncation_variances = None # subclustering # self.subcluster_method='FLOATING_RADII' self.subcluster_method = 'ORIGINAL' self.subcluster_program = "maxcluster" self.subcluster_exe = None self.subclustering_method = "radius" self.subcluster_radius_thresholds = [1, 2, 3] self.ensemble_max_models = 30 # ensembles self.ensembles_directory = None self.ensembles = None self.ensembles_data = None self.score_matrix = None return def superpose_models(self, models, basename=None, work_dir=None, homologs=False): run_theseus = theseus.Theseus(work_dir=work_dir, theseus_exe=self.theseus_exe) try: run_theseus.superpose_models(models, basename=basename, homologs=homologs) except Exception, e: _logger.critical("Error running theseus: {0}".format(e)) return False return run_theseus.superposed_models def cluster_models(self, cluster_dir=None, cluster_exe=None, cluster_method=None, models=None, max_cluster_size=200, num_clusters=None, nproc=1): """Wrapper function to run clustering of models dependent on the method """ ######################################################################## ## Switch statement solution to find function to use ## `Key` equal to cluster_method ## `Value` equal to function name, function arguments switch = {'fast_protein_cluster' : [cluster_util.fast_protein_cluster, list([cluster_exe, max_cluster_size, models, num_clusters, nproc, self.work_dir])], 'import' : [cluster_util.import_cluster, list([cluster_dir])], 'random' : [cluster_util.random_cluster, list([cluster_method, max_cluster_size, models, num_clusters])], 'spicker' : [cluster_util.spicker_default, list([cluster_exe, cluster_method, max_cluster_size, num_clusters, models, self.work_dir, nproc])], 'spicker_qscore' : [cluster_util.spicker_qscore, list([cluster_exe, cluster_method, max_cluster_size, num_clusters, models, self.work_dir, nproc, self.gesamt_exe])], 'spicker_tmscore' : [cluster_util.spicker_tmscore, list([cluster_exe, cluster_method, max_cluster_size, num_clusters, models, self.work_dir, nproc])], } # Get the function handler and keyword arguments cluster_function, args = switch.get(cluster_method, None) if not cluster_function: msg = 'Unrecognised clustering method: {0}'.format(cluster_method) raise RuntimeError(msg) # Cluster our protein structures _logger.info('Clustering models using method: {0}'.format(cluster_method)) clusters, clusters_data = cluster_function(*args) return clusters, clusters_data def edit_side_chains(self, raw_ensemble, raw_ensemble_data, side_chain_treatments, ensembles_directory, homologs=False, single_structure=False): assert os.path.isdir(ensembles_directory), "Cannot find ensembles directory: {0}".format(ensembles_directory) ensembles = [] ensembles_data = [] if side_chain_treatments is None: side_chain_treatments=[UNMODIFIED] for sct in side_chain_treatments: ensemble_data = copy.copy(raw_ensemble_data) ensemble_data['side_chain_treatment'] = sct if homologs: ensemble_data['name'] = 'e{0}_{1}'.format(ensemble_data['truncation_level'], sct) elif single_structure: ensemble_data['name'] = '{0}_t{1}_{2}'.format(ensemble_data['truncation_score_key'], ensemble_data['truncation_level'], sct) else: ensemble_data['name'] = 'c{0}_t{1}_r{2}_{3}'.format(ensemble_data['cluster_num'], ensemble_data['truncation_level'], ensemble_data['subcluster_radius_threshold'], sct) # create filename based on name and side chain treatment # fpath = ample_util.filename_append(raw_ensemble,astr=sct, directory=ensembles_directory) fpath = os.path.join(ensembles_directory, "{0}.pdb".format(ensemble_data['name'])) # Create the files if sct == ALLATOM or sct == UNMODIFIED: # For all atom just copy the file shutil.copy2(raw_ensemble, fpath) elif sct == RELIABLE: pdb_edit.reliable_sidechains(raw_ensemble, fpath) elif sct == POLYALA: pdb_edit.backbone(raw_ensemble, fpath) else: raise RuntimeError, "Unrecognised side_chain_treatment: {0}".format(sct) # Count the number of atoms in the ensemble-only required for benchmark mode natoms, nresidues = pdb_edit.num_atoms_and_residues(fpath, first=True) # Process ensemble data ensemble_data['ensemble_pdb'] = fpath ensemble_data['ensemble_num_atoms'] = natoms # check assert ensemble_data['num_residues'] == nresidues, "Unmatching number of residues: {0} : {1} \n{2}".format(ensemble_data['num_residues'], nresidues, raw_ensemble_data) ensembles.append(fpath) ensembles_data.append(ensemble_data) return ensembles, ensembles_data def subcluster_models(self, truncated_models, truncated_models_data, subcluster_program=None, subcluster_exe=None, radius_thresholds=None, ensemble_max_models=None, work_dir=None): if self.subcluster_method == "ORIGINAL": f = self.subcluster_models_fixed_radii elif self.subcluster_method == "FLOATING_RADII": f = self.subcluster_models_floating_radii else: assert False return f(truncated_models, truncated_models_data, subcluster_program, subcluster_exe, ensemble_max_models, radius_thresholds=radius_thresholds, work_dir=work_dir) def subcluster_models_fixed_radii(self, truncated_models, truncated_models_data, subcluster_program=None, subcluster_exe=None, ensemble_max_models=None, radius_thresholds=None, work_dir=None): # Theseus only works with > 3 residues if truncated_models_data['num_residues'] <= 2: return [], [] if not radius_thresholds: radius_thresholds = self.subcluster_radius_thresholds ensembles = [] ensembles_data = [] # Use first model to get data on level cluster_num = truncated_models_data['cluster_num'] truncation_level = truncated_models_data['truncation_level'] # Make sure everyting happens in the truncation directory owd = os.getcwd() os.chdir(work_dir) # Run maxcluster to generate the distance matrix if subcluster_program == 'maxcluster': clusterer = subcluster.MaxClusterer(self.subcluster_exe) elif subcluster_program == 'lsqkab': clusterer = subcluster.LsqkabClusterer(self.subcluster_exe) else: assert False,subcluster_program clusterer.generate_distance_matrix(truncated_models) # clusterer.dump_matrix(os.path.join(truncation_dir,"subcluster_distance.matrix")) # for debugging # Loop through the radius thresholds previous_clusters = [] for radius in radius_thresholds: _logger.debug("subclustering models under radius: {0}".format(radius)) # Get list of pdbs clustered according to radius threshold cluster_files = clusterer.cluster_by_radius(radius) if not cluster_files: _logger.debug("Skipping radius {0} as no files clustered in directory {1}".format(radius, work_dir)) continue _logger.debug("Clustered {0} files".format(len(cluster_files))) cluster_files = subcluster_util.slice_subcluster(cluster_files, previous_clusters, ensemble_max_models, radius, radius_thresholds) if not cluster_files: _logger.debug('Could not create different cluster for radius {0} in directory: {1}'.format(radius, work_dir)) continue # Remember this cluster so we don't create duplicate clusters previous_clusters.append(cluster_files) # Got files so create the directories subcluster_dir = os.path.join(work_dir, 'subcluster_{0}'.format(radius)) os.mkdir(subcluster_dir) os.chdir(subcluster_dir) basename = 'c{0}_t{1}_r{2}'.format(cluster_num, truncation_level, radius) # List of files for reference with open(os.path.join(subcluster_dir, "{0}.list".format(basename)), 'w') as f: for m in cluster_files: f.write(m + "\n") f.write("\n") cluster_file = self.superpose_models(cluster_files, work_dir=subcluster_dir) if not cluster_file: msg = "Error running theseus on ensemble {0} in directory: {1}\nSkipping subcluster: {0}".format(basename, subcluster_dir) _logger.critical(msg) continue ensemble = os.path.join(subcluster_dir, basename + '.pdb') shutil.move(cluster_file, ensemble) # The data we've collected is the same for all pdbs in this level so just keep using the first ensemble_data = copy.copy(truncated_models_data) ensemble_data['subcluster_num_models'] = len(cluster_files) ensemble_data['subcluster_radius_threshold'] = radius ensemble_data['ensemble_pdb'] = ensemble # Get the centroid model name from the list of files given to theseus - we can't parse # the pdb file as theseus truncates the filename ensemble_data['subcluster_centroid_model'] = os.path.abspath(cluster_files[0]) ensembles.append(ensemble) ensembles_data.append(ensemble_data) # back to where we started os.chdir(owd) return ensembles, ensembles_data def subcluster_models_floating_radii(self, truncated_models, truncated_models_data, subcluster_program=None, subcluster_exe=None, ensemble_max_models=None, work_dir=None): _logger.info("subclustering with floating radii") # Run maxcluster to generate the distance matrix if subcluster_program == 'maxcluster': clusterer = subcluster.MaxClusterer(self.subcluster_exe) else: assert False clusterer.generate_distance_matrix(truncated_models) # clusterer.dump_matrix(os.path.join(truncation_dir,"subcluster_distance.matrix")) # for debugging subclusters = [] subclusters_data = [] clusters = [] radii = [] len_truncated_models = len(truncated_models) for i in range(len(self.subcluster_radius_thresholds)): radius = None nmodels = None if i > 0 and radii[i - 1] > self.subcluster_radius_thresholds[i]: radius = radii[i - 1] nmodels = len(clusters[i - 1]) cluster_files, radius = subcluster_util.subcluster_nmodels(nmodels, radius, clusterer, direction='up', increment=1) else: radius = self.subcluster_radius_thresholds[i] cluster_files = clusterer.cluster_by_radius(radius) if cluster_files: cluster_files = tuple(sorted(cluster_files)) # Need to sort so that we can check if we've had this lot before cluster_size = len(cluster_files) else: cluster_files = [] cluster_size = 0 if radius in radii or cluster_size == 0: # Increase radius till we have one more than the last one if cluster_size == 0: nmodels = 2 else: radius = radii[i - 1] nmodels = len(clusters[i - 1]) + 1 cluster_files, radius = subcluster_util.subcluster_nmodels(nmodels, radius, clusterer, direction='up', increment=1) cluster_files = sorted(cluster_files) elif cluster_size >= ensemble_max_models or cluster_files in clusters: # Randomly pick ensemble_max_models cluster_files = subcluster_util.pick_nmodels(cluster_files, clusters, ensemble_max_models) if not cluster_files: _logger.debug('Could not cluster files under radius: {0} - could not find different models'.format(radius)) break # Need to check in case we couldn't cluster under this radius if cluster_size == 0 or radius in radii: _logger.debug('Could not cluster files under radius: {0} - got {1} files'.format(radius, len(cluster_files))) break _logger.debug('Subclustering {0} files under radius {1}'.format(cluster_size, radius)) try: cluster_ensemble, data = subcluster_util.subcluster_radius(list(cluster_files), radius, truncated_models_data) except RuntimeError: _logger.debug('Could not cluster files under radius: {0} as theseus failed'.format(radius, len(cluster_files))) # If theseus fails, we just move break subclusters.append(cluster_ensemble) subclusters_data.append(data) clusters.append(tuple(cluster_files)) # append as tuple so it is hashable radii.append(radius) if cluster_size == len_truncated_models: break return subclusters, subclusters_data def truncate_models(self, models, models_data={}, max_cluster_size=200, truncation_method=None, percent_truncation=None, truncation_pruning=None, residue_scores=None, homologs=False, alignment_file=None, work_dir=None): assert (len(models) > 1 or residue_scores), "Cannot truncate as < 2 models!" assert truncation_method and percent_truncation, "Missing arguments: {0}".format(truncation_method) # Create the directories we'll be working in assert work_dir and os.path.isdir(work_dir), "truncate_models needs a work_dir" os.chdir(work_dir) # Calculate variances between pdb and align them (we currently only require the aligned models for homologs) if truncation_method != "scores": run_theseus = theseus.Theseus(work_dir=work_dir, theseus_exe=self.theseus_exe) try: run_theseus.superpose_models(models, homologs=homologs, alignment_file=alignment_file) except RuntimeError as e: _logger.critical(e) return [],[] if homologs: # If using homologs, now trim down to the core. We only do this here so that we are using the aligned models from # theseus, which makes it easier to see what the truncation is doing. models = model_core_from_fasta(run_theseus.aligned_models, alignment_file=alignment_file, work_dir=os.path.join(work_dir,'core_models')) # Unfortunately Theseus doesn't print all residues in its output format, so we can't use the variances we calculated before and # need to calculate the variances of the core models try: run_theseus.superpose_models(models, homologs=homologs, basename='homologs_core') except RuntimeError as e: _logger.critical(e) return [],[] # No THESEUS variances required if scores for each residue provided var_by_res = run_theseus.var_by_res() if truncation_method != "scores" \ else self._convert_residue_scores(residue_scores) if not len(var_by_res) > 0: msg = "Error reading residue variances!" _logger.critical(msg) raise RuntimeError(msg) _logger.info('Using truncation method: {0}'.format(truncation_method)) # Calculate which residues to keep under the different methods truncation_levels, truncation_variances, truncation_residues, truncation_residue_idxs = None, None, None, None if truncation_method == 'percent': truncation_levels, truncation_variances, truncation_residues, truncation_residue_idxs = truncation_util.calculate_residues_percent(var_by_res, percent_truncation) elif truncation_method == 'scores': truncation_levels, truncation_variances, truncation_residues, truncation_residue_idxs = truncation_util.calculate_residues_percent(var_by_res, percent_truncation) elif truncation_method == 'thresh': truncation_levels, truncation_variances, truncation_residues, truncation_residue_idxs = truncation_util.calculate_residues_thresh(var_by_res, percent_truncation) elif truncation_method == 'focussed': truncation_levels, truncation_variances, truncation_residues, truncation_residue_idxs = truncation_util.calculate_residues_focussed(var_by_res) else: raise RuntimeError, "Unrecognised ensembling mode: {0}".format(truncation_method) self.truncation_levels = truncation_levels # save so we can put in results dict self.truncation_variances = truncation_variances # save so we can put in results dict self.truncation_nresidues = [len(r) for r in truncation_residues] # save so we can put in results dict # Use all models in cluster to calculate variance but then slice to max_cluster_size #models = self._slice_models(models, 0, max_cluster_size) truncated_models = [] truncated_models_data = [] truncated_models_dirs = [] pruned_residues = None for tlevel, tvar, tresidues, tresidue_idxs in zip(truncation_levels, truncation_variances, truncation_residues, truncation_residue_idxs): # Prune singletone/doubletone etc. residues if required _logger.debug("truncation_pruning: {0}".format(truncation_pruning)) if truncation_pruning == 'single': tresidue_idxs, pruned_residues=truncation_util.prune_residues(tresidue_idxs, chunk_size=1, allowed_gap=2) if pruned_residues: _logger.debug("prune_residues removing: {0}".format(pruned_residues)) elif truncation_pruning is None: pass else: raise RuntimeError("Unrecognised truncation_pruning: {0}".format(truncation_pruning)) # Skip if there are no residues if not tresidue_idxs: _logger.debug("Skipping truncation level {0} with variance {1} as no residues".format(tlevel, tvar)) continue trunc_dir = os.path.join(work_dir, 'tlevel_{0}'.format(tlevel)) os.mkdir(trunc_dir) _logger.info('Truncating at: {0} in directory {1}'.format(tlevel, trunc_dir)) # list of models for this truncation level level_models = [] for infile in models: pdbout = ample_util.filename_append(infile, str(tlevel), directory=trunc_dir) # Loop through PDB files and create new ones that only contain the residues left after truncation pdb_edit.select_residues(pdbin=infile, pdbout=pdbout, tokeep_idx=tresidue_idxs) level_models.append(pdbout) # Add the model truncated_models.append(level_models) truncated_models_dirs.append(trunc_dir) # Add the data model_data = copy.copy(models_data) model_data['truncation_level'] = tlevel model_data['truncation_variance'] = tvar model_data['truncation_residues'] = tresidues model_data['num_residues'] = len(tresidues) model_data['truncation_dir'] = trunc_dir model_data['percent_truncation'] = percent_truncation model_data['truncation_method'] = truncation_method model_data['truncation_pruning'] = truncation_pruning model_data['pruned_residues'] = pruned_residues truncated_models_data.append(model_data) return truncated_models, truncated_models_data, truncated_models_dirs def _convert_residue_scores(self, residue_scores): """Create named tuple to match store residue data""" scores = [ScoreVariances(idx=int(res)-1, # Required to match Theseus resSeq=int(res), variance=float(sco)) \ for (res, sco) in residue_scores] return scores def _slice_models(self, data, start, end): """Allows us to slice a list""" return data[start:end]