#! /usr/bin/env ccp4-python # # Copyright (C) 2005 Ronan Keegan # # This code is distributed under the terms and conditions of the # CCP4 Program Suite Licence Agreement as a CCP4 Application. # A copy of the CCP4 licence can be obtained by writing to the # CCP4 Secretary, Daresbury Laboratory, Warrington WA4 4AD, UK. # # # Setup Ensemble for Phaser MR # Ronan Keegan 12/12/04 # import logging import os, sys, string import shutil import mol_weight import subprocess, shlex if not os.environ.has_key('CCP4'): raise RuntimeError, 'CCP4 must be defined' import Model_struct import MRBUMP_gesamt import glob import collections try: from ample.util.foo import ample_util from ample.ensembler import ensembler_util except ImportError: ample_util = ensembler_util = None class Ensemble: """ A class to construct an ensemble of the top match PDBs for Phaser. """ def __init__(self): if os.name == "nt": self.superposeEXE=os.path.join(os.environ['CBIN'], 'superpose.exe') self.gesamtEXE=os.path.join(os.environ['CBIN'], 'gesamt.exe') self.pdbsetEXE=os.path.join(os.environ['CBIN'], 'pdbset.exe') else: self.superposeEXE=os.path.join(os.environ['CBIN'], 'superpose') self.gesamtEXE=os.path.join(os.environ['CBIN'], 'gesamt') self.pdbsetEXE=os.path.join(os.environ['CBIN'], 'pdbset') self.SPxyz_template='' self.SPxyzin='' self.SPxyzout='' self.SP_logfile='' self.SP_logfile_last='' self.DB_fail=False self.num_per_ensem=0 self.base_MW=0 self.meanMW=0 self.total_MW_ensem=0 try: self.debug=eval(os.environ['MRBUMP_DEBUG']) except: self.debug=False def setDEBUG(self, flag): self.debug=flag def setSPXYZ_TEMPLATE(self, xyzin): self.SPxyz_template=xyzin def setSPXYZIN(self, xyzin): self.SPxyzin=xyzin def setSPXYZOUT(self, xyzout): self.SPxyzout=xyzout def setSPLOGFILE(self, filename): self.SP_logfile=filename def setSPLOGFILELAST(self, filename): self.SP_logfile_last=filename def setNumPerEnsem(self, num_per_ensem): self.num_per_ensem=num_per_ensem def setup_ensemble(self, mstat, init): """ A function to setup structure data for running Phaser with ensembles. Calls superpose/gesamt to line up each of the template structures with each other. """ # Assingment of ensemble model to Gesamt ensemble model if init.keywords.GESENSEM: if init.keywords.AMPTRUNC: sys.stdout.write("Making Gesamt/AMPLE Ensembles\n") else: sys.stdout.write("Making Gesamt Ensemble\n") sys.stdout.write("\n") self.makeGesamtEnsemble(init, mstat) if init.keywords.GESTRUNC and len(mstat.sorted_MR_list) >= 2: self.makeGESAMTEnsembles(init, mstat) # If we want to make Gesamt AMPLE ensembles if init.keywords.AMPTRUNC and len(mstat.sorted_MR_list) >= 2: self.makeAMPLEEnsembles(init, mstat) # Create the model dictionary entries for these ensembles self.makeEnsembleDict(init, mstat) sys.stdout.write("Number of Ensembles: %d\n" % mstat.ampleNumModels) sys.stdout.write("Number of search models in each Ensemble: %d\n" % mstat.gesamtNumModels) sys.stdout.write("\n") else: # ensemble count ecount=0 sys.stdout.write("Number of Ensembles: %d\n" % init.keywords.ENSNUM) sys.stdout.write("Number of search models in each Ensemble: %d\n" % init.keywords.ENSMODNUM) sys.stdout.write("\n") while ecount < init.keywords.ENSNUM: # model count mcount=0 # Set up the model list entry m=Model_struct.Models_struct() m.setNumPerEnsem(init.keywords.ENSMODNUM) m.setModelChainSource('ensemble_model') m.setPDBName('ensm') m.setModelName('ensemble_model%d'%ecount) m.setModelType('ENSMBL') m.setModel_directory(os.path.join(init.search_dir, 'data', 'ensemble_model%d'%ecount)) # Create the directory structure os.mkdir(m.model_directory) os.mkdir(os.path.join(m.model_directory, "mr")) os.mkdir(os.path.join(m.model_directory, "mr", "pdbfiles")) os.mkdir(os.path.join(m.model_directory, "mr", "logs")) os.mkdir(os.path.join(m.model_directory, "mr", "tmp")) if "MOLREP" in init.keywords.MR_PROGRAM_LIST: os.mkdir(os.path.join(m.model_directory, 'mr', 'molrep')) os.mkdir(os.path.join(m.model_directory, 'mr', 'molrep', 'refine')) if "PHASER" in init.keywords.MR_PROGRAM_LIST: os.mkdir(os.path.join(m.model_directory, 'mr', 'phaser')) os.mkdir(os.path.join(m.model_directory, 'mr', 'phaser', 'refine')) self.setSPLOGFILE(os.path.join(m.model_directory, 'mr', 'logs', 'superpose.log')) self.setSPLOGFILELAST(os.path.join(m.model_directory, 'mr', 'logs', 'superpose_last.log')) full_log=open(self.SP_logfile,'w') if ecount 10.0: sys.stdout.write("WARNING: Deviation of base alignment model MW (%.2f %%) from mean too large\n" % percent_diff) sys.stdout.write("WARNING: Skipping model\n") sys.stdout.write("\n") continue else: if self.debug: sys.stdout.write("Current average molecular weight = %.1f\n" % self.meanMW) sys.stdout.write("Deviation of base alignment model from mean = %.2f %%\n" % percent_diff) self.total_MW_ensem += current_MW self.meanMW=self.total_MW_ensem/(mcount+1) #self.total_MW_ensem += current_MW #mean_MW_ensem = self.total_MW_ensem / (mcount+1) #percent_diff = (self.base_MW - mean_MW_ensem) * 100.0 / mean_MW_ensem #print "Molecular weight of alignment model = %.1f" % current_MW #if self.debug: # print "Current average molecular weight = %.1f" % mean_MW_ensem # print "Deviation of base alignment model from mean = %.2f %%" % percent_diff #if abs(percent_diff) > 10.0: # print "WARNING: Deviation of base alignment model MW (%.2f %%) from mean too large" % percent_diff # print "WARNING: Stopping here with %d models in the ensemble" % mcount # print "" # break self.setSPXYZIN(input_PDBFile) self.setSPXYZOUT(os.path.join(m.model_directory, 'mr', 'pdbfiles', mstat.chain_list[chain].chainName + '_superpose.pdb')) # Set the command line #command_line = self.superposeEXE + " " + self.SPxyzin + " " + self.SPxyz_template + " -p " + self.SPxyzout command_line = self.gesamtEXE + " " + self.SPxyzin + " " + self.SPxyz_template + " " + self.SPxyzout # Launch superpose if os.name == "nt": process_args = shlex.split(command_line, posix=False) p = subprocess.Popen(process_args, shell="True", stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) else: process_args = shlex.split(command_line) p = subprocess.Popen(process_args, stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) (o, i) = (p.stdout, p.stdin) i.close() # Capture the output from superpose log=open(self.SP_logfile_last, "w") line=o.readline() while line: log.write(line) line=o.readline() o.close() log.close() last_log=open(self.SP_logfile_last, 'r') line = last_log.readline() while line: # copy to full log file, that is kept full_log.write(line) # extract info from current log file if 'RMSD' in line and "=" in line: rmsd_index = string.split(line).index('RMSD') rmsd = float(string.split(line)[rmsd_index+2].replace(",", "")) align_length = int(string.split(line)[-1]) print "which aligns to base model with RMSD = %.3f over %d residues" % (rmsd,align_length) line = last_log.readline() last_log.close() # Add this pdb code to the ensemble list (along with its seq ID and rms value) m.PDBfile.append(self.SPxyzout) m.seqID.append(mstat.chain_list[chain].seqID) m.rms.append(mstat.chain_list[chain].rms) # When we reach the number of ensemble structures to be used, break out. mcount=mcount+1 if mcount >= m.num_per_ensem: print "" break # If there are too many models in each ensemble, reduce if mcount >= len(rotated_chain_list)-1: print "Warning: number of models in each ensemble reduced to %d to ensure ensemble uniqueness\n"%mcount m.num_per_ensem = mcount break full_log.close() if mcount < m.num_per_ensem: m.num_per_ensem = mcount # Add the ensemble to the model list and create an entry in the results_dict mstat.model_list[m.name]=m # Make the results dictionary import make_dictionary mstat.results_dict[m.name]=make_dictionary.makeDict(init.search_dir) # Insert the ensemble at the top of the sorted list mstat.sorted_model_list.insert(0,m.name) ecount += 1 def makeGesamtEnsemble(self, init, mstat): """ Make the Gesamt ensemble """ ############################################################ # ensemblingSortedModels=[] # if init.keywords.DOPHMMER: # for chain in mstat.sorted_MR_list: # if mstat.PHresultsDict[chain].score >= init.keywords.PMRCUTOFF: # print chain, init.keywords.PMRCUTOFF, mstat.PHresultsDict[chain].localSEQID, mstat.PHresultsDict[chain].score # ensemblingSortedModels.append(chain) # elif init.keywords.DOHHPRED: # for chain in mstat.sorted_MR_list: # if mstat.HHresultsDict[chain].score >= init.keywords.HHPCUTOFF: # print chain, init.keywords.HHPCUTOFF, mstat.PHresultsDict[chain].localSEQID, mstat.PHresultsDict[chain].score # ensemblingSortedModels.append(chain) # else: # sys.stdout.write("Ensembling with no search\n\n") # print ensemblingSortedModels ############################################################ gesamtTuple=collections.namedtuple('GesamtModel', ['pdbID', 'chainID', 'modelName', 'pdbFile', 'number']) gesamtModelList=[] count=0 number=len(mstat.sorted_MR_list) if init.keywords.MDLSCULPTOR: for chain in mstat.sorted_MR_list: if os.path.isfile(mstat.chain_list[chain].sculptor_modelPDB): gT=gesamtTuple(mstat.chain_list[chain].PDBName, mstat.chain_list[chain].chainID, mstat.chain_list[chain].chainName + '_SCLPTR', mstat.chain_list[chain].sculptor_modelPDB, number) gesamtModelList.append(gT) count=count+1 number=count mstat.EnsTemplate="SCLPTR" elif init.keywords.MDLCHAINSAW: for chain in mstat.sorted_MR_list: if os.path.isfile(mstat.chain_list[chain].chainsaw_modelPDB): gT=gesamtTuple(mstat.chain_list[chain].PDBName, mstat.chain_list[chain].chainID, mstat.chain_list[chain].chainName + '_CHNSAW', mstat.chain_list[chain].chainsaw_modelPDB, number) gesamtModelList.append(gT) count=count+1 number=count mstat.EnsTemplate="CHNSAW" elif init.keywords.MDLMOLREP: for chain in mstat.sorted_MR_list: if os.path.isfile(mstat.chain_list[chain].molrep_modelPDB): gT=gesamtTuple(mstat.chain_list[chain].PDBName, mstat.chain_list[chain].chainID, mstat.chain_list[chain].chainName + '_MOLREP', mstat.chain_list[chain].molrep_modelPDB, number) gesamtModelList.append(gT) count=count+1 number=count mstat.EnsTemplate="MOLREP" elif init.keywords.MDLUNMOD: for chain in mstat.sorted_MR_list: if os.path.isfile(mstat.chain_list[chain].unmod_modelPDB): gT=gesamtTuple(mstat.chain_list[chain].PDBName, mstat.chain_list[chain].chainID, mstat.chain_list[chain].chainName + '_UNMOD', mstat.chain_list[chain].unmod_modelPDB, number) gesamtModelList.append(gT) count=count+1 number=count mstat.EnsTemplate="UNMOD" elif init.keywords.MDLDPDBCLP: for chain in mstat.sorted_MR_list: if os.path.isfile(mstat.chain_list[chain].pdbclp_modelPDB): gT=gesamtTuple(mstat.chain_list[chain].PDBName, mstat.chain_list[chain].chainID, mstat.chain_list[chain].chainName + '_PDBCLP', mstat.chain_list[chain].pdbclp_modelPDB, number) gesamtModelList.append(gT) count=count+1 number=count mstat.EnsTemplate="PDBCLP" elif init.keywords.MDLPLYALA: for chain in mstat.sorted_MR_list: if os.path.isfile(mstat.chain_list[chain].plyala_modelPDB): gT=gesamtTuple(mstat.chain_list[chain].PDBName, mstat.chain_list[chain].chainID, mstat.chain_list[chain].chainName + '_PLYALA', mstat.chain_list[chain].plyala_modelPDB, number) gesamtModelList.append(gT) count=count+1 number=count mstat.EnsTemplate="PLYALA" else: for chain in mstat.sorted_MR_list: if os.path.isfile(mstat.chain_list[chain].unmod_modelPDB): gT=gesamtTuple(mstat.chain_list[chain].PDBName, mstat.chain_list[chain].chainID, mstat.chain_list[chain].chainName + '_UNMOD', mstat.chain_list[chain].unmod_modelPDB, number) gesamtModelList.append(gT) count=count+1 number=count mstat.EnsTemplate="UNMOD" if gesamtModelList==[]: sys.stdout.write("Error: No pdb models found for Gesamt alignment\n") sys.stdout.write("\n") sys.exit() # take the first non-ensemble entry as alignment base and remove from list mstat.gesamtAlignModel = gesamtModelList[0].modelName alignModelPDB = gesamtModelList[0].pdbFile # Set the sequence ID for the ensembles - it should be the seqID of the alignment base model mstat.gesamtEnsembleSeqID=mstat.chain_list[mstat.sorted_MR_list[0]].seqID # Set the Gesamt log file, alignment output file and the script file mstat.gesamtAlnFile=os.path.join(init.sequences_dir, "gesamtMultModelAlign.seq") logfile=os.path.join(init.logs_dir, "gesamtMultModelAlign.log") gesamtScript=os.path.join(init.scripts_dir, "gesamtMultModelAlign.sh") mstat.gesamtNumModels=len(gesamtModelList) # Set the name of the multi-model output pdb from Gesamt (Base Ensemble) mstat.gesamtBaseEnsemble=os.path.join(mstat.models_dir, "ensembles", "gesamtBaseAlignment.pdb") mstat.gesamtCSVFile=os.path.join(init.logs_dir, "gesamtCSVfile.csv") mstat.gesamtCSVCOREfile=os.path.join(init.logs_dir, "gesamtCSVCOREfile.csv") # Loop over the models and align them to the alignModel using Gesamt, then move them to the models directory if len(gesamtModelList) >= 2: pdbList=[] pdbDict=dict([]) for gesamtModel in gesamtModelList[1:]: # Create the pdb list and dictionary for this alignment pdbList.append(gesamtModel.pdbFile) pdbDict[gesamtModel.pdbFile]=gesamtModel.chainID pdbList.append(alignModelPDB) pdbDict[alignModelPDB]=gesamtModelList[0].chainID # Run Gesamt to align to the master and copy to the models directory gesamtRun=MRBUMP_gesamt.Gesamt() gesamtRun.runGesamt(pdbList=pdbList, pdbDict=pdbDict, outputDIR=mstat.models_dir, \ logfile=logfile, alnfile=mstat.gesamtAlnFile, MERGE_OUTPUT=False, \ script=gesamtScript, debug=eval(os.environ['MRBUMP_DEBUG'])) # Run it again but this time output all models to one PDB file gesamtScript=os.path.join(init.scripts_dir, "gesamtMultModelAlignOnePDB.sh") logfile=os.path.join(init.logs_dir, "gesamtMultModelAlignOnePDB.log") gesamtRun.runGesamt(pdbList=pdbList, pdbDict=pdbDict, outputDIR=mstat.models_dir, outputPDB=mstat.gesamtBaseEnsemble, \ logfile=logfile, alnfile=mstat.gesamtAlnFile, csvfile=mstat.gesamtCSVFile, MERGE_OUTPUT=True, \ script=gesamtScript, debug=eval(os.environ['MRBUMP_DEBUG'])) for gesamtModel in gesamtModelList: gesamtAlignFile =os.path.splitext(os.path.basename(gesamtModel.pdbFile))[0] + "_" + str(gesamtModel.number) + ".pdb" gesamtOrigFile =os.path.splitext(os.path.basename(gesamtModel.pdbFile))[0] + ".pdb" shutil.move(os.path.join(mstat.models_dir, gesamtAlignFile), os.path.join(mstat.models_dir, gesamtOrigFile)) shutil.copyfile(os.path.join(mstat.models_dir, gesamtOrigFile), gesamtModel.pdbFile) # If there was only one model found in the search then don't run gesamt and just copy the model to the models directory else: if os.path.isfile(alignModelPDB): shutil.copyfile(alignModelPDB, os.path.join(mstat.models_dir, os.path.basename(alignModelPDB))) else: sys.stdout.write("Warning: could not find model file:\n %s\n" % alignModelPDB) def makeGESAMTEnsembles(self, init, mstat): """ Make Gesamt variance-based truncated ensembles """ slevel=10 struncation="Cbeta" truncationList=[] if init.keywords.GTLEVEL != None: if isinstance(init.keywords.GTLEVEL, float) or isinstance(init.keywords.GTLEVEL, int): sys.stdout.write("GTLEVEL has been set to %s. Outputing single truncated Gesamt ensemble at this truncation level\n" % str(init.keywords.GTLEVEL)) truncationList=[str(init.keywords.GTLEVEL)] else: sys.stdout.write("Warning: GTLEVEL keyword has been given but is neither a float nor an integer. Defaulting to 10.0 Anstroms truncation level for Gesamt Ensemble\n") truncationList=["10.0"] else: #truncationList=["20.0", "10.0", "8.0", "6.0", "5.0", "4.0", "3.0", "2.0", "1.0", "0.8", "0.6", "0.5", "0.4", "0.3", "0.2", "0.1"] j=init.keywords.GTPERCENT while j <=100: truncationList.append(j) j=j+init.keywords.GTPERCENT gesamtRun=MRBUMP_gesamt.Gesamt() for i in truncationList: gesamtRun.makeGesTruncEnsemble(mstat.gesamtBaseEnsemble, os.path.join(mstat.models_dir, "ensembles", "gesamtEnsTrunc_%s_Side%s.pdb" % (i,struncation)), variancePercent=init.keywords.GTPERCENT, csvFile=mstat.gesamtCSVFile, csvCOREfile=mstat.gesamtCSVCOREfile, truncation_level=float(i), sidechain_level=slevel) def makeAMPLEEnsembles(self, init, mstat): """ Make AMPLE style truncated ensembles """ ensembles = self.ample_generate_ensembles(init, alignment_file=mstat.gesamtAlnFile + ".gesamt", logdir=os.path.join(init.search_dir,'logs')) def makeEnsembleDict(self, init, mstat): """ Make the dictionary entries for the Gesamt ensemble(s) """ ensFileList=os.listdir(os.path.join(mstat.models_dir, "ensembles")) ensembleTuple=collections.namedtuple('ensembleModel', ['modelName', 'pdbFile', 'truncationLevel', 'pdbName', 'fileName']) for pdbfile in ensFileList: if init.keywords.AMPTRUNC: if "_" in pdbfile and "gesamt" not in pdbfile.lower(): if string.split(pdbfile, "_")[0][0]=="e": truncationLevel=string.split(pdbfile, "_")[0].replace("e", "") else: truncationLevel="110" if int(truncationLevel) < 10: pdbName="e00" + truncationLevel elif int(truncationLevel) < 100: pdbName="e0" + truncationLevel else: pdbName="e" + truncationLevel if init.keywords.GESTRUNC: if "_Side" in pdbfile and "gesamt" in pdbfile.lower(): truncationLevel=string.split(pdbfile, "_")[1] pdbName=os.path.splitext(os.path.basename(pdbfile))[0] elif "gesamtBaseAlignment" in pdbfile: truncationLevel="All" pdbName=os.path.splitext(os.path.basename(pdbfile))[0] if not init.keywords.GESTRUNC and not init.keywords.AMPTRUNC: truncationLevel="All" pdbName=os.path.splitext(os.path.basename(pdbfile))[0] eT=ensembleTuple(os.path.splitext(pdbfile)[0], os.path.join(mstat.models_dir, "ensembles", pdbfile), truncationLevel, pdbName, pdbfile) mstat.ampleEnsembleList.append(eT) # Set up the model list entry m=Model_struct.Models_struct() m.setModelChainSource('ensemble_model') m.setPDBName(eT.pdbName) m.setModelName('ensemble_%s' % eT.pdbName) m.setModelType('ENSMBL') m.setModel_directory(os.path.join(init.search_dir, 'data', 'ensemble_%s'% eT.pdbName)) m.PDBfile=[] m.PDBfile.append(eT.pdbFile) m.seqID.append(mstat.gesamtEnsembleSeqID) m.rms.append(mstat.chain_list[mstat.sorted_MR_list[0]].rms) m.num_per_ensem=1#mstat.gesamtNumModels # Create the directory structure os.mkdir(m.model_directory) os.mkdir(os.path.join(m.model_directory, "mr")) os.mkdir(os.path.join(m.model_directory, "mr", "pdbfiles")) os.mkdir(os.path.join(m.model_directory, "mr", "logs")) os.mkdir(os.path.join(m.model_directory, "mr", "tmp")) if "MOLREP" in init.keywords.MR_PROGRAM_LIST: os.mkdir(os.path.join(m.model_directory, 'mr', 'molrep')) os.mkdir(os.path.join(m.model_directory, 'mr', 'molrep', 'refine')) if "PHASER" in init.keywords.MR_PROGRAM_LIST: os.mkdir(os.path.join(m.model_directory, 'mr', 'phaser')) os.mkdir(os.path.join(m.model_directory, 'mr', 'phaser', 'refine')) shutil.copyfile(eT.pdbFile, os.path.join(m.model_directory, eT.fileName)) # Add the ensemble to the model list and create an entry in the results_dict mstat.model_list[m.name]=m # Make the results dictionary import make_dictionary mstat.results_dict[m.name]=make_dictionary.makeDict(init.search_dir) # Insert the ensemble at the top of the sorted list mstat.sorted_model_list.insert(0,m.name) # Set the number of AMPLE/Gesamt ensembles mstat.ampleNumModels=len(mstat.ampleEnsembleList) def ample_generate_ensembles(self, init, alignment_file=None, logdir=None): """ Make the truncated ensembles using AMPLE method """ # TODO: requires treatment for ab initio truncation method. Only homologs work right now if ensembler_util is None: raise ImportError("Cannot import ample ensembler_util") logger = logging.getLogger() logger.setLevel(logging.DEBUG) # create file handler for debug output assert logdir logfile = os.path.join(logdir,'ample.log') fl = logging.FileHandler(logfile) fl.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s [%(lineno)d] - %(levelname)s - %(message)s') fl.setFormatter(formatter) logger.addHandler(fl) # get the module for homologs ensembling ensemble_module = ensembler_util.find_ensembler_module({"homologs" : True, "single_model_mode" : False}) ensembler = ensemble_module.Ensembler() ensembler.theseus_exe = os.path.join(os.environ['CCP4'], 'bin', 'theseus' + ample_util.EXE_EXT) ensembler.maxcluster_exe = ample_util.find_maxcluster({'maxcluster_exe' : None, 'rcdir' : os.path.join( os.path.expanduser("~"), ".ample") }) ensembler.subcluster_exe = ensembler.maxcluster_exe ensembler.max_ensemble_models = 30 cluster_exe = os.path.join(os.environ['CCP4'], 'bin', 'spicker' + ample_util.EXE_EXT) models_dir = os.path.join(init.search_dir, 'models') ensembles_directory = os.path.join(models_dir, 'ensembles') if not os.path.isdir(ensembles_directory): os.mkdir(ensembles_directory) work_dir = os.path.join(init.search_dir, 'models', 'ensemble_workdir') os.mkdir(work_dir) os.chdir(work_dir) models = glob.glob(os.path.join(models_dir, "*.pdb")) # Works specifically for homologs here. Kwargs would need to be changed to figure out otherwise ensembles = ensembler.generate_ensembles(models, percent_truncation=5, truncation_method='percent', ensembles_directory=ensembles_directory, alignment_file=alignment_file, work_dir=work_dir, nproc=1, homolog_aligner='gesamt', mustang_exe=None, gesamt_exe=os.path.join(os.environ['CCP4'], 'bin', 'gesamt' + ample_util.EXE_EXT), side_chain_treatments=None ) return ensembles