""" Fragon place.py Copyright (C) 2017-2018 University of York Author: Huw Jenkins This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 3, modified in accordance with the provisions of the license to address the requirements of UK law. You should have received a copy of the modified GNU Lesser General Public License along with this program. If not, copies may be downloaded from http://www.ccp4.ac.uk/ccp4license.php This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. """ import os import phaser from iotbx import pdb from fragon.utils import write_output class CallbackObject(object): def __init__(self, xml_file=None, xmlroot=None): self.xml_file = xml_file self.xmlroot = xmlroot def startProgressBar (self, label, size): if self.xml_file is None and self.xmlroot is None: pass # for command-line mode else: from fragon.utils import write_output if label != 'Generating Statistics': callback = ['progress', label] write_output({'callback':callback}, json_file=None, xml_file=self.xml_file, xmlroot=self.xmlroot, output=None) def incrementProgressBar (self): pass def endProgressBar (self): pass def warn (self, message): pass def loggraph (self, title, data): pass def call_back (self, message, data): if self.xml_file is None and self.xmlroot is None: pass # for command-line mode else: from fragon.utils import write_output if message == 'summary': if data.startswith('** New Best LLG'): llg = data.split('=')[1].split()[0] callback = ['Best LLG', llg] write_output({'callback':callback}, json_file=None, xml_file=self.xml_file, xmlroot=self.xmlroot, output=None) elif message == 'current best solution': for item in data.split(): if item[0:5] == 'TFZ==': tfz = item.split('==')[1] callback = ['Best TFZ', tfz] write_output({'callback':callback}, json_file=None, xml_file=self.xml_file, xmlroot=self.xmlroot, output=None) def prepare_data(mtzin, i, sigi, fp, sigfp, logfile, log): input = phaser.InputMR_DAT() input.setHKLI(mtzin) if i is not None and sigi is not None: input.setLABI_I_SIGI(i, sigi) else: input.setLABI_F_SIGF(fp, sigfp) output = phaser.Output() data_log = open(logfile, 'w') output_object = CallbackObject(None, None) output.setPhenixPackageCallback(output_object) output.set_file_object(data_log) data = phaser.runMR_DAT(input, output) data_log.close() if data.Success(): return data else: log.critical('Job exit status FAILURE') log.critical('%s ERROR : %s' % (data.ErrorName(), data.ErrorMessage())) def calculate_solvent(root, data, seqin, ncs_copies, highres, logfile, log): input = phaser.InputCCA() input.setSPAC_HALL(data.getSpaceGroupHall()) input.setCELL6(data.getUnitCell()) input.addCOMP_PROT_SEQ_NUM(seqin, ncs_copies) input.setRESO(highres, 10000) output = phaser.Output() cca_log = open(logfile, 'w') output_object = CallbackObject(None, None) output.setPhenixPackageCallback(output_object) output.set_file_object(cca_log) cca = phaser.runCCA(input, output) cca_log.close() # check if solvent content is higher than average if cca.getBestZ() > 1: log.warning ('*** Warning solvent content estimated by Phaser is %0.2f' % (1.0 - 1.232/cca.getBestVM())) if ncs_copies > 1: log.warning(' solvent content calculated from %d copies of sequence is %0.2f\n' % (ncs_copies, (1.0 - 1.232/cca.getVM()[0]))) else: log.warning(' solvent content calculated from 1 copy of sequence is %0.2f\n' % (1.0 - 1.232/cca.getVM()[0])) elif ncs_copies > 1: log.info('Solvent content calculated from %d copies of sequence is %0.2f\n' % (ncs_copies, (1.0 - 1.232/cca.getVM()[0]))) else: log.info('Solvent content calculated from 1 copy of sequence is %0.2f\n' % (1.0 - 1.232/cca.getVM()[0])) return cca.getBestZ(), cca.getBestVM(), cca.getZ()[0], cca.getVM()[0] def run_phaser(root, xml_file, xmlroot, log, logfile, data, pdbin, copies, rms, pdbin_fixed, seqin, ncs_copies, phaser_solvent, sgalternative, sg_list, input_hand, tncs, search_lowres, search_highres, rot_peaks, rot_cluster_off, rot_samp, tra_samp, search_down, tfz_solved, solutions, purge, rescore_strands, rescore_residues, rescore_models, nproc): input = phaser.InputMR_AUTO() input.setRESO(search_highres, search_lowres) # test OpenMP if nproc > 1: input.setJOBS(nproc) else: input.setJOBS(1) if rot_samp > 0: input.setSAMP_ROTA(rot_samp) if tra_samp > 0: input.setSAMP_TRAN(tra_samp) input.setSPAC_HALL(data.getSpaceGroupHall()) input.setCELL6(data.getUnitCell()) input.setREFL_DATA(data.getDATA()) input.addENSE_PDB_RMS(root, pdbin, rms) input.setROOT(root) # fixed solution if pdbin_fixed is not None: fixed_root = pdbin_fixed[:-4] input.addENSE_PDB_RMS(fixed_root, pdbin_fixed, rms) input.addSOLU_6DIM_ENSE(fixed_root, [0.0, 0.0, 0.0], False, [0.0, 0.0, 0.0], 0.0, False, False, False, 1.0, 1.0) if phaser_solvent is not None: input.setCOMP_SOLV() input.setCOMP_PERC(phaser_solvent) else: input.addCOMP_PROT_SEQ_NUM(seqin, ncs_copies) input.addSEAR_ENSE_NUM(root, copies) if rot_peaks is not None: input.setPEAK_ROTA_SELE('NUMBER') input.setPEAK_ROTA_CUTO(rot_peaks) if rot_cluster_off: input.setPEAK_ROTA_CLUS(False) if sgalternative: input.setSGAL_SELE('ALL') elif input_hand: input.setSGAL_SELE('NONE') elif sg_list is not None: input.setSGAL_SELE('LIST') for sg in sg_list.split(','): input.addSGAL_TEST(sg) else: input.setSGAL_SELE('HAND') input.setTOPF(solutions) if rescore_strands or rescore_residues or rescore_models: # we don't need to write out the files here input.setXYZO(False) input.setHKLO(False) input.setXYZO_ENSE(False) # turn off tNCS correction if required if tncs: # to turn restore default MR_AUTO behaviour needs this input.setTNCS_PATT_PERC(99.9) # in desparate cases search further down the RF and TF list if search_down is not None: input.setSEAR_DOWN_PERC(search_down) input.setPURG_TRAN_PERC(75.00 - search_down) # turn purging on if requested for multicopy searches if purge > 0: input.setPURG_RNP_ENAB(True) input.setPURG_RNP_NUMB(purge) # turn it off otherwise (so more solutions are output) else: input.setPURG_RNP_ENAB(False) # these are not default but fragments should never clash. input.setPACK_KEEP_HIGH_TFZ(False) input.setSEAR_PRUN(False) input.setTRAN_PACK_CUTO(0.0) input.setPACK_CUTO(0.0) # end non defaults input.setZSCO_USE(True) input.setZSCO_SOLV(tfz_solved) if xml_file is not None and xmlroot is not None: input.setOUTP_LEVE('LOGFILE') #i2 else: input.setOUTP_LEVE('SUMMARY') #command line output = phaser.Output() phaser_log = open(logfile, 'w') output_object = CallbackObject(xml_file, xmlroot) output.setPhenixPackageCallback(output_object) output.set_file_object(phaser_log) mr = phaser.runMR_AUTO(input, output) phaser_log.close() if mr.Success(): if mr.foundSolutions() : log.info('\nPhaser has found %d solution(s)' % mr.numSolutions()) if not rescore_strands or rescore_residues: log.info('Top solution PDB file = %s' % mr.getTopPdbFile()) log.info('Top solution LLG = %f' % mr.getTopLLG()) return mr.getDotSol() else: log.info('\nPhaser failed to find any solutions') return 'No solutions' else: log.info('\nPhaser failed to find any solutions') return 'No solutions' def split_chains(pdbin, copies): chain_ids =['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] ensemble = pdb.hierarchy.input(pdbin) num_models = len(ensemble.hierarchy.models()) name = pdbin[:-4] selection = pdb.hierarchy.ext.root() models = [] for copy in range(copies): selection = pdb.hierarchy.ext.root() for model_no in range(num_models): selected_model = ensemble.hierarchy.models()[model_no].detached_copy() selection.append_model(selected_model) chain_length = selection.models()[model_no].chains()[0].residue_groups_size() chain1 = selection.models()[model_no].chains()[0] chain1.id = chain_ids[copy*2] for residue in selection.models()[model_no].chains()[0].residue_groups(): if int(residue.resseq) > chain_length/2: selection.models()[model_no].chains()[0].remove_residue_group(residue) chain2 = ensemble.hierarchy.models()[model_no].chains()[0].detached_copy() chain2.id = chain_ids[copy*2 + 1] for residue in chain2.residue_groups(): if int(residue.resseq) <= chain_length/2: chain2.remove_residue_group(residue) selection.models()[model_no].append_chain(chain2) model = name + '.chains%s%s' % (chain_ids[copy*2], chain_ids[copy*2 + 1]) models.append(model) with open(model+'.pdb', 'w') as pdbfile: print >> pdbfile, selection.as_pdb_string(),'END' return models def split_strands(pdbin, copies): chain_ids =['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] ensemble = pdb.hierarchy.input(pdbin) num_models = len(ensemble.hierarchy.models()) chain_length = ensemble.hierarchy.models()[0].chains()[0].residue_groups_size() name = pdbin[:-4] selection = pdb.hierarchy.ext.root() models = [] for copy in range(copies): selection = pdb.hierarchy.ext.root() for model_no in range(num_models): for segment in range(4): atoms_to_keep = [] if segment in [0,2]: for resn in range(1, chain_length + 1): if resn <= int(chain_length/4) + int(segment*(chain_length/4.)) and resn > int(segment*(chain_length/4.)): atoms_to_keep.extend(['%4d N ' % resn, '%4d CB ' % resn, '%4d CA ' % resn, '%4d C ' % resn, '%4d O ' % resn]) elif resn == int(chain_length/4) + int(segment*(chain_length/4.)) + 1: atoms_to_keep.append('%4d N ' % resn) if segment == 0: # first one selected_model = ensemble.hierarchy.models()[model_no].detached_copy() res_groups = selected_model.chains()[0].residue_groups() else: next_chain = ensemble.hierarchy.models()[model_no].chains()[0].detached_copy() next_chain.id = chain_ids[copy*4 + segment] res_groups = next_chain.residue_groups() for residue_group in res_groups: for atom in residue_group.atom_groups()[0].atoms(): if atom.parent().parent().resseq + atom.name not in atoms_to_keep: residue_group.atom_groups()[0].remove_atom(atom) if segment == 0: # first one selection.append_model(selected_model) else: selection.models()[model_no].append_chain(next_chain) elif segment in [1,3]: for resn in range(1, chain_length + 1): if resn >= (int(chain_length/4) + int(segment*(chain_length/4.))) and resn <= int(chain_length/4. + segment*(chain_length/4.)) : atoms_to_keep.extend(['%4d N ' % resn, '%4d CB ' % resn, '%4d CA ' % resn, '%4d C ' % resn, '%4d O ' % resn]) elif resn == int(segment*(chain_length/4.)) + 1: atoms_to_keep.extend(['%4d C ' % resn, '%4d O ' % resn]) next_chain = ensemble.hierarchy.models()[model_no].chains()[0].detached_copy() next_chain.id = chain_ids[copy*4 + segment] for residue_group in next_chain.residue_groups(): for atom in residue_group.atom_groups()[0].atoms(): if atom.parent().parent().resseq + atom.name not in atoms_to_keep: residue_group.atom_groups()[0].remove_atom(atom) selection.models()[model_no].append_chain(next_chain) model = name + '.chains%s%s%s%s' % (chain_ids[copy*4], chain_ids[copy*4 + 1],chain_ids[copy*4 + 2],chain_ids[copy*4 + 3]) models.append(model) with open(model+'.pdb', 'w') as pdbfile: print >> pdbfile, selection.as_pdb_string(),'END' return models def split_models(pdbin): ensemble = pdb.hierarchy.input(pdbin) num_models = len(ensemble.hierarchy.models()) name = pdbin[:-4] models = [] for model_no in range(num_models): selection = pdb.hierarchy.ext.root() selected_model = ensemble.hierarchy.models()[model_no].detached_copy() selection.append_model(selected_model) model = name + '.model%d' % (model_no + 1) models.append(model) with open(model + '.pdb','w') as pdbfile: print >> pdbfile, selection.as_pdb_string(),'END' return models def run_rnp(root, xml_file, xmlroot, logfile, log, data, tncs, pdbin, models, solutions, num_solutions, rescore_all, rescore_models, nproc): input = phaser.InputMR_RNP() # test OpenMP if nproc > 1: input.setJOBS(nproc) else: input.setJOBS(1) input.setSPAC_HALL(data.getSpaceGroupHall()) input.setCELL6(data.getUnitCell()) input.setREFL_DATA(data.getDATA()) for model in models: input.addENSE_PDB_RMS(model, model + '.pdb', solutions[0].VRMS.values()[0][0]) if not rescore_all: solutions = solutions[:num_solutions] for solution in solutions: solution_id = root + '.' + str(solution.NUM) if rescore_models: for model in models: input.addSOLU_SET(solution_id + '_' + model.split('.')[1]) for mr in solution.KNOWN: rot = mr.getEuler() tra = mr.TRA input.addSOLU_6DIM_ENSE(model,[rot[0],rot[1],rot[2]],True,[tra[0],tra[1],tra[2]],mr.BFAC,False,False,False,1.0,1.0) else: input.addSOLU_SET(solution_id) copy = 0 for mr in solution.KNOWN: rot = mr.getEuler() tra = mr.TRA input.addSOLU_6DIM_ENSE(models[copy],[rot[0],rot[1],rot[2]],True,[tra[0],tra[1],tra[2]],mr.BFAC,False,False,False,1.0,1.0) copy += 1 if tncs: # to restore default MR_AUTO behaviour needs this input.setTNCS_PATT_PERC(99.9) input.setPURG_RNP_ENAB(False) input.setOUTP_LEVE('SUMMARY') if rescore_models: input.addMACM(False,False,False,True,False,False,False,50,'BFGS') else: input.addMACM(True,True,True,True,False,False,False,50,'BFGS') input.setMACM_CHAI(True) input.setROOT(root) input.setXYZO_ENSE(False) input.setTOPF(num_solutions) if xml_file is not None and xmlroot is not None: input.setOUTP_LEVE('LOGFILE') #i2 else: input.setOUTP_LEVE('SUMMARY') #command line output = phaser.Output() rescore_log = open(logfile, 'w') output_object = CallbackObject(xml_file, xmlroot) output.setPhenixPackageCallback(output_object) output.set_file_object(rescore_log) rnp = phaser.runMR_RNP(input, output) rescore_log.close() if rnp.Success(): log.info('After rescoring models in ensemble there are %d solution(s)' % rnp.numSolutions()) log.info('Top solution LLG = %f\n' % rnp.getTopLLG()) return rnp.getDotSol() else: return 'No solutions' def place_fragment(root, xml_file, xmlroot, log, data, pdbin, copies, rms, pdbin_fixed, seqin, ncs_copies, phaser_solvent, sgalternative, sg_list, input_hand, tncs, search_lowres, search_highres, rot_peaks, rot_cluster_off, rot_samp, tra_samp, search_down, tfz_solved, solutions, purge, rescore_strands, rescore_residues, rescore_all, rescore_models, nproc): phaser_logfile = root + '_Phaser.log' if nproc > 1: log.info(' Running Phaser with %d threads, logfile is: %s' % (nproc, os.path.abspath(phaser_logfile))) else: log.info(' Running Phaser, logfile is: %s' % (os.path.abspath(phaser_logfile))) if tncs: log.info('\n tNCS correction not applied') phaser_solutions = run_phaser(root=root, xml_file=xml_file, xmlroot=xmlroot, log=log, logfile=phaser_logfile, data=data, pdbin=pdbin, copies=copies, rms=rms, pdbin_fixed=pdbin_fixed, seqin=seqin, ncs_copies=ncs_copies, phaser_solvent=phaser_solvent, sgalternative=sgalternative, sg_list=sg_list, input_hand=input_hand, tncs=tncs, search_lowres=search_lowres, search_highres=search_highres, rot_peaks=rot_peaks, rot_cluster_off=rot_cluster_off, rot_samp=rot_samp, tra_samp=tra_samp, search_down=search_down, tfz_solved=tfz_solved, solutions=solutions, purge=purge, rescore_strands=rescore_strands, rescore_residues=rescore_residues, rescore_models=rescore_models, nproc=nproc) if rescore_strands or rescore_residues or rescore_models: phaser_logfile = root + '_rescore.log' if nproc > 1: log.info(' Rescoring models in ensemble with %d threads, logfile is: %s' % (nproc,os.path.abspath(phaser_logfile))) else: log.info(' Rescoring models in ensemble, logfile is: %s' % (os.path.abspath(phaser_logfile))) if tncs: log.info('\n tNCS correction not applied') if rescore_strands: models = split_chains(pdbin, copies) elif rescore_residues: models = split_strands(pdbin, copies) elif rescore_models: models = split_models(pdbin) rescored_solutions = run_rnp(root=root, xml_file=xml_file, xmlroot=xmlroot, logfile=phaser_logfile, log=log, data=data, tncs=tncs, pdbin=pdbin, models=models, solutions=phaser_solutions, num_solutions=solutions, rescore_all=rescore_all, rescore_models=rescore_models, nproc=nproc) return rescored_solutions else: return phaser_solutions