# Copyright (c) 2013, GlaxoSmithKline Research & Development Ltd. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above # copyright notice, this list of conditions and the following # disclaimer in the documentation and/or other materials provided # with the distribution. # * Neither the name of GlaxoSmithKline Research & Development Ltd. # nor the names of its contributors may be used to endorse or promote # products derived from this software without specific prior written # permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Created by Jameed Hussain, July 2013 from __future__ import print_function import sys import re from rdkit import Chem def find_correct(f_array): core = "" side_chains = "" for f in f_array: attachments = f.count("*") if (attachments == 1): side_chains = "%s.%s" % (side_chains,f) else: core = f side_chains = side_chains.lstrip('.') #cansmi the side chains temp = Chem.MolFromSmiles(side_chains) side_chains = Chem.MolToSmiles( temp, isomericSmiles=True ) #and cansmi the core temp = Chem.MolFromSmiles(core) core = Chem.MolToSmiles( temp, isomericSmiles=True ) return core,side_chains def delete_bonds(smi,id,mol,bonds,out): #use the same parent mol object and create editable mol em = Chem.EditableMol(mol) #loop through the bonds to delete isotope = 0 isotope_track = {}; for i in bonds: isotope += 1 #remove the bond em.RemoveBond(i[0],i[1]) #now add attachement points newAtomA = em.AddAtom(Chem.Atom(0)) em.AddBond(i[0],newAtomA,Chem.BondType.SINGLE) newAtomB = em.AddAtom(Chem.Atom(0)) em.AddBond(i[1],newAtomB,Chem.BondType.SINGLE) #keep track of where to put isotopes isotope_track[newAtomA] = isotope isotope_track[newAtomB] = isotope #should be able to get away without sanitising mol #as the existing valencies/atoms not changed modifiedMol = em.GetMol() #canonical smiles can be different with and without the isotopes #hence to keep track of duplicates use fragmented_smi_noIsotopes fragmented_smi_noIsotopes = Chem.MolToSmiles(modifiedMol,isomericSmiles=True) valid = True fragments = fragmented_smi_noIsotopes.split(".") #check if its a valid triple cut if(isotope == 3): valid = False for f in fragments: matchObj = re.search( '\*.*\*.*\*', f) if matchObj: valid = True break if valid: if(isotope == 1): fragmented_smi_noIsotopes = re.sub('\[\*\]', '[*:1]', fragmented_smi_noIsotopes) fragments = fragmented_smi_noIsotopes.split(".") #print fragmented_smi_noIsotopes s1 = Chem.MolFromSmiles(fragments[0]) s2 = Chem.MolFromSmiles(fragments[1]) #need to cansmi again as smiles can be different output = '%s,%s,,%s.%s' % (smi,id,Chem.MolToSmiles(s1,isomericSmiles=True),Chem.MolToSmiles(s2,isomericSmiles=True) ) if( (output in out) == False): out.add(output) elif (isotope >= 2): #add the isotope labels for key in isotope_track: #to add isotope lables modifiedMol.GetAtomWithIdx(key).SetIsotope(isotope_track[key]) fragmented_smi = Chem.MolToSmiles(modifiedMol,isomericSmiles=True) #change the isotopes into labels - currently can't add SMARTS or labels to mol fragmented_smi = re.sub('\[1\*\]', '[*:1]', fragmented_smi) fragmented_smi = re.sub('\[2\*\]', '[*:2]', fragmented_smi) fragmented_smi = re.sub('\[3\*\]', '[*:3]', fragmented_smi) fragments = fragmented_smi.split(".") #identify core/side chains and cansmi them core,side_chains = find_correct(fragments) #now change the labels on sidechains and core #to get the new labels, cansmi the dot-disconnected side chains #the first fragment in the side chains has attachment label 1, 2nd: 2, 3rd: 3 #then change the labels accordingly in the core #this is required by the indexing script, as the side-chains are "keys" in the index #this ensures the side-chains always have the same numbering isotope_track = {} side_chain_fragments = side_chains.split(".") for s in xrange( len(side_chain_fragments) ): matchObj = re.search( '\[\*\:([123])\]', side_chain_fragments[s] ) if matchObj: #add to isotope_track with key: old_isotope, value: isotope_track[matchObj.group(1)] = str(s+1) #change the labels if required if(isotope_track['1'] != '1'): core = re.sub('\[\*\:1\]', '[*:XX' + isotope_track['1'] + 'XX]' , core) side_chains = re.sub('\[\*\:1\]', '[*:XX' + isotope_track['1'] + 'XX]' , side_chains) if(isotope_track['2'] != '2'): core = re.sub('\[\*\:2\]', '[*:XX' + isotope_track['2'] + 'XX]' , core) side_chains = re.sub('\[\*\:2\]', '[*:XX' + isotope_track['2'] + 'XX]' , side_chains) if(isotope == 3): if(isotope_track['3'] != '3'): core = re.sub('\[\*\:3\]', '[*:XX' + isotope_track['3'] + 'XX]' , core) side_chains = re.sub('\[\*\:3\]', '[*:XX' + isotope_track['3'] + 'XX]' , side_chains) #now remove the XX core = re.sub('XX', '' , core) side_chains = re.sub('XX', '' , side_chains) output = '%s,%s,%s,%s' % (smi,id,core,side_chains) if( (output in out) == False): out.add(output) def fragment_mol(smi,id): mol = Chem.MolFromSmiles(smi) #different cuts can give the same fragments #to use outlines to remove them outlines = set() if(mol == None): sys.stderr.write("Can't generate mol for: %s\n" % (smi) ) else: #SMARTS for "acyclic and not in a functional group" smarts = Chem.MolFromSmarts("[#6+0;!$(*=,#[!#6])]!@!=!#[*]") #finds the relevant bonds to break #find the atoms maches matching_atoms = mol.GetSubstructMatches(smarts) total = len(matching_atoms) #catch case where there are no bonds to fragment if(total == 0): output = '%s,%s,,' % (smi,id) if( (output in outlines) == False ): outlines.add(output) bonds_selected = [] #loop to generate every single, double and triple cut in the molecule for x in xrange( total ): #print matches[x] bonds_selected.append(matching_atoms[x]) delete_bonds(smi,id,mol,bonds_selected,outlines) bonds_selected = [] for y in xrange(x+1,total): #print matching_atoms[x],matching_atoms[y] bonds_selected.append(matching_atoms[x]) bonds_selected.append(matching_atoms[y]) delete_bonds(smi,id,mol,bonds_selected,outlines) bonds_selected = [] for z in xrange(y+1, total): #print matching_atoms[x],matching_atoms[y],matching_atoms[z] bonds_selected.append(matching_atoms[x]) bonds_selected.append(matching_atoms[y]) bonds_selected.append(matching_atoms[z]) delete_bonds(smi,id,mol,bonds_selected,outlines) bonds_selected = [] #right, we are done. return outlines if __name__=='__main__': if (len(sys.argv) >= 2): print("Program that fragments a user input set of smiles.") print("The program enumerates every single,double and triple acyclic single bond cuts in a molecule.\n") print("USAGE: ./rfrag.py