# ----------------------------------------------------------------------------- # Fetch additional information about PDB BLAST hits using the PDB REST web # services. Offer descriptions of all chains, names of molecular components # and ligands, publication date, literature reference, number of residues... # class PDB_Entry_Info: columns = ( ("Authors", 'citation_authors', {'anchor':"nw"}, 'PDB authors of deposited structure'), ("Date", 'deposition_date', {'anchor':"n"}, 'PDB structure publication date'), ("Method", 'expMethod', {'anchor':"n"}, 'PDB experiment type used\nfor structure determination'), ("PubMed", 'pubmedId', {'anchor':"n"}, 'PDB PubMed literature reference for structure'), ("Resolution", 'resolution', {'anchor':"n"}, 'PDB resolution in Angstroms'), ("Title", 'title', {'anchor':"nw"}, 'PDB entry title'), ("Total atoms", 'nr_atoms', {'anchor':"n"}, 'PDB total number of atoms\nin structure (all chains)'), ("Total residues", 'nr_residues', {'anchor':"n"}, 'PDB total number of residues\nin structure (all chains)'), ) def fetch_info(self, pdb_chain_ids): if len(pdb_chain_ids) == 0: return {} pdb_ids = [id.split('_')[0].lower() for id in pdb_chain_ids] # Fetch PDB entry info xml = fetch_from_pdb('describePDB', set(pdb_ids)) if xml is None: # TODO: Warn that fetch failed. return {} pis = [self.pdb_info(e) for e in xml.getElementsByTagName('PDB')] da = dict((pi.structureId.lower(), self.entry_properties(pi)) for pi in pis) pimap = dict((pcid, da.get(pdb_id.lower(),{})) for pdb_id, pcid in zip(pdb_ids, pdb_chain_ids)) return pimap def pdb_info(self, xml_element): pe = PDB_Entry() attrs = xml_element.attributes for i in range(attrs.length): a = attrs.item(i) setattr(pe, a.name, a.value) strings_to_numeric_values(pe) return pe def entry_properties(self, pe): cnames = tuple(c[1] for c in self.columns) amap = dict((n,getattr(pe,n)) for n in cnames if hasattr(pe,n)) return amap # ----------------------------------------------------------------------------- # class PDB_Entry: def __init__(self): self.structureId = None # "4HHB" self.title = None # "THE CRYSTAL STRUCTURE OF HUMAN DEOXYHAEMOGLOBIN AT 1.74 ANGSTROMS RESOLUTION" self.expMethod = None # "X-RAY DIFFRACTION" self.resolution = None # "1.74" self.keywords = None # "OXYGEN TRANSPORT" self.nr_entities = None # "4" self.nr_residues = None # "574" self.nr_atoms = None # "4779" self.publish_date = None # "1984-03-07" self.revision_date = None # "1984-07-17" self.structure_authors = None # "Fermi, G., Perutz, M.F." self.pubmedId = None # "6726807" self.citation_authors = None # "Fermi, G., Perutz, M.F., Shaanan, B., Fourme, R." self.status = None # "CURRENT" self.replaces = None # "1HHB" self.integer_fields = ('nr_entities', 'nr_residues', 'nr_atoms', 'pubmedId') self.float_fields = ('resolution',) # ----------------------------------------------------------------------------- # def strings_to_numeric_values(o): for fname, fcast in (('integer', int), ('float', float)): for aname in getattr(o, fname+'_fields', ()): try: setattr(o, aname, fcast(getattr(o, aname))) except: pass # ----------------------------------------------------------------------------- # class PDB_Chain_Info: columns = ( ('Chain names', 'chain_names', {'anchor':"nw"}, 'PDB chain names'), ('Copies', 'chain_copies', {'anchor':"n"}, 'PDB number of copies\nof matched chain'), ("Polymers", 'npolymers', {'anchor':"n"}, 'PDB number of different\npolymers in structure'), ('Residues', 'chain_residues', {'anchor':"n"}, 'PDB number of residues\nin matched chain'), ('Species', 'chain_species', {'anchor':"n"}, 'PDB chain taxonomy'), ('UniProt', 'chain_sequence_id', {'anchor':"n"}, 'Uniprot sequence identifier\nof matched PDB chain'), ('Weight', 'chain_weight', {'anchor':"n"}, 'PDB molecular weight (Daltons)\nof matched chain'), ) def fetch_info(self, pdb_chain_ids): if len(pdb_chain_ids) == 0: return {} pdb_ids = [id.split('_')[0].lower() for id in pdb_chain_ids] # Fetch chain info. xml = fetch_from_pdb('describeMol', set(pdb_ids)) if xml is None: # TODO: Warn that fetch failed. return {} pos = [self.polymers_info(e) for e in xml.getElementsByTagName('structureId')] # Compute derived chain attributes pomap = dict((pdb_id.lower(), polys) for pdb_id, polys in pos) pimap = {} for pcid in pdb_chain_ids: pdb_id, cid = pcid.split('_') polys = pomap.get(pdb_id.lower(), []) pimap[pcid] = self.chain_properties(polys, cid) return pimap def polymers_info(self, e): pdb_id = e.getAttribute('id') polymers = [self.polymer_info(p) for p in e.getElementsByTagName('polymer')] return (pdb_id, polymers) def polymer_info(self, e): p = Polymer() for a in ('entityNr', 'length', 'type', 'weight'): if e.hasAttribute(a): setattr(p, a, e.getAttribute(a)) p.chainIds = [c.getAttribute('id') for c in e.getElementsByTagName('chain') if c.hasAttribute('id')] p.macroMoleculeName = tag_attribute(e, 'macroMolecule', 'name') mm = e.getElementsByTagName('macroMolecule') if mm.length >= 1: p.macroMoleculeId = tag_attribute(mm.item(0), 'accession', 'id') p.polymerDescription = tag_attribute(e, 'polymerDescription', 'description') p.fragment = tag_attribute(e, 'fragment', 'desc') p.details = tag_attribute(e, 'details', 'desc') p.taxonomyName = tag_attribute(e, 'Taxonomy', 'name') p.taxonomyId = tag_attribute(e, 'Taxonomy', 'id') strings_to_numeric_values(p) return p def chain_properties(self, polymers, chain_id): pr = {} cp = self.chain_polymer(polymers, chain_id) pr['chain_copies'] = len(cp.chainIds) if cp else 0 pr['chain_residues'] = cp.length if cp else 0 pr['npolymers'] = len(polymers) polys = list(polymers) if cp: polys.remove(cp) polys.insert(0,cp) cdesc = ['%s: %s %s' % (''.join(p.chainIds), p.polymerDescription, p.fragment) for p in polys] pr['chain_names'] = '\n'.join(cdesc) pr['chain_species'] = cp.taxonomyName if cp else '' pr['chain_sequence_id'] = cp.macroMoleculeId if cp and cp.macroMoleculeId else '' pr['chain_weight'] = cp.weight if cp else '' return pr def chain_polymer(self, polymers, chain_id): for p in polymers: if chain_id in p.chainIds: return p return None # ----------------------------------------------------------------------------- # class Polymer: def __init__(self): self.entityNr = None # "1" self.length = None # "141" self.type = None # "protein" self.weight = None # "15150.5" self.chainIds = [] # ["A", "C"] self.macroMoleculeName = '' # "Hemoglobin subunit alpha" self.macroMoleculeId = None # "P69905" self.polymerDescription = '' # "HEMOGLOBIN (DEOXY) (ALPHA CHAIN)" self.fragment = '' # "HEAVY CHAIN 1-219" self.details = '' # "OBTAINED BY PAPAIN CLEAVAGE (FAB)" self.taxonomyName = '' # "Homo sapiens" self.taxonomyId = None # "9606" self.integer_fields = ('entityNr', 'length') self.float_fields = ('weight',) # ----------------------------------------------------------------------------- # class PDB_Ligand_Info: columns = ( ("Ligand formulas", 'ligandFormulas', {'anchor':"nw"}, 'PDB ligand chemical formulas.'), ("Ligand names", 'ligandNames', {'anchor':"nw"}, 'PDB ligand chemical names.'), ("Ligand smiles", 'ligandSmiles', {'anchor':"nw"}, 'PDB ligand smile strings.'), ("Ligand symbols", 'ligandSymbols', {'anchor':"nw"}, 'PDB ligand symbols.'), ("Ligand weights", 'ligandWeights', {'anchor':"nw"}, 'PDB ligand weights (Daltons).'), ) def fetch_info(self, pdb_chain_ids): if len(pdb_chain_ids) == 0: return {} pdb_ids = [id.split('_')[0].lower() for id in pdb_chain_ids] xml = fetch_from_pdb('ligandInfo', set(pdb_ids)) los = [] if xml is None else [self.ligands_info(e) for e in xml.getElementsByTagName('structureId')] lmap = dict((pdb_id.lower(), ligands) for pdb_id, ligands in los) pmap = dict((pcid, self.ligand_properties(lmap.get(pdb_id, {}))) for pcid, pdb_id in zip(pdb_chain_ids, pdb_ids)) return pmap def ligands_info(self, e): pdb_id = e.getAttribute('id') ligands = [self.ligand_info(p) for p in e.getElementsByTagName('ligand')] return (pdb_id, ligands) def ligand_info(self, e): lig = Ligand() for a in ('chemicalID', 'type', 'molecularWeight'): if e.hasAttribute(a): setattr(lig, a, e.getAttribute(a)) for t in ('chemicalName', 'formula', 'smiles', 'InChi', 'InChiKey'): te = e.getElementsByTagName(t) if te.length >= 1: setattr(lig, t, tag_text(te.item(0))) strings_to_numeric_values(lig) return lig def ligand_properties(self, ligands): d = { 'ligandNames': '\n'.join(l.chemicalName for l in ligands), 'ligandSmiles': '\n'.join(l.smiles for l in ligands), 'ligandSymbols': '\n'.join(l.chemicalID for l in ligands), 'ligandWeights': FloatList([l.molecularWeight for l in ligands]), 'ligandFormulas': '\n'.join(l.formula for l in ligands), } return d # ----------------------------------------------------------------------------- # Used to sort molecular weights table column by largest molecular weight. # class FloatList: def __init__(self, values, format = '%.3f'): self.values = values self.format = format def __str__(self): return '\n'.join((self.format % v) for v in self.values) def __cmp__(self, fl): if not isinstance(fl,FloatList): return 1 return cmp((max(self.values) if len(self.values) > 0 else 0), (max(fl.values) if len(fl.values) > 0 else 0)) # ----------------------------------------------------------------------------- # class Ligand: def __init__(self): self.chemicalID = None # "1B0" self.type = None # "non-polymer" self.molecularWeight = None # "425.522" self.chemicalName = '' # "PHENYLALANINAMIDE" self.formula = '' # "C27 H27 N3 O2" self.smiles = '' # "Cc1c(c2ccccc2[nH]1)CC(=O)N[C@@H](Cc3ccccc3)C(=O)N(C)c4ccccc4" self.InChi = None self.InChiKey = None self.float_fields = ('molecularWeight',) # ----------------------------------------------------------------------------- # def fetch_from_pdb(query, pdb_ids): ids = ','.join([pdb_id.lower() for pdb_id in pdb_ids]) url = 'http://www.pdb.org/pdb/rest/%s?structureId=%s' % (query, ids) from urllib2 import urlopen, URLError try: f = urlopen(url) xml_string = f.read() f.close() except URLError, v: from chimera.replyobj import info info('Fetching BLAST PDB info using URL:\n %s\nFailed %s\n' % (url, str(v))) return None from xml.dom.minidom import parseString xml = parseString(xml_string) return xml # ----------------------------------------------------------------------------- # def tag_attribute(element, tag_name, attr_name, default = ''): for e in element.getElementsByTagName(tag_name): if e.hasAttribute(attr_name): return e.getAttribute(attr_name) return default # ----------------------------------------------------------------------------- # def tag_text(element, default = ''): t = ''.join(n.data for n in element.childNodes if n.nodeType == n.TEXT_NODE) return t # ----------------------------------------------------------------------------- # entry_info = PDB_Entry_Info() chain_info = PDB_Chain_Info() ligand_info = PDB_Ligand_Info()