# Use 4-space tab stops when editing. :set ts=4 # ----------------------------------------------------------------------------- # Read mmCIF file using Python mmLib module and make a Chimera Molecule. # # Tested with pymmlib 0.9.7 which must be installed separately into Chimera # Python site-packages directory. # # ----------------------------------------------------------------------------- # def open_mmcif(path): from OpenSave import osOpen p = osOpen(path) # # No bonds are created for standard mmCIF files from the PDB unless the # library_bonds or distance_bonds options are given. # # It appears that mmlib 0.9.7 can only link residues of a polymer using # distance rules. Residue templates are used for intra-residue bonds. # from mmLib.FileIO import LoadStructure from mmLib.mmCIF import mmCIFSyntaxError try: struct = LoadStructure(fil=p, format="CIF", library_bonds=True, auto_sort=False) except mmCIFSyntaxError, e: from chimera import NotABug raise NotABug("%s: %s" % (path, str(e))) except SystemExit: from chimera import LimitationError raise LimitationError("%s: fatal error reading .cif file" % path) mList = build_chimera_molecule(struct) import os.path for m in mList: m.name = os.path.basename(path) return mList # ----------------------------------------------------------------------------- # def build_chimera_molecule(mmlib_structure): from mmLib.Structure import fragment_id_split import chimera import numpy from string import letters from mmLib.CIFBuilder import add_number, add_string modelMap = {} # mmLib structure to Chimera molecule atoms = {} # mmLib atom to chimera atom mapping residues = {} # fragment to chimera residue mapping seq2frag = {} # entity-sequence to fragment mapping sa2r = {} # sequence-asym_id to chimera residue mapping e2c = {} # entity to chain mapping # if cif (rather than mmCIF) get aniso info in there if available... anisoTable = None if hasattr(mmlib_structure, 'cif_data'): cif_data = mmlib_structure.cif_data for t in cif_data.tables: if "atom_site_aniso_label" in t.columns: anisoTable = t anisoNameMap = {} for row in range(len(anisoTable.rows)): temp = {} add_string(temp, "x", anisoTable, "atom_site_aniso_label", row) anisoNameMap[temp["x"]] = row break # Repackage unused data from files into "headers" analogous # to PDB headers if hasattr(mmlib_structure, 'cif_data'): # CIF data are in cif_data (CIF.DataBlock) # We convert tags into a simple dictionary # and each table into list of dictionaries import copy cifTags = copy.copy(mmlib_structure.cif_data.tags) cifTables = dict() for ti, table in enumerate(mmlib_structure.cif_data.tables): tableName = _tableNameFromColumns(table.columns, ti) rowList = list() for row in table.rows: rowList.append(dict(zip(table.columns, row))) cifTables[tableName] = rowList cifHeaders = { "tags": cifTags, "tables": cifTables } def addFileData(m, headers=cifHeaders): m.cifHeaders = headers else: # mmCIF data are in cifdb (mmCIFDB.mmCIFDB) # Each table has a name, which becomes the key into # the mmCIFHeaders dictionary. Each table is turned # into a list of dictionaries, just like CIF above. mmCIFHeaders = dict() for table in mmlib_structure.cifdb: # table is an instance of mmCIFTable tableData = [] for row in table: # row is an instance of mmCIFRow tableData.append(dict(row)) mmCIFHeaders[table.name] = tableData def addFileData(m, headers=mmCIFHeaders): m.mmCIFHeaders = headers # Make molecules for mm in mmlib_structure.iter_models(): m = chimera.Molecule() addFileData(m) m.cif_unitCell = mmlib_structure.unit_cell modelMap[mm] = m seq2frag[m] = {} sa2r[m] = {} e2c[m] = {} # Make atoms and residues as needed name_to_element = element_table() for mm in mmlib_structure.iter_models(): m = modelMap[mm] for ma in mm.iter_all_atoms(): mf = ma.get_fragment() try: chain_id = ma.asym_id.strip() except AttributeError: chain_id = ma.chain_id.strip() try: r = residues[mf] except KeyError: seq, icode = fragment_id_split(mf.fragment_id) if icode is None: rid = chimera.MolResId(chain_id, seq) else: rid = chimera.MolResId(chain_id, seq, insert=icode) r = m.newResidue(mf.res_name, rid) for c in chain_id: if c.islower(): m.lowerCaseChains = True residues[mf] = r if not mf.is_standard_residue(): r.isHet = True sf_key = (ma.label_entity_id, ma.label_seq_id) d = seq2frag[m].setdefault(sf_key, {}) d[chain_id] = mf sa2r[m][(ma.label_seq_id, ma.label_asym_id)] = r s = e2c[m].setdefault(ma.label_entity_id, set([])) s.add(chain_id) if not ma.element: ename = ma.name[0] if len(ma.name) > 1 and ma.name[1] in letters: ename += ma.name[1].lower() elif len(ma.element) > 1: ename = ma.element[0].upper() + ma.element[1:].lower() while len(ename) > 1: if ename[-1].isalpha(): break ename = ename[:-1] else: ename = ma.element try: element = name_to_element[ename] except KeyError: # Don't know what it is, guess carbon element = name_to_element['C'] a = m.newAtom(ma.name.replace('*', "'"), element) a.altLoc = ma.alt_loc c = chimera.Coord() c.x, c.y, c.z = ma.position a.setCoord(c) # a.coord = c does not work if ma.temp_factor is not None: a.bfactor = ma.temp_factor if ma.occupancy is not None: a.occupancy = ma.occupancy if ma.U is not None: a.anisoU = numpy.array(ma.U, 'f') elif anisoTable and ma.name in anisoNameMap: anisoDict = {} anisoRow = anisoNameMap[ma.name] for pos in ["11", "22", "33", "23", "13", "12"]: add_number(anisoDict, "u"+pos, anisoTable, "atom_site_aniso_u_"+pos, anisoRow) a.anisoU = numpy.array([ [anisoDict['u11'], anisoDict['u12'], anisoDict['u13']], [anisoDict['u12'], anisoDict['u22'], anisoDict['u23']], [anisoDict['u13'], anisoDict['u23'], anisoDict['u33']] ], 'f') atoms[ma] = a r.addAtom(a) # Make bonds numBonds = 0 for mm in mmlib_structure.iter_models(): m = modelMap[mm] for ma in mm.iter_all_atoms(): for mb in ma.iter_bonds(): a1 = atoms[mb.get_atom1()] a2 = atoms[mb.get_atom2()] try: m.newBond(a1, a2) except TypeError: pass else: numBonds += 1 # Hack city! mmCIF templates for amino acids do not have # H1 or H3 in them, so the N-terminus amino acid may be # left with two connected atoms. for mm in mmlib_structure.iter_models(): m = modelMap[mm] for mf in mm.iter_fragments(): if not mf.is_amino_acid(): continue r = residues[mf] h1Atoms = r.findRangeAtoms("H1") h3Atoms = r.findRangeAtoms("H3") if not h1Atoms and not h3Atoms: continue h1List = h1Atoms.get("H1", []) h3List = h3Atoms.get("H3", []) nAtoms = r.findRangeAtoms("N") if not nAtoms: continue nList = nAtoms.get("N", []) numBonds += connectHydrogen(m, h1List, nList) numBonds += connectHydrogen(m, h3List, nList) # Make inter-residue bonds from chimera.resCode import res3to1 from chimera import Sequence residueAfter = {} try: table = mmlib_structure.cifdb["entity_poly_seq"] except KeyError: for m in modelMap.itervalues(): connectChain(m) else: for m in modelMap.itervalues(): seqMap = {} prevEntity = None prevDict = None for row in table.iter_rows(): thisEntity = row["entity_id"] try: chains = e2c[m][thisEntity] except KeyError: pass else: try: three = row["mon_id"] except KeyError: monomer = 'X' else: monomer = res3to1(three) for chainId in chains: try: seq = seqMap[chainId] except KeyError: if chainId == " ": name = Sequence.PRINCIPAL else: name = Sequence.CHAIN_FMT % chainId seq = Sequence.Sequence(name) seqMap[chainId] = seq seq.append(monomer) num = row["num"] try: thisDict = seq2frag[m][(thisEntity, num)] except KeyError: # leave the prev variables so we will connect # residues in the same chain. continue if prevDict is not None and prevEntity == thisEntity: for chain_id, prevR in prevDict.iteritems(): try: thisR = thisDict[chain_id] except KeyError: continue isAA = thisR.is_amino_acid() isNA = thisR.is_nucleic_acid() wasAA = prevR.is_amino_acid() wasNA = prevR.is_nucleic_acid() if isAA and wasAA: numBonds += connectResidues(prevR, [ "C" ], thisR, [ "N" ], atoms, residueAfter) if isNA and wasNA: numBonds += connectResidues(prevR, [ "O3'", "O3*" ], thisR, [ "P" ], atoms, residueAfter) prevDict = thisDict prevEntity = thisEntity m.cifPolySeq = seqMap # If no bonds were made, make them if numBonds == 0: import chimera for m in modelMap.itervalues(): chimera.connectMolecule(m) else: # Sometimes CIF files leave off connectivity for # hydrogen atoms. If a molecule consists of a # single residue and it has disconnected atoms, # try to connect them up. for m in modelMap.itervalues(): if len(m.residues) != 1: continue for a in m.atoms: if len(a.bonds) == 0: chimera.connectMolecule(m) break # Assign helix secondary structure try: table = mmlib_structure.cifdb["struct_conf"] except KeyError: pass else: for m in modelMap.itervalues(): for row in table.iter_rows(): typeId = row["conf_type_id"] if typeId.startswith("HELX"): ssType = "isHelix" else: continue bAsymId = row["beg_label_asym_id"] bSeqId = row["beg_label_seq_id"] eAsymId = row["end_label_asym_id"] eSeqId = row["end_label_seq_id"] try: bRes = sa2r[m][(bSeqId, bAsymId)] except KeyError: print "No residue match asym_id=%s seq_id=%s" % (bAsymId, bSeqId) continue try: eRes = sa2r[m][(eSeqId, eAsymId)] except KeyError: print "No residue match asym_id=%s seq_id=%s" % (eAsymId, eSeqId) continue # Make sure we have a clear list of residues # from beginning to end r = bRes while r is not eRes and r is not None: r = residueAfter.get(r, None) if r is None: print "Non-consecutive residues (%s.%s) -> (%s.%s)" % ( bSeqId, bAsymId, eSeqId, eAsymId) continue r = bRes while r is not eRes: setattr(r, ssType, True) r = residueAfter[r] setattr(eRes, ssType, True) # Assign secondary structure (sheet) try: table = mmlib_structure.cifdb["struct_sheet_range"] except KeyError: pass else: for m in modelMap.itervalues(): for row in table.iter_rows(): bAsymId = row["beg_label_asym_id"] bSeqId = row["beg_label_seq_id"] eAsymId = row["end_label_asym_id"] eSeqId = row["end_label_seq_id"] try: bRes = sa2r[m][(bSeqId, bAsymId)] except KeyError: print "No residue match asym_id=%s seq_id=%s" % (bAsymId, bSeqId) continue try: eRes = sa2r[m][(eSeqId, eAsymId)] except KeyError: print "No residue match asym_id=%s seq_id=%s" % (eAsymId, eSeqId) continue # Make sure we have a clear list of residues # from beginning to end r = bRes while r is not eRes and r is not None: r = residueAfter.get(r, None) if r is None: print "Non-consecutive residues (%s.%s) -> (%s.%s)" % ( bSeqId, bAsymId, eSeqId, eAsymId) continue r = bRes while r is not eRes: r.isSheet = True r = residueAfter[r] eRes.isSheet = True # Add any modres information to resCode dictionaries try: table = mmlib_structure.cifdb["struct_conn"] except KeyError: pass else: from chimera import resCode dicts = (resCode.regex3to1, resCode.nucleic3to1, resCode.protein3to1, resCode.standard3to1) for row in table.iter_rows(): if row["conn_type_id"] != "modres": continue try: name = row["ptnr1_auth_comp_id"] except KeyError: name = row["ptnr1_label_comp_id"] stdName = row["pdbx_ptnr1_standard_comp_id"] for d in dicts: if stdName in d: d[name] = d[stdName] return modelMap.values() def connectResidues(prev, prevAtoms, this, thisAtoms, atoms, residueAfter): numAdded = 0 prevList = [ ma for ma in prev.iter_all_atoms() if ma.name in prevAtoms ] thisList = [ ma for ma in this.iter_all_atoms() if ma.name in thisAtoms ] for prevA in prevList: for thisA in thisList: if alt_loc_match(prevA.alt_loc, thisA.alt_loc): a1 = atoms[prevA] m1 = a1.molecule a2 = atoms[thisA] m2 = a2.molecule if m1 != m2: continue try: m1.newBond(a1, a2) except TypeError: # This may happen if the bond was already # specified in struct_conn section pass else: numAdded += 1 residueAfter[a1.residue] = a2.residue return numAdded def connectChain(m): # Called when poly_seq is missing. # Connect residues in model m if they have the same # chain id and are of the same type (amino acid/nucleotide). # We should probably check sequence to make sure they are # adjacent but we will rely on long-bond postprocessing for now. prevType = prevAtoms = prevChain = None for r in m.residues: n = r.findRangeAtoms("N") ca = r.findRangeAtoms("CA") c = r.findRangeAtoms("C") if n and ca and c: rType = "AA" else: o3p = r.findRangeAtoms("O3'").values() p = r.findRangeAtoms("P").values() if o3p and p: rType = "Nuc" else: rType = None if rType is None: # Not AA or nucleotide, break chain prevType = prevAtoms = prevChain = None continue elif rType == "AA": chief = n["N"] link = c["C"] else: chief = p["P"] link = o3p["O3'"] chain = r.id.chainId if prevType == rType and prevAtoms is not None and prevChain == chain: # Same type and chain as previous residue, connect for prevA in prevAtoms: for thisA in chief: if alt_loc_match(prevA.altLoc, thisA.altLoc): m.newBond(prevA, thisA) prevType = rType prevAtoms = link prevChain = chain def _tableNameFromColumns(columns, tableIndex): shortest = min([ len(c) for c in columns ]) class Found(Exception): pass try: for i in range(shortest): ch = columns[0][i] for c in columns[1:]: if c[i] != ch: raise Found(i) except Found, found: n = found.args[0] name = columns[0][:n] while not name[-1].isalnum(): name = name[:-1] return name else: return "table%d" % tableIndex def alt_loc_match(al1, al2): return al1 == al2 or al1 == '' or al2 == '' def connectHydrogen(m, hList, nList): nb = 0 for h in hList: for n in nList: if h.altLoc and n.altLoc and h.altLoc != n.altLoc: continue try: m.newBond(n, h) except TypeError: pass else: nb += 1 return nb def dump_attributes(obj): import pprint for a in dir(obj): if a.startswith("_"): continue v = getattr(obj, a) if callable(v): continue print " %s:" % a, pprint.pprint(v) # ----------------------------------------------------------------------------- # name_to_element_dict = None def element_table(): global name_to_element_dict if name_to_element_dict == None: name_to_element_dict = {} from chimera import elements for name in elements.name: name_to_element_dict[name] = getattr(elements, name) return name_to_element_dict