# --- UCSF Chimera Copyright --- # Copyright (c) 2000 Regents of the University of California. # All rights reserved. This software provided pursuant to a # license agreement containing restrictions on its disclosure, # duplication and use. This notice must be embedded in or # attached to all copies, including partial copies, of the # software or any revisions or derivations thereof. # --- UCSF Chimera Copyright --- """ reads a Stockholm (Pfaat, HMMer, Pfam) format file """ from chimera.Sequence import Sequence from MultAlignViewer.parse import WrongFileTypeError, FormatSyntaxError, \ makeReadable # extensions to look for in file browser extensions = [".sth", ".sto"] # prefix to use on Chimera command line prefixes = ["sth", "hmmer"] # what type of file do we provide parsing for... fileType = "Stockholm" # what format-specific features can be used generically by MAV... # (also used by saveStockholm.py...) genericFileAttrs = { "AC": "accession", "AU": "author", "CC": "comments", "DE": "description" } genericSeqAttrs = { "AC": "accession", "DE": "description", "WT": "weight" } def parse(fileName): from OpenSave import osOpen from chimera import replyobj f = osOpen(fileName, "r") lineNum = 0 fileAttrs = {} fileMarkups = {} seqAttrs = {} seqMarkups = {} sequences = {} seqSequence = [] for line in f: line = line.rstrip() # drop trailing newline/whitespace lineNum += 1 if lineNum == 1: if line.startswith("# STOCKHOLM"): continue raise WrongFileTypeError() if not line: continue if line.startswith('#='): markupType = line[2:4] markup = line[5:].strip() def trySplit(numSplit): fields = markup.split(None, numSplit) if len(fields) == numSplit: # value is empty fields.append("") if len(fields) != numSplit + 1: raise FormatSyntaxError("Not enough" " arguments after #=%s markup" " on line %d" % ( markupType, lineNum)) return fields if markupType == "GF": tag, val = trySplit(1) tag = tag.replace("_", " ") tag = genericFileAttrs.get(tag, "Stockholm " + tag) if tag in fileAttrs: fileAttrs[tag] += '\n' + val else: fileAttrs[tag] = val elif markupType == "GS": seqName, tag, val = trySplit(2) tag = tag.replace("_", " ") attrs = seqAttrs.setdefault(seqName, {}) tag = genericSeqAttrs.get(tag, "Stockholm " + tag) if tag in attrs: attrs[tag] += '\n' + val else: attrs[tag] = val elif markupType == "GC": tag, val = trySplit(1) tag = tag.replace("_", " ") fileMarkups[tag] = fileMarkups.get(tag, "") + val elif markupType == "GR": seqName, tag, val = trySplit(2) tag = tag.replace("_", " ") seqMarkups.setdefault(seqName, {}).setdefault(tag, "") seqMarkups[seqName][tag] += val # ignore other types continue elif line.startswith('#'): # unstructured comment if 'comments' in fileAttrs: fileAttrs['comments'] += "\n" + line[1:] else: fileAttrs['comments'] = line[1:] continue elif line.strip() == "//": # end of sequence alignment blocks, but comments # may follow this, so keep going... continue # sequence info... try: seqName, block = line.split(None, 1) except ValueError: raise FormatSyntaxError("Sequence info not in name/" "contents format on line %d" % lineNum) if seqName not in sequences: sequences[seqName] = Sequence(makeReadable(seqName)) seqSequence.append(seqName) sequences[seqName].extend(block) f.close() if not sequences: raise FormatSyntaxError("No sequences found") for seqName, seq in sequences.items(): if seqName in seqAttrs: seq.attrs = seqAttrs[seqName] if seqName in seqMarkups: seq.markups = seqMarkups[seqName] for tag, markup in seq.markups.items(): if len(markup) != len(seq): replyobj.warning("Markup %s for" " sequence %s is wrong length;" " ignoring\n" % (tag, seqName)) del seq.markups[tag] for seqInfo, label in [(seqAttrs, "sequence"), (seqMarkups, "residue")]: for seqName in seqInfo.keys(): if seqName in sequences: continue # might be sequence name without trailing '/start-end' for fullName in sequences.keys(): if fullName.startswith(seqName) \ and fullName[len(seqName)] == '/' \ and '/' not in fullName[len(seqName)+1:]: break else: raise FormatSyntaxError("%s annotations " "provided for non-existent sequence %s" % (label.capitalize(), seqName)) replyobj.info("Updating %s %s annotions with %s " "annotations\n" % (fullName, label, seqName)) seqInfo[fullName].update(seqInfo[seqName]) del seqInfo[seqName] for tag, markup in fileMarkups.items(): if len(markup) != len(sequences[seqSequence[0]]): raise FormatSyntaxError("Column annotation %s is" " wrong length" % tag) return map(lambda name: sequences[name], seqSequence), \ fileAttrs, fileMarkups