# --- UCSF Chimera Copyright --- # Copyright (c) 2000 Regents of the University of California. # All rights reserved. This software provided pursuant to a # license agreement containing restrictions on its disclosure, # duplication and use. This notice must be embedded in or # attached to all copies, including partial copies, of the # software or any revisions or derivations thereof. # --- UCSF Chimera Copyright --- """ reads a PIR file """ import string from chimera.Sequence import Sequence from MultAlignViewer.parse import WrongFileTypeError, FormatSyntaxError, \ makeReadable # extensions to look for in file browser extensions = [".hssp"] # prefix to use on Chimera command line prefixes = ["hssp"] # what type of file do we provide parsing for... fileType = "HSSP" def parse(fileName): from OpenSave import osOpen f = osOpen(fileName, "r") doing = None sequences = [] headerOK = False lineNum = 0 alignStartIndex = None for line in f: if doing == 'alignments': # don't strip() alignment section since it has significant # leading spaces line = line.rstrip() else: line = line.strip() lineNum += 1 if not headerOK: if line.lower().startswith('hssp'): headerOK = True continue raise WrongFileTypeError("No initial HSSP header line") if line.startswith('##'): if doing == 'proteins' and not sequences: raise FormatSyntaxError("No entries in PROTEINS section") try: doing = line.split()[1].lower() except IndexError: doing = None if doing == 'alignments': try: hashes, alignments, begin, dash, end = line.strip().split() begin = int(begin) end = int(end) except ValueError: raise FormatSyntaxError("ALIGNMENTS line (line #%d) not of " "the form: ## ALIGNMENTS (number) - (number)" % lineNum) continue if doing == 'proteins': if not line[0].isdigit(): continue try: seqName = line.split()[2] except IndexError: raise WrongFormatError("Line %d in PROTEINS section does not " "start with [integer] : [sequence name]" % lineNum) sequences.append(Sequence(makeReadable(seqName))) elif doing == 'alignments': if line.lstrip().lower().startswith('seqno'): try: alignStartIndex = line.index('.') except: raise FormatSyntaxError("No indication of alignment " " starting column ('.' character) in SeqNo line " " in ALIGNMENTS section") continue if alignStartIndex == None: raise FormatSyntaxError("No initial SeqNo line in " "ALIGNMENTS section") block = line[alignStartIndex:] if not block: raise FormatSyntaxError("No alignment block given on line %d" % lineNum) blockLen = end - begin + 1 if len(block) > blockLen: raise FormatSyntaxError("Too many characters (%d, only %d " " sequences) in alignment block given on line %d" % (len(block), blockLen, lineNum)) block = block + ' ' * (blockLen - len(block)) for seq, c in zip(sequences[begin-1:end], block): seq.append(c) f.close() return sequences, {}, {}