# --- UCSF Chimera Copyright ---
# Copyright (c) 2000 Regents of the University of California.
# All rights reserved.  This software provided pursuant to a
# license agreement containing restrictions on its disclosure,
# duplication and use.  This notice must be embedded in or
# attached to all copies, including partial copies, of the
# software or any revisions or derivations thereof.
# --- UCSF Chimera Copyright ---

PRINCIPAL = "principal chain"
CHAIN_FMT = "chain %s"

clustalStrongGroups = ["STA", "NEQK", "NHQK", "NDEQ", "QHRK",
						"MILV", "MILF", "HY", "FYW"]
clustalWeakGroups = ["CSA", "ATV", "SAG", "STNK", "STPA", "SGND",
				"SNDEQK", "NDEQHK", "NEQHRK", "FVLIM", "HFY"]

defHelixColor = "goldenrod"
defStrandColor = "lime green"

# this is imported by chimera.__init__, so minimize global imports
import string

class Sequence(object):
	"""
	Sequence is a single sequence

	internally a list of characters (available in the 'sequence' attribute)
	external interface as a string
	"""

	_name = "sequence"

	SS_HELIX = 'H'
	SS_OTHER = 'O'
	SS_STRAND = 'S'

	TRIG_RENAME = "rename"

	def __init__(self, name=None):
		self.sequence = []
		if name is not None:
			self._name = name
		self.attrs = {} # miscellaneous attributes
		self.markups = {} # per-residue (strings or lists)
		self.numberingStart = None
		self._cache = {}
		from chimera.triggerSet import TriggerSet
		self.triggers = TriggerSet()
		self.triggers.addTrigger(self.TRIG_RENAME)

	def append(self, item):
		self._cache = {}
		try:
			itemList = list(item)
		except TypeError:
			itemList = [item]
		self.sequence.extend(itemList)

	def __copy__(self, copySeq=None):
		from copy import copy
		if copySeq is None:
			copySeq = Sequence(self.name)
		copySeq.attrs = copy(self.attrs)
		copySeq.markups = copy(self.markups)
		copySeq.sequence = copy(self.sequence)
		copySeq.numberingStart = self.numberingStart
		return copySeq

	def __del__(self):
		if hasattr(self, '_removeHandlerID'):
			try:
				from chimera import openModels
			except ImportError:
				return
			openModels.deleteRemoveHandler(self._removeHandlerID)

	def __delitem__(self, key):
		self._cache = {}
		del self.sequence[key]

	def __delslice__(self, i, j):
		self._cache = {}
		del self.sequence[i:j]

	extend = append

	def fullName(self):
		return self.name

	def gapped2ungapped(self, index):
		try:
			g2u = self._cache["g2u"]
		except KeyError:
			self.ungapped()
			g2u = self._cache["g2u"]
		try:
			return g2u[index]
		except KeyError:
			return None

	def __getitem__(self, key):
		return self.sequence[key]

	def __getslice__(self, i, j):
		return "".join(self.sequence[i:j])

	def __hash__(self):
		return id(self)

	def inGap(self, pos):
		return not self.sequence[pos].isalpha()
	isGap = inGap

	def insert(self, pos, insertion):
		self._cache = {}
		self.sequence.insert(pos, insertion)

	def __len__(self):
		return len(self.sequence)
	
	def _getName(self):
		return self._name

	def _setName(self, name):
		if name == self._name:
			return
		oldName = self._name
		self._name = name
		self.triggers.activateTrigger(self.TRIG_RENAME, (self, oldName))

	name = property(_getName, _setName)

	def patternMatch(self, expr):
		"""expr is a pattern compiled with re.compile()"""
		matches = []
		target = self.ungapped()
		startPos = 0
		while startPos < len(target):
			match = expr.search(target, startPos)
			if not match:
				break
			matches.append([self.ungapped2gapped(i)
				for i in [match.start(), match.end()-1]])
			startPos = match.start() + 1
		return matches

	def regexMatch(self, regex):
		"""regex is regular expression compilable with re.compile()"""
		import re
		return self.patternMatch(re.compile(regex))

	def prositeMatch(self, pattern):
		"""PROSITE matching

		   Return a list of start/end tuple for matches to the
		   given PROSITE pattern (see http://us.expasy.org/tools/
		   scanprosite/scanprosite-doc.html)
		"""
		if len(pattern) > 1 and pattern[0] in string.ascii_uppercase \
		and pattern[1] in string.ascii_uppercase and '-' not in pattern:
			pattern = '-'.join(pattern)
		elements = pattern.split('-')
		if not elements:
			return []

		elements = [e.strip() for e in elements]
		try:
			target = self._cache["prositeTarget"]
		except KeyError:
			target = self.ungapped().upper()
			self._cache["prositeTarget"] = target

		matches = []
		for start in range(len(target)):
			matches.extend(self._prositeMatch(target, elements,
								start))
		# due to ranges, there may be multiple different matches 
		# to the exact same characters, so eliminate duplicates
		# while remapping indices into gapped sequence...
		gapped = {}
		for start, end in matches:
			gapped[(self.ungapped2gapped(start),
				self.ungapped2gapped(end))] = 1
		return gapped.keys()

	def _prositeMatch(self, target, elements, start):
		from chimera import UserError
		try:
			partials = self._prositeMatchElement(target,
							elements[0], start)
		except _PrositePatternError:
			raise UserError("Unrecognized PROSITE pattern element:"
							" %s" % elements[0])
		if partials is None:
			return []
		tailElements = elements[1:]
		matches = []
		for end in partials:
			# handle the rare case of '>' inside brackets...
			if (end == len(target)-1 
			and len(tailElements) == 1 
			and tailElements[0][0] == '['
			and tailElements[0][-1] == ']'
			and '>' in tailElements[0]) or not tailElements:
				matches.append((start, end))
			else:
				if end >= len(target)-1:
					continue
				tailMatches = self._prositeMatch(target,
							tailElements, end+1)
				if not tailMatches:
					continue
				matches.extend([ (start, tEnd)
					for tStart, tEnd in tailMatches])
		return matches
				
	def _prositeMatchElement(self, target, element, start):
		if element and element[0] == '<':
			if start > 0:
				return None
			element = element[1:]
		if not element:
			from chimera import UserError
			raise UserError(
			     "Element of PROSITE pattern is empty or lone '<'")
		try:
			simple, minTimes, maxTimes = self._prositeParseRange(
									element)
		except ValueError:
			simple = element
			minTimes = maxTimes = 1

		return self._prositeMatchRange(target, simple, start,
							minTimes, maxTimes)

	def _prositeParseRange(self, element):
		lp = element.rindex('(')
		try:
			comma = element.rindex(',')
		except ValueError:
			comma = None
		rp = element.rindex(')')
		if lp == 0:
			raise ValueError, "No element to repeat"
		if comma is None:
			if not lp < rp:
				raise ValueError, "Range elements disordered"
			minTimes = maxTimes = int(element[lp+1:rp])
		else:
			if not lp < comma < rp:
				raise ValueError, "Range elements disordered"
			minTimes = int(element[lp+1:comma])
			maxTimes = int(element[comma+1:rp])
			if minTimes > maxTimes:
				raise ValueError, \
				   "Minimum range greater than maximum range"
		return element[:lp], minTimes, maxTimes

	def _prositeMatchRange(self, target, element, start,
							minTimes, maxTimes):
		for i in range(minTimes):
			if start+i >= len(target):
				return None
			if not self._prositeMatchSimple(target, element,
								start+i):
				return None

		matches = [start+minTimes-1]

		for i in range(minTimes, maxTimes):
			if start+i >= len(target):
				break
			if not self._prositeMatchSimple(target, element,
								start+i):
				break
			matches.append(start+i)
		return matches

	def _prositeMatchSimple(self, target, element, pos):
		char = target[pos]
		if len(element) == 1:
			if element == 'x' or element == char:
				return [(pos, pos)]
			return None
				
		if element[0] == '[' and element[-1] == ']':
			for matchable in element[1:-1]:
				if matchable == char:
					return [(pos, pos)]
			return None
		if element[0] == '{' and element[-1] == '}':
			for avoid in element[1:-1]:
				if avoid == char:
					return None
			return [(pos, pos)]
				
		raise _PrositePatternError

	def saveInfo(self):
		"""info that can be used with restoreSequence()"""
		saveInfo = {
			'name': self.name,
			'sequence': self.sequence,
			'attrs': self.attrs,
			'markups': self.markups,
			'numberingStart': self.numberingStart
		}
		if hasattr(self, 'circular'):
			saveInfo['circular'] = self.circular
		return saveInfo

	def __setitem__(self, key, val):
		self._cache = {}
		self.sequence[key] = val

	def __setslice__(self, i, j, seq):
		self._cache = {}
		self.sequence[i:j] = list(seq)

	def ssType(self, loc, locIsUngapped=False):
		try:
			ssMarkup = self.markups['SS']
		except KeyError:
			return None
		if not locIsUngapped:
			loc = self.gapped2ungapped(loc)
		if loc is None:
			return None
		ss = ssMarkup[loc]
		if ss in "HGI":
			return self.SS_HELIX
		if ss == "E":
			return self.SS_STRAND
		return self.SS_OTHER

	def __str__(self):
		try:
			return "".join(self.sequence)
		except TypeError:
			return "non-ASCII sequence (%s)" % self.name

	def ungapped(self, _raw=False):
		try:
			return self._cache["ungapped"]
		except KeyError:
			pass
		self._cache["ungapped"] = "".join([c for c in self.sequence
							if c.isalpha() or c == '?'])
		if _raw:
			return self._cache["ungapped"]
		uI = 0
		g2u = {}
		u2g = {}
		seq = self.sequence
		for i, char in enumerate(seq):
			if char.isalpha() or char == '?':
				g2u[i] = uI
				u2g[uI] = i
				uI += 1
		self._cache["g2u"] = g2u
		self._cache["u2g"] = u2g
				
		return self._cache["ungapped"]

	def ungapped2gapped(self, index):
		try:
			return self._cache["u2g"][index]
		except KeyError:
			self.ungapped()
			return self._cache["u2g"][index]

class StructureSequence(Sequence):
	"""
	Sequence of a structure chain (subclass of Sequence)
	"""

	TRIG_DELETE = "delete"
	TRIG_MODIFY = "modify"

	def __init__(self, molecule, *args, **kw):
		Sequence.__init__(self, *args, **kw)
		self.residues = []
		self.resMap = {}
		self.molecule = molecule
		# allow numberingStart to be dynamic
		self._numberingStart = self.numberingStart
		delattr(self, 'numberingStart')
		self.descriptiveName = None

		# since bound methods are "first class" objects, using
		# proxies to them doesn't work as you would expect
		# (the proxy immediately becomes invalid), so...
		from chimera import triggers, update, triggerSet
		from weakref import proxy
		def wrapper1(a1, a2, a3, s=proxy(self)):
			s._delMoleculeCB(a1, a2, a3)
		self._removeHandlerID = triggers.addHandler("Molecule",
					wrapper1, None)
		def wrapper2(a1, a2, a3, s=proxy(self)):
			s._residueCB(a1, a2, a3)
		if update.inTriggerProcessing:
			# avoid handling expensive and redundant pending
			# Residue trigger callback
			def mcCB(*args):
				self._residueHandlerID = triggers.addHandler("Residue",
					wrapper2, None)
				return triggerSet.ONESHOT
			self._delayedResHandlerID = triggers.addHandler(
				"monitor changes", mcCB, None)
		else:
			self._residueHandlerID = triggers.addHandler("Residue",
					wrapper2, None)
		self.triggers.addTrigger(self.TRIG_DELETE)
		self.triggers.addTrigger(self.TRIG_MODIFY)

	def __getattr__(self, attrName):
		if attrName == 'numberingStart':
			if self._numberingStart == None:
				for i, r in enumerate(self.residues):
					if r is None:
						continue
					if r.__destroyed__:
						return getattr(self, '_prevNumberingStart', 1)
					break
				else:
					return getattr(self, '_prevNumberingStart', 1)
				pns = self._prevNumberingStart = r.id.position - i
				return pns
			return self._numberingStart
		if attrName == 'chainID':
			ids = set([r.id.chainId for r in self.residues if r])
			if len(ids) == 1:
				return ids.pop()
			if len(ids) == 0:
				raise ValueError("No residues in chain")
			raise ValueError("Multiple chain IDs in chain")
		raise AttributeError("%s instance has no attribute '%s'"
				% (self.__class__.__name__, attrName))

	def append(self, item):
		try:
			itemList = list(item)
		except TypeError:
			itemList = [item]
		if itemList and isinstance(itemList[0], basestring):
			return Sequence.extend(self, itemList)
		base = len(self.residues)
		for i, res in enumerate(itemList):
			self.resMap[res] = base + i
		self.residues.extend(itemList)
		from resCode import res3to1
		Sequence.extend(self, [res3to1(r.type) for r in itemList])

	def _bestUngapped(self, i):
		end = len(self)
		while i < end:
			un = self.gapped2ungapped(i)
			if un is not None:
				return un
			i += 1
		return end

	def _cleanup(self):
		if not hasattr(self, 'molecule'):
			return
		# invalidate first since TRIG_DELETE will remove
		# sequence from _SequenceSequences
		if not self.molecule.__destroyed__ \
		and hasattr(self.molecule, '_SequenceSequences') \
		and self in self.molecule._SequenceSequences[0]: # not a copy
			invalidate(self.molecule)
		self.triggers.activateTrigger(self.TRIG_DELETE, self)
		delattr(self, 'molecule')
		delattr(self, 'residues')
		delattr(self, 'resMap')
		from chimera import triggers
		triggers.deleteHandler("Molecule", self._removeHandlerID)
		delattr(self, '_removeHandlerID')
		if hasattr(self, '_residueHandlerID'):
			triggers.deleteHandler("Residue", self._residueHandlerID)
			delattr(self, '_residueHandlerID')
		else:
			triggers.deleteHandler("monitor changes", self._delayedResHandlerID)

	def __copy__(self, copySeq=None):
		from copy import copy
		if copySeq is None:
			copySeq = StructureSequence(self.molecule, self.name)
		copySeq = Sequence.__copy__(self, copySeq=copySeq)
		copySeq._numberingStart = self._numberingStart
		delattr(copySeq, 'numberingStart')
		copySeq.residues.extend(self.residues)
		copySeq.resMap.update(self.resMap)
		if hasattr(self, 'circular'):
			copySeq.circular = self.circular
		copySeq.fromSeqres = self.fromSeqres
		copySeq.descriptiveName = self.descriptiveName
		if hasattr(self, 'residueSequence'):
			copySeq.residueSequence = self.residueSequence
		return copySeq

	def __del__(self):
		try:
			from chimera import openModels, triggers
		except ImportError:
			return
		self.destroy()

	def __delitem__(self, key):
		ungappedKey = self.gapped2ungapped(key)
		if ungappedKey is not None:
			res = self.residues[ungappedKey]
			del self.residues[ungappedKey]
			if res:
				del self.resMap[res]
			for res, i in self.resMap.items():
				if i > ungappedKey:
					self.resMap[res] = i-1
		Sequence.__delitem__(self, key)
		self.triggers.activateTrigger(self.TRIG_MODIFY, self)

	def _delMoleculeCB(self, trigName, myData, trigData):
		if self.molecule in trigData.deleted:
			self.demoteToSequence()

	def __delslice__(self, i, j):
		ungappedI = self._bestUngapped(i)
		ungappedJ = self._bestUngapped(j)
		for res in self.residues[ungappedI:ungappedJ]:
			if res:
				del self.resMap[res]
		del self.residues[ungappedI:ungappedJ]
		for res, k in self.resMap.items():
			if k >= j:
				self.resMap[res] = k - (j-i)
		Sequence.__delslice__(self, i, j)
		self.triggers.activateTrigger(self.TRIG_MODIFY, self)

	def demoteToSequence(self):
		numberingStart = self.numberingStart
		self._cleanup()
		self.__class__ = Sequence
		self.numberingStart = numberingStart

	def destroy(self):
		self._cleanup()

	extend = append

	def _getFromSeqres(self):
		return getattr(self, "_fromSeqres", None)

	def _setFromSeqres(self, fsVal):
		if fsVal == self.fromSeqres:
			return
		if self.fromSeqres:
			for pos in range(len(self)-1, -1, -1):
				if not self.residues[pos]:
					Sequence.__delitem__(self, self.ungapped2gapped(pos))
					del self.residues[pos]
		self._fromSeqres = fsVal

	fromSeqres = property(_getFromSeqres, _setFromSeqres)

	def fullName(self):
		rem = self.name
		for part in (self.molecule.name, "(%s)" % self.molecule):
			rem = rem.strip()
			if rem:
				rem = rem.strip()
				if rem.startswith(part):
					rem = rem[len(part):]
					continue
			break
		if rem and not rem.isspace():
			namePart = " " + rem.strip()
		else:
			namePart = ""
		return "%s (%s)%s" % (self.molecule.name, self.molecule, namePart)

	def hasProtein(self):
		"""contains some protein residues"""
		from resCode import protein3to1
		for r in self.residues:
			if r and r.type in protein3to1:
				return True
		return False

	def insert(self, pos, insertion):
		if type(insertion) == str:
			return Sequence.insert(self, pos, insertion)
		for r, i in self.resMap.items():
			if i >= pos:
				self.resMap[r] = i+1
		self.residues.insert(pos, insertion)
		self.resMap[insertion] = pos
		from resCode import res3to1
		Sequence.insert(self, pos, res3to1(insertion.type))
		self.triggers.activateTrigger(self.TRIG_MODIFY, self)

	def _residueCB(self, trigName, ignore, resChanges):
		deletions = [dr for dr in resChanges.deleted
							if dr in self.resMap]
		if len(deletions) == len([r for r in self.residues if r]):
			self.demoteToSequence()
			return
		# sort deletions into descending order so we can easily
		# delete from our residue list
		deletions.sort(lambda r1, r2:
					cmp(self.resMap[r2], self.resMap[r1]))
		for dr in deletions:
			pos = self.resMap[dr]
			del self.resMap[dr]
			if self.fromSeqres:
				self.residues[pos] = None
			else:
				Sequence.__delitem__(self, self.ungapped2gapped(pos))
				del self.residues[pos]

		typesChanged = False
		if "type changed" in resChanges.reasons:
			from resCode import res3to1
			ungapped = self.ungapped()
			for res, pos in self.resMap.items():
				if res3to1(res.type) != ungapped[pos]:
					typesChanged = True
					self[self.ungapped2gapped(pos)
							] = res3to1(res.type)
		if typesChanged and self.fromSeqres:
			self.fromSeqres = False

		# identify new residues that should be added to the sequence
		candidates = {}.fromkeys([cr for cr in resChanges.created
					if cr.molecule == self.molecule
					and cr not in self.resMap])
		addedSome = False
		if candidates:
			# find cross-residue bonds that connect candidates/
			# current residues
			allRes = self.resMap.copy()
			allRes.update(candidates)
			resConn = dict((r, [rb for rb in r.bondedResidues()
					    if rb in allRes]) for r in allRes)
			while True:
				addThese = {}
				for ar in candidates.keys():
					for jr in resConn.get(ar, []):
						if jr not in self.resMap:
							continue
						if len(resConn.get(jr, [])) > 2:
							# throw up hands
							continue
						addThese[ar] = jr
				if addThese:
					addedSome = True
					from resCode import res3to1
					for add, prev in addThese.items():
						after = cmp(add.id,
								prev.id) > 0
						where = self.residues.index(
								prev) + after
						if where == len(self.residues):
							seqWhere = len(self)
						else:
							seqWhere = self.\
							ungapped2gapped(where)
						oneLet = res3to1(add.type)
						if self.fromSeqres:
							if where == 0 or where == len(self.residues) \
							or self.residues[where-1+after] != None \
							or self[seqWhere] != oneLet:
								self.fromSeqres = False
						if self.fromSeqres:
							self.residues[where-1+after] = add
						else:
							self.residues.insert(where, add)
							Sequence.insert(self, seqWhere, oneLet)
						self.resMap[add] = None
						del candidates[add]
				else:
					break
		if deletions or typesChanged or addedSome:
			self.resMap.clear()
			for i, r in enumerate(self.residues):
				if r:
					self.resMap[r] = i
			self.triggers.activateTrigger(self.TRIG_MODIFY, self)

	def saveInfo(self, molEncodeFunc=None):
		"""info that can be used with restoreSequence()"""
		saveInfo = Sequence.saveInfo(self)
		if molEncodeFunc is None:
			from SimpleSession import sessionID as molEncodeFunc
		saveInfo['molecule'] = molEncodeFunc(self.molecule)
		def resSes(r):
			if r is None:
				return None
			return molEncodeFunc(r)
		saveInfo['residues'] = [resSes(r) for r in self.residues]
		saveMap = {}
		for residue, pos in self.resMap.items():
			saveMap[molEncodeFunc(residue)] = pos
		saveInfo['resMap'] = saveMap
		saveInfo['fromSeqres'] = self.fromSeqres
		saveInfo['descriptiveName'] = self.descriptiveName
		return saveInfo

	def __setitem__(self, key, val):
		if isinstance(val, basestring):
			return Sequence.__setitem__(self, key, val)
		ungapped = self.gapped2ungapped(key)
		if ungapped is None:
			raise KeyError("Cannot set gap to non-gap")
		oldRes = self.residues[ungapped]
		if oldRes:
			del self.resMap[oldRes]
		self.resMap[val] = ungapped
		self.residues[ungapped] = val
		from resCode import res3to1
		Sequence.__setitem__(self, key, res3to1(val.type))
		self.triggers.activateTrigger(self.TRIG_MODIFY, self)

	def __setslice__(self, i, j, seq):
		if not isinstance(seq, self.__class__):
			return Sequence.__setslice__(self, i, j, seq)
		ungappedI = self._bestUngapped(i)
		ungappedJ = self._bestUngapped(j)
		for r in self.residues[ungappedI:ungappedJ]:
			del self.resMap[r]
		size = ungappedJ - ungappedI
		for r, ri in self.resMap.items():
			if ri >= ungappedI:
				self.resMap[r] = ri + len(seq) - max(size, 0)
		self.residues[ungappedI:ungappedJ] = seq.residues
		from resCode import res3to1
		Sequence.__setslice__(self, i, j,
						[res3to1(r.type) for r in seq])
		self.triggers.activateTrigger(self.TRIG_MODIFY, self)

	def ssType(self, loc, locIsUngapped=False):
		if not locIsUngapped:
			loc = self.gapped2ungapped(loc)
		if loc is None:
			return None
		r = self.residues[loc]
		if r is None:
			return None
		if r.isHelix:
			return self.SS_HELIX
		if r.isStrand:
			return self.SS_STRAND
		return self.SS_OTHER

	def static(self, start=0, end=-1):
		if end < 0:
			end += len(self)
		return StaticStructureSequence(self, start, end)

class StaticStructureSequence(StructureSequence):
	"""static structure sequence
	   (doesn't change as residues added/deleted)
	"""
	def __init__(self, sseq, start=None, end=None):
		StructureSequence.__init__(self, sseq.molecule, name=sseq.name)
		StructureSequence.__copy__(sseq, copySeq=self)
		if end is not None and end < len(self) - 1:
			del self[end+1:]
		if start is not None and start > 0:
			del self[:start]
		# stop dynamic numbering...
		self.numberingStart = self.numberingStart

	def saveInfo(self, molEncodeFunc=None):
		"""info that can be used with restoreSequence()"""
		saveInfo = StructureSequence.saveInfo(self)
		saveInfo['static'] = True
		return saveInfo

	def _residueCB(self, trigName, ignore, resChanges):
		deletions = [dr for dr in resChanges.deleted
							if dr in self.resMap]
		if len(deletions) == len(self.resMap):
			self.demoteToSequence()
			return

		# sort deletions into descending order so we can easily
		# delete from our residue list
		deletions.sort(lambda r1, r2:
					cmp(self.resMap[r2], self.resMap[r1]))
		for dr in deletions:
			pos = self.resMap[dr]
			del self.resMap[dr]
			self.residues[pos] = None

		typesChanged = False
		if "type changed" in resChanges.reasons:
			from resCode import res3to1
			ungapped = self.ungapped()
			for res, pos in self.resMap.items():
				if res3to1(res.type) != ungapped[pos]:
					typesChanged = True
					self[self.ungapped2gapped(pos)
							] = res3to1(res.type)
		if deletions or typesChanged:
			self.triggers.activateTrigger(self.TRIG_MODIFY, self)

def getSequence(molecule, chainID, **kw):
	"""Get the Sequence of the specified chain

	   Uses the getSequences function (below) and accepts the same
	   keywords.  Throws KeyError if the specified chain isn't found,
	   and AssertionError if there are multiple chains with the
	   specified ID.
	"""
	
	kw['asDict'] = True
	return getSequences(molecule, **kw)[chainID]

def getSequences(molecule, asDict=False):
	"""return all non-trivial sequences in a molecule

	   This function is also available as molecule.sequences(...)

	   returns a list of sequences for the given molecule, one sequence per
	   multi-residue chain.  The sequence name is "Chain X" where X is
	   the chain ID, or "Principal chain" if there is no chain ID.

	   The 'residues' attribute of each sequence is a list of the
	   residues for that sequence, and the attribute 'resmap' is a
	   dictionary that maps residue to sequence position (zero-based).
	   The 'residues' attribute will self-delete if the corresponding
	   model is closed.

	   If 'asDict' is true, return a dictionary of Sequences keyed on
	   chain ID (can throw AssertionError if multiple chains have same ID),
	   otherwise return a list.
	"""
	from chimera import bondsBetween, openModels, triggers
	from copy import copy

	def connected(res1, res2):
		if res1.id.chainId != ' ' \
		and res1.id.chainId == res2.id.chainId \
		and not res1.isHet and not res2.isHet:
			return True
		return bondsBetween(res1, res2, onlyOne=True)

	if hasattr(molecule, '_SequenceSequences'):
		seqList, seqDict, trigIDs = molecule._SequenceSequences
		if asDict:
			if seqDict is not None:
				return copy(seqDict)
			# known to be non-unique chain IDs...
			raise AssertionError("Non-unique chain IDs")
		else:
			return copy(seqList)

	chain = None
	prevRes = None
	firstRes = molecule.residues[0]
	seqs = []
	# don't start a sequence until we've seen two residues in the chain
	for res in molecule.residues:
		# if a residue has only one heavy atom, and that is connected
		# to only one other heavy atom, then don't put the residue
		# in the sequence
		# 
		# if heavy is connected to no other heavy atom (presumably
		# metal or ion), end the chain
		nheavys = res.heavyAtomCount
		chainBreak = prevRes == None
		if nheavys == 0:
			continue
		# heavys more reliably connected than hydrogens
		if prevRes and not connected(prevRes, res):
			didMod = 0
			if seq:
				for sr in seq.residues:
					if connected(sr, res):
						didMod = 1
			if didMod:
				continue
			chainBreak = 1
		elif prevRes and prevRes.id == res.id and nheavys == 1:
			continue
		elif chain is not None:
			# avoid adding a ligand that joins two side chains to
			# the sequence (DTT in 1fvg)...
			if prevRes and res.isHet and prevRes.id.chainId == res.id.chainId \
			and prevRes.id.position < res.id.position - 1:
				btw = bondsBetween(prevRes, res)
				if len(btw) == 1 and set([a.element.name for a in
				btw[0].atoms]) not in [set(['N', 'C']), set(['P', 'O'])]:
					continue
			# HET residues in named chains get the chain name,
			# but in blank chains they get 'het' -- allow for this
			truePrevID = chain
			trueResID = res.id.chainId
			if truePrevID != trueResID:
				if truePrevID in (" ", "het"):
					prevID = " "
				else:
					prevID = truePrevID
				if trueResID in (" ", "het"):
					resID = " "
				else:
					resID = trueResID
				if resID != prevID:
					chainBreak = 1

		if chainBreak or res == firstRes:
			# if chain ID changes in middle of connected chain
			# need to remember new chain ID...
			chain = res.id.chainId
			startRes = res
			prevRes = res
			seq = None
			continue # to avoid starting single-residues chains

		if not seq:
			if not chain or chain == " ":
				name = PRINCIPAL
				chain = " "
			else:
				name = CHAIN_FMT % chain
			seq = StructureSequence(startRes.molecule, name)
			seq.chain = chain
			seqs.append(seq)
			seq.append(startRes)
		seq.append(res)
		prevRes = res
	chain2desc = {}
	pdbHeaders = getattr(molecule, "pdbHeaders", {})
	compndRecs = pdbHeaders.get("COMPND", None)
	if compndRecs and molecule.pdbVersion > 1:
		compndChainIDs = None
		description = ""
		continued = False
		for rec in compndRecs:
			if continued:
				v += " " + rec[10:].strip()
			else:
				try:
					k, v = rec[10:].strip().split(": ", 1)
				except ValueError:
					# bad PDB file
					break
			if v.endswith(';'):
				v = v[:-1]
				continued = False
			elif rec == compndRecs[-1]:
				continued = False
			else:
				continued = True
				continue
			if k == "MOL_ID":
				if compndChainIDs and description:
					for chainID in compndChainIDs:
						chain2desc[chainID] = (description, synonym)
				compndChainIDs = None
				description = ""
			elif k == "MOLECULE":
				if v.startswith("PROTEIN (") and v.endswith(")"):
					description = v[9:-1]
				else:
					description = v
				synonym = False
			elif k == "SYNONYM":
				if ',' not in v:
					# not a long list of synonyms
					description = v
					synonym = True
			elif k == "CHAIN":
				compndChainIDs = v.split(", ")
		if compndChainIDs and description:
			for chainID in compndChainIDs:
				chain2desc[chainID] = (description, synonym)
	mmcifHeaders = getattr(molecule, "mmCIFHeaders", {})
	if not chain2desc and mmcifHeaders:
		scheme = mmcifHeaders.get('pdbx_poly_seq_scheme', [])
		id2index = {}
		for sch in scheme:
			id2index[sch['pdb_strand_id']] = int(sch['entity_id']) - 1
		entity = mmcifHeaders.get('entity', [])
		entity_name_com = mmcifHeaders.get('entity_name_com', [])
		name_com_lookup = {}
		for enc in entity_name_com:
			name_com_lookup[int(enc['entity_id'])-1] = enc['name']
		for chainID, index in id2index.items():
			description = None
			# try SYNONYM equivalent first
			if index in name_com_lookup:
				syn = name_com_lookup[index]
				if syn != '?':
					description = syn
					synonym = True
			if not description and len(entity) > index:
				description = entity[index]['pdbx_description']
				synonym = False
			if description:
				chain2desc[chainID] = (description, synonym)
	if chain2desc:
		from misc import processPdbChemName
		for k, v in chain2desc.items():
			description, synonym = v
			chain2desc[k] = processPdbChemName(description,
								probableAbbrs=synonym)
		seqChainIDs = set()
		for seq in seqs:
			try:
				chainID = seq.chainID
			except ValueError:
				continue
			seqChainIDs.add(chainID)
			seq.descriptiveName = chain2desc.get(chainID, None)
		if not getattr(molecule, '_chainDescriptionsLogged', False):
			for chainID in sorted(list(seqChainIDs)):
				descriptiveName = chain2desc.get(chainID, None)
				if descriptiveName:
					import replyobj
					replyobj.info("%s, chain %s: %s\n" % (molecule, chainID,
						descriptiveName))
			molecule._chainDescriptionsLogged = True
	sd = {}
	for seq in seqs[:]:
		# set 'fromSeqres' in this loop so that all sequences have it
		seq.fromSeqres = None
		if seq.chain in sd:
			# a sequence of all 'X' residues or all het (against non-X/non-het) loses
			source = sd[seq.chain]
			sourceX = str(source).count('X') == len(source)
			seqX = str(seq).count('X') == len(seq)
			sourceHet = len([r for r in source.residues if not r.isHet]) == 0
			seqHet = len([r for r in seq.residues if not r.isHet]) == 0
			if (seqX and not sourceX) or (seqHet and not sourceHet):
				seqs.remove(seq)
				continue
			elif (sourceX and not seqX) or (sourceHet and not seqHet):
				seqs.remove(sd[seq.chain])
				sd[seq.chain] = seq
				continue
			# Used to raise AssertionError here; by not raising the error
			# we can cache the fact that there are non-unique chain IDs 
			# and short-circuit multiple calls to this routine
			sd = None
			break
		sd[seq.chain] = seq

	# use full sequence if available...
	if sd:
		seqresSeqs = seqresSequences(molecule, asDict=True)
		for chain, seq in sd.items():
			try:
				srSeq = seqresSeqs[chain]
			except (KeyError, TypeError):
				continue
			seq.fromSeqres = True
			if len(srSeq) == len(seq):
				# no adjustment needed
				continue
			if len(srSeq) < len(seq):
				seq.fromSeqres = False
				import replyobj
				replyobj.warning("SEQRES record for chain %s of %s is "
					"incomplete.\n" "Ignoring record as basis for sequence."
					% (chain, molecule))
				continue
			# skip PDBs where the standard residues have been removed
			# but the SEQRES records haven't been...
			if str(seq).count('X') == len(seq) and str(seq) not in str(srSeq):
				seq.fromSeqres = False
				import replyobj
				replyobj.warning("Residues corresponding to SEQRES record for"
					" chain %s of %s are missing.\n"
					"Ignoring record as basis for sequence."
					% (chain, molecule))
				continue

			from MultAlignViewer.structAssoc import estimateAssocParams, \
													tryAssoc
			estLen, segments, gaps = estimateAssocParams(seq)
			# UNK residues may be jammed up against the regular sequence
			# in SEQRES records (3dh4, 4gns) despite missing intervening 
			# residues; compensate...
			# can't test aginst estLen since there can be other missing structure
			#
			# leading Xs...
			additionalXs = 0
			xs = ""
			for i, seg in enumerate(segments[:-1]):
				if seg.replace('X', ' ').isspace():
					xs += seg
					additionalXs += gaps[i]
				else:
					break
			if xs and srSeq[:len(xs)] == xs:
				srSeq[:0] =  'X' * additionalXs
			# trailing Xs...
			additionalXs = 0
			xs = ""
			for i, seg in enumerate(list(reversed(segments[1:]))):
				if seg.replace('X', ' ').isspace():
					xs += seg
					additionalXs += gaps[-(i+1)]
				else:
					break
			if xs and srSeq[-len(xs):] == xs:
				srSeq.extend('X' * additionalXs)
			# if a jump in numbering is in an unresolved part of the
			# structure, the estimated length can be too long...
			estLen = min(estLen, len(srSeq))
			# since gapping a structure sequence is considered an
			# "error", need to allow a lot more errors than normal...
			# However, allowing a _lot_ of errors can make it take a very
			# long time to find the answer, so limit the maximum...
			# (1vqn, chain 0 is > 2700 residues)
			seqLen = len(seq)
			maxErrs = int(min(seqLen/2, max(seqLen/10, sum(gaps))))
			try:
				matchMap, numErrors = tryAssoc(srSeq, seq, segments, gaps,
									estLen, maxErrors=maxErrs)
			except ValueError:
				seq.fromSeqres = False
				continue

			for i in range(len(srSeq)):
				if i in matchMap:
					del matchMap[i]
				else:
					seq.residues.insert(i, None)
			seq.resMap = matchMap
			seq[:] = srSeq[:]

	# try to avoid identical names
	baseNames = {}
	for seq in seqs:
		baseNames.setdefault(seq.name, []).append(seq)
	for baseName, sameNamedSeqs in baseNames.items():
		if len(sameNamedSeqs) > 1:
			def fmtID(r):
				if r.id.insertionCode != " ":
					return str(r.id.position) + r.id.insertionCode
				return str(r.id.position)
			for seq in sameNamedSeqs:
				actualResidues = [r for r in seq.residues if r]
				seq.name += " (%s-%s)" % (fmtID(actualResidues[0]),
					fmtID(actualResidues[-1]))

	if hasattr(molecule, '_SequenceSequences'):
		# deregister from previous sequence's triggers...
		# (don't destroy the old sequences, may be in use elsewhere)
		seqList, seqDict, trigIDs = molecule._SequenceSequences
		for i, seq in enumerate(seqList):
			seq.triggers.deleteHandler(seq.TRIG_DELETE, trigIDs[i])
	else:
		# invalidate the sequences cache if residues added/deleted
		molecule.__cacheHandlerID = triggers.addHandler('Residue',
					_invalidateCacheCB, molecule)
		molecule.__removeHandlerID = openModels.addRemoveHandler(
							_removeMolCB, molecule)
	# register for current sequence's triggers
	trigIDs = []
	for seq in seqs:
		trigIDs.append(
			seq.triggers.addHandler(seq.TRIG_DELETE, _delSeq, None))
	molecule._SequenceSequences = (seqs, sd, trigIDs)
	if asDict:
		if sd is not None:
			return copy(sd)
		raise AssertionError("Non-unique chain IDs")
	return copy(seqs)

def invalidate(molecule):
	try:
		seqList, seqDict, trigIDs = molecule._SequenceSequences
	except AttributeError:
		pass
	else:
		for i, seq in enumerate(seqList):
			seq.triggers.deleteHandler(seq.TRIG_DELETE, trigIDs[i])
		del molecule._SequenceSequences

class _PrositePatternError(ValueError):
	pass

def _delSeq(trigName, myData, seq):
	seqList, seqDict, trigIDs = seq.molecule._SequenceSequences
	i = seqList.index(seq)
	del seqList[i]
	del trigIDs[i]
	if seqDict is not None and seq in seqDict:
		del seqDict[seq]
	from chimera.triggerSet import ONESHOT
	return ONESHOT

def _invalidateCacheCB(trigName, molecule, resChanges):
	# we need to wait for the sequences to update themselves, so...
	additions = [r for r in resChanges.created if r.molecule == molecule]
	if additions:
		from chimera import triggers
		triggers.addHandler("monitor changes", _cacheCheck,
						(molecule, additions))

def _removeMolCB(trigName, molecule, removed):
	if molecule in removed:
		_delMolHandlers(molecule)

def _cacheCheck(trigName, info, trigData):
	from chimera.triggerSet import ONESHOT
	molecule, additions = info
	try:
		seqList, seqDict, trigIDs = molecule._SequenceSequences
	except AttributeError:
		# didn't have a cache anyway!
		return ONESHOT
	try:
		resList = molecule.residues
	except:
		# we must be in molecule destructor
		_delMolHandlers(molecule)
		return ONESHOT
	
	# modifications and deletions "take care of themselves", so we
	# only care about additions that are in chains that haven't added
	# themselves into current sequences
	for r in additions:
		if r.isIsolated:
			continue
		incorporated = False
		for seq in seqList:
			if r in seq.resMap:
				incorporated = True
				break
		if not incorporated:
			delattr(molecule, '_SequenceSequences')
			for i, seq in enumerate(seqList):
				seq.triggers.deleteHandler(seq.TRIG_DELETE, trigIDs[i])
			_delMolHandlers(molecule)
			break
	return ONESHOT

def _delMolHandlers(mol):
	from chimera import triggers, openModels
	if hasattr(mol, "__cacheHandlerID"):
		triggers.deleteHandler('Residue', mol.__cacheHandlerID)
		openModels.deleteRemoveHandler(mol.__removeHandlerID)
		delattr(mol, "__cacheHandlerID")
		delattr(mol, "__removeHandlerID")

def seqresSequences(molecule, singletons=False, asDict=False):
	"""return the sequences found in PDB SEQRES records for molecule

	   Returns None if there are no SEQRES records.
	   By default, singleton chains are ignored to match
	   getSequences() behavior.
	"""

	try:
		seqs = molecule.cifPolySeq
	except AttributeError:
		pass
	else:
		return _retSeqres(seqs, singletons, asDict)

	from resCode import res3to1
	try:
		srRecords = molecule.pdbHeaders["SEQRES"]
	except (AttributeError, KeyError):
		return None
	curChain = None
	seqs = {}
	for rec in srRecords:
		chain = rec[11]
		if chain != curChain:
			if chain == " ":
				name = PRINCIPAL
			else:
				name = CHAIN_FMT % chain
			seq = Sequence(name)
			seqs[chain] = seq
			curChain = chain
		seq.extend(map(res3to1, rec[19:70].split()))
	return _retSeqres(seqs, singletons, asDict)

def _retSeqres(seqs, singletons, asDict):
	if asDict:
		if singletons:
			return seqs
		else:
			pruned = {}
			for k, v in seqs.items():
				if len(v) > 1:
					pruned[k] = v
			return pruned
	else:
		if singletons:
			return seqs.values()
		return [s for s in seqs.values() if len(s) > 1]

def restoreSequence(seqInfo, molDecodeFunc=None):
	"""restore a sequence from the string returned by seq.saveString()"""
	if isinstance(seqInfo, basestring):
		seqInfo = eval(seqInfo)
	if 'molecule' in seqInfo:
		if isinstance(seqInfo['molecule'], basestring):
			# backwards compatibility with old sessions
			from SimpleSession import oslLookup as molDecodeFunc
		elif molDecodeFunc is None:
			from SimpleSession import idLookup as molDecodeFunc
		molecule = molDecodeFunc(seqInfo['molecule'])
		seq = StructureSequence(molecule, seqInfo['name'])
		def resDecode(info):
			if info is None:
				return None
			return molDecodeFunc(info)
		seq.residues.extend([resDecode(info)
				for info in seqInfo['residues']])
		for sesID, pos in seqInfo['resMap'].items():
			seq.resMap[molDecodeFunc(sesID)] = pos
		if seqInfo.get('static', False):
			seq = StaticStructureSequence(seq)
		seq.descriptiveName = seqInfo.get('descriptiveName', None)
	else:
		seq = Sequence(seqInfo['name'])
		if 'numberingStart' in seqInfo:
			seq.numberingStart = seqInfo['numberingStart']
	if 'attrs' in seqInfo:
		seq.attrs = seqInfo['attrs']
		seq.markups = seqInfo['markups']
	else:
		seq.attrs = seqInfo['miscAttrs']
		seq.markups = {}
	if 'circular' in seqInfo:
		seq.circular = seqInfo['circular']
	seq.fromSeqres = seqInfo.get('fromSeqres', None)
	seq.sequence.extend(seqInfo['sequence'])
	return seq

def percentIdentity(seq1, seq2, denominator="shorter"):
	"""Compute the percent identity between two gapped sequences

	   'denominator' control what the number of matches is divided by:
	     "shorter":  the ungapped length of the shorter of the two sequences
	     "longer":  converse of "shorter"
	     "in common":  the number of non-gap columns in common
	"""
	if len(seq1) != len(seq2):
		raise ValueError("Cannot compute percent identity for"
					" different-length sequences")

	matches = inCommon = 0

	for i, c1 in enumerate(seq1):
		if not c1.isalnum():
			continue
		c2 = seq2[i]
		if not c2.isalnum():
			continue
		inCommon += 1
		if c1.lower() == c2.lower():
			matches += 1
	try:
		if denominator == "shorter":
			return matches * 100.0 / min(len(seq1.ungapped()),
								len(seq2.ungapped()))
		elif denominator == "longer":
			return matches * 100.0 / max(len(seq1.ungapped()),
								len(seq2.ungapped()))
		return matches * 100.0 / inCommon
	except ZeroDivisionError:
		return 0.0