# --- UCSF Chimera Copyright ---
# Copyright (c) 2000 Regents of the University of California.
# All rights reserved.  This software provided pursuant to a
# license agreement containing restrictions on its disclosure,
# duplication and use.  This notice must be embedded in or
# attached to all copies, including partial copies, of the
# software or any revisions or derivations thereof.
# --- UCSF Chimera Copyright ---
#
# $Id: structAssoc.py 40732 2015-09-03 00:27:47Z pett $

from chimera.Sequence import Sequence
from NeedlemanWunsch import nw

def tryAssoc(alignSeq, mseq, segments, gaps, estLen, maxErrors=6):
	aseq = alignSeq.ungapped().upper()

	valueError = False
	try:
		if len(aseq) >= estLen:
			matchMap, errors = constrainedMatch(aseq, mseq,
							segments, gaps, maxErrors)
		else:
			matchMap, errors = gappedMatch(aseq, mseq, segments,
							gaps, estLen, maxErrors)
	except ValueError:
		valueError = True

	if not valueError and errors == 0:
		return matchMap, errors
	# prune off 'X' residues and see how that works...
	if mseq[0] != 'X' and mseq[-1] != 'X':
		if valueError:
			raise ValueError, "bad assoc"
		return matchMap, errors
	from copy import copy
	noX = copy(mseq)
	while noX[:1] == 'X':
		noX.sequence = noX.sequence[1:]
		noX.residues = noX.residues[1:]
		estLen -= 1
	if len(noX) == 0:
		if valueError:
			raise ValueError, "bad assoc"
		return matchMap, errors
	offset = len(mseq) - len(noX)
	Xsegments = copy(segments)
	Xgaps = copy(gaps)
	while offset and offset >= len(Xsegments[0]):
		offset -= len(Xsegments[0])
		Xsegments = Xsegments[1:]
		estLen -= Xgaps[0]
		Xgaps = Xgaps[1:]
	if offset:
		Xsegments[0] = Xsegments[0][offset:]
	tailLoss = 0
	while noX[-1:] == 'X':
		noX.sequence = noX.sequence[:-1]
		noX.residues = noX.residues[:-1]
		estLen -= 1
		tailLoss += 1
	while tailLoss and tailLoss >= len(Xsegments[-1]):
		tailLoss -= len(Xsegments[-1])
		Xsegments = Xsegments[:-1]
		estLen -= Xgaps[-1]
		Xgaps = Xgaps[:-1]
	if tailLoss:
		Xsegments[-1] = Xsegments[-1][:-tailLoss]
	try:
		if len(aseq) >= estLen:
			noXmatchMap, noXerrors = constrainedMatch(aseq, noX,
							Xsegments, Xgaps, maxErrors)
		else:
			noXmatchMap, noXerrors = gappedMatch(aseq, noX,
					Xsegments, Xgaps, estLen, maxErrors)
	except ValueError:
		if valueError:
			raise
		return matchMap, errors
	if valueError or noXerrors < errors:
		return noXmatchMap, noXerrors
	return matchMap, errors

def constrainedMatch(aseq, mseq, segments, gaps, maxErrors):
	# all the segments should fit in aseq

	offsets, errors = _constrained(aseq, segments, [None] + gaps + [None], maxErrors)
	if errors > maxErrors:
		raise ValueError, "bad assoc"
	if len(offsets) != len(segments):
		raise AssertionError, \
			"Internal match problem: segments (%d) != offsets (%d)"\
			% (len(segments), len(offsets))
	matchMap = {}
	resOffset = 0
	for si in range(len(segments)):
		offset, segment = offsets[si], segments[si]
		for i in range(len(segment)):
			res = mseq.residues[resOffset+i]
			if res:
				matchMap[res] = offset+i
				matchMap[offset+i] = res
		resOffset += len(segment)
	return matchMap, errors

def _constrained(aseq, segments, gaps, maxErrors):
	# find the biggest segment (but non-all-X segments win over all-X segments)
	longest = None
	for i in range(len(segments)):
		seg = segments[i]
		if seg.count('X') == len(seg):
			continue
		if not longest or len(seg) > longest:
			bsi = i
			longest = len(seg)
	if longest is None:
		for i in range(len(segments)):
			seg = segments[i]
			if not longest or len(seg) > longest:
				bsi = i
				longest = len(seg)
	from operator import add
	leftSpace = 0
	for seg in segments[:bsi]:
		leftSpace += 1 + len(seg)
	rightSpace = 0
	for seg in segments[bsi+1:]:
		rightSpace += 1 + len(seg)

	if len(aseq) - leftSpace - rightSpace < longest:
		return [], maxErrors + 1

	errList = []
	seq = segments[bsi]
	minOffset = None
	minErrs = None
	if bsi == 0:
		targetLeftGap = gaps[0]
	else:
		targetLeftGap = None
	if bsi == len(segments)-1:
		targetRightGap = gaps[bsi+1]
	else:
		targetRightGap = None
	for offset in range(leftSpace, len(aseq) - rightSpace - longest + 1):
		errors = 0
		for i in range(longest):
			if seq[i] == aseq[offset+i]:
				continue
			errors += 1
			if errors > maxErrors:
				errList.append(maxErrors+1)
				break
		else:
			errList.append(errors)
			gapErrs = 0
			if targetLeftGap is not None:
				gapErrs += abs((offset+1) - targetLeftGap)
			if targetRightGap is not None:
				gapErrs += abs(1 + (len(aseq) - (offset+longest)) - targetRightGap)
			if minErrs is None or errors < minErrs or (
					errors == minErrs and gapErrs < minGapErrs):
				minErrs = errors
				minOffset = offset
				minGapErrs = gapErrs
	if minOffset is None:
		return [], maxErrors+1

	# leave gaps to left and right
	if bsi > 0:
		leftOffsets, leftErrors = _constrained(aseq[0:minOffset-1],
					segments[:bsi], gaps[:bsi+1], maxErrors - minErrs)
	else:
		leftOffsets, leftErrors = [], 0

	if leftErrors + minErrs > maxErrors or bsi + 1 == len(segments):
		rightOffsets, rightErrors = [], 0
	else:
		rightOffsets, rightErrors = _constrained(
				aseq[minOffset+longest+1:], segments[bsi+1:], gaps[bsi+1:],
				maxErrors - minErrs - leftErrors)
	totErrs = minErrs + leftErrors + rightErrors
	offs = (leftOffsets, minOffset, rightOffsets)

	for i in range(len(errList)):
		baseErrs = errList[i]
		if baseErrs >= min(totErrs, maxErrors+1):
			continue
		offset = leftSpace + i
		if offset == minOffset:
			continue

		if bsi > 0:
			leftOffsets, leftErrors = _constrained(aseq[0:offset-1],
					segments[:bsi], gaps[:bsi+1],
					min(totErrs, maxErrors) - baseErrs)
		else:
			leftOffsets, leftErrors = [], 0
			
		if leftErrors + baseErrs > maxErrors:
			continue

		if bsi+1 < len(segments):
			rightOffsets, rightErrors = _constrained(
				aseq[offset+longest+1:], segments[bsi+1:], gaps[bsi+1:],
				min(totErrs, maxErrors) - baseErrs - leftErrors)
		else:
			rightOffsets, rightErrors = [], 0

		if baseErrs + leftErrors + rightErrors < totErrs:
			totErrs = baseErrs + leftErrors + rightErrors
			offs = (leftOffsets, offset, rightOffsets)

	if totErrs > maxErrors:
		return [], maxErrors+1

	offsets = offs[0]
	offsets.append(offs[1])
	for ro in offs[2]:
		offsets.append(ro + offs[1] + longest + 1)

	return offsets, totErrs

def gappedMatch(aseq, mseq, segments, gaps, estLen, maxErrors):
	gapped = segments[0]
	for i in range(1, len(segments)):
		gapped += "." * gaps[i-1]
		gapped += segments[i]

	# to avoid matching completely in gaps, need to establish a 
	# minimum number of matches
	minMatches = min(len(aseq), len(mseq)) / 2
	bestScore = 0
	totErrs = maxErrors + 1
	for offset in range(len(gapped) - estLen, estLen - len(aseq) + 1):
		matches = 0
		errors = 0
		if offset + len(aseq) < minMatches:
			continue
		if len(gapped) - offset < minMatches:
			continue
		for i in range(len(aseq)):
			if offset + i < 0:
				continue
			try:
				gapChar = gapped[offset+i]
			except IndexError:
				# in ending gap
				continue
			if aseq[i] == gapChar:
				matches += 1
				continue
			if gapChar == ".":
				continue
			errors += 1
			if errors >= totErrs:
				break
		else:
			if matches < minMatches or matches-errors <= bestScore:
				continue
			bestScore = matches - errors
			totErrs = errors
			bestOffset = offset

	if totErrs > maxErrors:
		raise ValueError, "bad assoc"

	matchMap = {}
	mseqIndex = 0
	for i in range(bestOffset + len(aseq)):
		try:
			if gapped[i] == ".":
				continue
		except IndexError:
			# in ending gap
			break
		if i >= bestOffset:
			res = mseq.residues[mseqIndex]
			if res:
				aseqIndex = i - bestOffset
				matchMap[res] = aseqIndex
				matchMap[aseqIndex] = res
		mseqIndex += 1

	return matchMap, totErrs

def oldTryAssoc(alignSeq, molSeq, gappedMseq, maxErrors=6):
	# remove gaps in alignment sequence...
	mseq = molSeq
	aseq = alignSeq.ungapped().upper()

	if len(gappedMseq) > len(aseq):
		return dragMatch(aseq, gappedMseq, maxErrors)

	unmatched = [(len(mseq), ("initial", 0, len(aseq), 0, len(mseq)))]
	matchups = []
	forcedMatches = []
	sumMatched = 0
	forcedErrors = 0

	while len(unmatched) > 0:
		minErrors = len(matchups) - 1 + len(unmatched) + forcedErrors
		if minErrors > maxErrors:
			raise ValueError, "bad assoc"
		# the minimum number of identical matches required
		# for a matchup to possibly work out is...
		allowedErrors = maxErrors - minErrors + 1
		# round down since if number of allowed errors exceeds what
		# we still need to match, the minStretch is zero
		minStretch = int((len(mseq) - sumMatched - minErrors) /
							float(allowedErrors))
		side, aseqStart, aseqLength, mseqStart, mseqLength \
							= unmatched[0][1]
		unmatched = unmatched[1:]
		matchup = Matchup(aseq[aseqStart:aseqStart+aseqLength],
			mseq[mseqStart:mseqStart+mseqLength], minStretch)
		if matchup.best == None:
			if minStretch == 0 and side != "initial":
				fm, errs = makeForcedMatch(side, aseq, mseq,
							aseqStart, aseqLength,
							mseqStart, mseqLength)
				forcedMatches.append(fm)
				forcedErrors += errs
				if aseqLength == mseqLength \
				and mseqStart > 0 \
				and mseqStart + mseqLength < len(mseq):
					# if the forced errors fill an
					# internal gap, then reduce the
					# the errors by one to compensate
					# for the gap being gone
					forcedErrors -= 1
				continue
			raise ValueError, "bad assoc"
		matchup.aseqStart = aseqStart
		matchup.mseqStart = mseqStart
		matchups.append(matchup)
		offset, start, stretch = matchup.best
		sumMatched += stretch
		if start > 0:
			astart = aseqStart
			alen = offset + start
			mstart = mseqStart
			mlen = start
			if mlen == 1:
				forcedMatches.append((astart, mstart, 1))
				if alen > 1 or mstart == 0:
					# don't add an error if filling
					# an internal gap
					forcedErrors += 1
			else:
				unmatched.append((mlen,
					("left", astart, alen, mstart, mlen)))
		if start + stretch < mseqLength:
			astart = aseqStart + offset + start + stretch
			alen = aseqLength - offset - start - stretch
			mstart = mseqStart + start + stretch
			mlen = mseqLength - start - stretch
			if mlen == 1:
				forcedMatches.append((astart, mstart, 1))
				if alen > 1 or mstart + 1 == len(mseq):
					# don't add an error if filling
					# an internal gap
					forcedErrors += 1
			else:
				unmatched.append((mlen,
					("right", astart, alen, mstart, mlen)))
		unmatched.sort()
		unmatched.reverse()

	matchMap = {}
	for matchup in matchups:
		offset, start, stretch = matchup.best
		aStart = matchup.aseqStart + offset + start
		mStart = matchup.mseqStart + start
		for i in range(stretch):
			residue = mseq.residues[mStart + i]
			if residue:
				matchMap[aStart + i] = residue
				matchMap[residue] = aStart + i
	for aStart, mStart, stretch in forcedMatches:
		for i in range(stretch):
			residue = mseq.residues[mStart + i]
			if residue:
				matchMap[aStart + i] = residue
				matchMap[residue] = aStart + i
	return matchMap, len(matchups) - 1 + forcedErrors

def makeForcedMatch(side, aseq, mseq,
			aseqStart, aseqLength, mseqStart, mseqLength):
	if side == "left":
		astart = aseqStart
	else:
		astart = aseqStart + aseqLength - mseqLength
	errors = 0
	for i in range(mseqLength):
		if aseq[astart + i] != mseq[mseqStart + i]:
			errors += 1
	return ((astart, mseqStart, mseqLength), errors)

def dragMatch(aseq, gappedMseq, maxErrors):
	# mseq longer than aseq
	# can still be gaps in mseq

	# scan aseq along mseq looking for longest matching stretch...
	bestStretch = (None, None, 0, 0)
	for i in range(min(2 - len(aseq), 0), len(gappedMseq)):
		start = None
		if len(gappedMseq) - i <= bestStretch[-1]:
			break
		for j in range(min(len(aseq), len(gappedMseq)-i)):
			if i+j < 0:
				continue
			mval = gappedMseq[i+j]
			if mval == aseq[j] or mval == '-':
				if start is None:
					start = j
					matches = mval != '-'
				else:
					matches += mval != '-'
				if matches > bestStretch[-1]:
					bestStretch = (start, i+start,
							j - start + 1, matches)
			else:
				start = None
	a_off, m_off, length, matches = bestStretch
	if length < 2:
		if hasattr(gappedMseq, 'residues'):
			raise ValueError, "bad assoc"
		return []
	unmatched = len(aseq) - matches - aseq.count('-') 
	if unmatched / matches + 1 > maxErrors:
		raise ValueError, "bad assoc"
	matches = [[a_off, m_off, length]]
	furtherErrors = maxErrors - (m_off > 0) - (
					m_off + length < len(gappedMseq))
	if furtherErrors < 0:
		raise ValueError, "bad assoc"

	if a_off > 0 and m_off > 0:
		matches += dragMatch(aseq[:a_off], gappedMseq[:m_off],
								furtherErrors)

	if a_off + length < len(aseq) and m_off + length < len(gappedMseq):
		rMatches = dragMatch(aseq[a_off+length:],
				gappedMseq[m_off+length:], furtherErrors)
		for m in rMatches:
			m[0] += a_off + length
			m[1] += m_off + length
			matches.append(m)

	if not hasattr(gappedMseq, 'residues'):
		return matches

	matchMap = {}
	for m in matches:
		a_off, m_off, length = m
		for i in range(length):
			ungapped = gappedMseq.gapped2ungapped(m_off + i)
			if ungapped is None:
				continue
			res = gappedMseq.residues[ungapped]
			matchMap[res] = a_off + i
			matchMap[a_off + i] = res
	if not matchMap:
		raise ValueError, "bad assoc"

	# find gaps;
	# if mseq gap and aseq gap are same length: force match
	errors = left = gap = None
	for i in range(len(aseq)):
		if matchMap.has_key(i):
			last = i
			if errors is None:
				errors = i # e.g. 0 if first match at i == 0
			if gap is None:
				left = i
			else:
				# possible force
				mseqStart = gappedMseq.residues.index(
								matchMap[left])
				mseqEnd = gappedMseq.residues.index(matchMap[i])
				errors += max(mseqEnd - mseqStart - 1, 1)
				if errors > maxErrors:
					raise ValueError, "bad assoc"
				if mseqEnd - mseqStart != i - left:
					left = i
					gap = None
					continue
				for j in range(1, i - left):
					res = gappedMseq.residues[mseqStart + j]
					matchMap[res] = left + j
					matchMap[left + j] = res
				left = i
				gap = None
		else:
			if left is not None:
				gap = i
	errors += len(aseq) - last - 1
	if errors > maxErrors:
		raise ValueError, "bad assoc"
	return matchMap, errors

def nwAssoc(alignSeq, molSeq):
	mseq = molSeq
	aseq = alignSeq.ungapped()
	score, matchList = nw(mseq, aseq)

	errors = 0
	# matched are in reverse order...
	try:
		mEnd = matchList[0][0]
	except IndexError:
		mEnd = -1
	if mEnd < len(mseq) - 1:
		# trailing unmatched
		errors += len(mseq) - mEnd - 1

	matchMap = {}
	lastMatch = mEnd + 1
	for mIndex, aIndex in matchList:
		if mseq[mIndex] != aseq[aIndex]:
			errors += 1

		if mIndex < lastMatch - 1:
			# gap in structure sequence
			errors += lastMatch - mIndex - 1

		res = mseq.residues[mIndex]
		if res:
			matchMap[res] = aIndex
			matchMap[aIndex] = res

		lastMatch = mIndex
	if lastMatch > 0:
		# beginning unmatched
		errors += lastMatch

	if len(mseq) > len(aseq):
		# unmatched residues forced, reduce errors by that amount...
		errors -= len(mseq) - len(aseq)

	return matchMap, errors

class Matchup:
	""" When done, self.best will either be None or a 3-tuple.
	    If None, then no stretch of at least 'minStretch' matches
	    were found.
	    A 3-tuple indicates the best match match.  The components of
	    the 3-tuple are (offset, start, stretch):
	    
	       offset:  offset into 'aseq' for matchup of 'mseq'

	       start:  offset into 'mseq' where stretch of matches found

	       stretch:  length of consecutive matches found
	"""

	def __init__(self, aseq, mseq, minStretch):
		self.minStretch = minStretch
		self.best = None
		self.aseq = aseq
		self.mseq = mseq
		if self.minStretch > 1:
			self.stride = self.minStretch
		else:
			self.stride = 1
		for offset in range(len(aseq) - len(mseq) + 1):
			if self.stride > 1:
				self.strideLoop(offset)
			else:
				self.simpleLoop(offset)

	def simpleLoop(self, offset):
		mseq = self.mseq
		aseq = self.aseq
		curStretch = None
		best = None
		for i in range(len(mseq)):
			m = mseq[i]
			a = aseq[i + offset]
			if m == a:
				if curStretch:
					curStretch += 1
				else:
					curStretch = 1
					curStart = i
			else:
				if curStretch:
					if not best or curStretch > best[-1]:
						best = (curStart, curStretch)
					if best[-1] > 1:
						self.stride = best[-1]
						return self.strideLoop(offset,
							best=best,
							startIndex=i+1)
					curStretch = None
		if curStretch:
			if not best or curStretch > best[-1]:
				best = (curStart, curStretch)
		if best:
			if (self.best and best[-1] > self.best[-1]) \
			or (not self.best and best[-1] >= self.minStretch):
				start, stretch = best
				self.best = (offset, start, stretch)

	def strideLoop(self, offset, best=None, startIndex=0):
		mseq = self.mseq
		aseq = self.aseq
		for i in range(startIndex + self.stride - 1, len(mseq),
								self.stride):
			m = mseq[i]
			a = aseq[i + offset]
			if m == a:
				# go left to find start of stretch
				for left in range(i-1, -1, -1):
					if mseq[left] != aseq[left + offset]:
						start = left + 1
						break
				else:
					start = 0
				# go right to find end
				for right in range(i+1, len(mseq)):
					if mseq[right] != aseq[right + offset]:
						stretch = right - start
						break
				else:
					stretch = len(mseq) - start

				if not best or stretch > best[-1]:
					best = (start, stretch)
					if stretch > self.stride:
						self.stride = stretch
				return self.strideLoop(offset, best=best,
						startIndex=start+stretch+1)
		if best:
			if (self.best and best[-1] > self.best[-1]) \
			or (not self.best and best[-1] > self.minStretch):
				start, stretch = best
				self.best = (offset, start, stretch)

def estimateAssocParams(mseq):
	# find the apparent gaps in the structure, and estimate total length
	# of structure sequence given these gaps; make a list of the continuous
	# segments
	estLen, segments, gaps = _findGaps(mseq)
	for i in range(len(segments)):
		segments[i] = "".join(segments[i])

	# try to compensate for trailing/leading
	# gaps by looking at SEQRES records
	from chimera.Sequence import seqresSequences
	srSeqs = seqresSequences(mseq.molecule)
	if srSeqs is None:
		srSeqs = []
	matchingSR = None
	for sr in srSeqs:
		if sr.name == mseq.name:
			if matchingSR:
				matchingSR = None
				break
			matchingSR = sr
	if matchingSR and len(matchingSR) > estLen:
		estLen = len(matchingSR)
	return estLen, segments, gaps

def _findGaps(mseq):
	from chimera import bondsBetween
	prevRes = None
	estLen = 0
	segments = []
	gaps = []
	for i in range(len(mseq)):
		estLen += 1
		res = mseq.residues[i]
		if not res:
			# explicit gapping
			return len(mseq), [mseq[:]], []
		char = mseq[i]
		gap = 0
		if prevRes:
			connects = bondsBetween(prevRes, res, onlyOne=1)
			# narrow gap between residues 34 and 61 of chain A in 1jff
			# requires excluding lengths >= 2.678...
			# For consecutively numbered residues we are a lot more
			# loose in distance requirement, so that 3f5g works,
			# where due to some missing atoms the inter-residue bond
			# between residues 109 and 110 is 4.381...
			decimated = False
			if res.id.position - prevRes.id.position - 1 == 0:
				cutoff = 20.0
				# after a zone deletion, the connecting inter-residue
				# bonds may be missing...
				from chimera.resCode import res3to1
				from chimera.misc import principalAtom
				if not connects \
				and res3to1(prevRes.type) != 'X' and res3to1(res.type) != 'X' \
				and (principalAtom(prevRes) == None or principalAtom(res) == None) \
				and len(res.atoms) < 20 and len(prevRes.atoms) < 20:
					for a1 in res.atoms:
						for a2 in prevRes.atoms:
							if a1.coord().sqdistance(a2.coord()) < 100.0:
								decimated = True
								break
						if decimated:
							break
			else:
				cutoff = 6.0
			if not decimated and (not connects or connects[0].sqlength() > cutoff):
				gap = res.id.position - prevRes.id.position - 1
				if gap == -1:
					# 1ton/1bil have gaps in insertion codes...
					prevInsert, insert = \
						prevRes.id.insertionCode, res.id.insertionCode
					if insert != ' ' and prevInsert < insert:
						if prevInsert == ' ':
							gap = ord(insert) - ord('A')
						else:
							gap = ord(insert) - ord(prevInsert)
				# CA-only structures may have longer inter-residue bonds...
				caOnly = len(res.atoms) + len(prevRes.atoms) == 2
				if not (gap == 0 and connects and caOnly \
				and connects[0].sqlength() < 45.0):
					# 3ixy chain B has 6.602 CA-CA length
					# between residues 131 and 132
					if gap < 1:
						# instead of jamming everything together and hoping,
						# use 1 as the gap size, since the gap-placement
						# algorithm doesn't actually care about the size of
						# gap except in tie-breaking cases (placing a segment
						# in two different places has the same score).  Using
						# 1 instead of jamming allows 3oe0 chain A to work,
						# where the gap is between a normal-numbered segment
						# and a long insertion that was numbered much higher
						# [well, between insertion and normal numbering]
						gap = 1
					gaps.append(gap)
					estLen += gap
		if not prevRes or gap:
			curSeg = [char]
			segments.append(curSeg)
		else:
			curSeg.append(char)
		prevRes = res
	if estLen > 0 and mseq.residues[0].id.position > 1:
		estLen += mseq.residues[0].id.position - 1
	return estLen, segments, gaps