# --- UCSF Chimera Copyright ---
# Copyright (c) 2000 Regents of the University of California.
# All rights reserved.  This software provided pursuant to a
# license agreement containing restrictions on its disclosure,
# duplication and use.  This notice must be embedded in or
# attached to all copies, including partial copies, of the
# software or any revisions or derivations thereof.
# --- UCSF Chimera Copyright ---
#
# $Id: __init__.py 39819 2014-05-20 00:16:46Z pett $

import SmithWaterman
import NeedlemanWunsch
from chimera.Sequence import StructureSequence
from chimera import replyobj, UserError, LimitationError

CP_SPECIFIC_SPECIFIC = "ss"
CP_SPECIFIC_BEST = "sb"
CP_BEST = "bb"

SA_NEEDLEMAN_WUNSCH = "Needleman-Wunsch"
SA_SMITH_WATERMAN = "Smith-Waterman"

from prefs import defaults, CHAIN_PAIRING, SEQUENCE_ALGORITHM, MATRIX, \
	GAP_OPEN, GAP_EXTEND, SS_MIXTURE, SS_SCORES, ITER_CUTOFF, HELIX_OPEN, \
	STRAND_OPEN, OTHER_OPEN, COMPUTE_SS
defaultSSMatrix = defaults[SS_SCORES]

# called recursively, so any changes to calling signature need to happen
# in recursive call too...
def align(ref, match, matrix, algorithm, gapOpen, gapExtend, ksdsspCache,
					ssMatrix=defaults[SS_SCORES],
					ssFraction=defaults[SS_MIXTURE],
					gapOpenHelix=defaults[HELIX_OPEN],
					gapOpenStrand=defaults[STRAND_OPEN],
					gapOpenOther=defaults[OTHER_OPEN],
					computeSS=defaults[COMPUTE_SS]):
	similarityMatrix = SmithWaterman.matrices[matrix]
	ssf = ssFraction
	ssm = ssMatrix
	if ssf is not None and ssf is not False and computeSS:
		needCompute = []
		if ref.molecule not in ksdsspCache:
			for r in ref.residues:
				if r and len(r.atoms) > 1:
					# not CA only
					needCompute.append(ref.molecule)
					ksdsspCache.add(ref.molecule)
					break
		if match.molecule not in ksdsspCache:
			for r in match.residues:
				if r and len(r.atoms) > 1:
					# not CA only
					needCompute.append(match.molecule)
					ksdsspCache.add(match.molecule)
					break
		if needCompute:
			from chimera.initprefs import ksdsspPrefs, \
					KSDSSP_ENERGY, KSDSSP_HELIX_LENGTH, \
					KSDSSP_STRAND_LENGTH
			from Midas import ksdssp
			ksdssp(needCompute, energy=ksdsspPrefs[KSDSSP_ENERGY],
				helixLen=ksdsspPrefs[KSDSSP_HELIX_LENGTH],
				strandLen=ksdsspPrefs[KSDSSP_STRAND_LENGTH])
	if algorithm == "nw":
		score, seqs = NeedlemanWunsch.nw(ref, match,
			scoreGap=-gapExtend, scoreGapOpen=0-gapOpen,
			similarityMatrix=similarityMatrix, returnSeqs=True,
			ssMatrix=ssMatrix, ssFraction=ssFraction,
			gapOpenHelix=-gapOpenHelix,
			gapOpenStrand=-gapOpenStrand,
			gapOpenOther=-gapOpenOther)
		gappedRef, gappedMatch = seqs
	elif algorithm =="sw":
		refName = ref.molecule.name
		if not ref.name.startswith("principal"):
			refName += ", " + ref.name
		gappedRef = StructureSequence(ref.molecule, refName)
		matchName = match.molecule.name
		if not match.name.startswith("principal"):
			matchName += ", " + match.name
		gappedMatch = StructureSequence(match.molecule, matchName)
		def ssLet(r):
			if not r:
				return ' '
			if r.isHelix:
				return 'H'
			elif r.isStrand:
				return 'S'
			return 'O'
		if ssf is False or ssf is None:
			ssf = 0.0
			ssm = None
		if ssm:
			# account for missing structure (blank SS letter)
			ssm = ssm.copy()
			for let in "HSO ":
				ssm[(let, ' ')] = 0.0
				ssm[(' ', let)] = 0.0
		score, alignment = SmithWaterman.align(str(ref), str(match),
			similarityMatrix, float(gapOpen), float(gapExtend),
			gapChar=".", ssMatrix=ssm, ssFraction=ssf,
			gapOpenHelix=float(gapOpenHelix),
			gapOpenStrand=float(gapOpenStrand),
			gapOpenOther=float(gapOpenOther),
			ss1="".join([ssLet(r) for r in ref.residues]),
			ss2="".join([ssLet(r) for r in match.residues]))
		gappedRef.extend(alignment[0])
		gappedMatch.extend(alignment[1])
		# Smith-Waterman may not be entirety of sequences...
		for orig, gapped in [(ref, gappedRef), (match, gappedMatch)]:
			ungapped = gapped.ungapped()
			for i in range(len(orig) - len(ungapped) + 1):
				if ungapped == orig[i:i+len(ungapped)]:
					break
			else:
				raise ValueError("Smith-Waterman result not"
					" a subsequence of original sequence")
			gapped.residues = orig.residues[i:i+len(ungapped)]
			resMap = {}
			gapped.resMap = resMap
			for j in range(len(ungapped)):
				gres = gapped.residues[j]
				if gres:
					resMap[gres] = j
	else:
		raise ValueError("Unknown sequence alignment algorithm: %s"
								% algorithm)

	# If the structures are disjoint snippets of the same longer SEQRES,
	# they may be able to be structurally aligned but the SEQRES records
	# will keep them apart.  Try to detect this situation and work around
	# by snipping off sequence ends.
	srDisjoint = False
	if 'SEQRES' in ref.molecule.pdbHeaders and 'SEQRES' in match.molecule.pdbHeaders:
		structMatch = 0
		for i in range(len(gappedRef)):
			uri = gappedRef.gapped2ungapped(i)
			if uri is None:
				continue
			umi = gappedMatch.gapped2ungapped(i)
			if umi is None:
				continue
			if gappedRef.residues[uri] and gappedMatch.residues[umi]:
				structMatch += 1
				if structMatch >= 3:
					break
		if structMatch < 3:
			seqMatch = 0
			for s1, s2 in zip(gappedRef[:], gappedMatch[:]):
				if s1.isalpha() and s2.isalpha():
					seqMatch += 1
					if seqMatch > 3:
						break
			if seqMatch > 3:
				need = 3 - structMatch
				if (ref.residues[:need].count(None) == 3
				or ref.residues[-need:].count(None) == 3) \
				and (match.residues[:need].count(None) == 3
				or match.residues[-need:].count(None) == 3):
					srDisjoint = True
	if srDisjoint:
		from copy import copy
		clippedRef = copy(ref)
		clippedMatch = copy(match)
		for seq in (clippedRef, clippedMatch):
			numNone = 0
			for r in seq.residues:
				if r:
					break
				numNone += 1
			if numNone:
				seq[:] = seq[numNone:]
				seq.residues = seq.residues[numNone:]
				for r, i in seq.resMap.items():
					seq.resMap[r] = i - numNone

			numNone = 0
			for r in reversed(seq.residues):
				if r:
					break
				numNone += 1
			if numNone:
				seq[:] = seq[:-numNone]
				seq.residues = seq.residues[:-numNone]
		return align(clippedRef, clippedMatch, matrix, algorithm, gapOpen,
			gapExtend, ksdsspCache, ssMatrix=ssMatrix, ssFraction=ssFraction,
			gapOpenHelix=gapOpenHelix, gapOpenStrand=gapOpenStrand,
			gapOpenOther=gapOpenOther, computeSS=False)
	for orig, aligned in [(ref, gappedRef), (match, gappedMatch)]:
		if hasattr(orig, '_dmRebuildInfo'):
			aligned._dmRebuildInfo = orig._dmRebuildInfo
			_dmCleanup.append(aligned)
	return score, gappedRef, gappedMatch

def matrixCompatible(chain, matrix):
	proteinMatrix = len(SmithWaterman.matrices[matrix]) >= 400
	return proteinMatrix == chain.hasProtein()

def match(chainPairing, matchItems, matrix, alg, gapOpen, gapExtend, iterate=None,
		showAlignment=False, align=align, domainResidues=(None, None), 
		verbose=False, **alignKw):
	"""Superimpose structures based on sequence alignment

	   'chainPairing' is the method of pairing chains to match:
	   
	   CP_SPECIFIC_SPECIFIC --
	   Each reference chain is paired with a specified match chain
	   
	   CP_SPECIFIC_BEST --
	   Single reference chain is paired with best seq-aligning
	   chain from one or more molecules

	   CP_BEST --
	   Best seq-aligning pair of chains from reference molecule and
	   match molecule(s) is used
	"""
	ksdsspCache = set()
	alg = alg.lower()
	if alg == "nw" or alg.startswith("needle"):
		alg = "nw"
		algName = "Needleman-Wunsch"
	elif alg =="sw" or alg.startswith("smith"):
		alg = "sw"
		algName = "Smith-Waterman"
	else:
		raise ValueError("Unknown sequence alignment algorithm: %s"
									% alg)
	pairings = {}
	smallMolErrMsg = "Reference and/or match model contains no nucleic or"\
		" amino acid chains.\nUse the command-line 'match' command" \
		" to superimpose small molecules/ligands."
	rdRes, mdRes = domainResidues
	if chainPairing == CP_SPECIFIC_SPECIFIC:
		# specific chain(s) in each

		# various sanity checks
		#
		# (1) can't have same chain matched to multiple refs
		# (2) reference molecule can't be a match molecule
		matchChains = {}
		matchMols = {}
		refMols = {}
		for ref, match in matchItems:
			if not matrixCompatible(ref, matrix):
				raise UserError("Reference chain (%s) not"
					" compatible with %s similarity"
					" matrix" % (ref.fullName(), matrix))
			if not matrixCompatible(match, matrix):
				raise UserError("Match chain (%s) not"
					" compatible with %s similarity"
					" matrix" % (match.fullName(), matrix))
			if match in matchChains:
				raise UserError("Cannot match the same chain"
					" to multiple reference chains")
			matchChains[match] = ref
			if match.molecule in refMols \
			or ref.molecule in matchMols \
			or match.molecule == ref.molecule:
				raise UserError("Cannot have same molecule"
					" model provide both reference and"
					" match chains")
			matchMols[match.molecule] = ref
			refMols[ref.molecule] = match

		if not matchChains:
			raise UserError("Must select at least one reference"
								" chain.\n")

		for match, ref in matchChains.items():
			match, ref = [checkDomainMatching([ch], dr)[0] for ch, dr in
				((match, mdRes), (ref, rdRes))]
			score, s1, s2 = align(ref, match, matrix, alg,
						gapOpen, gapExtend,
						ksdsspCache, **alignKw)
			pairings.setdefault(s2.molecule, []).append(
							(score, s1, s2))

	elif chainPairing == CP_SPECIFIC_BEST:
		# specific chain in reference;
		# best seq-aligning chain in match model(s)
		ref, matches = matchItems
		if not ref or not matches:
			raise UserError("Must select at least one reference"
							" and match item.\n")
		if not matrixCompatible(ref, matrix):
			raise UserError("Reference chain (%s) not compatible"
						" with %s similarity matrix"
						% (ref.fullName(), matrix))
		ref = checkDomainMatching([ref], rdRes)[0]
		for match in matches:
			bestScore = None
			seqs = [s for s in match.sequences()
						if matrixCompatible(s, matrix)]
			if not seqs and match.sequences():
				raise UserError("No chains in match structure"
					" %s compatible with %s similarity"
					" matrix" % (match, matrix))
			seqs = checkDomainMatching(seqs, mdRes)
			for seq in seqs:
				score, s1, s2 = align(ref, seq, matrix, alg,
						gapOpen, gapExtend,
						ksdsspCache, **alignKw)
				if bestScore is None or score > bestScore:
					bestScore = score
					pairing = (score, s1, s2)
			if bestScore is None:
				raise LimitationError(smallMolErrMsg)
			pairings[match]= [pairing]

	elif chainPairing == CP_BEST:
		# best seq-aligning pair of chains between
		# reference and match structure(s)
		ref, matches = matchItems
		if not ref or not matches:
			raise UserError("Must select at least one reference"
				" and match item in different models.\n")
		rseqs = [s for s in checkDomainMatching(ref.sequences(), rdRes)
					if matrixCompatible(s, matrix)]
		if not rseqs and ref.sequences():
			raise UserError("No chains in reference structure"
				" %s compatible with %s similarity"
				" matrix" % (ref, matrix))
		for match in matches:
			bestScore = None
			mseqs = [s for s in checkDomainMatching(match.sequences(), mdRes)
						if matrixCompatible(s, matrix)]
			if not mseqs and match.sequences():
				raise UserError("No chains in match structure"
					" %s compatible with %s similarity"
					" matrix" % (match, matrix))
			for mseq in mseqs:
				for rseq in rseqs:
					score, s1, s2 = align(rseq, mseq,
						matrix, alg, gapOpen, gapExtend,
						ksdsspCache, **alignKw)
					if bestScore is None \
					or score > bestScore:
						bestScore = score
						pairing = (score,s1,s2)
			if bestScore is None:
				raise LimitationError(smallMolErrMsg)
			pairings[match]= [pairing]
	else:
		raise ValueError("No such chain-pairing method")

	from chimera.misc import principalAtom
	retVals = []
	for matchMol, pairs in pairings.items():
		refAtoms = []
		matchAtoms = []
		regionInfo = {}
		if verbose:
			seqPairings = []
		for score, s1, s2 in pairs:
			try:
				ssMatrix = alignKw['ssMatrix']
			except KeyError:
				ssMatrix = defaultSSMatrix
			try:
				ssFraction = alignKw['ssFraction']
			except KeyError:
				ssFraction = defaults[SS_MIXTURE]

			replyobj.info("\n")
			replyobj.status("Matchmaker %s (%s) with %s (%s),"
				" sequence alignment score = %g" % (
				s1.name, s1.molecule.oslIdent(), s2.name,
				s2.molecule.oslIdent(), score), log=1)
			replyobj.info("with these parameters:\n"
				"\tchain pairing: %s\n\t%s using %s\n"
				% (chainPairing, algName, matrix))

			if ssFraction is None or ssFraction is False:
				replyobj.info("\tno secondary structure"
							" guidance used\n")
				replyobj.info("\tgap open %g, extend %g\n" % (
							gapOpen, gapExtend))
			else:
				if 'gapOpenHelix' in alignKw:
					gh = alignKw['gapOpenHelix']
				else:
					gh = defaults[HELIX_OPEN]
				if 'gapOpenStrand' in alignKw:
					gs = alignKw['gapOpenStrand']
				else:
					gs = defaults[STRAND_OPEN]
				if 'gapOpenOther' in alignKw:
					go = alignKw['gapOpenOther']
				else:
					go = defaults[OTHER_OPEN]
				replyobj.info("\tss fraction: %g\n"
					"\tgap open (HH/SS/other) %g/%g/%g, "
					"extend %g\n"
					"\tss matrix: " % (ssFraction, gh, gs,
					go, gapExtend))
				for ss1, ss2 in ssMatrix.keys():
					if ss2 < ss1:
						continue
					replyobj.info(" (%s, %s): %g" % (ss1,
						ss2, ssMatrix[(ss1, ss2)]))
				replyobj.info("\n")
			if iterate is None:
				replyobj.info("\tno iteration\n")
			else:
				replyobj.info("\titeration cutoff: %g\n"
								% iterate)
			skip = set()
			if showAlignment:
				from MultAlignViewer.MAViewer import MAViewer
				for s in [s1,s2]:
					if hasattr(s, '_dmRebuildInfo'):
						for i, c, r in s._dmRebuildInfo:
							g = s.ungapped2gapped(i)
							s[g] = c
							s.residues[i] = r
							skip.add(r)
						s.resMap.clear()
						for i, r in enumerate(s.residues):
							if r:
								s.resMap[r] = i
				mav = MAViewer([s1,s2], autoAssociate=None)
				mav.autoAssociate = True
				mav.hideHeaders(mav.headers(shownOnly=True))
				from MAVHeader.ChimeraExtension import CaDistanceSeq
				mav.showHeaders([h for h in mav.headers()
							if h.name == CaDistanceSeq.name])
			for i in range(len(s1)):
				if s1[i] == "." or s2[i] == ".":
					continue
				refRes = s1.residues[s1.gapped2ungapped(i)]
				matchRes = s2.residues[s2.gapped2ungapped(i)]
				if not refRes:
					continue
				refAtom = principalAtom(refRes)
				if not refAtom:
					continue
				if not matchRes:
					continue
				matchAtom = principalAtom(matchRes)
				if not matchAtom:
					continue
				if refRes in skip or matchRes in skip:
					continue
				if refAtom.name != matchAtom.name:
					# nucleic P-only trace vs. full nucleic
					if refAtom.name != "P":
						try:
							refAtom = refAtom.residue.atomsMap["P"][0]
						except KeyError:
							continue
					else:
						try:
							matchAtom = matchAtom.residue.atomsMap["P"][0]
						except KeyError:
							continue
				refAtoms.append(refAtom)
				matchAtoms.append(matchAtom)
				if showAlignment and iterate is not None:
					regionInfo[refAtom] = (mav, i)

			if verbose:
				seqPairings.append((s1, s2))
		import Midas
		if len(matchAtoms) < 3:
			replyobj.error("Fewer than 3 residues aligned; cannot"
				" match %s with %s\n" % (s1.name, s2.name))
			continue
		try:
			retVals.append(Midas.match(matchAtoms, refAtoms,
						iterate=iterate, minPoints=3))
		except Midas.TooFewAtomsError:
			replyobj.error("Iteration produces fewer than 3"
				" residues aligned.\nCannot match %s with %s"
				" satisfying iteration threshold.\n"
				% (s1.name, s2.name))
			continue
		replyobj.info("\n") # separate matches with whitespace
		if regionInfo:
			byMav = {}
			for ra in retVals[-1][1]:
				mav, index = regionInfo[ra]
				byMav.setdefault(mav, []).append(index)
			for mav, indices in byMav.items():
				indices.sort()
				from MultAlignViewer.MAViewer import \
							MATCHED_REGION_INFO
				name, fill, outline = MATCHED_REGION_INFO
				mav.newRegion(name=name, columns=indices,
						fill=fill, outline=outline)
				mav.status("Residues used in final fit"
						" iteration are highlighted")
		if verbose:
			for s1, s2 in seqPairings:
				replyobj.info("Sequences:\n")
				for s in [s1,s2]:
					replyobj.info(s.name + "\t" + str(s) + "\n")
				replyobj.info("Residues:\n")
				for s in [s1, s2]:
					replyobj.info(", ".join([str(r) for r in s.residues]) + "\n")
				replyobj.info("Residue usage in match (1=used, 0=unused):\n")
				matchAtoms1, matchAtoms2 = retVals[-1][:2]
				matchResidues = set([a.residue
					for matched in retVals[-1][:2] for a in matched])
				for s in [s1, s2]:
					replyobj.info(", ".join([str(int(r in matchResidues))
						for r in s.residues]) + "\n")

	global _dmCleanup
	for seq in _dmCleanup:
		delattr(seq, '_dmRebuildInfo')
	_dmCleanup = []
	return retVals

def cmdMatch(refSel, matchSel, pairing=defaults[CHAIN_PAIRING],
		alg=defaults[SEQUENCE_ALGORITHM], verbose=False,
		ssFraction=defaults[SS_MIXTURE], matrix=defaults[MATRIX],
		gapOpen=defaults[GAP_OPEN], hgap=defaults[HELIX_OPEN],
		sgap=defaults[STRAND_OPEN], ogap=defaults[OTHER_OPEN],
		iterate=defaults[ITER_CUTOFF], gapExtend=defaults[GAP_EXTEND],
		showAlignment=False, computeSS=defaults[COMPUTE_SS],
		matHH=defaultSSMatrix[('H', 'H')],
		matSS=defaultSSMatrix[('S', 'S')],
		matOO=defaultSSMatrix[('O', 'O')],
		matHS=defaultSSMatrix[('H', 'S')],
		matHO=defaultSSMatrix[('H', 'O')],
		matSO=defaultSSMatrix[('S', 'O')]):
	"""wrapper for command-line command (friendlier args)"""
	from Midas import MidasError
	if matrix not in SmithWaterman.matrices:
		raise MidasError("No such matrix name: %s" % str(matrix))
	try:
		gapOpen + 1
		gapExtend + 1
		hgap + 1
		sgap + 1
		ogap + 1
	except TypeError:
		raise MidasError("Gap open/extend penalties must be numeric")
	if pairing == CP_SPECIFIC_SPECIFIC:
		matches = matchSel.chains(ordered=True)
	elif pairing == CP_SPECIFIC_BEST:
		matches = matchSel.molecules()
	if pairing == CP_SPECIFIC_SPECIFIC or pairing == CP_SPECIFIC_BEST:
		refs = refSel.chains(ordered=True)
		if not refs:
			raise MidasError("No reference chains specified")
		if pairing == CP_SPECIFIC_BEST:
			if len(refs) > 1:
				raise MidasError("Specify a single reference chain only")
	else:
		refMols = refSel.molecules()
		if not refMols:
			raise MidasError("No reference model specified")
		if len(refMols) > 1:
			raise MidasError("Specify a single reference"
								" model only")
		refs = refMols
		matches = matchSel.molecules()
	if not matches:
		raise MidasError("No molecules/chains to match specified")
	for ref in refs:
		if ref in matches:
			matches.remove(ref)
	if not matches:
		raise MidasError("Must use different reference and match"
								" structures")
	if pairing == CP_SPECIFIC_SPECIFIC:
		if len(refs) != len(matches):
			raise MidasError("Different number of reference/match"
					" chains (%d ref, %d match)" %
					(len(refs), len(matches)))
		matchItems = zip(refs, matches)
	else:
		matchItems = (refs[0], matches)
	ssMatrix = {}
	ssMatrix[('H', 'H')] = float(matHH)
	ssMatrix[('S', 'S')] = float(matSS)
	ssMatrix[('O', 'O')] = float(matOO)
	ssMatrix[('H', 'S')] = ssMatrix[('S', 'H')] = float(matHS)
	ssMatrix[('H', 'O')] = ssMatrix[('O', 'H')] = float(matHO)
	ssMatrix[('S', 'O')] = ssMatrix[('O', 'S')] = float(matSO)
	if type(iterate) == bool and not iterate:
		iterate = None
	try:
		match(pairing, matchItems, matrix, alg, gapOpen, gapExtend,
			ssFraction=ssFraction, ssMatrix=ssMatrix,
			iterate=iterate, showAlignment=showAlignment,
			domainResidues=(refSel.residues(), matchSel.residues()),
			gapOpenHelix=hgap, gapOpenStrand=sgap,
			gapOpenOther=ogap, computeSS=computeSS, verbose=verbose)
	except UserError, v:
		raise MidasError, v

_dmCleanup = []
def checkDomainMatching(chains, selResidues):
	if not selResidues:
		return chains
	chainResidues = set([r for ch in chains for r in ch.residues if r])
	selResidues = set(selResidues)
	if not chainResidues.issubset(selResidues):
		# domain matching
		newChains = []
		for chain in chains:
			thisChain = set([r for r in chain.residues if r])
			if thisChain.issubset(selResidues):
				newChains.append(chain)
				continue
			nc = StructureSequence(chain.molecule, chain.name)
			nc._dmRebuildInfo = []
			_dmCleanup.append(nc)
			newChains.append(nc)
			for c, r in zip(str(chain), chain.residues):
				if r in selResidues:
					nc.append(c)
					nc.resMap[r] = len(nc.residues)
					nc.residues.append(r)
				else:
					nc._dmRebuildInfo.append((len(nc.residues), c, r))
					nc.append('?')
					nc.residues.append(None)
		chains = newChains
	return chains