# --- UCSF Chimera Copyright ---
# Copyright (c) 2000-2009 Regents of the University of California.
# All rights reserved.  This software provided pursuant to a
# license agreement containing restrictions on its disclosure,
# duplication and use.  This notice must be embedded in or
# attached to all copies, including partial copies, of the
# software or any revisions or derivations thereof.
# --- UCSF Chimera Copyright ---
#
# $Id: copyright 26655 2009-01-07 22:02:30Z gregc $

def name2pdbID(name):
	startable = True
	for i in range(0, len(name)-3):
		if startable and name[i].isdigit() and name[i+1:i+4].isalnum() and (
				len(name[i:]) == 4 or not name[i+4].isalnum()
				or (name[i+4].isupper() and not name[i+3].isupper())):
			pdbID = name[i:i+4].upper()
			return pdbID, pdbID, "PDBID"
		startable = not name[i].isalnum()

	# SCOP?
	if len(name) == 7 and name[0] == 'd':
		if name[1].isdigit() and name[2:5].isalnum():
			pdbID = name[1:5].upper()
			return pdbID, name, "SCOP"
	return None, None, None

def chain2pdbID(chain):
	header = getattr(chain.molecule, 'pdbHeaders', {}).get('HEADER', None)
	if header:
		pdbID = header[0].split()[-1]
		if len(pdbID) == 4 and pdbID[0].isdigit() \
		and pdbID[1:].isalnum():
			return pdbID
	pdbID, fetchID, db = name2pdbID(chain.molecule.name)
	return pdbID

def uniprotIDs(pdbCode, chainID, tree=None):
	if tree == None:
		import xml.etree.ElementTree
		tree = parseUniprotAlignment(pdbCode, chainID,
				parser=xml.etree.ElementTree.parse)
	root = tree.getroot()
	ns = root.tag[:root.tag.index('}')+1] # namespace
	alignment = root.find(ns + "alignment")
	uniprotIDs = set()
	for alignObj in alignment.findall(ns + "alignObject"):
		if alignObj.get("dbSource") == "UniProt":
			uniprotIDs.add(alignObj.get('dbAccessionId'))
	for blockObj in alignment.findall(ns + 'block'):
		uniprotSeg = blockObj[-1]
		uniprotIDs.add(uniprotSeg.get("intObjectId"))
	if not uniprotIDs:
		raise AssertionError("No UniProt accession ID(s) found for %s.%s"
			% (pdbCode, chainID))
	return uniprotIDs

class InvalidAccessionError(ValueError):
	pass

def mapUniprotNameID(ident):
	import urllib, urllib2
	params = { 'from': 'ACC+ID', 'to': 'ACC', 'format': 'tab', 'query': ident}
	data = urllib.urlencode(params)
	request = urllib2.Request("http://www.uniprot.org/mapping/", data)
	try:
		response = urllib2.urlopen(request)
	except urllib2.HTTPError, v:
		from chimera import NonChimeraError
		raise NonChimeraError("Error from UniProt web server: %s\n\n"
			"Try again later.  If you then still get the error, you could use"
			"Help->Report A Bug to report the error to the Chimera team."
			"They may be able to help you work around the problem."
			% unicode(v))
	page = response.read(200000)
	fields = page.split() # also split on newlines, not just tabs!
	if len(fields) < 4:
		raise InvalidAccessionError("Invalid UniProt entry name / accession"
			" number: %s" % ident)
	return fields[3]

def uniprotFetch(uniprotID):
	"""can raise CancelOperation"""
	from chimera.fetch import fetch_file
	path, headers = fetch_file("http://www.uniprot.org/"
		"uniprot/%s.xml" % uniprotID, "%s UniProt info" % uniprotID)
	tree = xmlParsePath(path)
	import os
	os.unlink(path)
	try:
		uniprot = [cn for cn in tree.childNodes
			if getattr(cn, 'tagName', None) == "uniprot"][0]
	except IndexError:
		raise InvalidAccessionError(
				"Invalid UniProt accession number: %s" % uniprotID)
	entry = [cn for cn in uniprot.childNodes
		if getattr(cn, 'tagName', None) == "entry"][0]
	try:
		seqNode = [cn for cn in entry.childNodes
			if getattr(cn, 'tagName', None) == "sequence"][0]
	except (KeyError, IndexError):
		raise AssertionError("No sequence for %s in Uniprot"
			" info" % uniprotID)
	protein = [cn for cn in entry.childNodes
		if getattr(cn, 'tagName', None) == "protein"][0]
	recName = [cn for cn in protein.childNodes
		if getattr(cn, 'tagName', None)
		in ("recommendedName", "submittedName")][0]
	fullName = [cn for cn in recName.childNodes
		if getattr(cn, 'tagName', None) == "fullName"][0].firstChild.nodeValue
	features = [cn for cn in entry.childNodes
			if getattr(cn, 'tagName', None) == "feature"]
	return "".join([c for c in seqNode.firstChild.nodeValue
				if not c.isspace()]), fullName, features

def showCddFeatures(mav, seqIndex=-1, hideSS=None):
	"""returns corresponding PSSM Ids

	   can raise CancelOperation
	"""

	seq = mav.seqs[seqIndex]
	from chimera import tasks
	task = tasks.Task("Fetch CDD annotations for %s" % seq.fullName())
	from CGLutil import multipart
	import socket
	cddHost = "www.ncbi.nlm.nih.gov"
	cddPage = "/Structure/bwrpsb/bwrpsb.cgi"
	mav.status("Submitting CDD annotations job for %s" % seq.fullName())
	task.updateStatus("submitting job")
	try:
		# settings documented here:
		# www.ncbi.nlm.nih.gov/Structure/cdd/cdd_help.shtml#BatchRPSBWebAPI
		cddData = multipart.post_multipart(cddHost, cddPage, [
				("queries", None, seq.ungapped().upper()),
				("db", None, "cdd"),
				("smode", None, "live"),
				("evalue", None, "0.01"),
				("tdata", None, "feats"),
				("dmode", None, "rep"),
				("qdefl", None, "false"),
				("cddefl", None, "false")
			])
	except (IOError, socket.error), v:
		task.finished()
		from chimera import NonChimeraError
		raise NonChimeraError("Error posting CDD annotations"
			" from %s: %s" % (cddHost, str(v)))
	except:
		task.finished()
		raise
	cddStatus = cdsid = None
	for line in cddData.splitlines():
		fields = line.split()
		if fields[0] == "#status":
			cddStatus = int(fields[1])
		elif fields[0] == "#cdsid":
			cdsid = fields[1]
	if cddStatus is None or cdsid is None:
		task.finished()
		raise AssertionError("No status info and/or job ID in CDD response")

	mav.status("CDD annotations job submitted; awaiting results")
	waitIncr = waitAmount = 0
	import time
	while cddStatus != 0:
		for i in range(waitAmount):
			task.updateStatus("awaiting job completion")
			time.sleep(1.0)
		try:
			cddData = multipart.post_multipart(cddHost, cddPage,
				[("cdsid", None, cdsid)])
		except (IOError, socket.error), v:
			task.finished()
			from chimera import NonChimeraError
			raise NonChimeraError("Error retrieving CDD annotations"
				" from %s: %s" % (cddHost, str(v)))
		except:
			task.finished()
			raise
		for line in cddData.splitlines():
			fields = line.split()
			if fields[0] == "#status":
				cddStatus = int(fields[1])
				break
		waitIncr += 1
		waitAmount += waitIncr
	task.finished()
	mav.status("CDD job finished; processing results")

	rb = mav.regionBrowser
	msReg = None
	if hideSS == None:
		hideSS = len(mav.seqs) == 1
	if hideSS:
		mav.showSS(show=False)
	for reg in rb.regions[:]:
		if reg.name in ["structure strands", "structure helices"]:
			rb.lowerRegion(reg, rebuildTable=False)
		if reg.name and reg.name.startswith(mav.GAP_REG_NAME_START):
			msReg = reg
	featureInfo = []
	pssmIDs = set()
	for line in cddData.splitlines():
		if not line or line[0] == '#':
			continue
		fields = line.split('\t')
		if len(fields) != 7 or fields[0] == 'Query':
			continue
		query, qtype, title, coordinates, fullSize, mappedSize, pssmID = fields
		ungappedCoords = []
		for coord in coordinates.split(','):
			if '-' in coord:
				# explicit range
				fromCrd, toCrd = coord.split('-')
				fromRC, fromPos = fromCrd[0], int(fromCrd[1:])
				toRC, toPos = toCrd[0], int(toCrd[1:])
				for pos in range(fromPos, toPos+1):
					ungappedCoords.append(seq.ungapped2gapped(pos-1))
			else:
				resCode, pos = coord[0], int(coord[1:])
				ungappedCoords.append(seq.ungapped2gapped(pos-1))
		# convert to ranges
		ranges = []
		start = cur = None
		for uc in ungappedCoords:
			if start is None:
				start = cur = uc
			elif uc == cur + 1:
				cur = uc
			else:
				ranges.append((start, cur))
				start = cur = uc
		ranges.append((start, cur))
		featureInfo.append((title, ranges, pssmID))
		if pssmID != '-':
			pssmIDs.add(pssmID)
	featureInfo.sort()
	featureInfo.reverse()
	# use region colors that differ from each other and from
	# letters (black) and selection region (green)
	usedColors = [(0.0, 1.0, 0.0), (0.0, 0.0, 0.0)]
	from CGLtk.color import distinguishFrom
	for title, ranges, pssmID in featureInfo:
		rebuildTable = (title, ranges, pssmID) == featureInfo[-1]
		fill = distinguishFrom(usedColors, numCandidates=5, seed=len(seq))
		usedColors.append(fill)
		if len(pssmIDs) > 1 and pssmID != '-':
			source = "CDD (PSSM %s)" % pssmID
		else:
			source = "CDD"
		mav.newRegion(name=title, shown=False, fill=fill, blocks=[
			(seq, seq, begin, end) for begin,end in ranges], sequence=seq,
			rebuildTable=rebuildTable, after=msReg, source=source)
	rb.showSeqRegions(seq)
	rb.enter()
	mav.enter() # put the sequence on top
	if featureInfo:
		mav.status("CDD features shown")
	else:
		mav.status("No CDD features found")
	return pssmIDs

from chimera import UserError
class NoUniprotEntryError(UserError):
	pass
def parseUniprotAlignment(pdbCode, chainID, ignoreCache=False,
					dasErrorsOnly=False, parser=None):
	"""can raise CancelOperation"""

	dasArgs = (pdbCode.lower(), chainID)
	url = "http://www.pdb.org/pdb/rest/das/pdb_uniprot_mapping/alignment?"\
		"query=%s.%s" % dasArgs
	from chimera.fetch import fetch_file
	# can raise CancelOperation
	path, headers = fetch_file(url, "%s.%s UniProt mapping"
		% (pdbCode, chainID), save_dir="PDB2UniProt", ignore_cache=ignoreCache,
		save_name="%s.%s.xml" % dasArgs)
	failure = None
	if headers:
		dasStatus = int(headers.getheader("X-DAS-Status", 0))
		if dasStatus:
			if dasStatus >= 400:
				from dasErrors import dasErrors, BadDASCommandArgsError
				dasError = dasErrors.get(dasStatus)
				if dasError == BadDASCommandArgsError:
					if dasErrorsOnly:
						failure = dasError(*dasArgs)
					else:
						failure = NoUniprotEntryError("No Uniprot entry for %s, chain %s"
											% dasArgs)
				else:
					failure = dasError(*dasArgs)
		else:
			if dasErrorsOnly:
				from dasErrors import NoDASResponseError
				failure = NoDASResponseError(*dasArgs)
			else:
				from chimera import NonChimeraError
				failure = NonChimeraError("No DAS server status (X-DAS-Status)"
					" in reply for %s, chain %s" % dasArgs)
	if failure:
		import os
		os.unlink(path)
		raise failure
	tree = xmlParsePath(path, parser)
	return tree

def showUniprotFeatures(mav, features, seqIndex=-1, hideSS=None, mapping=None,
		sourceID=None):
	# adjust existing regions...
	rb = mav.regionBrowser
	msReg = None
	seq = mav.seqs[seqIndex]
	if hideSS == None:
		hideSS = len(mav.seqs) == 1
	if hideSS:
		mav.showSS(show=False)
	for reg in rb.regions[:]:
		if reg.name in ["structure strands", "structure helices"]:
			rb.lowerRegion(reg, rebuildTable=False)
		if reg.name and reg.name.startswith(mav.GAP_REG_NAME_START):
			msReg = reg
			msReg.borderRGBA = (1.0, 0.376, 0.898, 1.0)
			msReg.interiorRGBA = None
			#msReg.borderRGBA = (1.0, 0.753, 0.796, 1.0)
			#msReg.interiorRGBA = (0.947, 0.824, 0.845, 1.0)
			mav.status("Sequence with no 3D structure outlined in pinkish magenta")
	fNameRemap = {
		"strand": "Uniprot strands",
		"helix": "Uniprot helices",
		"turn": "Uniprot turns"
	}
	fTypeColor = {
		"active site": "deep sky blue",
		"binding site": "medium purple",
		"chain": "dark khaki",
		"disulfide bond": "yellow",
		"domain": "sandy brown",
		"glycosylation site": "white",
		"helix": "gold",
		"modified residue": "plum",
		"mutagenesis site": "coral",
		"propeptide": "gray",
		"region of interest": "hot pink",
		"repeat": "slate gray",
		"sequence conflict": "red",
		"sequence variant": "orange",
		"signal peptide": "dark gray",
		"site": "turquoise",
		"splice variant": "magenta",
		"topological domain": "dark cyan",
		"transmembrane region": "salmon",
		"strand": "chartreuse",
		"turn": "medium purple",
	}
	regionInfo = {}
	featureLookup = {}
	for feature in features:
		locs = [cn for cn in feature.childNodes
					if getattr(cn, 'tagName', None) == "location"]
		if not locs:
			continue
		fType = feature.getAttribute("type")
		name = fNameRemap.get(fType, fType)
		# try to coalesce features with the exact
		# same attributes into one region...
		attrMap = {}
		xmlAttrs = feature.attributes
		for attr in [xmlAttrs.item(i) for i in range(xmlAttrs.length)]:
			if attr.localName == "type":
				continue
			attrMap[attr.localName] = attr.value
		regKey = (fType, tuple(sorted(attrMap.items())))
		featureLookup.setdefault(regKey, []).append(feature)
		for loc in locs:
			begin = end = None
			for cn in loc.childNodes:
				tn = getattr(cn, 'tagName', None)
				if tn == "position":
					begin = end = int(cn.getAttribute("position")) - 1
				elif tn == "begin" and cn.getAttribute("status") != "unknown":
					begin = int(cn.getAttribute("position")) - 1
				elif tn == "end" and cn.getAttribute("status") != "unknown":
					end = int(cn.getAttribute("position")) - 1
			if begin is None or end is None:
				continue
			blocks = regionInfo.setdefault(regKey, [])
			if mapping is not None:
				for b in range(begin, end+1):
					if b in mapping:
						break
				else:
					continue
				for e in range(end, begin-1, -1):
					if e in mapping:
						break
				else:
					continue
				# if the mapped range is larger than the unmapped range,
				# then there are one or more insertions in the sequence
				# and we need to add multiple blocks
				while e  - b < mapping[e] - mapping[b]:
					ne = b
					while ne - b == mapping[ne] - mapping[b]:
						ne += 1
						if ne not in mapping:
							break
					blocks.append((mapping[b], mapping[ne-1]))
					b = ne
					while b not in mapping:
						b += 1
				begin, end = mapping[b], mapping[e]
			blocks.append((begin, end))
	sortableRegKeys = list(reversed(sorted(
		[(fNameRemap.get(rk[0], rk[0]).lower(), rk) for rk in regionInfo.keys()])))
	if sourceID is None:
		source = "UniProt"
	else:
		source = "UniProt (%s)" % sourceID
	for sortKey, regKey in sortableRegKeys:
		fType, attrList = regKey
		attrMap = dict(attrList)
		blocks = regionInfo[regKey]
		if len(blocks) > 2:
			defName = fType + 's'
		else:
			defName = fType
		if 'bond' in fType:
			oldBlocks = blocks[:]
			blocks = []
			for block in oldBlocks:
				blocks.extend([(block[0], block[0]), (block[1], block[1])])
		baseName = fNameRemap.get(fType, defName)
		features = featureLookup[regKey]
		if len(features) == 1:
			feature = features[0]
			origs = [cn for cn in feature.childNodes
						if getattr(cn, 'tagName', None) == "original"]
			if len(origs) == 1:
				vars = [cn for cn in feature.childNodes
							if getattr(cn, 'tagName', None) == "variation"]
				if len(vars) == 1:
					baseName += " " + origs[0].firstChild.nodeValue + \
						u"\N{RIGHTWARDS ARROW}" \
						+ vars[0].firstChild.nodeValue
		if attrMap:
			baseName += ":\n\t"
			if 'description' in attrMap:
				baseName += attrMap.pop('description').replace(
						';', '\n\t  ').strip()
				if attrMap:
					baseName += "\n\t"
			regName = "%s%s" % (baseName, "\n\t".join([
				"%s=%s" % (k,v) for k,v in attrMap.items()]))
		else:
			regName = baseName
		if fType.endswith("-binding region"):
			fill = "cornflower blue"
		else:
			fill = fTypeColor.get(fType, "cyan")
		blocks=[(seq, seq, seq.ungapped2gapped(begin), seq.ungapped2gapped(end))
				for begin,end in blocks]
		isLast = regKey == sortableRegKeys[-1][1]
		reg = mav.getRegion(regName, sequence=seq, source=source)
		if reg:
			reg.addBlocks(blocks, makeCB=isLast)
		else:
			mav.newRegion(name=regName, shown=False, fill=fill, blocks=blocks,
				sequence=seq, source=source, rebuildTable=isLast,
				after=msReg)
	rb.showSeqRegions(seq)
	rb.enter()
	mav.enter() # put the sequence on top

def showUniprotSeq(ident):
	from chimera import UserError, CancelOperation, replyobj, Sequence
	try:
		acc = mapUniprotNameID(ident)
		seqString, fullName, features = uniprotFetch(acc)
	except InvalidAccessionError, v:
		raise UserError(unicode(v))
	except CancelOperation:
		replyobj.status("Fetch of %s cancelled" % ident)
		return
	seq = Sequence.Sequence(ident)
	seq.extend(seqString)
	from MultAlignViewer.MAViewer import MAViewer
	mav = MAViewer([seq], title=fullName)
	showUniprotFeatures(mav, features)

def pdbUniprotCorrespondences(chain, status=None,
								ignoreCache=False, pdbID=None):
	"""can raise CancelOperation"""

	from chimera import LimitationError
	try:
		chain.molecule.pdbHeaders["SEQRES"]
	except (AttributeError, KeyError):
		raise LimitationError("Cannot determine UniProt mapping for"
			" structures without PDB SEQRES records.  [Required by"
			u" RCSB PDB\N{RIGHTWARDS ARROW}UniProt web service]")
	if pdbID is None:
		try:
			header = chain.molecule.pdbHeaders["HEADER"]
		except (AttributeError, KeyError):
			raise LimitationError("Cannot determine PDB ID code"
				" without PDB HEADER record.  [PDB ID code required by"
				u" RCSB PDB\N{RIGHTWARDS ARROW}UniProt web service]")
		pdbID = header[0].split()[-1]
		if not (len(pdbID) == 4 and pdbID[0].isdigit() and pdbID[1:].isalnum()):
			from chimera import UserError
			raise UserError("HEADER record does not end with PDB ID code")
	chainID = chain.chainID
	if status:
		status("Fetching PDB data for %s, chain %s" % (pdbID, chainID))
	import xml.etree.ElementTree
	try:
		tree = parseUniprotAlignment(pdbID, chainID, ignoreCache=ignoreCache,
				parser=xml.etree.ElementTree.parse)
	finally:
		if status:
			status("")
	seqresIndex = makeSeqresIndex(chain)
	chain.uniprotIDs = uniprotIDs(pdbID, chainID, tree=tree)
	root = tree.getroot()
	ns = root.tag[:root.tag.index('}')+1] # namespace
	alignment = root.find(ns + "alignment")
	allCorrs = {}
	blocks = alignment.findall(ns + "block")
	if blocks:
		def _blockNum(b):
			return int(b.get("blockOrder"))
		blocks.sort(lambda b1, b2: cmp(_blockNum(b1), _blockNum(b2)))
		for block in blocks:
			pdbSeg = uniprotSeg = None
			curPdbLen = 0
			for segment in block.findall(ns + "segment"):
				objID = segment.get("intObjectId")
				if objID[:4].lower() == pdbID.lower():
					start = seqresIndex[segment.get("start")]
					end = seqresIndex[segment.get("end")]
					length = end - start + 1
					pdbSeg = (start, length)
					prev = uniprotSeg
					if curPdbLen + length > len(chain):
						from chimera import NonChimeraError
						raise NonChimeraError("%s: RCSB reports length"
							" greater than available in currently open"
							" structure.\nCheck to make sure you have"
							" the latest version of the PDB entry"
							u" and RCSB PDB\N{RIGHTWARDS ARROW}UniProt"
							" info (the latter by checking the 'Ignore"
							" any cached data' button in the PDB/UniProt"
							" Info dialog)." % chain.name)
					curPdbLen += length
				else:
					start = int(segment.get("start"))
					end = int(segment.get("end"))
					length = end - start + 1
					uniprotSeg = (start, length)
					prev = pdbSeg
					objCorrs = allCorrs.setdefault(objID, [])
				if prev and prev[1] != length:
					raise AssertionError("%s: UniProt/PDB segment"
						" length mismatch" % pdbID)
			if not pdbSeg or not uniprotSeg:
				raise AssertionError("%s: correspondence info"
					" PDB/UniProt missing" % pdbID)
			objCorrs.append((pdbSeg[0], uniprotSeg[0], pdbSeg[1]))
	return alignment.findall(ns + "alignObject"), allCorrs

def makeSeqresIndex(chain):
	"""make a lookup table from a (string) residue identifier to
	   an index into the (SEQRES-based) chain.residues"""
	lookup = {}
	midNones = initialNones = 0
	lastKey = None
	for i, r in enumerate(chain.residues):
		if r:
			key = str(r.id)[:-2]
			lookup[key] = i
			if initialNones and not lastKey:
				nonei = i - 1
				if key[-1].isalpha():
					insert = key[-1]
					while insert > 'A' and initialNones:
						insert = char(ord(insert)-1)
						initialNones -= 1
						lookup[key[:-1] + insert] = nonei
						nonei -= 1
					pos = int(key[:-1])
				else:
					pos = int(key)
				while initialNones:
					pos = pos - 1
					initialNones -= 1
					lookup[str(pos)] = nonei
					nonei -= 1
			lookup[key] = i
			if midNones:
				if key[-1].isalpha():
					pos = int(key[:-1])
				else:
					pos = int(key)
				ni = i
				while midNones:
					midNones -= 1
					pos -= 1
					ni -= 1
					lookup[str(pos)] = ni
		else:
			if not lastKey:
				initialNones += 1
				continue
			midNones += 1
			if lastKey[-1].isalpha():
				pos = int(lastKey[:-1]) + 1
			else:
				pos = int(lastKey) + 1
			key = str(pos)
			lookup[key] = i
		lastKey = key
	return lookup

def parseSifts(pdbCode, parser=None):
	"""can raise CancelOperation"""

	url = "http://www.rcsb.org/pdb/files/%s.sifts.xml.gz" % pdbCode.lower()
	from chimera.fetch import fetch_file
	# can raise CancelOperation
	path, headers = fetch_file(url, "%s SIFTS data" % pdbCode, uncompress=True,
		save_dir="SIFTS", save_name="%s.xml" % pdbCode.lower())
	tree = xmlParsePath(path, parser)
	return tree

def printElementTree(tree, depth=0):
	from xml.etree.ElementTree import ElementTree
	if isinstance(tree, ElementTree):
		node = tree.getroot()
	else:
		node = tree
	baseIndent = " " * (4 * depth)
	def printableTag(node):
		if node.tag.startswith('{'):
			return node.tag[node.tag.index('}')+1:]
		return node.tag
	if node.text and node.text.strip():
		textVal = " = " + node.text.strip()
	else:
		textVal = ""
	print baseIndent + printableTag(node) + textVal
	for k, v in node.items():
		print baseIndent, "  ", k, v
	for kid in node.getchildren():
		print baseIndent, "   --CHILD--"
		printTree(kid, depth+1)

def xmlParsePath(path, parser=None):
	if parser == None:
		import xml.dom.minidom
		parser = xml.dom.minidom.parse

	from OpenSave import osOpen
	f = osOpen(path)
	# would like to use 'with', but GzipFiles don't have __exit__
	try:
		tree = parser(f)
	finally:
		f.close()
	return tree