#!/usr/local/bin/python
# Copyright (c) 2000 by the Regents of the University of California.
# All rights reserved.  See http://www.cgl.ucsf.edu/chimera/ for
# license details.
#
# $Id: oslLexer.py 40867 2016-01-07 01:16:55Z pett $

import string
import re

#
# Lexical analyzer modes (determines whether some special characters are
# treated as qualifier elements or just normal characters)
#
ModeAbbr = 1
ModeQual = 2
ModeValue = 3

#
# Token type
#
TypeKey = 'key'
TypeAbbr = 'abbreviation'
TypeQual = 'qualifier'
TypeIdent = 'identifier'
TypeOp = 'operator'
TypeEnd = 'end of input'

#
# Operator type
#
OpLP = '('
OpRP = ')'
OpLB = '['
OpRB = ']'
OpMatch = '~'
OpNotMatch = '!~'
OpEQ1 = '='  # case independent
OpEQ2 = '==' # case dependent
OpNE = '!='
OpLT = '<'
OpLE = '<='
OpGT = '>'
OpGE = '>='
OpNot = '!'
OpUndef = '^'
OpAnd = 'and'
OpOr = 'or'

#
# Key type
#
KeyGraph = 'graph'
KeySubgraph = 'subgraph'
KeyVertex = 'vertex'
KeyEdge = 'edge'

#
# Various regular expressions for determining end of sequence of characters
#
white = re.compile('[' + string.whitespace + ']')
nonWhite = re.compile('[^' + string.whitespace + ']')
endAbbr = re.compile('[' + '#:@/]')
endQual = re.compile('[^a-zA-Z0-9_]')
endValue = white

#
# nextToken
#	Skip over whitespace and return a token, which is a dictionary
#	with at least three attributes 'type', 'start' and 'end'.  'start'
#	and 'end' are the indices for the start and end of the token.
#	Depending on the value of 'type', there may be other attributes:
#	  'type' == 'key'
#		'key' is the type of key
#	  'type' == 'abbreviation'
#		'string' is the actual abbreviation string
#	  'type' == 'qualifier'
#		none
#	  'type' == 'identifier'
#		'string is the actual identifier string
#	  'type' == 'operator'
#		'operator' is the operator type
#	  'type' == 'end of input'
#		none
#
def nextToken(s, start, end, mode):
	#
	# Skip over leading whitespace and see if we reached end of input
	#
	m = nonWhite.search(s, start)
	n = m is None and -1 or m.start()
	if n < 0:
		return {'type':TypeEnd, 'start':n, 'end':n}, n

	#
	# Check for key or qualifier symbol
	#
	last = n
	token = {'start':n}
	if s[n] == '#':				# Check for graph key
		token['type'] = TypeKey
		token['key'] = KeyGraph
		last = n + 1
	elif s[n] == ':':			# Check for subgraph key
		token['type'] = TypeKey
		token['key'] = KeySubgraph
		last = n + 1
	elif s[n] == '@':			# Check for vertex key
		token['type'] = TypeKey
		token['key'] = KeyVertex
		last = n + 1
	elif s[n] == '/':			# Check for qualifier
		token['type'] = TypeQual
		last = n + 1
	if last > n:				# Check if any of above found
		token['end'] = last
		return token, last

	#
	# Check for abbreviation, identifier or operators
	#
	if mode == ModeAbbr:
		#
		# In abbreviation mode, we treat the subsequent characters
		# up to a whitespace or key/qualifier symbol as the
		# abbreviation string
		# Quoted strings are handled specially
		#
		if s[n] == '"':
			return lexString(s, n, end, TypeAbbr)
		m = endAbbr.search(s, n + 1)
		last = m is None and -1 or m.start()
		token['type'] = TypeAbbr
		if last < 0:
			token['string'] = s[n:]
		else:
			token['string'] = s[n:last]
	elif mode == ModeQual:			# Qualifer tests mode
		#
		# In qualifier mode, a quoted string is treated as an
		# identifier.  Otherwise, we look for operators first
		# and, if none is found, then assume we got an identifier
		#
		if s[n] == '"':
			return lexString(s, n, end, TypeIdent)
		elif s[n] == '(':
			token['type'] = TypeOp
			token['operator'] = OpLP
			last = n + 1
		elif s[n] == ')':
			token['type'] = TypeOp
			token['operator'] = OpRP
			last = n + 1
		elif s[n] == '[':
			token['type'] = TypeOp
			token['operator'] = OpLB
			last = n + 1
		elif s[n] == ']':
			token['type'] = TypeOp
			token['operator'] = OpRB
			last = n + 1
		elif s[n] == '~':
			token['type'] = TypeOp
			token['operator'] = OpMatch
			last = n + 1
		elif s[n] == '=':
			token['type'] = TypeOp
			if s[n + 1] == '=':
				token['operator'] = OpEQ2
				last = n + 2
			else:
				token['operator'] = OpEQ1
				last = n + 1
		elif s[n] == '!':
			token['type'] = TypeOp
			if s[n + 1] == '=':
				token['operator'] = OpNE
				last = n + 2
			elif s[n + 1] == '~':
				token['operator'] = OpNotMatch
				last = n + 2
			else:
				token['operator'] = OpNot
				last = n + 1
		elif s[n] == '<':
			token['type'] = TypeOp
			if s[n + 1] == '=':
				token['operator'] = OpLE
				last = n + 2
			else:
				token['operator'] = OpLT
				last = n + 1
		elif s[n] == '>':
			token['type'] = TypeOp
			if s[n + 1] == '=':
				token['operator'] = OpGE
				last = n + 2
			else:
				token['operator'] = OpGT
				last = n + 1
		elif s[n] == '^':
			token['type'] = TypeOp
			token['operator'] = OpUndef
			last = n + 1
		else:
			#
			# No operators found.  Must be either an identifier
			# or the conjunctions "and" or "or"
			#
			# _or_ a syntax error
			#
			m = endQual.search(s, n)
			if m and m.start() == n:
				raise SyntaxError, (
				'illegal symbol in identifier', ('', 1, n, s))
			last = m is None and -1 or m.start()
			if last == -1:
				str = s[n:]
			else:
				str = s[n:last]
			if str == 'and':
				token['type'] = TypeOp
				token['operator'] = OpAnd
			elif str == 'or':
				token['type'] = TypeOp
				token['operator'] = OpOr
			else:
				token['type'] = TypeIdent
				token['string'] = str
	elif mode == ModeValue:			# Qualifer value mode
		if s[n] == '"':
			return lexString(s, n, end, TypeIdent)
		m = endValue.search(s, n + 1)
		last = m is None and -1 or m.start()
		if last == -1:
			str = s[n:]
		else:
			str = s[n:last]
		token['type'] = TypeIdent
		token['string'] = str
	else:					# Unknown mode (how?)
		raise ValueError, ('unknown osl lexer mode ', mode)
	if last == -1:
		last = end
	token['end'] = last
	return token, last

#
# lexString
#	Grab a quoted string from the input
#
def lexString(s, start, end, type):
	token = {'type':type, 'start':start}
	last = start + 1
	escape = 0
	while last < end:
		if escape:
			escape = 0
		elif s[last] == '\\':
			escape = 1
		elif s[last] == '"':
			break
		last = last + 1
	if escape or last >= end:
		raise SyntaxError, ('unterminated string', ('', 1, start, s))
	last = last + 1
	str = s[start:last]
	token['string'] = eval(str)
	token['end'] = last
	return token, last

#
# Test code
#
if __name__ == '__main__':
	def testString(s, mode):
		print 'lexing', repr(s)
		start = 0
		end = len(s)
		while start != -1:
			token, start = nextToken(s, start, end, mode)
			print token
	testString('#0@ca #1@cb', ModeAbbr)
	testString('#0@"ca" #1@cb', ModeAbbr)
	testString('#0@"c\\"a" #1@cb', ModeAbbr)
	testString('#/number=0@/name=ca', ModeQual)