"""
qtgui/ParseHTMLTextString.py: CCP4MG Molecular Graphics Program
Copyright (C) 2001-2008 University of York, CCLRC
This library is free software: you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
version 3, modified in accordance with the provisions of the
license to address the requirements of UK law.
You should have received a copy of the modified GNU Lesser General
Public License along with this library. If not, copies may be
downloaded from http://www.ccp4.ac.uk/ccp4license.php
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
"""
#from PyQt4 import QtGui, QtCore
from HTMLParser import HTMLParser
import htmlentitydefs
class MyHTMLParser(HTMLParser):
font_xx_small = 9
font_x_small = 11
font_small = 14
font_medium = 18
font_large = 22
font_x_large = 26
font_xx_large = 30
font_css_step = 5
def __init__(self,fontSize=12,fontUnits="pt"):
HTMLParser.__init__(self)
self.lastTagPos = self.getpos()
self.lastDataPos = self.getpos()
self.currentColour = "default"
self.currentFace = "default"
self.currentSize = fontSize
self.currentSuperscript = 0
self.currentSubscript = 0
self.currentUnderline = 0
self.currentOverline = 0
self.currentLinethrough = 0
self.currentBold = 0
self.currentItalic = 0
self.currentFont = 0
self.fontList = []
self.fontStack = []
self.openTag = None
self.dataList = []
self.dataLength = 0
self.allData = ''
self.currentText = ''
self.fontUnits = fontUnits
self.currentMathMarkup = 0
self.currentVAlign = -1
self.currentParagraph =-1
self.addFontChange()
self.ignoring = False
def addDataChange(self):
if self.ignoring: return
data = {}
data["pos"] = self.getpos()
data["text"] = self.currentText.replace('\n','')
self.dataList.append(data)
self.dataLength = self.dataLength + len(self.currentText)
self.allData = self.allData+self.currentText
def handle_charref(self, charref):
self.lastDataPos = self.getpos();
if charref[0]=="x":
self.currentText = unichr(int("0"+charref,16))
else:
self.currentText = unichr(int(charref))
self.addDataChange()
def handle_entityref(self, entity):
self.lastDataPos = self.getpos();
self.currentText = unichr(int(htmlentitydefs.name2codepoint[entity]))
self.addDataChange()
def handle_data(self, data):
self.lastDataPos = self.getpos();
self.currentText = data
self.addDataChange()
"""
def handle_startendtag(self, tag, attrs):
self.lastTagPos = self.getpos()
print "Encountered empty %s ,%s tag" % (tag, attrs))
"""
def parse_style_attr(self, attr):
for element in attr[1].split(';'):
if not element.isspace() and len(element)>0:
keyp,val = element.split(':')
key = keyp.strip()
if key=="vertical-align":
if val=="baseline": self.currentVAlign = 0
if val=="sub": self.currentVAlign = 1
if val=="super": self.currentVAlign = 2
if key=="color":
self.currentColour=val.strip().replace("'",'').replace('"','')
if key=="font-family":
self.currentFace=val.strip().replace("'",'').replace('"','')
if key=="font-size":
val = val.strip().replace(' ','')
if val=='xx-small':
size = self.font_xx_small
elif val=='x-small':
size = self.font_x_small
elif val=='small':
size = self.font_small
elif val=='medium':
size = self.font_medium
elif val=='large':
size = self.font_large
elif val=='x-large':
size = self.font_x_large
elif val=='xx-large':
size = self.font_xx_large
elif val=='larger':
size = self.currentSize + self.font_css_step
elif val=='smaller':
size = self.currentSize - self.font_css_step
elif val.find('mm')>0:
try:
size = int(float(val[:val.find('mm')])/2.54/10*72)
except:
size = self.currentSize
elif val.find('cm')>0:
try:
size = int(float(val[:val.find('cm')])/2.54*72)
except:
size = self.currentSize
elif val.find('in')>0:
try:
size = 72*int(val[:val.find('in')])
except:
size = self.currentSize
elif val.find('pc')>0:
try:
size = 12*int(val[:val.find('pc')])
except:
size = self.currentSize
elif val.find('px')>0:
try:
size = int(val[:val.find('px')])
self.fontUnits = "px"
except:
size = self.currentSize
elif val.find('pt')>0:
try:
size = int(val[:val.find('pt')])
except:
size = self.currentSize
elif val.find('em')>0:
try:
size = int(self.currentSize*float(val[:val.find('em')]))
except:
size = self.currentSize
elif val.find('%')>0:
try:
size = int(self.currentSize*float(val[:val.find('%')])/100.)
except:
size = self.currentSize
else:
try:
size = int(val)
except:
size = self.currentSize
self.currentSize = size
if key=="font-style":
if val=='italic' or val=='oblique':
self.currentItalic = 1
if val=='normal':
self.currentItalic = max(self.currentItalic - 1,0)
if key=="text-decoration":
if val.find("line-through")>-1:
self.currentLinethrough = 1
else:
self.currentLinethrough = max(self.currentLinethrough-1,0)
if val.find("overline")>-1:
self.currentOverline = 1
else:
self.currentOverline = max(self.currentOverline-1,0)
if val.find("underline")>-1:
self.currentUnderline = 1
else:
self.currentUnderline = max(self.currentUnderline-1,0)
if key=="font-variant":
pass
if key=="font-weight":
if int(val)>=600:
self.currentBold = 1
else:
self.currentBold = max(self.currentBold - 1,0)
def handle_starttag(self, tag, attrs):
self.lastTagPos = self.getpos()
if tag.upper() == "P":
self.currentParagraph = self.currentParagraph + 1
if tag.upper() == "HEAD":
self.ignoring = True
return
if tag.upper() == "MGMATH":
self.currentMathMarkup = self.currentMathMarkup + 1
if tag.upper() == "SUP":
self.currentSuperscript = self.currentSuperscript + 1
if tag.upper() == "SUB":
self.currentSubscript = self.currentSubscript + 1
if tag.upper() == "B":
self.currentBold = 1
if tag.upper() == "I":
self.currentItalic = 1
if tag.upper() == "U":
self.currentUnderline = 1
if tag.upper() == "FONT":
self.currentFont = 1
# This stuff needs to go outside the FONT TAG as it is utterly generic
for attr in attrs:
if len(attr)==2:
if attr[0].upper()=="STYLE":
self.parse_style_attr(attr)
if attr[0].upper()=="SIZE":
if attr[1][0]=='-':
self.currentSize = self.currentSize - int(attr[1][1:])
elif attr[1][0]=='+':
self.currentSize = self.currentSize + int(attr[1][1:])
else:
self.currentSize = int(attr[1])
currentColour=attr[1].lower()
if attr[0].upper()=="COLOR":
self.currentColour=attr[1].lower()
if attr[0].upper()=="FACE":
self.currentFace=attr[1].lower()
else:
for attr in attrs:
if len(attr)==2:
if attr[0].upper()=="STYLE":
self.parse_style_attr(attr)
self.fontStack.append(self.fontList[len(self.fontList)-1])
self.addFontChange()
def addFontChange(self):
font = {}
font["pos"] = self.getpos()
font["size"] = self.currentSize
font["colour"] = self.currentColour
font["face"] = self.currentFace
font["underline"] = self.currentUnderline
font["overline"] = self.currentOverline
font["linethrough"] = self.currentLinethrough
font["bold"] = self.currentBold
font["italic"] = self.currentItalic
font["subscript"] = self.currentSubscript
font["superscript"] = self.currentSuperscript
font["mathmarkup"] = self.currentMathMarkup
font["fontUnits"] = self.fontUnits
font["valign"] = self.currentVAlign
font["paragraph"] = self.currentParagraph
self.fontList.append(font)
"""
def handle_pi(self, data):
self.lastTagPos = self.getpos()
print "Encountered instruction %s" % (data)
"""
def handle_endtag(self, tag):
#print "Encountered the end of a %s tag" % (tag)
self.lastTagPos = self.getpos()
if tag.upper() == "HEAD":
self.ignoring = False
return
if tag.upper() == "MGMATH":
self.currentMathMarkup = max(self.currentMathMarkup - 1,0)
if tag.upper() == "SUP":
self.currentSuperscript = max(self.currentSuperscript - 1,0)
if tag.upper() == "SUB":
self.currentSubscript = max(self.currentSubscript - 1,0)
if tag.upper() == "B":
self.currentBold = max(self.currentBold - 1,0)
if tag.upper() == "I":
self.currentItalic = max(self.currentItalic - 1,0)
if tag.upper() == "U":
self.currentUnderline = max(self.currentUnderline - 1,0)
if tag.upper() == "FONT":
self.currentFont = max(self.currentFont - 1,0)
if len(self.fontStack) < 1:
font = self.fontList[0]
else:
font = self.fontStack.pop()
self.currentSize = font["size"]
self.currentColour = font["colour"]
self.currentFace = font["face"]
self.currentUnderline = font["underline"]
self.currentOverline = font["overline"]
self.currentLinethrough = font["linethrough"]
self.currentBold = font["bold"]
self.currentItalic = font["italic"]
self.currentSubscript = font["subscript"]
self.currentSuperscript = font["superscript"]
self.currentMathMarkup = font["mathmarkup"]
self.fontUnits = font["fontUnits"]
self.currentVAlign = font["valign"]
self.addFontChange()
def getFontChanges(self):
return self.fontList
def getDataLength(self):
return self.dataLength
def getAllData(self):
return self.allData
def getData(self):
return self.dataList
def getStrings(self):
changes = self.getFontChanges()
data = self.getData()
#print self.getpos()
fontedStrings = []
left_data = data[:]
#print "left_data 0", left_data
for j in range(len(changes)-1):
cpos = changes[j+1]["pos"]
this_data = ""
for i,datum in zip(range(len(left_data)), left_data[:]):
pos = datum["pos"]
#print cpos,pos
if pos[0]>cpos[0] or (pos[0]==cpos[0] and pos[1] > cpos[1]):
#print pos,"is beyond",cpos
left_data = left_data[i:]
break
else:
this_data = this_data+datum["text"]
if i == len(left_data)-1:
left_data = []
fontedString = {}
fontedString["font"] = changes[j]
fontedString["text"] = this_data
fontedStrings.append(fontedString)
if self.lastTagPos[0] > self.lastDataPos[0] or (self.lastTagPos[0] == self.lastDataPos[0] and self.lastTagPos[1] > self.lastDataPos[1]):
#print "End with a tag"
left_data = []
if len(left_data)>0:
#print "left_data final",left_data
for lf in left_data:
fontedString = {}
fontedString["font"] = changes[len(changes)-1]
fontedString["text"] = lf["text"]
fontedStrings.append(fontedString)
return fontedStrings
if __name__ == '__main__':
parser = MyHTMLParser()
text = """hello <<< <
world ÞÞ"""
#text = """helloworld"""
#text = "/1/J/123(PRO)/CA [1] (-12.345) The σσ Waft from the flashy fish."
text = "/1/J/123(PRO)/CA [1] (-12.345) The σσ hello"
parser.feed(text)
parser.close()
#print parser.getData()
fontedStrings = parser.getStrings()
for fstring in fontedStrings:
print fstring