""" qtgui/ParseHTMLTextString.py: CCP4MG Molecular Graphics Program Copyright (C) 2001-2008 University of York, CCLRC This library is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 3, modified in accordance with the provisions of the license to address the requirements of UK law. You should have received a copy of the modified GNU Lesser General Public License along with this library. If not, copies may be downloaded from http://www.ccp4.ac.uk/ccp4license.php This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. """ #from PyQt4 import QtGui, QtCore from HTMLParser import HTMLParser import htmlentitydefs class MyHTMLParser(HTMLParser): font_xx_small = 9 font_x_small = 11 font_small = 14 font_medium = 18 font_large = 22 font_x_large = 26 font_xx_large = 30 font_css_step = 5 def __init__(self,fontSize=12,fontUnits="pt"): HTMLParser.__init__(self) self.lastTagPos = self.getpos() self.lastDataPos = self.getpos() self.currentColour = "default" self.currentFace = "default" self.currentSize = fontSize self.currentSuperscript = 0 self.currentSubscript = 0 self.currentUnderline = 0 self.currentOverline = 0 self.currentLinethrough = 0 self.currentBold = 0 self.currentItalic = 0 self.currentFont = 0 self.fontList = [] self.fontStack = [] self.openTag = None self.dataList = [] self.dataLength = 0 self.allData = '' self.currentText = '' self.fontUnits = fontUnits self.currentMathMarkup = 0 self.currentVAlign = -1 self.currentParagraph =-1 self.addFontChange() self.ignoring = False def addDataChange(self): if self.ignoring: return data = {} data["pos"] = self.getpos() data["text"] = self.currentText.replace('\n','') self.dataList.append(data) self.dataLength = self.dataLength + len(self.currentText) self.allData = self.allData+self.currentText def handle_charref(self, charref): self.lastDataPos = self.getpos(); if charref[0]=="x": self.currentText = unichr(int("0"+charref,16)) else: self.currentText = unichr(int(charref)) self.addDataChange() def handle_entityref(self, entity): self.lastDataPos = self.getpos(); self.currentText = unichr(int(htmlentitydefs.name2codepoint[entity])) self.addDataChange() def handle_data(self, data): self.lastDataPos = self.getpos(); self.currentText = data self.addDataChange() """ def handle_startendtag(self, tag, attrs): self.lastTagPos = self.getpos() print "Encountered empty %s ,%s tag" % (tag, attrs)) """ def parse_style_attr(self, attr): for element in attr[1].split(';'): if not element.isspace() and len(element)>0: keyp,val = element.split(':') key = keyp.strip() if key=="vertical-align": if val=="baseline": self.currentVAlign = 0 if val=="sub": self.currentVAlign = 1 if val=="super": self.currentVAlign = 2 if key=="color": self.currentColour=val.strip().replace("'",'').replace('"','') if key=="font-family": self.currentFace=val.strip().replace("'",'').replace('"','') if key=="font-size": val = val.strip().replace(' ','') if val=='xx-small': size = self.font_xx_small elif val=='x-small': size = self.font_x_small elif val=='small': size = self.font_small elif val=='medium': size = self.font_medium elif val=='large': size = self.font_large elif val=='x-large': size = self.font_x_large elif val=='xx-large': size = self.font_xx_large elif val=='larger': size = self.currentSize + self.font_css_step elif val=='smaller': size = self.currentSize - self.font_css_step elif val.find('mm')>0: try: size = int(float(val[:val.find('mm')])/2.54/10*72) except: size = self.currentSize elif val.find('cm')>0: try: size = int(float(val[:val.find('cm')])/2.54*72) except: size = self.currentSize elif val.find('in')>0: try: size = 72*int(val[:val.find('in')]) except: size = self.currentSize elif val.find('pc')>0: try: size = 12*int(val[:val.find('pc')]) except: size = self.currentSize elif val.find('px')>0: try: size = int(val[:val.find('px')]) self.fontUnits = "px" except: size = self.currentSize elif val.find('pt')>0: try: size = int(val[:val.find('pt')]) except: size = self.currentSize elif val.find('em')>0: try: size = int(self.currentSize*float(val[:val.find('em')])) except: size = self.currentSize elif val.find('%')>0: try: size = int(self.currentSize*float(val[:val.find('%')])/100.) except: size = self.currentSize else: try: size = int(val) except: size = self.currentSize self.currentSize = size if key=="font-style": if val=='italic' or val=='oblique': self.currentItalic = 1 if val=='normal': self.currentItalic = max(self.currentItalic - 1,0) if key=="text-decoration": if val.find("line-through")>-1: self.currentLinethrough = 1 else: self.currentLinethrough = max(self.currentLinethrough-1,0) if val.find("overline")>-1: self.currentOverline = 1 else: self.currentOverline = max(self.currentOverline-1,0) if val.find("underline")>-1: self.currentUnderline = 1 else: self.currentUnderline = max(self.currentUnderline-1,0) if key=="font-variant": pass if key=="font-weight": if int(val)>=600: self.currentBold = 1 else: self.currentBold = max(self.currentBold - 1,0) def handle_starttag(self, tag, attrs): self.lastTagPos = self.getpos() if tag.upper() == "P": self.currentParagraph = self.currentParagraph + 1 if tag.upper() == "HEAD": self.ignoring = True return if tag.upper() == "MGMATH": self.currentMathMarkup = self.currentMathMarkup + 1 if tag.upper() == "SUP": self.currentSuperscript = self.currentSuperscript + 1 if tag.upper() == "SUB": self.currentSubscript = self.currentSubscript + 1 if tag.upper() == "B": self.currentBold = 1 if tag.upper() == "I": self.currentItalic = 1 if tag.upper() == "U": self.currentUnderline = 1 if tag.upper() == "FONT": self.currentFont = 1 # This stuff needs to go outside the FONT TAG as it is utterly generic for attr in attrs: if len(attr)==2: if attr[0].upper()=="STYLE": self.parse_style_attr(attr) if attr[0].upper()=="SIZE": if attr[1][0]=='-': self.currentSize = self.currentSize - int(attr[1][1:]) elif attr[1][0]=='+': self.currentSize = self.currentSize + int(attr[1][1:]) else: self.currentSize = int(attr[1]) currentColour=attr[1].lower() if attr[0].upper()=="COLOR": self.currentColour=attr[1].lower() if attr[0].upper()=="FACE": self.currentFace=attr[1].lower() else: for attr in attrs: if len(attr)==2: if attr[0].upper()=="STYLE": self.parse_style_attr(attr) self.fontStack.append(self.fontList[len(self.fontList)-1]) self.addFontChange() def addFontChange(self): font = {} font["pos"] = self.getpos() font["size"] = self.currentSize font["colour"] = self.currentColour font["face"] = self.currentFace font["underline"] = self.currentUnderline font["overline"] = self.currentOverline font["linethrough"] = self.currentLinethrough font["bold"] = self.currentBold font["italic"] = self.currentItalic font["subscript"] = self.currentSubscript font["superscript"] = self.currentSuperscript font["mathmarkup"] = self.currentMathMarkup font["fontUnits"] = self.fontUnits font["valign"] = self.currentVAlign font["paragraph"] = self.currentParagraph self.fontList.append(font) """ def handle_pi(self, data): self.lastTagPos = self.getpos() print "Encountered instruction %s" % (data) """ def handle_endtag(self, tag): #print "Encountered the end of a %s tag" % (tag) self.lastTagPos = self.getpos() if tag.upper() == "HEAD": self.ignoring = False return if tag.upper() == "MGMATH": self.currentMathMarkup = max(self.currentMathMarkup - 1,0) if tag.upper() == "SUP": self.currentSuperscript = max(self.currentSuperscript - 1,0) if tag.upper() == "SUB": self.currentSubscript = max(self.currentSubscript - 1,0) if tag.upper() == "B": self.currentBold = max(self.currentBold - 1,0) if tag.upper() == "I": self.currentItalic = max(self.currentItalic - 1,0) if tag.upper() == "U": self.currentUnderline = max(self.currentUnderline - 1,0) if tag.upper() == "FONT": self.currentFont = max(self.currentFont - 1,0) if len(self.fontStack) < 1: font = self.fontList[0] else: font = self.fontStack.pop() self.currentSize = font["size"] self.currentColour = font["colour"] self.currentFace = font["face"] self.currentUnderline = font["underline"] self.currentOverline = font["overline"] self.currentLinethrough = font["linethrough"] self.currentBold = font["bold"] self.currentItalic = font["italic"] self.currentSubscript = font["subscript"] self.currentSuperscript = font["superscript"] self.currentMathMarkup = font["mathmarkup"] self.fontUnits = font["fontUnits"] self.currentVAlign = font["valign"] self.addFontChange() def getFontChanges(self): return self.fontList def getDataLength(self): return self.dataLength def getAllData(self): return self.allData def getData(self): return self.dataList def getStrings(self): changes = self.getFontChanges() data = self.getData() #print self.getpos() fontedStrings = [] left_data = data[:] #print "left_data 0", left_data for j in range(len(changes)-1): cpos = changes[j+1]["pos"] this_data = "" for i,datum in zip(range(len(left_data)), left_data[:]): pos = datum["pos"] #print cpos,pos if pos[0]>cpos[0] or (pos[0]==cpos[0] and pos[1] > cpos[1]): #print pos,"is beyond",cpos left_data = left_data[i:] break else: this_data = this_data+datum["text"] if i == len(left_data)-1: left_data = [] fontedString = {} fontedString["font"] = changes[j] fontedString["text"] = this_data fontedStrings.append(fontedString) if self.lastTagPos[0] > self.lastDataPos[0] or (self.lastTagPos[0] == self.lastDataPos[0] and self.lastTagPos[1] > self.lastDataPos[1]): #print "End with a tag" left_data = [] if len(left_data)>0: #print "left_data final",left_data for lf in left_data: fontedString = {} fontedString["font"] = changes[len(changes)-1] fontedString["text"] = lf["text"] fontedStrings.append(fontedString) return fontedStrings if __name__ == '__main__': parser = MyHTMLParser() text = """hello <<< <
world ÞÞ""" #text = """helloworld""" #text = "/1/J/123(PRO)/CA [1] (-12.345) The σσ Waft from the flashy fish." text = "/1/J/123(PRO)/CA [1] (-12.345) The σσ hello" parser.feed(text) parser.close() #print parser.getData() fontedStrings = parser.getStrings() for fstring in fontedStrings: print fstring