#!/usr/bin/env python # # qtfmt.py v1.10 # v1.10 : Updated to use Python 2.0 Unicode type. # # Read a document in the quotation DTD, converting it to a list of Quotation # objects. The list can then be output in several formats. __doc__ = """Usage: qtfmt.py [options] file1.xml file2.xml ... If no filenames are provided, standard input will be read. Available options: -f or --fortune Produce output for the fortune(1) program -h or --html Produce HTML output -t or --text Produce plain text output -m N or --max N Suppress quotations longer than N lines; defaults to 0, which suppresses no quotations at all. """ import string, re, cgi, types import codecs from xml.sax import saxlib, saxexts def simplify(t, indent="", width=79): """Strip out redundant spaces, and insert newlines to wrap the text at the given width.""" t = string.strip(t) t = re.sub('\s+', " ", t) if t=="": return t t = indent + t t2 = "" while len(t) > width: index = string.rfind(t, ' ', 0, width) if index == -1: t2 = t2 + t[:width] ; t = t[width:] else: t2 = t2 + t[:index] ; t = t[index+1:] t2 = t2 + '\n' return t2 + t class Quotation: """Encapsulates a single quotation. Attributes: stack -- used during construction and then deleted text -- A list of Text() instances, or subclasses of Text(), containing the text of the quotation. source -- A list of Text() instances, or subclasses of Text(), containing the source of the quotation. (Optional) author -- A list of Text() instances, or subclasses of Text(), containing the author of the quotation. (Optional) Methods: as_fortune() -- return the quotation formatted for fortune as_html() -- return an HTML version of the quotation as_text() -- return a plain text version of the quotation """ def __init__(self): self.stack = [ Text() ] self.text = [] def as_text(self): "Convert instance into a pure text form" output = "" def flatten(textobj): "Flatten a list of subclasses of Text into a list of paragraphs" if type(textobj) != types.ListType: textlist=[textobj] else: textlist = textobj paragraph = "" ; paralist = [] for t in textlist: if (isinstance(t, PreformattedText) or isinstance(t, CodeFormattedText) ): paralist.append(paragraph) paragraph = "" paralist.append(t) elif isinstance(t, Break): paragraph = paragraph + t.as_text() paralist.append(paragraph) paragraph = "" else: paragraph = paragraph + t.as_text() paralist.append(paragraph) return paralist # Flatten the list of instances into a list of paragraphs paralist = flatten(self.text) if len(paralist) > 1: indent = 2*" " else: indent = "" for para in paralist: if isinstance(para, PreformattedText) or isinstance(para, CodeFormattedText): output = output + para.as_text() else: output = output + simplify(para, indent) + '\n' attr = "" for i in ['author', 'source']: if hasattr(self, i): paralist = flatten(getattr(self, i)) text = string.join(paralist) if attr: attr = attr + ', ' text = string.lower(text[:1]) + text[1:] attr = attr + text attr=simplify(attr, width = 79 - 4 - 3) if attr: output = output + ' -- '+re.sub('\n', '\n ', attr) return output + '\n' def as_fortune(self): return self.as_text() + '%' def as_html(self): output = "

" def flatten(textobj): if type(textobj) != types.ListType: textlist = [textobj] else: textlist = textobj paragraph = "" ; paralist = [] for t in textlist: paragraph = paragraph + t.as_html() if isinstance(t, Break): paralist.append(paragraph) paragraph = "" paralist.append(paragraph) return paralist paralist = flatten(self.text) for para in paralist: output = output + string.strip(para) + '\n' attr = "" for i in ['author', 'source']: if hasattr(self, i): paralist = flatten(getattr(self, i)) text = string.join(paralist) attr=attr + ('

' % i) + string.strip(text) return output + attr # Text and its subclasses are used to hold chunks of text; instances # know how to display themselves as plain text or as HTML. class Text: "Plain text" def __init__(self, text=""): self.text = text # We need to allow adding a string to Text instances. def __add__(self, val): newtext = self.text + str(val) # __class__ must be used so subclasses create instances of themselves. return self.__class__(newtext) def __str__(self): return self.text def __repr__(self): s = string.strip(self.text) if len(s) > 15: s = s[0:15] + '...' return '<%s: "%s">' % (self.__class__.__name__, s) def as_text(self): return self.text def as_html(self): return cgi.escape(self.text) class PreformattedText(Text): "Text inside

...
" def as_text(self): return str(self.text) def as_html(self): return '
' + cgi.escape(str(self.text)) + '
' class CodeFormattedText(Text): "Text inside ..." def as_text(self): return str(self.text) def as_html(self): return '' + cgi.escape(str(self.text)) + '' class CitedText(Text): "Text inside ..." def as_text(self): return '_' + simplify(str(self.text)) + '_' def as_html(self): return '' + string.strip(cgi.escape(str(self.text))) + '' class ForeignText(Text): "Foreign words, from Latin or French or whatever." def as_text(self): return '_' + simplify(str(self.text)) + '_' def as_html(self): return '' + string.strip(cgi.escape(str(self.text))) + '' class EmphasizedText(Text): "Text inside ..." def as_text(self): return '*' + simplify(str(self.text)) + '*' def as_html(self): return '' + string.strip(cgi.escape(str(self.text))) + '' class Break(Text): def as_text(self): return "" def as_html(self): return "

" # The QuotationDocHandler class is a SAX handler class that will # convert a marked-up document using the quotations DTD into a list of # quotation objects. class QuotationDocHandler(saxlib.HandlerBase): def __init__(self, process_func): self.process_func = process_func self.newqt = None # Errors should be signaled, so we'll output a message and raise # the exception to stop processing def fatalError(self, exception): sys.stderr.write('ERROR: '+ str(exception)+'\n') sys.exit(1) error = fatalError warning = fatalError def characters(self, ch, start, length): if self.newqt != None: s = ch[start:start+length] # Undo the UTF-8 encoding, converting to ISO Latin1, which # is the default character set used for HTML. latin1_encode = codecs.lookup('iso-8859-1') [0] unicode_str = s s, consumed = latin1_encode( unicode_str ) assert consumed == len( unicode_str ) self.newqt.stack[-1] = self.newqt.stack[-1] + s def startDocument(self): self.quote_list = [] def startElement(self, name, attrs): methname = 'start_'+str(name) if hasattr(self, methname): method = getattr(self, methname) method(attrs) else: sys.stderr.write('unknown start tag: <' + name + ' ') for name, value in attrs.items(): sys.stderr.write(name + '=' + '"' + value + '" ') sys.stderr.write('>\n') def endElement(self, name): methname = 'end_'+str(name) if hasattr(self, methname): method = getattr(self, methname) method() else: sys.stderr.write('unknown end tag: \n') # There's nothing to be done for the tag def start_quotations(self, attrs): pass def end_quotations(self): pass def start_quotation(self, attrs): if self.newqt == None: self.newqt = Quotation() def end_quotation(self): st = self.newqt.stack for i in range(len(st)): if type(st[i]) == types.StringType: st[i] = Text(st[i]) self.newqt.text=self.newqt.text + st del self.newqt.stack if self.process_func: self.process_func(self.newqt) else: print "Completed quotation\n ", self.newqt.__dict__ self.newqt=Quotation() # Attributes of a quotation: ... and ... def start_author(self, data): # Add the current contents of the stack to the text of the quotation self.newqt.text = self.newqt.text + self.newqt.stack # Reset the stack self.newqt.stack = [ Text() ] def end_author(self): # Set the author attribute to contents of the stack; you can't # have more than one tag per quotation. self.newqt.author = self.newqt.stack # Reset the stack for more text. self.newqt.stack = [ Text() ] # The code for the tag is exactly parallel to that for def start_source(self, data): self.newqt.text = self.newqt.text + self.newqt.stack self.newqt.stack = [ Text() ] def end_source(self): self.newqt.source = self.newqt.stack self.newqt.stack = [ Text() ] # Text markups:
for breaks,

...
for preformatted # text, ... for emphasis, ... for citations. def start_br(self, data): # Add a Break instance, and a new Text instance. self.newqt.stack.append(Break()) self.newqt.stack.append( Text() ) def end_br(self): pass def start_pre(self, data): self.newqt.stack.append( Text() ) def end_pre(self): self.newqt.stack[-1] = PreformattedText(self.newqt.stack[-1]) self.newqt.stack.append( Text() ) def start_code(self, data): self.newqt.stack.append( Text() ) def end_code(self): self.newqt.stack[-1] = CodeFormattedText(self.newqt.stack[-1]) self.newqt.stack.append( Text() ) def start_em(self, data): self.newqt.stack.append( Text() ) def end_em(self): self.newqt.stack[-1] = EmphasizedText(self.newqt.stack[-1]) self.newqt.stack.append( Text() ) def start_cite(self, data): self.newqt.stack.append( Text() ) def end_cite(self): self.newqt.stack[-1] = CitedText(self.newqt.stack[-1]) self.newqt.stack.append( Text() ) def start_foreign(self, data): self.newqt.stack.append( Text() ) def end_foreign(self): self.newqt.stack[-1] = ForeignText(self.newqt.stack[-1]) self.newqt.stack.append( Text() ) if __name__ == '__main__': import sys, getopt # Process the command-line arguments opts, args = getopt.getopt(sys.argv[1:], 'fthm:r', ['fortune', 'text', 'html', 'max=', 'help', 'randomize'] ) # Set defaults maxlength = 0 ; method = 'as_fortune' randomize = 0 # Process arguments for opt, arg in opts: if opt in ['-f', '--fortune']: method='as_fortune' elif opt in ['-t', '--text']: method = 'as_text' elif opt in ['-h', '--html']: method = 'as_html' elif opt in ['-m', '--max']: maxlength = string.atoi(arg) elif opt in ['-r', '--randomize']: randomize = 1 elif opt == '--help': print __doc__ ; sys.exit(0) # This function will simply output each quotation by calling the # desired method, as long as it's not suppressed by a setting of # --max. qtlist = [] def process_func(qt, qtlist=qtlist, maxlength=maxlength, method=method): func = getattr(qt, method) output = func() length = string.count(output, '\n') if maxlength!=0 and length > maxlength: return qtlist.append(output) # Loop over the input files; use sys.stdin if no files are specified if len(args) == 0: args = [sys.stdin] for file in args: if type(file) == types.StringType: input = open(file, 'r') else: input = file # Enforce the use of the Expat parser, because the code needs to be # sure that the output will be UTF-8 encoded. p=saxexts.XMLParserFactory.make_parser(["xml.sax.drivers.drv_pyexpat"]) dh = QuotationDocHandler(process_func) p.setDocumentHandler(dh) p.setErrorHandler(dh) p.parseFile(input) if type(file) == types.StringType: input.close() p.close() # Randomize the order of the quotations if randomize: import whrandom q2 = [] for i in range(len(qtlist)): qt = whrandom.randint(0,len(qtlist)-1 ) q2.append( qtlist[qt] ) qtlist[qt:qt+1] = [] assert len(qtlist) == 0 qtlist = q2 for quote in qtlist: print quote # We're done!