#!/usr/bin/env python # # qtfmt.py v1.10 # v1.10 : Updated to use Python 2.0 Unicode type. # # Read a document in the quotation DTD, converting it to a list of Quotation # objects. The list can then be output in several formats. __doc__ = """Usage: qtfmt.py [options] file1.xml file2.xml ... If no filenames are provided, standard input will be read. Available options: -f or --fortune Produce output for the fortune(1) program -h or --html Produce HTML output -t or --text Produce plain text output -m N or --max N Suppress quotations longer than N lines; defaults to 0, which suppresses no quotations at all. """ import string, re, cgi, types import codecs from xml.sax import saxlib, saxexts def simplify(t, indent="", width=79): """Strip out redundant spaces, and insert newlines to wrap the text at the given width.""" t = string.strip(t) t = re.sub('\s+', " ", t) if t=="": return t t = indent + t t2 = "" while len(t) > width: index = string.rfind(t, ' ', 0, width) if index == -1: t2 = t2 + t[:width] ; t = t[width:] else: t2 = t2 + t[:index] ; t = t[index+1:] t2 = t2 + '\n' return t2 + t class Quotation: """Encapsulates a single quotation. Attributes: stack -- used during construction and then deleted text -- A list of Text() instances, or subclasses of Text(), containing the text of the quotation. source -- A list of Text() instances, or subclasses of Text(), containing the source of the quotation. (Optional) author -- A list of Text() instances, or subclasses of Text(), containing the author of the quotation. (Optional) Methods: as_fortune() -- return the quotation formatted for fortune as_html() -- return an HTML version of the quotation as_text() -- return a plain text version of the quotation """ def __init__(self): self.stack = [ Text() ] self.text = [] def as_text(self): "Convert instance into a pure text form" output = "" def flatten(textobj): "Flatten a list of subclasses of Text into a list of paragraphs" if type(textobj) != types.ListType: textlist=[textobj] else: textlist = textobj paragraph = "" ; paralist = [] for t in textlist: if (isinstance(t, PreformattedText) or isinstance(t, CodeFormattedText) ): paralist.append(paragraph) paragraph = "" paralist.append(t) elif isinstance(t, Break): paragraph = paragraph + t.as_text() paralist.append(paragraph) paragraph = "" else: paragraph = paragraph + t.as_text() paralist.append(paragraph) return paralist # Flatten the list of instances into a list of paragraphs paralist = flatten(self.text) if len(paralist) > 1: indent = 2*" " else: indent = "" for para in paralist: if isinstance(para, PreformattedText) or isinstance(para, CodeFormattedText): output = output + para.as_text() else: output = output + simplify(para, indent) + '\n' attr = "" for i in ['author', 'source']: if hasattr(self, i): paralist = flatten(getattr(self, i)) text = string.join(paralist) if attr: attr = attr + ', ' text = string.lower(text[:1]) + text[1:] attr = attr + text attr=simplify(attr, width = 79 - 4 - 3) if attr: output = output + ' -- '+re.sub('\n', '\n ', attr) return output + '\n' def as_fortune(self): return self.as_text() + '%' def as_html(self): output = "
" def flatten(textobj): if type(textobj) != types.ListType: textlist = [textobj] else: textlist = textobj paragraph = "" ; paralist = [] for t in textlist: paragraph = paragraph + t.as_html() if isinstance(t, Break): paralist.append(paragraph) paragraph = "" paralist.append(paragraph) return paralist paralist = flatten(self.text) for para in paralist: output = output + string.strip(para) + '\n' attr = "" for i in ['author', 'source']: if hasattr(self, i): paralist = flatten(getattr(self, i)) text = string.join(paralist) attr=attr + ('
' % i) + string.strip(text) return output + attr # Text and its subclasses are used to hold chunks of text; instances # know how to display themselves as plain text or as HTML. class Text: "Plain text" def __init__(self, text=""): self.text = text # We need to allow adding a string to Text instances. def __add__(self, val): newtext = self.text + str(val) # __class__ must be used so subclasses create instances of themselves. return self.__class__(newtext) def __str__(self): return self.text def __repr__(self): s = string.strip(self.text) if len(s) > 15: s = s[0:15] + '...' return '<%s: "%s">' % (self.__class__.__name__, s) def as_text(self): return self.text def as_html(self): return cgi.escape(self.text) class PreformattedText(Text): "Text inside
..." def as_text(self): return str(self.text) def as_html(self): return '
' + cgi.escape(str(self.text)) + '' class CodeFormattedText(Text): "Text inside
...
"
def as_text(self):
return str(self.text)
def as_html(self):
return '' + cgi.escape(str(self.text)) + '
'
class CitedText(Text):
"Text inside ..."
def as_text(self):
return '_' + simplify(str(self.text)) + '_'
def as_html(self):
return '' + string.strip(cgi.escape(str(self.text))) + ''
class ForeignText(Text):
"Foreign words, from Latin or French or whatever."
def as_text(self):
return '_' + simplify(str(self.text)) + '_'
def as_html(self):
return '' + string.strip(cgi.escape(str(self.text))) + ''
class EmphasizedText(Text):
"Text inside ..."
def as_text(self):
return '*' + simplify(str(self.text)) + '*'
def as_html(self):
return '' + string.strip(cgi.escape(str(self.text))) + ''
class Break(Text):
def as_text(self): return ""
def as_html(self): return ""
# The QuotationDocHandler class is a SAX handler class that will
# convert a marked-up document using the quotations DTD into a list of
# quotation objects.
class QuotationDocHandler(saxlib.HandlerBase):
def __init__(self, process_func):
self.process_func = process_func
self.newqt = None
# Errors should be signaled, so we'll output a message and raise
# the exception to stop processing
def fatalError(self, exception):
sys.stderr.write('ERROR: '+ str(exception)+'\n')
sys.exit(1)
error = fatalError
warning = fatalError
def characters(self, ch, start, length):
if self.newqt != None:
s = ch[start:start+length]
# Undo the UTF-8 encoding, converting to ISO Latin1, which
# is the default character set used for HTML.
latin1_encode = codecs.lookup('iso-8859-1') [0]
unicode_str = s
s, consumed = latin1_encode( unicode_str )
assert consumed == len( unicode_str )
self.newqt.stack[-1] = self.newqt.stack[-1] + s
def startDocument(self):
self.quote_list = []
def startElement(self, name, attrs):
methname = 'start_'+str(name)
if hasattr(self, methname):
method = getattr(self, methname)
method(attrs)
else:
sys.stderr.write('unknown start tag: <' + name + ' ')
for name, value in attrs.items():
sys.stderr.write(name + '=' + '"' + value + '" ')
sys.stderr.write('>\n')
def endElement(self, name):
methname = 'end_'+str(name)
if hasattr(self, methname):
method = getattr(self, methname)
method()
else:
sys.stderr.write('unknown end tag: ' + name + '>\n')
# There's nothing to be done for the