#!/usr/bin/env python # # qtfmt.py v1.10 # v1.10 : Updated to use Python 2.0 Unicode type. # # Read a document in the quotation DTD, converting it to a list of Quotation # objects. The list can then be output in several formats. __doc__ = """Usage: qtfmt.py [options] file1.xml file2.xml ... If no filenames are provided, standard input will be read. Available options: -f or --fortune Produce output for the fortune(1) program -h or --html Produce HTML output -t or --text Produce plain text output -m N or --max N Suppress quotations longer than N lines; defaults to 0, which suppresses no quotations at all. """ import string, re, cgi, types import codecs from xml.sax import saxlib, saxexts def simplify(t, indent="", width=79): """Strip out redundant spaces, and insert newlines to wrap the text at the given width.""" t = string.strip(t) t = re.sub('\s+', " ", t) if t=="": return t t = indent + t t2 = "" while len(t) > width: index = string.rfind(t, ' ', 0, width) if index == -1: t2 = t2 + t[:width] ; t = t[width:] else: t2 = t2 + t[:index] ; t = t[index+1:] t2 = t2 + '\n' return t2 + t class Quotation: """Encapsulates a single quotation. Attributes: stack -- used during construction and then deleted text -- A list of Text() instances, or subclasses of Text(), containing the text of the quotation. source -- A list of Text() instances, or subclasses of Text(), containing the source of the quotation. (Optional) author -- A list of Text() instances, or subclasses of Text(), containing the author of the quotation. (Optional) Methods: as_fortune() -- return the quotation formatted for fortune as_html() -- return an HTML version of the quotation as_text() -- return a plain text version of the quotation """ def __init__(self): self.stack = [ Text() ] self.text = [] def as_text(self): "Convert instance into a pure text form" output = "" def flatten(textobj): "Flatten a list of subclasses of Text into a list of paragraphs" if type(textobj) != types.ListType: textlist=[textobj] else: textlist = textobj paragraph = "" ; paralist = [] for t in textlist: if (isinstance(t, PreformattedText) or isinstance(t, CodeFormattedText) ): paralist.append(paragraph) paragraph = "" paralist.append(t) elif isinstance(t, Break): paragraph = paragraph + t.as_text() paralist.append(paragraph) paragraph = "" else: paragraph = paragraph + t.as_text() paralist.append(paragraph) return paralist # Flatten the list of instances into a list of paragraphs paralist = flatten(self.text) if len(paralist) > 1: indent = 2*" " else: indent = "" for para in paralist: if isinstance(para, PreformattedText) or isinstance(para, CodeFormattedText): output = output + para.as_text() else: output = output + simplify(para, indent) + '\n' attr = "" for i in ['author', 'source']: if hasattr(self, i): paralist = flatten(getattr(self, i)) text = string.join(paralist) if attr: attr = attr + ', ' text = string.lower(text[:1]) + text[1:] attr = attr + text attr=simplify(attr, width = 79 - 4 - 3) if attr: output = output + ' -- '+re.sub('\n', '\n ', attr) return output + '\n' def as_fortune(self): return self.as_text() + '%' def as_html(self): output = "
" def flatten(textobj): if type(textobj) != types.ListType: textlist = [textobj] else: textlist = textobj paragraph = "" ; paralist = [] for t in textlist: paragraph = paragraph + t.as_html() if isinstance(t, Break): paralist.append(paragraph) paragraph = "" paralist.append(paragraph) return paralist paralist = flatten(self.text) for para in paralist: output = output + string.strip(para) + '\n' attr = "" for i in ['author', 'source']: if hasattr(self, i): paralist = flatten(getattr(self, i)) text = string.join(paralist) attr=attr + ('
' % i) + string.strip(text) return output + attr # Text and its subclasses are used to hold chunks of text; instances # know how to display themselves as plain text or as HTML. class Text: "Plain text" def __init__(self, text=""): self.text = text # We need to allow adding a string to Text instances. def __add__(self, val): newtext = self.text + str(val) # __class__ must be used so subclasses create instances of themselves. return self.__class__(newtext) def __str__(self): return self.text def __repr__(self): s = string.strip(self.text) if len(s) > 15: s = s[0:15] + '...' return '<%s: "%s">' % (self.__class__.__name__, s) def as_text(self): return self.text def as_html(self): return cgi.escape(self.text) class PreformattedText(Text): "Text inside
..." def as_text(self): return str(self.text) def as_html(self): return '
' + cgi.escape(str(self.text)) + '' class CodeFormattedText(Text): "Text inside
...
"
def as_text(self):
return str(self.text)
def as_html(self):
return '' + cgi.escape(str(self.text)) + '
'
class CitedText(Text):
"Text inside ..."
def as_text(self):
return '_' + simplify(str(self.text)) + '_'
def as_html(self):
return '' + string.strip(cgi.escape(str(self.text))) + ''
class ForeignText(Text):
"Foreign words, from Latin or French or whatever."
def as_text(self):
return '_' + simplify(str(self.text)) + '_'
def as_html(self):
return '' + string.strip(cgi.escape(str(self.text))) + ''
class EmphasizedText(Text):
"Text inside ..."
def as_text(self):
return '*' + simplify(str(self.text)) + '*'
def as_html(self):
return '' + string.strip(cgi.escape(str(self.text))) + ''
class Break(Text):
def as_text(self): return ""
def as_html(self): return ""
# The QuotationDocHandler class is a SAX handler class that will
# convert a marked-up document using the quotations DTD into a list of
# quotation objects.
class QuotationDocHandler(saxlib.HandlerBase):
def __init__(self, process_func):
self.process_func = process_func
self.newqt = None
# Errors should be signaled, so we'll output a message and raise
# the exception to stop processing
def fatalError(self, exception):
sys.stderr.write('ERROR: '+ str(exception)+'\n')
sys.exit(1)
error = fatalError
warning = fatalError
def characters(self, ch, start, length):
if self.newqt != None:
s = ch[start:start+length]
# Undo the UTF-8 encoding, converting to ISO Latin1, which
# is the default character set used for HTML.
latin1_encode = codecs.lookup('iso-8859-1') [0]
unicode_str = s
s, consumed = latin1_encode( unicode_str )
assert consumed == len( unicode_str )
self.newqt.stack[-1] = self.newqt.stack[-1] + s
def startDocument(self):
self.quote_list = []
def startElement(self, name, attrs):
methname = 'start_'+str(name)
if hasattr(self, methname):
method = getattr(self, methname)
method(attrs)
else:
sys.stderr.write('unknown start tag: <' + name + ' ')
for name, value in attrs.items():
sys.stderr.write(name + '=' + '"' + value + '" ')
sys.stderr.write('>\n')
def endElement(self, name):
methname = 'end_'+str(name)
if hasattr(self, methname):
method = getattr(self, methname)
method()
else:
sys.stderr.write('unknown end tag: ' + name + '>\n')
# There's nothing to be done for the
for breaks, ...
for preformatted
# text, ... for emphasis, ... for citations.
def start_br(self, data):
# Add a Break instance, and a new Text instance.
self.newqt.stack.append(Break())
self.newqt.stack.append( Text() )
def end_br(self): pass
def start_pre(self, data):
self.newqt.stack.append( Text() )
def end_pre(self):
self.newqt.stack[-1] = PreformattedText(self.newqt.stack[-1])
self.newqt.stack.append( Text() )
def start_code(self, data):
self.newqt.stack.append( Text() )
def end_code(self):
self.newqt.stack[-1] = CodeFormattedText(self.newqt.stack[-1])
self.newqt.stack.append( Text() )
def start_em(self, data):
self.newqt.stack.append( Text() )
def end_em(self):
self.newqt.stack[-1] = EmphasizedText(self.newqt.stack[-1])
self.newqt.stack.append( Text() )
def start_cite(self, data):
self.newqt.stack.append( Text() )
def end_cite(self):
self.newqt.stack[-1] = CitedText(self.newqt.stack[-1])
self.newqt.stack.append( Text() )
def start_foreign(self, data):
self.newqt.stack.append( Text() )
def end_foreign(self):
self.newqt.stack[-1] = ForeignText(self.newqt.stack[-1])
self.newqt.stack.append( Text() )
if __name__ == '__main__':
import sys, getopt
# Process the command-line arguments
opts, args = getopt.getopt(sys.argv[1:], 'fthm:r',
['fortune', 'text', 'html', 'max=', 'help',
'randomize'] )
# Set defaults
maxlength = 0 ; method = 'as_fortune'
randomize = 0
# Process arguments
for opt, arg in opts:
if opt in ['-f', '--fortune']:
method='as_fortune'
elif opt in ['-t', '--text']:
method = 'as_text'
elif opt in ['-h', '--html']:
method = 'as_html'
elif opt in ['-m', '--max']:
maxlength = string.atoi(arg)
elif opt in ['-r', '--randomize']:
randomize = 1
elif opt == '--help':
print __doc__ ; sys.exit(0)
# This function will simply output each quotation by calling the
# desired method, as long as it's not suppressed by a setting of
# --max.
qtlist = []
def process_func(qt, qtlist=qtlist, maxlength=maxlength, method=method):
func = getattr(qt, method)
output = func()
length = string.count(output, '\n')
if maxlength!=0 and length > maxlength: return
qtlist.append(output)
# Loop over the input files; use sys.stdin if no files are specified
if len(args) == 0: args = [sys.stdin]
for file in args:
if type(file) == types.StringType: input = open(file, 'r')
else: input = file
# Enforce the use of the Expat parser, because the code needs to be
# sure that the output will be UTF-8 encoded.
p=saxexts.XMLParserFactory.make_parser(["xml.sax.drivers.drv_pyexpat"])
dh = QuotationDocHandler(process_func)
p.setDocumentHandler(dh)
p.setErrorHandler(dh)
p.parseFile(input)
if type(file) == types.StringType: input.close()
p.close()
# Randomize the order of the quotations
if randomize:
import whrandom
q2 = []
for i in range(len(qtlist)):
qt = whrandom.randint(0,len(qtlist)-1 )
q2.append( qtlist[qt] )
qtlist[qt:qt+1] = []
assert len(qtlist) == 0
qtlist = q2
for quote in qtlist:
print quote
# We're done!