[XML-SIG] SAX prettyprinter V2 and SGMLOP
Christian Tismer
tismer@appliedbiometrics.com
Fri, 22 Jan 1999 21:27:58 +0100
This is a multi-part message in MIME format.
--------------F46600A1D3B1D2BC0AA2B68F
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Hi again,
the appended version of Indenter.py can use sgmlop to format
large XML files. It then processes a few megabytes in a few seconds.
sgmlop does not support ignorableWhitespace, so I supported
this alone, by delayed writing and postprocessing.
BTW - is sgmlop deprecated?
It still has some flaws, like not allowing "_" in tagnames.
Is Fredrik no longer supporting it, or what is the current
preferred fast parser for all platforms?
ciao - chris
--
Christian Tismer :^) <mailto:tismer@appliedbiometrics.com>
Applied Biometrics GmbH : Have a break! Take a ride on Python's
Kaiserin-Augusta-Allee 101 : *Starship* http://starship.skyport.net
10553 Berlin : PGP key -> http://pgp.ai.mit.edu/
we're tired of banana software - shipped green, ripens at home
--------------F46600A1D3B1D2BC0AA2B68F
Content-Type: text/plain; charset=us-ascii; name="indenter.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="indenter.py"
# pretty printer for SAX
# CT990122
# based upon the saxutils.Canonizer code
# V.0.2 support for sgmlop which doesn't give ignorableWhitespace info
from xml.sax import saxexts, saxlib, saxutils
import string, sys
class Indenter(saxlib.HandlerBase):
"A SAX document handler that produces indented XML output."
def __init__(self,writer=sys.stdout, indent=2):
self.elem_level=0
self.writer=writer
self.indent=indent
self.last_level=-1
self.buffer = "" # lazy buffer for whitespace stripping
def processingInstruction (self,target, remainder):
#if not target=="xml":
self.writer.write("<?"+target+" "+remainder+"?>\n")
def startElement(self,name,amap):
if self.buffer:
self.write_buffer()
self.writer.write("\n"+self.indent*self.elem_level*" "+"<"+name)
a_names=amap.keys()
a_names.sort()
for a_name in a_names:
self.writer.write(" "+a_name+"=\"")
self.write_data(amap[a_name], 1)
self.writer.write("\"")
self.writer.write(">")
self.last_level = self.elem_level
self.elem_level=self.elem_level+1
def endElement(self,name):
if self.buffer:
self.write_buffer()
self.elem_level=self.elem_level-1
if self.last_level < self.elem_level:
self.writer.write("\n"+self.indent*self.elem_level*" "+"</"+name+">")
else:
self.writer.write("</"+name+">")
self.last_level = -1
def ignorableWhitespace(self,data,start_ix,length):
# we drop white space here.
# self.characters(data,start_ix,length)
pass
def characters(self,data,start_ix,length):
if self.elem_level>0:
self.put_buffer(data[start_ix:start_ix+length])
def put_buffer(self, txt):
self.buffer = self.buffer+txt
def write_buffer(self):
if self.buffer:
self.write_data(string.strip(self.buffer))
self.buffer = ""
def write_data(self,data, quotes=0):
"Writes datachars to writer."
data=string.replace(data,"&","&")
data=string.replace(data,"<","<")
if quotes:
data=string.replace(data,"\"",""")
data=string.replace(data,">",">")
self.writer.write(data)
def endDocument(self):
self.write_buffer()
self.writer.write("\n")
try:
pass #self.writer.close()
except NameError:
pass # It's OK, if the method isn't there we probably don't need it
"""
Example to format a DOM:
>>> i=Indenter()
>>> p=saxexts.make_parser()
>>> p.setErrorHandler(saxutils.ErrorPrinter())
>>> p.setDocumentHandler(i)
>>> p.parseFile(cStringIO.StringIO(dom.toxml()))
Example to format a file to a file, with sgmlop as parser:
>>> f=open(r'd:\tmp\test.xml',"w")
>>> i=Indenter(f)
>>> p=saxexts.make_parser("xml.sax.drivers.drv_sgmlop")
>>> p.setErrorHandler(saxutils.ErrorPrinter())
>>> p.setDocumentHandler(i)
>>> p.parseFile(r"h:\pns\projekte\srz\roteli\birgit\sgml\praep.sgm.umgebrochen.xml")
>>> f.close()
"""
--------------F46600A1D3B1D2BC0AA2B68F--