[XML-SIG] updated "quick parser" ... qp_xml.py
Greg Stein
gstein@lyra.org
Wed, 31 Mar 1999 18:51:12 -0800
This is a multi-part message in MIME format.
--------------6625E7BC3519C1A232100425
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Hey there...
From that speed test thing that I posted a few days ago, I extracted an
actual module. At the same time, I also simplified some of the namespace
stuff and corrected several bugs w.r.t. default namespaces.
As I mentioned, this guys is about 12x faster than using the DOM to
parse XML (when both are using pyexpat).
It also handles namespaces and xml:lang properly.
Comments/patches are encouraged.
thx
-g
--
Greg Stein, http://www.lyra.org/
--------------6625E7BC3519C1A232100425
Content-Type: text/plain; charset=us-ascii; name="qp_xml.py"
Content-Disposition: inline; filename="qp_xml.py"
Content-Transfer-Encoding: 7bit
#
# qp_xml: Quick Parsing for XML
#
import string
try:
import pyexpat
except ImportError:
from xml.parsers import pyexpat
error = __name__ + '.error'
#
# The parsing class. Instantiate and pass a string/file to .parse()
#
class Parser:
def __init__(self):
self.reset()
def reset(self):
self.root = None
self.cur_elem = None
self.error = None
def find_prefix(self, prefix):
elem = self.cur_elem
while elem:
if elem.ns_scope.has_key(prefix):
return elem.ns_scope[prefix]
elem = elem.parent
if prefix == '':
return '' # empty URL for "no namespace"
return None
def process_prefix(self, ob, use_default):
idx = string.find(ob.name, ':')
if idx == -1:
if use_default:
ob.ns = self.find_prefix('')
else:
ob.ns = '' # no namespace
elif string.lower(ob.name[:3]) == 'xml':
ob.ns = '' # name is reserved by XML. don't break out a NS.
else:
ob.ns = self.find_prefix(ob.name[:idx])
ob.name = ob.name[idx+1:]
if ob.ns is None:
self.error = 'namespace prefix not found'
return
def start(self, name, attrs):
if self.error:
return
elem = _element(name=name, lang=None, parent=None,
children=[], ns_scope={}, attrs=[],
first_cdata='', following_cdata='')
if self.cur_elem:
elem.parent = self.cur_elem
elem.parent.children.append(elem)
self.cur_elem = elem
else:
self.cur_elem = self.root = elem
# scan for namespace declarations (and xml:lang while we're at it)
for i in range(0, len(attrs), 2):
name = attrs[i]
value = attrs[i+1]
if name == 'xmlns':
elem.ns_scope[''] = value
elif name[:6] == 'xmlns:':
elem.ns_scope[name[6:]] = value
elif name == 'xml:lang':
elem.lang = value
else:
attr = _attribute(name=name, value=value)
elem.attrs.append(attr)
# inherit xml:lang from parent
if elem.lang is None and elem.parent:
elem.lang = elem.parent.lang
# process prefix of the element name
self.process_prefix(elem, 1)
# process attributes' namespace prefixes
for attr in elem.attrs:
self.process_prefix(attr, 0)
def end(self, name):
if self.error:
return
parent = self.cur_elem.parent
del self.cur_elem.ns_scope
del self.cur_elem.parent
self.cur_elem = parent
def cdata(self, data):
if self.error:
return
elem = self.cur_elem
if elem.children:
last = elem.children[-1]
last.following_cdata = last.following_cdata + data
else:
elem.first_cdata = elem.first_cdata + data
def parse(self, input):
self.reset()
p = pyexpat.ParserCreate()
p.StartElementHandler = self.start
p.EndElementHandler = self.end
p.CharacterDataHandler = self.cdata
try:
if type(input) == type(''):
rv = p.Parse(input, 1)
else:
while 1:
s = input.read(_BLOCKSIZE)
if not s:
rv = p.Parse('', 1)
break
rv = p.Parse(s, 0)
if rv == 0 or self.error:
break
if rv == 0:
s = pyexpat.ErrorString(p.ErrorCode)
raise error, 'expat parsing error: ' + s
if self.error:
raise error, self.error
finally:
_clean_tree(self.root)
return self.root
#
# handy function for dumping a tree that is returned by Parser
#
def dump(f, root):
f.write('<?xml version="1.0"?>\n')
namespaces = _collect_ns(root)
_dump_recurse(f, root, namespaces, 1)
f.write('\n')
#
# This function returns the element's CDATA. Note: this is not recursive --
# it only returns the CDATA immediately within the element, excluding the
# CDATA in child elements.
#
def textof(elem):
s = elem.first_cdata
for child in elem.children:
s = s + child.following_cdata
return s
#########################################################################
#
# private stuff for qp_xml
#
_BLOCKSIZE = 16384 # chunk size for parsing input
class _blank:
def __init__(self, **kw):
self.__dict__.update(kw)
class _element(_blank): pass
class _attribute(_blank): pass
def _clean_tree(elem):
elem.parent = None
del elem.parent
map(_clean_tree, elem.children)
def _collect_recurse(elem, dict):
dict[elem.ns] = None
for attr in elem.attrs:
dict[attr.ns] = None
for child in elem.children:
_collect_recurse(child, dict)
def _collect_ns(elem):
"Collect all namespaces into a NAMESPACE -> PREFIX mapping."
d = { '' : None }
_collect_recurse(elem, d)
del d[''] # make sure we don't pick up no-namespace entries
keys = d.keys()
for i in range(len(keys)):
d[keys[i]] = i
return d
def _dump_recurse(f, elem, namespaces, dump_ns=0):
if elem.ns:
f.write('<ns%d:%s' % (namespaces[elem.ns], elem.name))
else:
f.write('<' + elem.name)
for attr in elem.attrs:
if attr.ns:
f.write(' ns%d:%s="%s"' % (namespaces[attr.ns], attr.name, attr.value))
else:
f.write(' %s="%s"' % (attr.name, attr.value))
if dump_ns:
for ns, id in namespaces.items():
f.write(' xmlns:ns%d="%s"' % (id, ns))
if elem.children or elem.first_cdata:
f.write('>' + elem.first_cdata)
for child in elem.children:
_dump_recurse(f, child, namespaces)
f.write(child.following_cdata)
if elem.ns:
f.write('</ns%d:%s>' % (namespaces[elem.ns], elem.name))
else:
f.write('</%s>' % elem.name)
else:
f.write('/>')
--------------6625E7BC3519C1A232100425--