[XML-SIG] speed question re DOM parsing
Bjorn Pettersen
bjorn@roguewave.com
Mon, 05 Jun 2000 18:03:55 -0600
This is a multi-part message in MIME format.
--------------2165DD6D55CFA19313775CB7
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Lars Marius Garshol wrote:
>
> * Bjorn Pettersen
> |
> | Question: does using StringIO (or perhaps array) and __getattr__
> | sound like the right thing to do?
>
> StringIO sounds like the right thing, at least for that particular
> document. Probably it wouldn't be too bad for the other documents
> either, but I have no experience with its performance.
>
> I'm afraid I don't have the necessary context to answer the
> __getattr__ questions, but: I would definitely like to see your
> sources. If you could post them somewhere, I, at least, would be happy
> to have a look at them.
I've included the patched file as an attachment. My changes are
confined to:
- importing (c)StringIO at the top
- changing the constructor call to _element (line 82) to pass
a StringIO object rather than an empty string.
- hiding the "first_cdata" member in the __init__ method of _element
- adding a __getattr__ method to _element.
With limited performance testing I got:
File Size Original Patched
37K 0.14s 0.07s
968K 103.77s 1.68s
-- bjorn
--------------2165DD6D55CFA19313775CB7
Content-Type: text/plain; charset=us-ascii;
name="qp_xml.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="qp_xml.py"
#
# qp_xml: Quick Parsing for XML
#
# Written by Greg Stein. Public Domain.
# No Copyright, no Rights Reserved, and no Warranties.
#
# This module is maintained by Greg and is available as part of the XML-SIG
# distribution. This module and its changelog can be fetched at:
# http://www.lyra.org/cgi-bin/viewcvs.cgi/xml/xml/utils/qp_xml.py
#
# Additional information can be found on Greg's Python page at:
# http://www.lyra.org/greg/python/
#
# This module was added to the XML-SIG distribution on February 14, 2000.
# As part of that distribution, it falls under the XML distribution license.
#
import string
try:
import cStringIO
_StringIO = cStringIO
except ImportError:
import StringIO
_StringIO = StringIO
try:
import pyexpat
except ImportError:
from xml.parsers import pyexpat
error = __name__ + '.error'
#
# The parsing class. Instantiate and pass a string/file to .parse()
#
class Parser:
def __init__(self):
self.reset()
def reset(self):
self.root = None
self.cur_elem = None
self.error = None
def find_prefix(self, prefix):
elem = self.cur_elem
while elem:
if elem.ns_scope.has_key(prefix):
return elem.ns_scope[prefix]
elem = elem.parent
if prefix == '':
return '' # empty URL for "no namespace"
return None
def process_prefix(self, name, use_default):
idx = string.find(name, ':')
if idx == -1:
if use_default:
return self.find_prefix(''), name
return '', name # no namespace
if string.lower(name[:3]) == 'xml':
return '', name # name is reserved by XML. don't break out a NS.
ns = self.find_prefix(name[:idx])
if ns is None:
self.error = 'namespace prefix not found'
return ns, name[idx+1:]
def start(self, name, attrs):
if self.error:
return
elem = _element(name=name, lang=None, parent=None,
children=[], ns_scope={}, attrs={},
first_cdata=_StringIO.StringIO(), following_cdata='')
if self.cur_elem:
elem.parent = self.cur_elem
elem.parent.children.append(elem)
self.cur_elem = elem
else:
self.cur_elem = self.root = elem
work_attrs = [ ]
# scan for namespace declarations (and xml:lang while we're at it)
for i in range(0, len(attrs), 2):
name = attrs[i]
value = attrs[i+1]
if name == 'xmlns':
elem.ns_scope[''] = value
elif name[:6] == 'xmlns:':
elem.ns_scope[name[6:]] = value
elif name == 'xml:lang':
elem.lang = value
else:
work_attrs.append((name, value))
# inherit xml:lang from parent
if elem.lang is None and elem.parent:
elem.lang = elem.parent.lang
# process prefix of the element name
elem.ns, elem.name = self.process_prefix(elem.name, 1)
# process attributes' namespace prefixes
for name, value in work_attrs:
elem.attrs[self.process_prefix(name, 0)] = value
def end(self, name):
if self.error:
return
parent = self.cur_elem.parent
del self.cur_elem.ns_scope
del self.cur_elem.parent
self.cur_elem = parent
def cdata(self, data):
if self.error:
return
elem = self.cur_elem
if elem.children:
last = elem.children[-1]
last.following_cdata = last.following_cdata + data
else:
# this branch taken ~3 times more than true branch
elem.first_cdata.write(data)
#elem.first_cdata = elem.first_cdata + data
def parse(self, input):
self.reset()
p = pyexpat.ParserCreate()
p.StartElementHandler = self.start
p.EndElementHandler = self.end
p.CharacterDataHandler = self.cdata
exception = None
try:
if type(input) == type(''):
try:
p.Parse(input, 1)
except pyexpat.error, exception:
pass
else:
while 1:
s = input.read(_BLOCKSIZE)
if not s:
try:
p.Parse('', 1)
except pyexpat.error, exception:
pass
break
try:
rv = p.Parse(s, 0)
except pyexpat.error, exception:
pass
if exception or self.error:
break
if exception:
s = pyexpat.ErrorString(p.ErrorCode)
raise error, 'expat parsing error: ' + exception
if self.error:
raise error, self.error
finally:
if self.root:
_clean_tree(self.root)
print 'self.root', self.root
return self.root
#
# handy function for dumping a tree that is returned by Parser
#
def dump(f, root):
f.write('<?xml version="1.0"?>\n')
namespaces = _collect_ns(root)
_dump_recurse(f, root, namespaces, 1)
f.write('\n')
#
# This function returns the element's CDATA. Note: this is not recursive --
# it only returns the CDATA immediately within the element, excluding the
# CDATA in child elements.
#
def textof(elem):
return elem.textof()
#########################################################################
#
# private stuff for qp_xml
#
_BLOCKSIZE = 1024 * 16 # chunk size for parsing input
class _element:
def __init__(self, **kw):
self.__dict__.update(kw)
# changing first_cdata to be a StringIO object and
# handling it transparently in __getattr__ below.
# To make it work, we need to hide it first...
self.__fcd = self.__dict__['first_cdata']
del self.__dict__['first_cdata']
def textof(self):
'''Return the CDATA of this element.
Note: this is not recursive -- it only returns the CDATA immediately
within the element, excluding the CDATA in child elements.
'''
s = self.first_cdata #.getvalue()
for child in self.children:
s = s + child.following_cdata
return s
def find(self, name, ns=''):
for elem in self.children:
if elem.name == name and elem.ns == ns:
return elem
return None
def __getattr__(self, attr):
"""first_cdata used to be a string attribute, but
is now a StringIO object. Preserve the illusion
that it is still a string attribute.
"""
if attr == 'first_cdata':
return self.__fcd.getvalue()
else:
return self.__dict__[attr]
def _clean_tree(elem):
elem.parent = None
del elem.parent
map(_clean_tree, elem.children)
def _collect_recurse(elem, dict):
dict[elem.ns] = None
for ns, name in elem.attrs.keys():
dict[ns] = None
for child in elem.children:
_collect_recurse(child, dict)
def _collect_ns(elem):
"Collect all namespaces into a NAMESPACE -> PREFIX mapping."
d = { '' : None }
_collect_recurse(elem, d)
del d[''] # make sure we don't pick up no-namespace entries
keys = d.keys()
for i in range(len(keys)):
d[keys[i]] = i
return d
def _dump_recurse(f, elem, namespaces, dump_ns=0):
if elem.ns:
f.write('<ns%d:%s' % (namespaces[elem.ns], elem.name))
else:
f.write('<' + elem.name)
for (ns, name), value in elem.attrs.items():
if ns:
f.write(' ns%d:%s="%s"' % (namespaces[ns], name, value))
else:
f.write(' %s="%s"' % (name, value))
if dump_ns:
for ns, id in namespaces.items():
f.write(' xmlns:ns%d="%s"' % (id, ns))
if elem.children or elem.first_cdata:
f.write('>' + elem.first_cdata)
for child in elem.children:
_dump_recurse(f, child, namespaces)
f.write(child.following_cdata)
if elem.ns:
f.write('</ns%d:%s>' % (namespaces[elem.ns], elem.name))
else:
f.write('</%s>' % elem.name)
else:
f.write('/>')
--------------2165DD6D55CFA19313775CB7--