[XML-SIG] speed question re DOM parsing

Mon, 05 Jun 2000 18:03:55 -0600

This is a multi-part message in MIME format.
--------------2165DD6D55CFA19313775CB7
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Lars Marius Garshol wrote:
> 
> * Bjorn Pettersen
> |
> | Question: does using StringIO (or perhaps array) and __getattr__
> | sound like the right thing to do?
> 
> StringIO sounds like the right thing, at least for that particular
> document. Probably it wouldn't be too bad for the other documents
> either, but I have no experience with its performance.
> 
> I'm afraid I don't have the necessary context to answer the
> __getattr__ questions, but: I would definitely like to see your
> sources. If you could post them somewhere, I, at least, would be happy
> to have a look at them.

I've included the patched file as an attachment.  My changes are
confined to:

  - importing (c)StringIO at the top
  - changing the constructor call to _element (line 82) to pass
    a StringIO object rather than an empty string.
  - hiding the "first_cdata" member in the __init__ method of _element
  - adding a __getattr__ method to _element.

With limited performance testing I got:

  File Size  Original  Patched
    37K        0.14s    0.07s
   968K      103.77s    1.68s 

-- bjorn
--------------2165DD6D55CFA19313775CB7
Content-Type: text/plain; charset=us-ascii;
 name="qp_xml.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
 filename="qp_xml.py"

#
# qp_xml: Quick Parsing for XML
#
# Written by Greg Stein. Public Domain.
# No Copyright, no Rights Reserved, and no Warranties.
#
# This module is maintained by Greg and is available as part of the XML-SIG
# distribution. This module and its changelog can be fetched at:
#    http://www.lyra.org/cgi-bin/viewcvs.cgi/xml/xml/utils/qp_xml.py
#
# Additional information can be found on Greg's Python page at:
#    http://www.lyra.org/greg/python/
#
# This module was added to the XML-SIG distribution on February 14, 2000.
# As part of that distribution, it falls under the XML distribution license.
#

import string

try:
	import cStringIO
	_StringIO = cStringIO
except ImportError:
	import StringIO
	_StringIO = StringIO

try:
  import pyexpat
except ImportError:
  from xml.parsers import pyexpat

error = __name__ + '.error'

#
# The parsing class. Instantiate and pass a string/file to .parse()
#
class Parser:
  def __init__(self):
    self.reset()

  def reset(self):
    self.root = None
    self.cur_elem = None
    self.error = None

  def find_prefix(self, prefix):
    elem = self.cur_elem
    while elem:
      if elem.ns_scope.has_key(prefix):
        return elem.ns_scope[prefix]
      elem = elem.parent

    if prefix == '':
      return ''		# empty URL for "no namespace"

    return None

  def process_prefix(self, name, use_default):
    idx = string.find(name, ':')
    if idx == -1:
      if use_default:
        return self.find_prefix(''), name
      return '', name	# no namespace

    if string.lower(name[:3]) == 'xml':
      return '', name	# name is reserved by XML. don't break out a NS.

    ns = self.find_prefix(name[:idx])
    if ns is None:
      self.error = 'namespace prefix not found'

    return ns, name[idx+1:]

  def start(self, name, attrs):
    if self.error:
      return

    elem = _element(name=name, lang=None, parent=None,
                    children=[], ns_scope={}, attrs={},
                    first_cdata=_StringIO.StringIO(), following_cdata='')

    if self.cur_elem:
      elem.parent = self.cur_elem
      elem.parent.children.append(elem)
      self.cur_elem = elem
    else:
      self.cur_elem = self.root = elem

    work_attrs = [ ]

    # scan for namespace declarations (and xml:lang while we're at it)
    for i in range(0, len(attrs), 2):
      name = attrs[i]
      value = attrs[i+1]

      if name == 'xmlns':
        elem.ns_scope[''] = value
      elif name[:6] == 'xmlns:':
        elem.ns_scope[name[6:]] = value
      elif name == 'xml:lang':
        elem.lang = value
      else:
        work_attrs.append((name, value))

    # inherit xml:lang from parent
    if elem.lang is None and elem.parent:
      elem.lang = elem.parent.lang

    # process prefix of the element name
    elem.ns, elem.name = self.process_prefix(elem.name, 1)

    # process attributes' namespace prefixes
    for name, value in work_attrs:
      elem.attrs[self.process_prefix(name, 0)] = value

  def end(self, name):
    if self.error:
      return

    parent = self.cur_elem.parent

    del self.cur_elem.ns_scope
    del self.cur_elem.parent

    self.cur_elem = parent

  def cdata(self, data):
    if self.error:
      return
    elem = self.cur_elem
    if elem.children:
      last = elem.children[-1]
      last.following_cdata = last.following_cdata + data
    else:
      # this branch taken ~3 times more than true branch
      elem.first_cdata.write(data)
      #elem.first_cdata = elem.first_cdata + data

  def parse(self, input):
    self.reset()

    p = pyexpat.ParserCreate()
    p.StartElementHandler = self.start
    p.EndElementHandler = self.end
    p.CharacterDataHandler = self.cdata

    exception = None
    try:
      if type(input) == type(''):
        try:
          p.Parse(input, 1)
        except pyexpat.error, exception:
          pass
      else:
        while 1:
          s = input.read(_BLOCKSIZE)
          if not s:
            try:
              p.Parse('', 1)
            except pyexpat.error, exception:
              pass
            break

          try:
            rv = p.Parse(s, 0)
          except pyexpat.error, exception:
            pass
          if exception or self.error:
            break

      if exception:
        s = pyexpat.ErrorString(p.ErrorCode)
        raise error, 'expat parsing error: ' + exception
      if self.error:
        raise error, self.error
    finally:
      if self.root:
        _clean_tree(self.root)

    print 'self.root', self.root

    return self.root

#
# handy function for dumping a tree that is returned by Parser
#
def dump(f, root):
  f.write('<?xml version="1.0"?>\n')
  namespaces = _collect_ns(root)
  _dump_recurse(f, root, namespaces, 1)
  f.write('\n')

#
# This function returns the element's CDATA. Note: this is not recursive --
# it only returns the CDATA immediately within the element, excluding the
# CDATA in child elements.
#
def textof(elem):
  return elem.textof()

#########################################################################
#
# private stuff for qp_xml
#

_BLOCKSIZE = 1024 * 16	# chunk size for parsing input

class _element:
  def __init__(self, **kw):
    self.__dict__.update(kw)

    # changing first_cdata to be a StringIO object and
    # handling it transparently in __getattr__ below.
    # To make it work, we need to hide it first...
    self.__fcd = self.__dict__['first_cdata']
    del self.__dict__['first_cdata']

  def textof(self):
    '''Return the CDATA of this element.

    Note: this is not recursive -- it only returns the CDATA immediately
    within the element, excluding the CDATA in child elements.
    '''
    s = self.first_cdata #.getvalue()
    for child in self.children:
      s = s + child.following_cdata
    return s

  def find(self, name, ns=''):
    for elem in self.children:
      if elem.name == name and elem.ns == ns:
        return elem
    return None

  def __getattr__(self, attr):
    """first_cdata used to be a string attribute, but
       is now a StringIO object. Preserve the illusion
       that it is still a string attribute.
    """
    if attr == 'first_cdata':
      return self.__fcd.getvalue()
    else:
      return self.__dict__[attr]

def _clean_tree(elem):
  elem.parent = None
  del elem.parent
  map(_clean_tree, elem.children)

def _collect_recurse(elem, dict):
  dict[elem.ns] = None
  for ns, name in elem.attrs.keys():
    dict[ns] = None
  for child in elem.children:
    _collect_recurse(child, dict)

def _collect_ns(elem):
  "Collect all namespaces into a NAMESPACE -> PREFIX mapping."
  d = { '' : None }
  _collect_recurse(elem, d)
  del d['']	# make sure we don't pick up no-namespace entries
  keys = d.keys()
  for i in range(len(keys)):
    d[keys[i]] = i
  return d

def _dump_recurse(f, elem, namespaces, dump_ns=0):
  if elem.ns:
    f.write('<ns%d:%s' % (namespaces[elem.ns], elem.name))
  else:
    f.write('<' + elem.name)
  for (ns, name), value in elem.attrs.items():
    if ns:
      f.write(' ns%d:%s="%s"' % (namespaces[ns], name, value))
    else:
      f.write(' %s="%s"' % (name, value))
  if dump_ns:
    for ns, id in namespaces.items():
      f.write(' xmlns:ns%d="%s"' % (id, ns))
  if elem.children or elem.first_cdata:
    f.write('>' + elem.first_cdata)
    for child in elem.children:
      _dump_recurse(f, child, namespaces)
      f.write(child.following_cdata)
    if elem.ns:
      f.write('</ns%d:%s>' % (namespaces[elem.ns], elem.name))
    else:
      f.write('</%s>' % elem.name)
  else:
    f.write('/>')

--------------2165DD6D55CFA19313775CB7--