[XML-SIG] updated "quick parser" ... qp_xml.py

Greg Stein gstein@lyra.org
Wed, 31 Mar 1999 18:51:12 -0800


This is a multi-part message in MIME format.

--------------6625E7BC3519C1A232100425
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Hey there...

From that speed test thing that I posted a few days ago, I extracted an
actual module. At the same time, I also simplified some of the namespace
stuff and corrected several bugs w.r.t. default namespaces.

As I mentioned, this guys is about 12x faster than using the DOM to
parse XML (when both are using pyexpat).

It also handles namespaces and xml:lang properly.

Comments/patches are encouraged.

thx
-g

--
Greg Stein, http://www.lyra.org/

--------------6625E7BC3519C1A232100425
Content-Type: text/plain; charset=us-ascii; name="qp_xml.py"
Content-Disposition: inline; filename="qp_xml.py"
Content-Transfer-Encoding: 7bit

#
# qp_xml: Quick Parsing for XML
#

import string

try:
  import pyexpat
except ImportError:
  from xml.parsers import pyexpat

error = __name__ + '.error'


#
# The parsing class. Instantiate and pass a string/file to .parse()
#
class Parser:
  def __init__(self):
    self.reset()

  def reset(self):
    self.root = None
    self.cur_elem = None
    self.error = None

  def find_prefix(self, prefix):
    elem = self.cur_elem
    while elem:
      if elem.ns_scope.has_key(prefix):
        return elem.ns_scope[prefix]
      elem = elem.parent

    if prefix == '':
      return ''		# empty URL for "no namespace"

    return None

  def process_prefix(self, ob, use_default):
    idx = string.find(ob.name, ':')
    if idx == -1:
      if use_default:
        ob.ns = self.find_prefix('')
      else:
        ob.ns = ''	# no namespace
    elif string.lower(ob.name[:3]) == 'xml':
      ob.ns = ''	# name is reserved by XML. don't break out a NS.
    else:
      ob.ns = self.find_prefix(ob.name[:idx])
      ob.name = ob.name[idx+1:]

      if ob.ns is None:
        self.error = 'namespace prefix not found'
        return

  def start(self, name, attrs):
    if self.error:
      return

    elem = _element(name=name, lang=None, parent=None,
                    children=[], ns_scope={}, attrs=[],
                    first_cdata='', following_cdata='')

    if self.cur_elem:
      elem.parent = self.cur_elem
      elem.parent.children.append(elem)
      self.cur_elem = elem
    else:
      self.cur_elem = self.root = elem

    # scan for namespace declarations (and xml:lang while we're at it)
    for i in range(0, len(attrs), 2):
      name = attrs[i]
      value = attrs[i+1]

      if name == 'xmlns':
        elem.ns_scope[''] = value
      elif name[:6] == 'xmlns:':
        elem.ns_scope[name[6:]] = value
      elif name == 'xml:lang':
        elem.lang = value
      else:
        attr = _attribute(name=name, value=value)
        elem.attrs.append(attr)

    # inherit xml:lang from parent
    if elem.lang is None and elem.parent:
      elem.lang = elem.parent.lang

    # process prefix of the element name
    self.process_prefix(elem, 1)

    # process attributes' namespace prefixes
    for attr in elem.attrs:
      self.process_prefix(attr, 0)

  def end(self, name):
    if self.error:
      return

    parent = self.cur_elem.parent

    del self.cur_elem.ns_scope
    del self.cur_elem.parent

    self.cur_elem = parent

  def cdata(self, data):
    if self.error:
      return
    elem = self.cur_elem
    if elem.children:
      last = elem.children[-1]
      last.following_cdata = last.following_cdata + data
    else:
      elem.first_cdata = elem.first_cdata + data

  def parse(self, input):
    self.reset()

    p = pyexpat.ParserCreate()
    p.StartElementHandler = self.start
    p.EndElementHandler = self.end
    p.CharacterDataHandler = self.cdata

    try:
      if type(input) == type(''):
        rv = p.Parse(input, 1)
      else:
        while 1:
          s = input.read(_BLOCKSIZE)
          if not s:
            rv = p.Parse('', 1)
            break
          rv = p.Parse(s, 0)
          if rv == 0 or self.error:
            break

      if rv == 0:
        s = pyexpat.ErrorString(p.ErrorCode)
        raise error, 'expat parsing error: ' + s
      if self.error:
        raise error, self.error
    finally:
      _clean_tree(self.root)

    return self.root


#
# handy function for dumping a tree that is returned by Parser
#
def dump(f, root):
  f.write('<?xml version="1.0"?>\n')
  namespaces = _collect_ns(root)
  _dump_recurse(f, root, namespaces, 1)
  f.write('\n')


#
# This function returns the element's CDATA. Note: this is not recursive --
# it only returns the CDATA immediately within the element, excluding the
# CDATA in child elements.
#
def textof(elem):
  s = elem.first_cdata
  for child in elem.children:
    s = s + child.following_cdata
  return s


#########################################################################
#
# private stuff for qp_xml
#

_BLOCKSIZE = 16384	# chunk size for parsing input

class _blank:
  def __init__(self, **kw):
    self.__dict__.update(kw)
class _element(_blank): pass
class _attribute(_blank): pass

def _clean_tree(elem):
  elem.parent = None
  del elem.parent
  map(_clean_tree, elem.children)


def _collect_recurse(elem, dict):
  dict[elem.ns] = None
  for attr in elem.attrs:
    dict[attr.ns] = None
  for child in elem.children:
    _collect_recurse(child, dict)

def _collect_ns(elem):
  "Collect all namespaces into a NAMESPACE -> PREFIX mapping."
  d = { '' : None }
  _collect_recurse(elem, d)
  del d['']	# make sure we don't pick up no-namespace entries
  keys = d.keys()
  for i in range(len(keys)):
    d[keys[i]] = i
  return d

def _dump_recurse(f, elem, namespaces, dump_ns=0):
  if elem.ns:
    f.write('<ns%d:%s' % (namespaces[elem.ns], elem.name))
  else:
    f.write('<' + elem.name)
  for attr in elem.attrs:
    if attr.ns:
      f.write(' ns%d:%s="%s"' % (namespaces[attr.ns], attr.name, attr.value))
    else:
      f.write(' %s="%s"' % (attr.name, attr.value))
  if dump_ns:
    for ns, id in namespaces.items():
      f.write(' xmlns:ns%d="%s"' % (id, ns))
  if elem.children or elem.first_cdata:
    f.write('>' + elem.first_cdata)
    for child in elem.children:
      _dump_recurse(f, child, namespaces)
      f.write(child.following_cdata)
    if elem.ns:
      f.write('</ns%d:%s>' % (namespaces[elem.ns], elem.name))
    else:
      f.write('</%s>' % elem.name)
  else:
    f.write('/>')

--------------6625E7BC3519C1A232100425--