[XML-SIG] quick speed test

Sun, 28 Mar 1999 06:46:23 -0800

This is a multi-part message in MIME format.

--------------3BCDD7BF6676916C1CF1ED3C
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit

Hey gang,

I added some parsing in my DAV client for the server responses. My test
script then started running horribly slow :-(

To do some quick performance testing, I whipped up the attached script.
The "Parser" class in there is essentially a direct translation of the C
code in mod_dav. It interfaces with Expat and handles xml:lang and
namespace processing. Of course, Python has different/better data
structures, so it is quite a bit simpler than the C equivalent.

My testing shows that the Parser class is about 12 times faster than
going thru the DOM code. Some post-processing of the DOM adds another
50%. The post-processing does the namespace handling (no xml:lang
handling or handling of the reserved "xml" prefix). The post-process
*does* do some data extraction which I haven't written for the Parser
thing yet. I figure it would balance out to the Parser being about 15x
the DOM version.

Regardless of the obscure details, the main point is that this script
demonstrates a much faster mechanism for translating Expat output into a
useful tree-based structure, while also performing namespace processing
and miscellaneous XML conformance stuff.

There is also a sample function for dumping the output tree.

To get this to run on your system, you may need to drop the "import
davlib" from the top. It isn't really used. A couple other DAV remnants
are in there, but hey. Exercise for the reader :-)

I'm posting this mostly as an example or aid, in that somebody may find
it useful. It isn't intended to universally replace the DOM stuff.

Cheers,
-g

--
Greg Stein, http://www.lyra.org/

--------------3BCDD7BF6676916C1CF1ED3C
Content-Type: text/plain; charset=us-ascii; name="xmlperf.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="xmlperf.py"

#
# do performance tests on XML parsing variants
#

import xml.sax.saxexts
import xml.dom.sax_builder
import StringIO
import davlib
from xml.parsers import pyexpat
import string
import time

msr = '''\
<?xml version="1.0"?>
<multistatus xmlns="DAV:">
<response>
<href>/dav/foo.cgi</href>
<propstat>
<prop>
<creationdate>1999-03-16T20:06:16Z</creationdate>
<getcontentlength>17</getcontentlength>
<getlastmodified>Tue, 16 Mar 1999 20:06:16 GMT</getlastmodified>
<resourcetype/></prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/file1</href>
<propstat>
<prop>
<creationdate>1999-03-16T20:06:17Z</creationdate>
<getcontentlength>14</getcontentlength>
<getlastmodified>Tue, 16 Mar 1999 20:06:17 GMT</getlastmodified>
<resourcetype/></prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/testdata/</href>
<propstat>
<prop>
<creationdate>1999-03-16T20:06:18Z</creationdate>
<getlastmodified>Tue, 16 Mar 1999 20:06:18 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/newdir/</href>
<propstat>
<prop>
<creationdate>1999-03-28T12:28:29Z</creationdate>
<getlastmodified>Sun, 28 Mar 1999 12:28:29 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/foo/</href>
<propstat>
<prop>
<creationdate>1999-03-16T13:26:07Z</creationdate>
<getlastmodified>Tue, 16 Mar 1999 13:26:07 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/</href>
<propstat>
<prop>
<creationdate>1999-03-28T12:28:29Z</creationdate>
<getlastmodified>Sun, 28 Mar 1999 12:28:29 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
</multistatus>
'''

def use_parser():
  parser = xml.sax.saxexts.make_parser()
  handler = xml.dom.sax_builder.SaxBuilder()
  parser.setDocumentHandler(handler)
  parser.parseFile(StringIO.StringIO(msr))
  return handler.document
  return davlib.MultiStatusResponse(handler.document)

class blank:
  pass

DAV_NS_XML = -10

class Parser:
  def __init__(self):
    self.reset()

  def reset(self):
    self.doc = doc = blank()
    doc.root = None
    doc.namespaces = [ 'DAV:' ]

    self.cur_elem = None
    self.no_namespace_id = None
    self.error = None

  def find_prefix(self, prefix):
    elem = self.cur_elem
    while elem:
      if elem.ns_scope.has_key(prefix):
        return elem.ns_scope[prefix]
      elem = elem.parent

    if prefix == '':
      if self.no_namespace_id is None:
        self.no_namespace_id = len(self.doc.namespaces)
        self.doc.namespaces.append('')
      return self.no_namespace_id

    return -1

  def process_prefix(self, ob):
    idx = string.find(ob.name, ':')
    if idx == -1:
      ob.ns_id = self.find_prefix('')
    elif string.lower(ob.name[:3]) == 'xml':
      ob.ns_id = DAV_NS_XML	# name is reserved by XML
    else:
      ob.ns_id = self.find_prefix(ob.name[:idx])
      ob.name = ob.name[idx+1:]

      if ob.ns_id == -1:
        self.error = 'namespace prefix not found'
        return

  def start(self, name, attrs):
    if self.error:
      return

    elem = blank()
    elem.name = name
    elem.lang = None
    elem.parent = None
    elem.children = [ ]
    elem.ns_scope = { }
    elem.attrs = [ ]
    elem.first_cdata = ''
    elem.following_cdata = ''

    if self.cur_elem:
      elem.parent = self.cur_elem
      elem.parent.children.append(elem)
      self.cur_elem = elem
    else:
      self.cur_elem = self.doc.root = elem

    # scan for namespace declarations
    for i in range(0, len(attrs), 2):
      name = attrs[i]
      value = attrs[i+1]

      if name == 'xmlns' or name[:6] == 'xmlns:':
        if name == 'xmlns':
          prefix = ''
        else:
          prefix = name[6:]
        try:
          id = self.doc.namespaces.index(value)
        except ValueError:
          id = len(self.doc.namespaces)
          self.doc.namespaces.append(value)
        elem.ns_scope[prefix] = id
      elif name == 'xml:lang':
        elem.lang = value
      else:
        attr = blank()
        attr.name = name
        attr.value = value
        elem.attrs.append(attr)

    # inherit xml:lang from parent
    if elem.lang is None and elem.parent:
      elem.lang = elem.parent.lang

    # process prefix of the element name
    self.process_prefix(elem)

    # process attributes' namespace prefixes
    map(self.process_prefix, elem.attrs)

  def end(self, name):
    if self.error:
      return

    parent = self.cur_elem.parent

    del self.cur_elem.ns_scope
    del self.cur_elem.parent

    self.cur_elem = parent

  def cdata(self, data):
    if self.error:
      return
    elem = self.cur_elem
    if elem.children:
      last = elem.children[-1]
      last.following_cdata = last.following_cdata + data
    else:
      elem.first_cdata = elem.first_cdata + data

  def parse(self, s):
    p = pyexpat.ParserCreate()
    p.StartElementHandler = self.start
    p.EndElementHandler = self.end
    p.CharacterDataHandler = self.cdata
    rv = p.Parse(s, 1)
    if rv == 0:
      raise 'expat parsing error'
    doc = self.doc
    self.reset()
    return doc

def use_expat():
  p = Parser()
  return p.parse(msr)

def dump(f, doc, elem=None, dump_ns=0):
  if elem is None:
    f.write('<?xml version="1.0"?>\n')
    dump(f, doc, doc.root, 1)
  else:
    if elem.ns_id == DAV_NS_XML:
      f.write('<' + elem.name)
    else:
      f.write('<ns%d:%s' % (elem.ns_id, elem.name))
    for attr in elem.attrs:
      if attr.ns_id == DAV_NS_XML:
        f.write(' %s="%s"' % (attr.name, attr.value))
      else:
        f.write(' ns%d:%s="%s"' % (attr.ns_id, attr.name, attr.value))
    if dump_ns:
      for i in range(len(doc.namespaces)):
        f.write(' xmlns:ns%d="%s"' % (i, doc.namespaces[i]))
    if elem.children or elem.first_cdata:
      f.write('>' + elem.first_cdata)
      for child in elem.children:
        dump(f, doc, child)
        f.write(child.following_cdata)
      if elem.ns_id == DAV_NS_XML:
        f.write('</%s>' % elem.name)
      else:
        f.write('</ns%d:%s>' % (elem.ns_id, elem.name))
    else:
      f.write('/>')

def timing(n1=10, n2=200):
  l1 = range(n1)
  l2 = range(n2)
  t = time.time()
  for i in l1:
    use_parser()
  t1 = time.time() - t
  print "time=%.4f  each=%.4f" % (t1, t1/n1)
  t = time.time()
  for i in l2:
    use_expat()
  t2 = time.time() - t
  print "time=%.4f  each=%.4f" % (t2, t2/n2)

--------------3BCDD7BF6676916C1CF1ED3C--