[XML-SIG] quick speed test
Greg Stein
gstein@lyra.org
Sun, 28 Mar 1999 06:46:23 -0800
This is a multi-part message in MIME format.
--------------3BCDD7BF6676916C1CF1ED3C
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Hey gang,
I added some parsing in my DAV client for the server responses. My test
script then started running horribly slow :-(
To do some quick performance testing, I whipped up the attached script.
The "Parser" class in there is essentially a direct translation of the C
code in mod_dav. It interfaces with Expat and handles xml:lang and
namespace processing. Of course, Python has different/better data
structures, so it is quite a bit simpler than the C equivalent.
My testing shows that the Parser class is about 12 times faster than
going thru the DOM code. Some post-processing of the DOM adds another
50%. The post-processing does the namespace handling (no xml:lang
handling or handling of the reserved "xml" prefix). The post-process
*does* do some data extraction which I haven't written for the Parser
thing yet. I figure it would balance out to the Parser being about 15x
the DOM version.
Regardless of the obscure details, the main point is that this script
demonstrates a much faster mechanism for translating Expat output into a
useful tree-based structure, while also performing namespace processing
and miscellaneous XML conformance stuff.
There is also a sample function for dumping the output tree.
To get this to run on your system, you may need to drop the "import
davlib" from the top. It isn't really used. A couple other DAV remnants
are in there, but hey. Exercise for the reader :-)
I'm posting this mostly as an example or aid, in that somebody may find
it useful. It isn't intended to universally replace the DOM stuff.
Cheers,
-g
--
Greg Stein, http://www.lyra.org/
--------------3BCDD7BF6676916C1CF1ED3C
Content-Type: text/plain; charset=us-ascii; name="xmlperf.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline; filename="xmlperf.py"
#
# do performance tests on XML parsing variants
#
import xml.sax.saxexts
import xml.dom.sax_builder
import StringIO
import davlib
from xml.parsers import pyexpat
import string
import time
msr = '''\
<?xml version="1.0"?>
<multistatus xmlns="DAV:">
<response>
<href>/dav/foo.cgi</href>
<propstat>
<prop>
<creationdate>1999-03-16T20:06:16Z</creationdate>
<getcontentlength>17</getcontentlength>
<getlastmodified>Tue, 16 Mar 1999 20:06:16 GMT</getlastmodified>
<resourcetype/></prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/file1</href>
<propstat>
<prop>
<creationdate>1999-03-16T20:06:17Z</creationdate>
<getcontentlength>14</getcontentlength>
<getlastmodified>Tue, 16 Mar 1999 20:06:17 GMT</getlastmodified>
<resourcetype/></prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/testdata/</href>
<propstat>
<prop>
<creationdate>1999-03-16T20:06:18Z</creationdate>
<getlastmodified>Tue, 16 Mar 1999 20:06:18 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/newdir/</href>
<propstat>
<prop>
<creationdate>1999-03-28T12:28:29Z</creationdate>
<getlastmodified>Sun, 28 Mar 1999 12:28:29 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/foo/</href>
<propstat>
<prop>
<creationdate>1999-03-16T13:26:07Z</creationdate>
<getlastmodified>Tue, 16 Mar 1999 13:26:07 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
<response>
<href>/dav/</href>
<propstat>
<prop>
<creationdate>1999-03-28T12:28:29Z</creationdate>
<getlastmodified>Sun, 28 Mar 1999 12:28:29 GMT</getlastmodified>
<resourcetype><collection/></resourcetype>
</prop>
<status>HTTP/1.1 200 OK</status>
</propstat>
</response>
</multistatus>
'''
def use_parser():
parser = xml.sax.saxexts.make_parser()
handler = xml.dom.sax_builder.SaxBuilder()
parser.setDocumentHandler(handler)
parser.parseFile(StringIO.StringIO(msr))
return handler.document
return davlib.MultiStatusResponse(handler.document)
class blank:
pass
DAV_NS_XML = -10
class Parser:
def __init__(self):
self.reset()
def reset(self):
self.doc = doc = blank()
doc.root = None
doc.namespaces = [ 'DAV:' ]
self.cur_elem = None
self.no_namespace_id = None
self.error = None
def find_prefix(self, prefix):
elem = self.cur_elem
while elem:
if elem.ns_scope.has_key(prefix):
return elem.ns_scope[prefix]
elem = elem.parent
if prefix == '':
if self.no_namespace_id is None:
self.no_namespace_id = len(self.doc.namespaces)
self.doc.namespaces.append('')
return self.no_namespace_id
return -1
def process_prefix(self, ob):
idx = string.find(ob.name, ':')
if idx == -1:
ob.ns_id = self.find_prefix('')
elif string.lower(ob.name[:3]) == 'xml':
ob.ns_id = DAV_NS_XML # name is reserved by XML
else:
ob.ns_id = self.find_prefix(ob.name[:idx])
ob.name = ob.name[idx+1:]
if ob.ns_id == -1:
self.error = 'namespace prefix not found'
return
def start(self, name, attrs):
if self.error:
return
elem = blank()
elem.name = name
elem.lang = None
elem.parent = None
elem.children = [ ]
elem.ns_scope = { }
elem.attrs = [ ]
elem.first_cdata = ''
elem.following_cdata = ''
if self.cur_elem:
elem.parent = self.cur_elem
elem.parent.children.append(elem)
self.cur_elem = elem
else:
self.cur_elem = self.doc.root = elem
# scan for namespace declarations
for i in range(0, len(attrs), 2):
name = attrs[i]
value = attrs[i+1]
if name == 'xmlns' or name[:6] == 'xmlns:':
if name == 'xmlns':
prefix = ''
else:
prefix = name[6:]
try:
id = self.doc.namespaces.index(value)
except ValueError:
id = len(self.doc.namespaces)
self.doc.namespaces.append(value)
elem.ns_scope[prefix] = id
elif name == 'xml:lang':
elem.lang = value
else:
attr = blank()
attr.name = name
attr.value = value
elem.attrs.append(attr)
# inherit xml:lang from parent
if elem.lang is None and elem.parent:
elem.lang = elem.parent.lang
# process prefix of the element name
self.process_prefix(elem)
# process attributes' namespace prefixes
map(self.process_prefix, elem.attrs)
def end(self, name):
if self.error:
return
parent = self.cur_elem.parent
del self.cur_elem.ns_scope
del self.cur_elem.parent
self.cur_elem = parent
def cdata(self, data):
if self.error:
return
elem = self.cur_elem
if elem.children:
last = elem.children[-1]
last.following_cdata = last.following_cdata + data
else:
elem.first_cdata = elem.first_cdata + data
def parse(self, s):
p = pyexpat.ParserCreate()
p.StartElementHandler = self.start
p.EndElementHandler = self.end
p.CharacterDataHandler = self.cdata
rv = p.Parse(s, 1)
if rv == 0:
raise 'expat parsing error'
doc = self.doc
self.reset()
return doc
def use_expat():
p = Parser()
return p.parse(msr)
def dump(f, doc, elem=None, dump_ns=0):
if elem is None:
f.write('<?xml version="1.0"?>\n')
dump(f, doc, doc.root, 1)
else:
if elem.ns_id == DAV_NS_XML:
f.write('<' + elem.name)
else:
f.write('<ns%d:%s' % (elem.ns_id, elem.name))
for attr in elem.attrs:
if attr.ns_id == DAV_NS_XML:
f.write(' %s="%s"' % (attr.name, attr.value))
else:
f.write(' ns%d:%s="%s"' % (attr.ns_id, attr.name, attr.value))
if dump_ns:
for i in range(len(doc.namespaces)):
f.write(' xmlns:ns%d="%s"' % (i, doc.namespaces[i]))
if elem.children or elem.first_cdata:
f.write('>' + elem.first_cdata)
for child in elem.children:
dump(f, doc, child)
f.write(child.following_cdata)
if elem.ns_id == DAV_NS_XML:
f.write('</%s>' % elem.name)
else:
f.write('</ns%d:%s>' % (elem.ns_id, elem.name))
else:
f.write('/>')
def timing(n1=10, n2=200):
l1 = range(n1)
l2 = range(n2)
t = time.time()
for i in l1:
use_parser()
t1 = time.time() - t
print "time=%.4f each=%.4f" % (t1, t1/n1)
t = time.time()
for i in l2:
use_expat()
t2 = time.time() - t
print "time=%.4f each=%.4f" % (t2, t2/n2)
--------------3BCDD7BF6676916C1CF1ED3C--