[XML-SIG] Expat as xmllib
Paul Prescod
paul@prescod.net
Mon, 24 Jan 2000 09:56:47 -0600
This is a multi-part message in MIME format.
--------------E2834FEC56D5F06E9B5E259A
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
The attached library allows expat to be used as a basis for a parser
with the xmllib interface.
Performance:
Without any xmllib-specific optimization, pyexpat runs almost as fast as
sgmlop:
raw sgmlop: 13222 items; 0.426 seconds; 1281.29 kbytes per second
fast xmllib: 13222 items; 1.445 seconds; 378.03 kbytes per second
slow xmllib: 13222 items; 6.651 seconds; 82.11 kbytes per second
pyexpat: 13210 items; 1.527 seconds; 357.68 kbytes per second
I can think of several optimizations that could speed it up quite a bit.
Also if you compare it to the xmllib in the standard distribution, we
are talking night and day so if we bundle expat we're only improving
things for them.
Conformance
Pyexpat caught more errors than xmllib, was more accepting of legal XML
input (e.g. <?foo?>) and handled entities (especially character
entities) in a manner consistent with the XML specification.
These explain the differenced in the number of "items" above.
Backwards Compatibility
The only big compatibility difference between xmllib on pyexpat and
xmllib on sgmlop is that expat expands entity references like & to
"&" instead of to a separate event. This is actually a feature of expat
because it is doing entity expansion *for you*. The XML spec requires
this behavior.
The library and a test program are attached.
--
Paul Prescod - ISOGEN Consulting Engineer speaking for himself
Earth will soon support only survivor species -- dandelions, roaches,
lizards, thistles, crows, rats. Not to mention 10 billion humans.
- Planet of the Weeds, Harper's Magazine, October 1998
--------------E2834FEC56D5F06E9B5E259A
Content-Type: text/plain; charset=us-ascii;
name="ExpatOp.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="ExpatOp.py"
from xml.parsers import xmllib
import pyexpat
handlerMap=[("finish_starttag", "StartElementHandler"),
("finish_endtag", "EndElementHandler"),
("handle_data","CharacterDataHandler"),
("handle_proc","ProcessingInstructionHandler")]
class ExpatPretendingToBeSGMLOp:
def __init__(self, encoding=None):
if encoding:
self.pyexpat=pyexpat.ParserCreate(encoding)
else:
self.pyexpat=pyexpat.ParserCreate()
def close( self ):
self.pyexpat.Parse( "", 1 )
def parse( self, data ):
self.pyexpat.Parse( data, 1 )
def feed( self, data ):
self.pyexpat.Parse( data, 0 )
def register( self, obj ):
for oldname,newname in handlerMap:
method=getattr( obj, oldname, None )
setattr( self.pyexpat, newname, method )
class XMLParser( xmllib.FastXMLParser ):
def reset( self ):
xmllib.FastXMLParser.reset(self)
self.parser=ExpatPretendingToBeSGMLOp()
self.feed=self.parser.pyexpat.Parse
self.parser.register( self )
if __name__=="__main__":
import sys
junk = open( "out.tmp","w")
if len( sys.argv )>1:
filename=sys.argv[1]
else:
filename="hamlet.xml"
class myparser( XMLParser ):
def handle_proc(self, target,data):
junk.write( "\n?"+target+data )
def handle_data( self, data):
junk.write( "\n'"+data)
def finish_starttag(self,gi,attrs):
junk.write( "\n<>"+gi+ `attrs` )
def finish_endtag( self, gi ):
junk.write( "\n</>"+gi )
myparser().feed( open( filename).read() )
--------------E2834FEC56D5F06E9B5E259A
Content-Type: text/plain; charset=us-ascii;
name="testxml1.py"
Content-Transfer-Encoding: 7bit
Content-Disposition: inline;
filename="testxml1.py"
# basic tests
test_sgmlop = 1
import sys
import time, string
from xml.parsers import sgmlop, xmllib, ExpatOp
try:
FILE, VERBOSE = sys.argv[1], 2
except IndexError:
FILE, VERBOSE = "hamlet.xml", 1
print
print "test collecting parsers on", FILE
print
# --------------------------------------------------------------------
# sgmlop
class myCollector:
def __init__(self):
self.data = []
self.text = []
def finish_starttag(self, tag, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("start", tag, data)
def handle_proc(self, tag, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("pi", tag, data)
def handle_special(self, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("special", data)
def handle_entityref(self, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("entity", data)
def handle_data(self, data):
self.text.append(data)
def handle_cdata(self, data):
self.text.append("CDATA" + data)
def doRawSGMLOp():
global parser
t = time.clock()
for i in range(1):
out = myCollector()
fp = open(FILE)
parser = sgmlop.XMLParser()
parser.register(out)
b = 0
while 1:
data = fp.read(512)
if not data:
break
parser.feed(data)
b = b + len(data)
parser.close()
t1 = time.clock() - t
print "raw sgmlop:", len(out.data), "items;", round(t1, 3), "seconds;",
print round(b / t1 / 512, 2), "kbytes per second"
return t1
# --------------------------------------------------------------------
# xmllib
base=None
def makeparser( basecls ):
global base
base=basecls
class FastXMLParser(base):
def __init__(self):
base.__init__(self)
self.data = []
self.text = []
def unknown_starttag(self, tag, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("start", tag, data)
def handle_proc(self, tag, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("pi", tag, data)
def handle_special(self, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("special", data)
def handle_entityref(self, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("entity", data)
def handle_data(self, data):
self.text.append(data)
def handle_cdata(self, data):
self.text.append("CDATA" + data)
return FastXMLParser
def doFastXMLLib():
global parser2
FastXMLParser = makeparser( xmllib.FastXMLParser )
t = time.clock()
for i in range(1):
fp = open(FILE)
parser2 = FastXMLParser()
b = 0
while 1:
data = fp.read(512)
if not data:
break
parser2.feed(data)
b = b + len(data)
parser2.close()
t2 = time.clock() - t
print "fast xmllib:", len(parser2.data), "items;", round(t2, 3), "seconds;",
print round(b / t2 / 512, 2), "kbytes per second"
return t2
class SlowXMLParser(xmllib.SlowXMLParser):
def __init__(self):
xmllib.SlowXMLParser.__init__(self)
self.data = []
self.text = []
def unknown_starttag(self, tag, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("start", tag, data)
def handle_proc(self, tag, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("pi", tag, data)
def handle_special(self, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("special", data)
def handle_entityref(self, data):
if self.text:
self.data.append(repr(string.join(self.text, "")))
self.text = []
self.data.append("entity", data)
def handle_data(self, data):
self.text.append(data)
def handle_cdata(self, data):
self.text.append("CDATA" + data)
def doSlowXMLLib():
global parser3
t = time.clock()
for i in range(1):
fp = open(FILE)
parser3 = SlowXMLParser()
b = 0
while 1:
data = fp.read(512)
if not data:
break
parser3.feed(data)
b = b + len(data)
parser3.close()
t3 = time.clock() - t
print "slow xmllib:", len(parser3.data), "items;", round(t3, 3), "seconds;",
print round(b / t3 / 512, 2), "kbytes per second"
return t3
def doPyExpat():
global parser4
# PyExpat
FastXMLParser = makeparser( ExpatOp.XMLParser )
t = time.clock()
for i in range(1):
fp = open(FILE)
parser4 = FastXMLParser()
b = 0
while 1:
data = fp.read(512)
if not data:
break
parser4.feed(data)
b = b + len(data)
parser4.close()
t4 = time.clock() - t
print "pyexpat:", len(parser4.data), "items;", round(t4, 3), "seconds;",
print round(b / t4 / 512, 2), "kbytes per second"
return t4
t1=doRawSGMLOp()
t2=doFastXMLLib()
t3=doSlowXMLLib()
t4=doPyExpat()
print
print "normalized timing:"
print "slow xmllib", 1.0
print "fast xmllib", round(t2 / t3, 2), "(%sx)" % round(t3 / t2, 1)
print "sgmlop ", round(t1 / t3, 2), "(%sx)" % round(t3 / t1, 1)
print "pyexpat ", round(t4 / t3, 2), "(%sx)" % round(t3 / t4, 1)
print
print "looking for differences:"
items = min(len(parser2.data), len(parser4.data))
for i in xrange(items):
if parser2.data[i] != parser3.data[i]:
for j in range(max(i-5, 0), min(i+5, items)):
if parser2.data[j] != parser3.data[j]:
print "+", j+1, parser2.data[j]
print "*", j+1, parser3.data[j]
else:
print "=", j+1, parser2.data[j]
break
else:
print "(no differences)"
--------------E2834FEC56D5F06E9B5E259A--