[XML-SIG] Parsing speed
Bjorn Pettersen
bjorn@roguewave.com
Wed, 15 Nov 2000 15:25:58 -0700
With a simple parsing of a xml file (size= 1,579,881 bytes):
from xml.utils.qp_xml import Parser
if __name__=="__main__":
import sys, time
if len(sys.argv) != 2:
print "usage %s <xmlfile>" % sys.argv[0]
else:
t1 = time.time()
p = Parser()
p.parse(open(sys.argv[1]))
t2 = time.time()
print t2-t1
I'm seeing very different speeds with Python 1.5.2 and 2.0:
1.5.2: 79.414 secs
2.0 : 185.606 secs
ie. Python 2.0 is about 130% slower. Is this all due to Unicode, or is
there something else also?
I've also developed my own xml->python datastructure parser that isn't
so sensitive to the size of the input (it's also really fast on small
inputs). With this module I get:
1.5.2: 1.792 secs
2.0 : 2.954 secs
for the same file, which makes Python 2.0 about 65% slower.
I've copied the parser part of my module below. Is there anything I'm
missing? (beside namespace support, which isn't useful to me right now
;-) Also, is it kosher to get around the differences in pyexpat the way
I'm doing? (I didn't want to do it inside the start method since it is
called over 3000 times for the file above).
There is of course a cursor type module to go along with the generated
datastructure to make it easy to work with (would it be appropriate to
add something like this to the PyXML distribution?)
-- bjorn
import pyexpat
from cStringIO import StringIO
import sys
class Parser:
def __init__(self):
self.parent = ([],[])
self.root = self.parent
self.stack = []
self.debug = 0
self.cur = None
self.i = 0
self.prev = 0 #init
def parse(self, input):
p = pyexpat.ParserCreate()
p.StartElementHandler = self.start
p.EndElementHandler = self.end
p.CharacterDataHandler = self.cdata
if type(input) == type(''):
p.Parse(input, 1)
else:
while 1:
s = input.read(1024 * 16)
if not s:
p.Parse('', 1)
break
p.Parse(s, 0)
return self.root[1][0]
def _start_2_0(self, name, attrs):
subnodes = ([],[])
node = (name, attrs, subnodes)
if self.prev == 3: #cdata
self.parent[0].append( (self.i,self.cur.getvalue()) )
self.i = self.i + 1
self.parent[1].append( (self.i, node) )
else:
self.parent[1].append( (self.i, node) )
self.i = self.i + 1
self.stack.append((self.i,self.parent))
self.parent = subnodes
self.i = 0
self.cur = node
self.prev = 1 #start
def _start_1_5_2(self, name, a):
i = 0; attrs={}
while i<len(a):
attrs[a[i]] = a[i+1]
i = i+2
subnodes = ([],[])
node = (name, attrs, subnodes)
if self.prev == 3: #cdata
self.parent[0].append( (self.i,self.cur.getvalue()) )
self.i = self.i + 1
self.parent[1].append( (self.i, node) )
else:
self.parent[1].append( (self.i, node) )
self.i = self.i + 1
self.stack.append((self.i,self.parent))
self.parent = subnodes
self.i = 0
self.cur = node
self.prev = 1 #start
if sys.version[0] == '1':
start = _start_1_5_2
else:
start = _start_2_0
def end(self, name):
if self.prev == 3: #cdata
self.parent[0].append( (self.i,self.cur.getvalue()) )
self.i, self.parent = self.stack.pop()
self.prev = 2 #end
def cdata(self, data):
if self.prev != 3: #cdata
self.cur = StringIO()
self.prev = 3 #cdata
self.cur.write(data)