[XML-SIG] Parsing speed

Bjorn Pettersen bjorn@roguewave.com
Wed, 15 Nov 2000 15:25:58 -0700


With a simple parsing of a xml file (size= 1,579,881 bytes):

from xml.utils.qp_xml import Parser
		
if __name__=="__main__":
	import sys, time
	if len(sys.argv) != 2:
		print "usage %s <xmlfile>" % sys.argv[0]
	else:
		t1 = time.time()
		p = Parser()
		p.parse(open(sys.argv[1]))
		t2 = time.time()
		print t2-t1

I'm seeing very different speeds with Python 1.5.2 and 2.0:

 1.5.2:  79.414 secs
 2.0  : 185.606 secs

ie. Python 2.0 is about 130% slower. Is this all due to Unicode, or is
there something else also?

I've also developed my own xml->python datastructure parser that isn't
so sensitive to the size of the input (it's also really fast on small
inputs). With this module I get:

 1.5.2: 1.792 secs
 2.0  : 2.954 secs

for the same file, which makes Python 2.0 about 65% slower.

I've copied the parser part of my module below. Is there anything I'm
missing? (beside namespace support, which isn't useful to me right now
;-)  Also, is it kosher to get around the differences in pyexpat the way
I'm doing? (I didn't want to do it inside the start method since it is
called over 3000 times for the file above).

There is of course a cursor type module to go along with the generated
datastructure to make it easy to work with (would it be appropriate to
add something like this to the PyXML distribution?)

-- bjorn

import pyexpat
from cStringIO import StringIO
import sys

class Parser:
   def __init__(self):
      self.parent = ([],[])
      self.root = self.parent
      self.stack = []
      
      self.debug = 0
      self.cur = None
      self.i = 0
      self.prev = 0 #init

   def parse(self, input):
      p = pyexpat.ParserCreate()
      p.StartElementHandler = self.start
      p.EndElementHandler = self.end
      p.CharacterDataHandler = self.cdata

      if type(input) == type(''):
         p.Parse(input, 1)
      else:
         while 1:
            s = input.read(1024 * 16)
            if not s:
               p.Parse('', 1)
               break

            p.Parse(s, 0)

      return self.root[1][0]

   def _start_2_0(self, name, attrs):
      subnodes = ([],[])
      node = (name, attrs, subnodes)

      if self.prev == 3: #cdata
         self.parent[0].append( (self.i,self.cur.getvalue()) )
         self.i = self.i + 1
         self.parent[1].append( (self.i, node) )
      else:
         self.parent[1].append( (self.i, node) )

      self.i = self.i + 1
      self.stack.append((self.i,self.parent))
      self.parent = subnodes
      self.i = 0
      self.cur = node
      self.prev = 1 #start

   def _start_1_5_2(self, name, a):
      i = 0; attrs={}
      while i<len(a):
         attrs[a[i]] = a[i+1]
         i = i+2
      subnodes = ([],[])
      node = (name, attrs, subnodes)

      if self.prev == 3: #cdata
         self.parent[0].append( (self.i,self.cur.getvalue()) )
         self.i = self.i + 1
         self.parent[1].append( (self.i, node) )
      else:
         self.parent[1].append( (self.i, node) )

      self.i = self.i + 1
      self.stack.append((self.i,self.parent))
      self.parent = subnodes
      self.i = 0
      self.cur = node
      self.prev = 1 #start

   if sys.version[0] == '1':
      start = _start_1_5_2
   else:
      start = _start_2_0
      
   def end(self, name):
      if self.prev == 3: #cdata
         self.parent[0].append( (self.i,self.cur.getvalue()) )
      self.i, self.parent = self.stack.pop()
      self.prev = 2 #end
      
   def cdata(self, data):
      if self.prev != 3: #cdata
         self.cur = StringIO()
         self.prev = 3 #cdata
      self.cur.write(data)