[XML-SIG] XML parser benchmark

Lars Marius Garshol larsga@ifi.uio.no
08 May 1999 11:59:47 +0200


A quick investigation of the benchmark code shows that roughly 40 % of
its time is spent doing Unicode string counting, using the re module.
Here are some interesting numbers, using the big.xml document:

  Original:                 108 seconds
  Using len to count:       62 seconds
  My optimized version:     92 seconds

If we replace my naive pure-Python character counting with one of the
Unicode C modules we might well have a version of statsexpat.py that
is faster than the XML::Parser application.

In case anyone wants to attack this, here's my version:

import sys,os,time,pyexpat

class Elinfo:
    def __init__(self, name):
        self.name = name
        self.count = 0
        self.minlev = 0
        self.charcount = 0
        self.empty = 1
        self.ptab = {}
        self.ktab = {}
        self.atab = {}

class Docinfo:
    def __init__(self):
        self.root = None
        self.eltab = {}
        self.elstack = []
        self.seenorder = 0

    def strt_handle(self, name, attrs):
        intern(name)
        try:
            inf = self.eltab[name]
        except KeyError:
            inf = Elinfo(name)
            inf.seen = self.seenorder
            self.seenorder = self.seenorder + 1
            self.eltab[name] = inf
        inf.count = inf.count + 1

        if self.elstack!=[]:
            parent = self.elstack[-1]
            try:
                inf.ptab[parent] = inf.ptab[parent] + 1
            except KeyError:
                inf.ptab[parent] = 1
            pinf = self.eltab[parent]
            pinf.empty = 0
            try:
                pinf.ktab[name] = pinf.ktab[name] + 1
            except KeyError:
                pinf.ktab[name] = 1
        else:
            self.root = name

        #Attribute handling
        for i in range(0, len(attrs), 2):
            try:
                inf.atab[attrs[i]] = inf.atab[attrs[i]] + 1
            except KeyError:
                inf.atab[attrs[i]] = 1
        self.elstack.append(name)

    def end_handle(self, name):
        del self.elstack[-1]        

    def char_handle(self, data):
        elname = self.elstack[-1]
        inf = self.eltab[elname]
        inf.empty = 0
        cnt = len(data)
        max = len(data)

        ix=0
        while ix<max:
            if data[ix]<"\300":
                ix=ix+1
            elif data[ix] < "\340":
                cnt = cnt - 1
                ix=ix+2
            elif data[ix] < "\360":
                cnt = cnt - 2
                ix=ix+3
            else:
                cnt = cnt - 3
                ix=ix+4

        inf.charcount = inf.charcount + cnt
    
    def set_minlev(self, name, level):
        inf = self.eltab[name]
        if inf.minlev == 0 or inf.minlev > level:
            newlev = level + 1
            inf.minlev = level
            for kid in inf.ktab.keys():
                self.set_minlev(kid, newlev)

def showtab(label, tab, dosum):
    if not len(tab):
        return
    print '\n  ', label + ':'
    sum = 0

    names = tab.keys()
    names.sort()
    for name in names:
        cnt = tab[name]
        sum = sum + cnt
        print '      %-16s      %5d' % (name, cnt)

    if dosum and len(names) > 1:
        print '                            ====='
        print '                            %5d' % sum

def elcmp(a, b):
    cmpmin = a.minlev - b.minlev
    if cmpmin:
        return cmpmin
    return a.seen - b.seen

doc = Docinfo()

parser = pyexpat.ParserCreate()
parser.StartElementHandler  = doc.strt_handle
parser.EndElementHandler    = doc.end_handle
parser.CharacterDataHandler = doc.char_handle

docstream = open(sys.argv[1])
start=time.clock()

while 1:
    buff = docstream.read(32000)
    if not len(buff):
        break
    status = parser.Parse(buff, 0)
    if status == 0:
        print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\
              ', column ', parser.ErrorColumnNumber, ', byte ',\
              parser.ErrorByteIndex
        sys.exit(-1)

status = parser.Parse('', 1)
pt=time.clock()-start
if status == 0:
    print parser.ErrorCode, ' at line ', parser.ErrorLineNumber,\
          ', column ', parser.ErrorColumnNumber, ', byte ',\
          parser.ErrorByteIndex
    sys.exit(-1)

print "TIME: "+`pt`
sys.exit()

doc.set_minlev(doc.root, 0)
sortinf = doc.eltab.values()
sortinf.sort(elcmp)
for elinf in sortinf:
    print '\n================'
    print elinf.name + ':', elinf.count
    if elinf.charcount:
        print 'Had', elinf.charcount, 'bytes of character data'
    if elinf.empty:
        print 'Always empty'
    showtab('Parents', elinf.ptab, 0)
    showtab('Children', elinf.ktab, 1)
    showtab('Attributes', elinf.atab, 0)


--Lars M.