[XML-SIG] SAX 2.0 names

Lars Marius Garshol larsga@garshol.priv.no
29 Feb 2000 17:21:06 +0100


* Lars Marius Garshol
|
| - as objects (with __cmp__, __hash__, get_uri, get_local_name and
|   get_rawname methods)
| 
|   - requires a bit of machinery in drivers to be effective
|   - all operations will be slow
|   - a natural way to model this

* Fred L. Drake, Jr.
| 
| If the objects are implemented as a C/Java extension type, it should
| be plenty fast.  A 100% Pure Python implementation can be a fallback
| if the extension isn't available.

Hmmm.  That might be the way to go.  I still wonder about the speed,
though. 
 
| And the convenient tuple unpacking could also be provided using the
| object approach; the objects can easily implement the sequence
| protocol.

Good idea. This makes objects even more attractive.

| I'd be willing to write a C implementation of the object version if
| that's the API we decide on, but I'd also be fine with the third
| option.

Hmmm.  Let's chew on this a little more and hear some more opinions
before deciding. 

I did the benchmark I spoke of, and the results indicate that the
performance differences are very small between strings and tuples.
Also, how you put together the strings influences the speed a
bit. Benchmark run with Python 1.5.2 on Debian GNU/Linux on a Pentium
II with plenty of RAM and MHz.




[larsga@pc-larsga python]$ python sax2bench.py 
Pure parsing time: 28.73

---Generic:
__main__.NamespaceFilterString  30.25
__main__.NamespaceFilterInternedString  30.85
 __main__.NamespaceFilterTuple  30.15

---Specific:
__main__.NamespaceFilterString  30.71
__main__.NamespaceFilterInternedString  30.7
 __main__.NamespaceFilterTuple  29.67




# A simple benchmark of various ways to represent namespace-names and
# how this affects performance.
    
# ==================== NAMESPACEFILTER

# This is xmlproc's normal namespace filter, but modified to use
# different name representations

import string
from xml.parsers.xmlproc import xmlapp

# --- Name objects

class SAXName:

    def __init__(self, uri, localname, rawname):
        self.__uri = uri
        self.__localname = localname
        self.__rawname = rawname
        self.__hash = hash(uri) + hash(localname)

    def get_uri(self):
        return self.__uri

    def get_localname(self):
        return self.__localname

    def get_rawname(self):
        return self.__rawname

    def __cmp__(self, other): # NB! Does not sort properly
        if self.__hash == hash(other) and isinstance(other, SAXName):
            return self.__uri == other.get_uri() and \
                   self.__localname == other.get_localname()
        else:
            return 0

    def __hash__(self):
        return self.__hash

# --- ParserFilter

class ParserFilter(xmlapp.Application):
    "A generic parser filter class."

    def __init__(self):
        xmlapp.Application.__init__(self)
        self.app=xmlapp.Application()

    def set_application(self,app):
        "Sets the application to report events to."
        self.app=app
        
    # --- Methods inherited from xmlapp.Application
        
    def set_locator(self,locator):
        xmlapp.Application.set_locator(self,locator)
        self.app.set_locator(locator)
    
    def doc_start(self):
        self.app.doc_start()
        
    def doc_end(self):
        self.app.doc_end()
	
    def handle_comment(self,data):
        self.app.handle_comment(data)

    def handle_start_tag(self,name,attrs):
        self.app.handle_start_tag(name,attrs)

    def handle_end_tag(self,name):
        self.app.handle_end_tag(name)
    
    def handle_data(self,data,start,end):
        self.app.handle_data(data,start,end)

    def handle_ignorable_data(self,data,start,end):
        self.app.handle_ignorable_data(data,start,end)
    
    def handle_pi(self,target,data):
        self.app.handle_pi(target,data)

    def handle_doctype(self,root,pubID,sysID):
        self.app.handle_doctype(root,pubID,sysID)
    
    def set_entity_info(self,xmlver,enc,sddecl):
        self.app.set_entity_info(xmlver,enc,sddecl)

# --- NamespaceFilter
        
class NamespaceFilterGeneric(ParserFilter):
    """An xmlproc application that processes qualified names and reports them
    as 'URI local-part' names. It reports errors through the error reporting
    mechanisms of the parser."""   

    def __init__(self,parser):
        ParserFilter.__init__(self)
        self.ns_map={}       # Current prefix -> URI map
        self.ns_stack=[]     # Pushed for each element, used to maint ns_map
        self.rep_ns_attrs=0  # Report xmlns-attributes?
        self.parser=parser

    def set_report_ns_attributes(self,action):
        "Tells the filter whether to report or delete xmlns-attributes."
        self.rep_ns_attrs=action
        
    # --- Overridden event methods
        
    def handle_start_tag(self,name,attrs):
        old_ns={} # Reset ns_map to these values when we leave this element
        del_ns=[] # Delete these prefixes from ns_map when we leave element

        # attrs=attrs.copy()   Will have to do this if more filters are made

        # Find declarations, update self.ns_map and self.ns_stack
        for (a,v) in attrs.items():
            if a[:6]=="xmlns:":
                prefix=a[6:]
                if string.find(prefix,":")!=-1:
                    self.parser.report_error(1900)

                if v=="":
                    self.parser.report_error(1901)
            elif a=="xmlns":
                prefix=""
            else:
                continue

            if self.ns_map.has_key(prefix):
                old_ns[prefix]=self.ns_map[prefix]
            else:
                del_ns.append(prefix)

            if prefix=="" and v=="":
                del self.ns_map[prefix]
            else:
                self.ns_map[prefix]=v

            if not self.rep_ns_attrs:
                del attrs[a]

        self.ns_stack.append((old_ns,del_ns))
        
        # Process elem and attr names
        name=self._process_name(name)
        for (a,v) in attrs.items():
            del attrs[a]
            attrs[self._process_name(a)]=v
        
        # Report event
        self.app.handle_start_tag(name,attrs)

    def handle_end_tag(self,name):
        name=self._process_name(name)

        # Clean up self.ns_map and self.ns_stack
        (old_ns,del_ns)=self.ns_stack[-1]
        del self.ns_stack[-1]

        self.ns_map.update(old_ns)
        for prefix in del_ns:
            del self.ns_map[prefix]        
            
        self.app.handle_end_tag(name)

class NamespaceFilterString(NamespaceFilterGeneric):
    
    def _process_name(self,name):
        n=string.split(name,":")
        if len(n)>2:
            self.parser.report_error(1900)
            return name
        elif len(n)==2:
            if n[0]=="xmlns":
                return name 
                
            try:
                #return string.join(self.ns_map[n[0]],n[1])   (slowest)
                #return "%s %s" % (self.ns_map[n[0]],n[1])    (slower)
                return self.ns_map[n[0]] + " " + n[1]
            except KeyError:
                self.parser.report_error(1902)
                return name
        elif self.ns_map.has_key("") and name!="xmlns":
            return "%s %s" % (self.ns_map[""],name)
        else:
            return name

class NamespaceFilterInternedString(NamespaceFilterGeneric):
    
    def _process_name(self,name):
        n=string.split(name,":")
        if len(n)>2:
            self.parser.report_error(1900)
            return name
        elif len(n)==2:
            if n[0]=="xmlns":
                return name 
                
            try:
                #return intern(string.join(self.ns_map[n[0]],n[1])) (slowest)
                #return intern("%s %s" % (self.ns_map[n[0]],n[1]))  (slower)
                return intern(self.ns_map[n[0]] + " " + n[1])
            except KeyError:
                self.parser.report_error(1902)
                return name
        elif self.ns_map.has_key("") and name!="xmlns":
            return intern("%s %s" % (self.ns_map[""],name))
        else:
            return name

class NamespaceFilterTuple(NamespaceFilterGeneric):

    def _process_name(self,name):
        n=string.split(name,":")
        if len(n)>2:
            self.parser.report_error(1900)
            return name
        elif len(n)==2:
            if n[0]=="xmlns":
                return name 
                
            try:
                return (self.ns_map[n[0]], n[1])
            except KeyError:
                self.parser.report_error(1902)
                return (None, name)
        elif self.ns_map.has_key("") and name!="xmlns":
            return (self.ns_map[""], name)
        else:
            return (None, name)

class NamespaceFilterObject(NamespaceFilterGeneric):
    
    def __init__(self, parser):
        NamespaceFilterGeneric.__init__(self, parser)
        self.__objs = {}
    
    def _process_name(self,name): # FIXME: implement!
        n=string.split(name,":")
        if len(n)>2:
            self.parser.report_error(1900)
            return name
        elif len(n)==2:
            if n[0]=="xmlns":
                return name 
                
            try:
                return (self.ns_map[n[0]], n[1])
            except KeyError:
                self.parser.report_error(1902)
                return (None, name)
        elif self.ns_map.has_key("") and name!="xmlns":
            return (self.ns_map[""], name)
        else:
            return name
        
# ==================== GENERIC BENCHMARK

class GenericStats(xmlapp.Application):

    def __init__(self):
        self.__elemtypes = {}
        self.__attrtypes = {}

    def handle_start_tag(self, name, attrs):
        try:
            self.__elemtypes[name] = self.__elemtypes[name] + 1
        except KeyError:
            self.__elemtypes[name] = 1

        for (attr, value) in attrs.items():
            try:
                self.__attrtypes[attr] = self.__attrtypes[attr]
            except KeyError:
                self.__attrtypes[attr] = 1

# ==================== SPECIFIC BENCHMARK

apt_airport = intern("http://www.megginson.com/exp/ns/airports# Airport")
apt_latitude = intern("http://www.megginson.com/exp/ns/airports# latitude")

apt_uri = "http://www.megginson.com/exp/ns/airports#"
apt_len = len(apt_uri)

rdf_uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
rdf_len = len(rdf_uri)

apt_airport2 = ("http://www.megginson.com/exp/ns/airports#", intern("Airport"))
apt_latitude2 = ("http://www.megginson.com/exp/ns/airports#", intern("latitude"))

class SpecificStatsString(xmlapp.Application):

    def __init__(self):
        self.__airports = 0
        self.__with_coords = 0
        self.__apt_elems = 0
        self.__rdf_elems = 0

    def handle_start_tag(self, name, attrs):
        if name == apt_airport:
            self.__airports = self.__airports + 1

        elif name == apt_latitude:
            self.__with_coords = self.__with_coords + 1

        if name[:apt_len] == apt_uri:
            self.__apt_elems = self.__apt_elems + 1

        elif name[:rdf_len] == rdf_uri:
            self.__rdf_elems = self.__rdf_elems + 1

class SpecificStatsTuple(xmlapp.Application):

    def __init__(self):
        self.__airports = 0
        self.__with_coords = 0
        self.__apt_elems = 0
        self.__rdf_elems = 0

    def handle_start_tag(self, name, attrs):
        if name == apt_airport2:
            self.__airports = self.__airports + 1

        elif name == apt_latitude2:
            self.__with_coords = self.__with_coords + 1

        if name[0] == apt_uri:
            self.__apt_elems = self.__apt_elems + 1

        elif name[0] == rdf_uri:
            self.__rdf_elems = self.__rdf_elems + 1
            
# ==================== MAIN PROGRAM

from xml.parsers.xmlproc import xmlproc
import time

p = xmlproc.XMLProcessor()
start = time.clock()
p.set_application(NamespaceFilterTuple(p))
p.parse_resource("airports.rdf")
used = time.clock() - start

print "Pure parsing time:", used

print
print "---Generic:"
for filter in [NamespaceFilterString, NamespaceFilterInternedString,
               NamespaceFilterTuple]:
    p = xmlproc.XMLProcessor()
    nsfilter = filter(p)
    nsfilter.set_application(GenericStats())
    p.set_application(nsfilter)

    start = time.clock()
    p.parse_resource("airports.rdf")
    used = time.clock() - start

    print "%30s\t%s" % (filter, used)

print
print "---Specific:"
for (Filter, App) in [(NamespaceFilterString, SpecificStatsString),
                      (NamespaceFilterInternedString, SpecificStatsString),
                      (NamespaceFilterTuple, SpecificStatsTuple)]:
    p = xmlproc.XMLProcessor()
    nsfilter = Filter(p)
    nsfilter.set_application(App())
    p.set_application(nsfilter)

    start = time.clock()
    p.parse_resource("airports.rdf")
    used = time.clock() - start

    print "%30s\t%s" % (Filter, used)




#--Lars M.