[XML-SIG] SAX 2.0 names
Lars Marius Garshol
larsga@garshol.priv.no
29 Feb 2000 17:21:06 +0100
* Lars Marius Garshol
|
| - as objects (with __cmp__, __hash__, get_uri, get_local_name and
| get_rawname methods)
|
| - requires a bit of machinery in drivers to be effective
| - all operations will be slow
| - a natural way to model this
* Fred L. Drake, Jr.
|
| If the objects are implemented as a C/Java extension type, it should
| be plenty fast. A 100% Pure Python implementation can be a fallback
| if the extension isn't available.
Hmmm. That might be the way to go. I still wonder about the speed,
though.
| And the convenient tuple unpacking could also be provided using the
| object approach; the objects can easily implement the sequence
| protocol.
Good idea. This makes objects even more attractive.
| I'd be willing to write a C implementation of the object version if
| that's the API we decide on, but I'd also be fine with the third
| option.
Hmmm. Let's chew on this a little more and hear some more opinions
before deciding.
I did the benchmark I spoke of, and the results indicate that the
performance differences are very small between strings and tuples.
Also, how you put together the strings influences the speed a
bit. Benchmark run with Python 1.5.2 on Debian GNU/Linux on a Pentium
II with plenty of RAM and MHz.
[larsga@pc-larsga python]$ python sax2bench.py
Pure parsing time: 28.73
---Generic:
__main__.NamespaceFilterString 30.25
__main__.NamespaceFilterInternedString 30.85
__main__.NamespaceFilterTuple 30.15
---Specific:
__main__.NamespaceFilterString 30.71
__main__.NamespaceFilterInternedString 30.7
__main__.NamespaceFilterTuple 29.67
# A simple benchmark of various ways to represent namespace-names and
# how this affects performance.
# ==================== NAMESPACEFILTER
# This is xmlproc's normal namespace filter, but modified to use
# different name representations
import string
from xml.parsers.xmlproc import xmlapp
# --- Name objects
class SAXName:
def __init__(self, uri, localname, rawname):
self.__uri = uri
self.__localname = localname
self.__rawname = rawname
self.__hash = hash(uri) + hash(localname)
def get_uri(self):
return self.__uri
def get_localname(self):
return self.__localname
def get_rawname(self):
return self.__rawname
def __cmp__(self, other): # NB! Does not sort properly
if self.__hash == hash(other) and isinstance(other, SAXName):
return self.__uri == other.get_uri() and \
self.__localname == other.get_localname()
else:
return 0
def __hash__(self):
return self.__hash
# --- ParserFilter
class ParserFilter(xmlapp.Application):
"A generic parser filter class."
def __init__(self):
xmlapp.Application.__init__(self)
self.app=xmlapp.Application()
def set_application(self,app):
"Sets the application to report events to."
self.app=app
# --- Methods inherited from xmlapp.Application
def set_locator(self,locator):
xmlapp.Application.set_locator(self,locator)
self.app.set_locator(locator)
def doc_start(self):
self.app.doc_start()
def doc_end(self):
self.app.doc_end()
def handle_comment(self,data):
self.app.handle_comment(data)
def handle_start_tag(self,name,attrs):
self.app.handle_start_tag(name,attrs)
def handle_end_tag(self,name):
self.app.handle_end_tag(name)
def handle_data(self,data,start,end):
self.app.handle_data(data,start,end)
def handle_ignorable_data(self,data,start,end):
self.app.handle_ignorable_data(data,start,end)
def handle_pi(self,target,data):
self.app.handle_pi(target,data)
def handle_doctype(self,root,pubID,sysID):
self.app.handle_doctype(root,pubID,sysID)
def set_entity_info(self,xmlver,enc,sddecl):
self.app.set_entity_info(xmlver,enc,sddecl)
# --- NamespaceFilter
class NamespaceFilterGeneric(ParserFilter):
"""An xmlproc application that processes qualified names and reports them
as 'URI local-part' names. It reports errors through the error reporting
mechanisms of the parser."""
def __init__(self,parser):
ParserFilter.__init__(self)
self.ns_map={} # Current prefix -> URI map
self.ns_stack=[] # Pushed for each element, used to maint ns_map
self.rep_ns_attrs=0 # Report xmlns-attributes?
self.parser=parser
def set_report_ns_attributes(self,action):
"Tells the filter whether to report or delete xmlns-attributes."
self.rep_ns_attrs=action
# --- Overridden event methods
def handle_start_tag(self,name,attrs):
old_ns={} # Reset ns_map to these values when we leave this element
del_ns=[] # Delete these prefixes from ns_map when we leave element
# attrs=attrs.copy() Will have to do this if more filters are made
# Find declarations, update self.ns_map and self.ns_stack
for (a,v) in attrs.items():
if a[:6]=="xmlns:":
prefix=a[6:]
if string.find(prefix,":")!=-1:
self.parser.report_error(1900)
if v=="":
self.parser.report_error(1901)
elif a=="xmlns":
prefix=""
else:
continue
if self.ns_map.has_key(prefix):
old_ns[prefix]=self.ns_map[prefix]
else:
del_ns.append(prefix)
if prefix=="" and v=="":
del self.ns_map[prefix]
else:
self.ns_map[prefix]=v
if not self.rep_ns_attrs:
del attrs[a]
self.ns_stack.append((old_ns,del_ns))
# Process elem and attr names
name=self._process_name(name)
for (a,v) in attrs.items():
del attrs[a]
attrs[self._process_name(a)]=v
# Report event
self.app.handle_start_tag(name,attrs)
def handle_end_tag(self,name):
name=self._process_name(name)
# Clean up self.ns_map and self.ns_stack
(old_ns,del_ns)=self.ns_stack[-1]
del self.ns_stack[-1]
self.ns_map.update(old_ns)
for prefix in del_ns:
del self.ns_map[prefix]
self.app.handle_end_tag(name)
class NamespaceFilterString(NamespaceFilterGeneric):
def _process_name(self,name):
n=string.split(name,":")
if len(n)>2:
self.parser.report_error(1900)
return name
elif len(n)==2:
if n[0]=="xmlns":
return name
try:
#return string.join(self.ns_map[n[0]],n[1]) (slowest)
#return "%s %s" % (self.ns_map[n[0]],n[1]) (slower)
return self.ns_map[n[0]] + " " + n[1]
except KeyError:
self.parser.report_error(1902)
return name
elif self.ns_map.has_key("") and name!="xmlns":
return "%s %s" % (self.ns_map[""],name)
else:
return name
class NamespaceFilterInternedString(NamespaceFilterGeneric):
def _process_name(self,name):
n=string.split(name,":")
if len(n)>2:
self.parser.report_error(1900)
return name
elif len(n)==2:
if n[0]=="xmlns":
return name
try:
#return intern(string.join(self.ns_map[n[0]],n[1])) (slowest)
#return intern("%s %s" % (self.ns_map[n[0]],n[1])) (slower)
return intern(self.ns_map[n[0]] + " " + n[1])
except KeyError:
self.parser.report_error(1902)
return name
elif self.ns_map.has_key("") and name!="xmlns":
return intern("%s %s" % (self.ns_map[""],name))
else:
return name
class NamespaceFilterTuple(NamespaceFilterGeneric):
def _process_name(self,name):
n=string.split(name,":")
if len(n)>2:
self.parser.report_error(1900)
return name
elif len(n)==2:
if n[0]=="xmlns":
return name
try:
return (self.ns_map[n[0]], n[1])
except KeyError:
self.parser.report_error(1902)
return (None, name)
elif self.ns_map.has_key("") and name!="xmlns":
return (self.ns_map[""], name)
else:
return (None, name)
class NamespaceFilterObject(NamespaceFilterGeneric):
def __init__(self, parser):
NamespaceFilterGeneric.__init__(self, parser)
self.__objs = {}
def _process_name(self,name): # FIXME: implement!
n=string.split(name,":")
if len(n)>2:
self.parser.report_error(1900)
return name
elif len(n)==2:
if n[0]=="xmlns":
return name
try:
return (self.ns_map[n[0]], n[1])
except KeyError:
self.parser.report_error(1902)
return (None, name)
elif self.ns_map.has_key("") and name!="xmlns":
return (self.ns_map[""], name)
else:
return name
# ==================== GENERIC BENCHMARK
class GenericStats(xmlapp.Application):
def __init__(self):
self.__elemtypes = {}
self.__attrtypes = {}
def handle_start_tag(self, name, attrs):
try:
self.__elemtypes[name] = self.__elemtypes[name] + 1
except KeyError:
self.__elemtypes[name] = 1
for (attr, value) in attrs.items():
try:
self.__attrtypes[attr] = self.__attrtypes[attr]
except KeyError:
self.__attrtypes[attr] = 1
# ==================== SPECIFIC BENCHMARK
apt_airport = intern("http://www.megginson.com/exp/ns/airports# Airport")
apt_latitude = intern("http://www.megginson.com/exp/ns/airports# latitude")
apt_uri = "http://www.megginson.com/exp/ns/airports#"
apt_len = len(apt_uri)
rdf_uri = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
rdf_len = len(rdf_uri)
apt_airport2 = ("http://www.megginson.com/exp/ns/airports#", intern("Airport"))
apt_latitude2 = ("http://www.megginson.com/exp/ns/airports#", intern("latitude"))
class SpecificStatsString(xmlapp.Application):
def __init__(self):
self.__airports = 0
self.__with_coords = 0
self.__apt_elems = 0
self.__rdf_elems = 0
def handle_start_tag(self, name, attrs):
if name == apt_airport:
self.__airports = self.__airports + 1
elif name == apt_latitude:
self.__with_coords = self.__with_coords + 1
if name[:apt_len] == apt_uri:
self.__apt_elems = self.__apt_elems + 1
elif name[:rdf_len] == rdf_uri:
self.__rdf_elems = self.__rdf_elems + 1
class SpecificStatsTuple(xmlapp.Application):
def __init__(self):
self.__airports = 0
self.__with_coords = 0
self.__apt_elems = 0
self.__rdf_elems = 0
def handle_start_tag(self, name, attrs):
if name == apt_airport2:
self.__airports = self.__airports + 1
elif name == apt_latitude2:
self.__with_coords = self.__with_coords + 1
if name[0] == apt_uri:
self.__apt_elems = self.__apt_elems + 1
elif name[0] == rdf_uri:
self.__rdf_elems = self.__rdf_elems + 1
# ==================== MAIN PROGRAM
from xml.parsers.xmlproc import xmlproc
import time
p = xmlproc.XMLProcessor()
start = time.clock()
p.set_application(NamespaceFilterTuple(p))
p.parse_resource("airports.rdf")
used = time.clock() - start
print "Pure parsing time:", used
print
print "---Generic:"
for filter in [NamespaceFilterString, NamespaceFilterInternedString,
NamespaceFilterTuple]:
p = xmlproc.XMLProcessor()
nsfilter = filter(p)
nsfilter.set_application(GenericStats())
p.set_application(nsfilter)
start = time.clock()
p.parse_resource("airports.rdf")
used = time.clock() - start
print "%30s\t%s" % (filter, used)
print
print "---Specific:"
for (Filter, App) in [(NamespaceFilterString, SpecificStatsString),
(NamespaceFilterInternedString, SpecificStatsString),
(NamespaceFilterTuple, SpecificStatsTuple)]:
p = xmlproc.XMLProcessor()
nsfilter = Filter(p)
nsfilter.set_application(App())
p.set_application(nsfilter)
start = time.clock()
p.parse_resource("airports.rdf")
used = time.clock() - start
print "%30s\t%s" % (Filter, used)
#--Lars M.