[XML-SIG] DOM notes, and xml.marshal module

Andrew M. Kuchling akuchlin@cnri.reston.va.us
Sun, 19 Apr 1998 16:40:49 -0400 (EDT)


I've written a first cut at a marshal module that converts a simple
Python data structure to and from a simple XML representation, using
the DOM implementation.  The code's included below.  Some notes:

	* There's one problem with xml.marshal at the moment; you
can't pickle multiple objects to the same stream because, when you
read the data again, the parser doesn't read one data item and stop,
but reads them all.

	For example, None is converted to a <none/> tag; if you pickle
None to the same file object twice, you get <none/><none/>.  But when
you parse this, the parser builds a tree containing both tags.  If an
XML document must contain a single top-level element, then I think
parsers should recognize when that top-level element has been
completed and stop.

	Any thoughts on this question?  What's the correct behaviour?

	* The Walker class's walk1() method isn't consistent in
returning values.  walk() does "return self.walk1()", but walk1()
never returns anything; this should probably be fixed.  For
xml.marshal, I therefore overrode the walk1() method, but I'm not sure
that's how Walker is intended to be used.

	On the other hand, unmarshalling using just startElement(),
endElement(), and doText() would have been more complicated, so
overriding was the easiest thing to do.

-- 
A.M. Kuchling				http://starship.skyport.net/crew/amk/
Despair says little, and is patient.
	-- From SANDMAN: "Season of Mists", episode 0



# xml.marshal : Marshals simple Python data types into an XML-based
# format.  The interface is the same as the built-in module of the
# same name, with four functions: 
#   dump(value, file), load(file)
#   dumps(value), loads(string)

#  XXX Should provide a DTD for the XML format here.

from xml.dom.builder import Builder
from xml.dom.writer import XmlWriter, XmlLineariser
from types import *

# Dictionary mapping some of the simple types to the corresponding tag
_mapping = {StringType:'string', IntType:'int', 
	   FloatType:'float'}

# Internal function; recursively marshals a simple Python data type,
# acting on a DOM Builder object.  

def _marshal(value, tree):
    t = type(value)
    if _mapping.has_key( t ):
	name = _mapping[t]
	tree.startElement(name, {})
	tree.text( str(value) )
	tree.endElement(name)

    elif t == LongType:
	tree.startElement('long', {})
	tree.text( str(value)[:-1] )	# Chop off the trailing 'L'
	tree.endElement('long')

    elif t == TupleType:
	tree.startElement('tuple', {})
	for elem in value:
	    _marshal(elem, tree)
	tree.endElement('tuple')

    elif t == ListType:
	tree.startElement('list', {})
	for elem in value:
	    _marshal(elem, tree)
	tree.endElement('list')

    elif t == DictType:
	tree.startElement('dict', {})
	for key, v in value.items():
	    _marshal(key, tree)
	    _marshal(v, tree)
	tree.endElement('dict')

    elif t == NoneType:
	tree.startElement('none', {})
	tree.endElement('none')

    elif t == ComplexType:
	tree.startElement('complex', {})

	tree.startElement('real', {})
	tree.text( str(value.real) )
	tree.endElement('real')

	tree.startElement('imag', {})
	tree.text( str(value.imag) )
	tree.endElement('imag')

	tree.endElement('complex')

    elif t == CodeType:
	# The full information about code objects is only available
	# from the C level, so we'll use the built-in marshal module
	# to convert the code object into a string, and include it in
	# the HTML.
	import marshal
	tree.startElement('code', {})
	tree.text( marshal.dumps(value) )
	tree.endElement('code')

    return tree

# The following class walks over a DOM tree, constructing the Python
# data objects for each node.
# XXX This was done by subclassing Walker and overriding the walk1()
# method; is this the way Walker is supposed to be used?

from xml.dom.walker import Walker
from xml.dom.core import *

class UnmarshallingWalker(Walker):
    def walk1(self, node):
	assert node.NodeType == ELEMENT
	n = node.tagName
	if n == 'tuple' or n=='list':
	    L = []
	    children = node.getChildren()
	    children = filter(lambda x: x.NodeType == ELEMENT, children)
	    for child in children:
		if child.NodeType == ELEMENT:
		    L.append( self.walk1(child) )
	    if n == 'tuple': return tuple (L)
	    else: return L

	elif n == 'dict':
	    d = {}
	    children = node.getChildren()
	    children = filter(lambda x: x.NodeType == ELEMENT, children)
	    assert (len(children) % 2) ==0
	    for i in range(0, len(children), 2):
		key = self.walk1(children[i]) 
		value = self.walk1(children[i+1]) 
		d[key] = value
	    return d

	elif n=='none': return None

	elif n=='complex':
	    children = node.getChildren()
	    children = filter(lambda x: x.NodeType == ELEMENT, children)
	    assert len(children) == 2
	    real = self.walk1(children[0])
	    imag = self.walk1(children[1])
	    return complex(real, imag)
	
	elif n == 'code':
	    children = node.getChildren()
	    assert len(children) == 1
	    child = children[0]
	    assert child.NodeType == TEXT
	    data = child.data
	    import marshal
	    return marshal.loads(data)
	elif n == 'string': 
	    d = ""
	    children = node.getChildren()
	    for child in children:
		assert child.NodeType == TEXT
		d = d + child.data
	    return d
	else:
	    children = node.getChildren()
	    assert len(children) == 1
	    child = children[0]
	    assert child.NodeType == TEXT
	    data = child.data
	    if n == 'int': return int(data)
	    elif n == 'long': return long(data)
	    elif n == 'float' or n=='real' or n=='imag': return float(data)


def dump(value, file):
    "Write the value on the open file"
    builder = _marshal(value, Builder() )
    w = XmlWriter( file ) 
    w.newline_after_start = ['list', 'tuple', 'dict']
    w.newline_after_end = ['list', 'tuple', 'dict', 'none', 'int']
    w.write(builder.document)

def load(file):
    "Read one value from the open file"
    import xml.sax.saxlib, xml.sax.drv_xmllib
    from xml.dom.sax_builder import SaxBuilder

    p = xml.sax.drv_xmllib.SAX_XLParser()
    dh = SaxBuilder()
    p.setDocumentHandler(dh)
    p.parse('', file)
    u = UnmarshallingWalker()
    return u.walk(dh.document)

def dumps(value):
    "Marshal value, returning the resulting string"
    builder = _marshal(value, Builder() )
    w = XmlLineariser( )
    w.newline_after_start = ['list', 'tuple', 'dict']
    w.newline_after_end = ['list', 'tuple', 'dict', 'none', 'int', 'long', 'float', 'complex', 'string']
    return w.linearise( builder.document )

def loads(string):
    "Read one value from the string"
    import StringIO
    file = StringIO.StringIO(string)
    return load(file)

if __name__ == '__main__':
    print "Testing XML marshalling..."
    L=[None, 1, pow(2,123L), 19.72, 1+5j, 
       "here is a string <fake tag>",
       (1,2,3), 
       ['alpha', 'beta', 'gamma'], 
       {'key':'value', 1:2}, 
       dumps.func_code ]

    # Try all the above bits of data
    import StringIO
    print "The second and third numbers in each line should both be 1."

    for item in L + [ L ]:
	s = dumps(item)
	output = loads(s)

	# Try it from a file
	file = StringIO.StringIO()
	dump(item, file)
	file.seek(0)
	output2 = load(file)

	# Verify that the parser only reads as far as is required
	# XXX this test currently fails (see text of posting)
	##file = StringIO.StringIO( 2 * dumps(item) )
	##print file.getvalue()
	##output3 = load( file )
	##output4 = load( file )
 

	print repr(item), item==output, item==output2