[Python-checkins] CVS: python/dist/src/Lib/xml/dom minidom.py,NONE,1.1 pulldom.py,NONE,1.1

Fred L. Drake python-dev@python.org
Thu, 29 Jun 2000 12:39:59 -0700


Update of /cvsroot/python/python/dist/src/Lib/xml/dom
In directory slayer.i.sourceforge.net:/tmp/cvs-serv5599

Added Files:
	minidom.py pulldom.py 
Log Message:

Paul Prescod <paul@prescod.net>:
W3C DOM implementation for Python.


--- NEW FILE ---
import pulldom
import string
from StringIO import StringIO
import types

"""
minidom.py -- a lightweight DOM implementation based on SAX.

Todo:
=====
 * convenience methods for getting elements and text.
 * more testing
 * bring some of the writer and linearizer code into conformance with this
        interface
 * SAX 2 namespaces
"""

class Node:
    ELEMENT_NODE                = 1
    ATTRIBUTE_NODE              = 2
    TEXT_NODE                   = 3
    CDATA_SECTION_NODE          = 4
    ENTITY_REFERENCE_NODE       = 5
    ENTITY_NODE                 = 6
    PROCESSING_INSTRUCTION_NODE = 7
    COMMENT_NODE                = 8
    DOCUMENT_NODE               = 9
    DOCUMENT_TYPE_NODE          = 10
    DOCUMENT_FRAGMENT_NODE      = 11
    NOTATION_NODE               = 12

    allnodes=[]

    def __init__( self ):
        self.childNodes=[]
        Node.allnodes.append( repr( id( self ))+repr( self.__class__ ))

    def __getattr__( self, key ):
        if key[0:2]=="__": raise AttributeError
        # getattr should never call getattr!
        if self.__dict__.has_key("inGetAttr"): 
            del self.inGetAttr
            raise AttributeError, key

        prefix,attrname=key[:5],key[5:]
        if prefix=="_get_":
            self.inGetAttr=1
            if hasattr( self, attrname ): 
                del self.inGetAttr
                return (lambda self=self, attrname=attrname: 
                                getattr( self, attrname ))
            else:
                del self.inGetAttr
                raise AttributeError, key
        else:
            self.inGetAttr=1
            try:
                func = getattr( self, "_get_"+key )
            except AttributeError:
                raise AttributeError, key
            del self.inGetAttr
            return func()

    def __nonzero__(self): return 1

    def toxml( self ):
        writer=StringIO()
        self.writexml( writer )
        return writer.getvalue()

    def hasChildNodes( self ):
        if self.childNodes: return 1
        else: return 0

    def insertBefore( self, newChild, refChild):
        index=self.childNodes.index( refChild )
        self.childNodes.insert( index, newChild )

    def appendChild( self, node ):
        self.childNodes.append( node )

    def unlink( self ):
        self.parentNode=None
        while self.childNodes:
            self.childNodes[-1].unlink()
            del self.childNodes[-1] # probably not most efficient!
        self.childNodes=None
        if self.attributes:
            for attr in self.attributes.values():
                attr.unlink()
        self.attributes=None
        index=Node.allnodes.index( repr( id( self ))+repr( self.__class__ ))
        del Node.allnodes[index]

def _write_data( writer, data):
    "Writes datachars to writer."
    data=string.replace(data,"&","&amp;")
    data=string.replace(data,"<","&lt;")
    data=string.replace(data,"\"","&quot;")
    data=string.replace(data,">","&gt;")
    writer.write(data)

def _closeElement( element ):
    del element.parentNode
    for node in element.elements:
        _closeElement( node )

def _getElementsByTagNameHelper( parent, name, rc ):
    for node in parent.childNodes:
        if node.nodeType==Node.ELEMENT_NODE and\
            (name=="*" or node.tagName==name):
            rc.append( node )
        _getElementsByTagNameHelper( node, name, rc )
    return rc

def _getElementsByTagNameNSHelper( parent, nsURI, localName, rc ):
    for node in parent.childNodes:
        if (node.nodeType==Node.ELEMENT_NODE ):
            if ((localName=="*" or node.tagName==localName) and
            (nsURI=="*" or node.namespaceURI==nsURI)):
                rc.append( node )
            _getElementsByTagNameNSHelper( node, name, rc )

class Attr(Node):
    nodeType=Node.ATTRIBUTE_NODE
    def __init__( self, qName, namespaceURI="", prefix="",
                  localName=None ):
        Node.__init__( self )
        assert qName
        # skip setattr for performance
        self.__dict__["nodeName"] = self.__dict__["name"] = qName
        self.__dict__["localName"]=localName or qName
        self.__dict__["prefix"]=prefix
        self.__dict__["namespaceURI"]=namespaceURI
        # nodeValue and value are set elsewhere
        self.attributes=None

    def __setattr__( self, name, value ):
        if name in ("value", "nodeValue" ):
            self.__dict__["value"]=self.__dict__["nodeValue"]=value
        else:
            self.__dict__[name]=value

class AttributeList:
    # the attribute list is a transient interface to the underlying dictionaries
    # mutations here will change the underlying element's dictionary
    def __init__( self, attrs, attrsNS ):
        self.__attrs=attrs
        self.__attrsNS=attrs
        self.length=len( self.__attrs.keys() )

    def item( self, index ):
        try:
            return self[self.keys()[index]]
        except IndexError:
            return None
        
    def items( self ):
        return map( lambda node: (node.tagName, node.value),
                    self.__attrs.values() )

    def itemsNS( self ):
        return map( lambda node: ((node.URI, node.localName), node.value),
                    self.__attrs.values() )
    
    def keys( self ):
        return self.__attrs.keys()

    def keysNS( self ):
        return self.__attrsNS.keys()

    def values( self ):
        return self.__attrs.values()

    def __len__( self ):
        return self.length

    def __cmp__( self, other ):
        if self.__attrs is other.__attrs: 
            return 0
        else: 
            return cmp( id( self ), id( other ) )

    #FIXME: is it appropriate to return .value?
    def __getitem__( self, attname_or_tuple ):
        if type( attname_or_tuple ) == type( (1,2) ):
            return self.__attrsNS[attname_or_tuple].value
        else:
            return self.__attrs[attname_or_tuple].value

    def __setitem__( self, attname ):
        raise TypeError, "object does not support item assignment"
        
class Element( Node ):
    nodeType=Node.ELEMENT_NODE
    def __init__( self, tagName, namespaceURI="", prefix="",
                  localName=None ):
        Node.__init__( self )
        self.tagName = self.nodeName = tagName
        self.localName=localName or tagName
        self.prefix=prefix
        self.namespaceURI=namespaceURI
        self.nodeValue=None

        self.__attrs={}  # attributes are double-indexed:
        self.__attrsNS={}#    tagName -> Attribute
                #    URI,localName -> Attribute
                # in the future: consider lazy generation of attribute objects
                #                this is too tricky for now because of headaches
                #                with namespaces.

    def getAttribute( self, attname ):
        return self.__attrs[attname].value

    def getAttributeNS( self, namespaceURI, localName ):
        return self.__attrsNS[(namespaceURI, localName)].value
    
    def setAttribute( self, attname, value ):
        attr=Attr( attname )
        # for performance
        attr.__dict__["value"]=attr.__dict__["nodeValue"]=value
        self.setAttributeNode( attr )

    def setAttributeNS( self, namespaceURI, qualifiedName, value ):
        attr=createAttributeNS( namespaceURI, qualifiedName )
        # for performance
        attr.__dict__["value"]=attr.__dict__["nodeValue"]=value
        self.setAttributeNode( attr )

    def setAttributeNode( self, attr ):
        self.__attrs[attr.name]=attr
        self.__attrsNS[(attr.namespaceURI,attr.localName)]=attr

    def removeAttribute( self, name ):
        attr = self.__attrs[name]
        self.removeAttributeNode( attr )

    def removeAttributeNS( self, namespaceURI, localName ):
        attr = self.__attrsNS[(uri, localName)]
        self.removeAttributeNode( attr )

    def removeAttributeNode( self, node ):
        del self.__attrs[node.name]
        del self.__attrsNS[(node.namespaceURI, node.localName)]
        
    def getElementsByTagName( self, name ):
        return _getElementsByTagNameHelper( self, name, [] )

    def getElementsByTagNameNS(self,namespaceURI,localName):
        _getElementsByTagNameNSHelper( self, namespaceURI, localName, [] )

    def __repr__( self ):
        return "<DOM Element:"+self.tagName+" at "+`id( self )` +" >"

    def writexml(self, writer):
        writer.write("<"+self.tagName)
            
        a_names=self._get_attributes().keys()
        a_names.sort()

        for a_name in a_names:
            writer.write(" "+a_name+"=\"")
            _write_data(writer, self._get_attributes()[a_name])
            writer.write("\"")
        if self.childNodes:
            writer.write(">")
            for node in self.childNodes:
                node.writexml( writer )
            writer.write("</"+self.tagName+">")
        else:
            writer.write("/>")

    def _get_attributes( self ):
        return AttributeList( self.__attrs, self.__attrsNS )

class Comment( Node ):
    nodeType=Node.COMMENT_NODE
    def __init__(self, data ):
        Node.__init__( self )
        self.data=self.nodeValue=data
        self.nodeName="#comment"
        self.attributes=None

    def writexml( self, writer ):
        writer.write( "<!--" + self.data + "-->" )

class ProcessingInstruction( Node ):
    nodeType=Node.PROCESSING_INSTRUCTION_NODE
    def __init__(self, target, data ):
        Node.__init__( self )
        self.target = self.nodeName = target
        self.data = self.nodeValue = data
        self.attributes=None

    def writexml( self, writer ):
        writer.write( "<?" + self.target +" " + self.data+ "?>" )

class Text( Node ):
    nodeType=Node.TEXT_NODE
    nodeName="#text"
    def __init__(self, data ):
        Node.__init__( self )
        self.data = self.nodeValue = data
        self.attributes=None

    def __repr__(self):
        if len( self.data )> 10:
            dotdotdot="..."
        else:
            dotdotdot=""
        return "<DOM Text node \"" + self.data[0:10] + dotdotdot+"\">"

    def writexml( self, writer ):
        _write_data( writer, self.data )

class Document( Node ):
    nodeType=Node.DOCUMENT_NODE
    def __init__( self ):
        Node.__init__( self )
        self.documentElement=None
        self.attributes=None
        self.nodeName="#document"
        self.nodeValue=None

    createElement=Element

    createTextNode=Text

    createComment=Comment

    createProcessingInstruction=ProcessingInstruction

    createAttribute=Attr

    def createElementNS(self, namespaceURI, qualifiedName):
        fields = string.split(qualifiedName, ':')
        if len(fields) == 2:
            prefix = fields[0]
            localName = fields[1]
        elif len(fields) == 1:
            prefix = ''
            localName = fields[0]            
        return Element(self, qualifiedName, namespaceURI, prefix, localName)

    def createAttributeNS(self, namespaceURI, qualifiedName):
        fields = string.split(qualifiedName,':')
        if len(fields) == 2:
            localName = fields[1]
            prefix = fields[0]
        elif len(fields) == 1:
            localName = fields[0]
            prefix = None
        return Attr(qualifiedName, namespaceURI, prefix, localName)

    def getElementsByTagNameNS(self,namespaceURI,localName):
        _getElementsByTagNameNSHelper( self, namespaceURI, localName )

    def close( self ):
        for node in self.elements:
            _closeElement( node )

    def unlink( self ):
        self.documentElement=None
        Node.unlink( self )

    def getElementsByTagName( self, name ):
        rc=[]
        _getElementsByTagNameHelper( self, name, rc )
        return rc

    def writexml( self, writer ):
        for node in self.childNodes:
            node.writexml( writer )

def _doparse( func, args, kwargs ):
    events=apply( func, args, kwargs )
    (toktype, rootNode)=events.getEvent()
    events.expandNode( rootNode )
    return rootNode

def parse( *args, **kwargs ):
    return _doparse( pulldom.parse, args, kwargs )

def parseString( *args, **kwargs ):
    return _doparse( pulldom.parseString, args, kwargs )

--- NEW FILE ---
import minidom
import types
import string
import sys
import pyexpat
from xml.sax import ExpatParser

#todo: SAX2/namespace handling

START_ELEMENT="START_ELEMENT"
END_ELEMENT="END_ELEMENT"
COMMENT="COMMENT"
START_DOCUMENT="START_DOCUMENT"
END_DOCUMENT="END_DOCUMENT"
PROCESSING_INSTRUCTION="PROCESSING_INSTRUCTION"
IGNORABLE_WHITESPACE="IGNORABLE_WHITESPACE"
CHARACTERS="CHARACTERS"

class PullDOM:
    def __init__( self ):
        self.firstEvent=[None,None]
        self.lastEvent=self.firstEvent

    def setDocumentLocator( self, locator ): pass

    def startElement( self, tagName , attrs  ):
        if not hasattr( self, "curNode" ):
            # FIXME: hack!
            self.startDocument( )

        node = self.document.createElement( tagName ) #FIXME namespaces!
        for attr in attrs.keys():
            node.setAttribute( attr, attrs[attr] )
        
        parent=self.curNode
        node.parentNode = parent
        if parent.childNodes:
            node.previousSibling=parent.childNodes[-1]
            node.previousSibling.nextSibling=node
        self.curNode = node
        # FIXME: do I have to screen namespace attributes
        self.lastEvent[1]=[(START_ELEMENT, node), None ]
        self.lastEvent=self.lastEvent[1]
        #self.events.append( (START_ELEMENT, node) )

    def endElement( self, name ):
        node = self.curNode
        self.lastEvent[1]=[(END_ELEMENT, node), None ]
        self.lastEvent=self.lastEvent[1]
        #self.events.append( (END_ELEMENT, node ))
        self.curNode = node.parentNode

    def comment( self, s):
        node = self.document.createComment ( s )
        parent=self.curNode
        node.parentNode=parent
        if parent.childNodes:
            node.previousSibling=parent.childNodes[-1]
            node.previousSibling.nextSibling=node
        self.lastEvent[1]=[(COMMENT, node), None ]
        self.lastEvent=self.lastEvent[1]
        #self.events.append( (COMMENT, node ))

    def processingInstruction( self, target, data ):
        node = self.document.createProcessingInstruction( target, data )
        #self.appendChild( node )
        
        parent=self.curNode
        node.parentNode=parent
        if parent.childNodes:
            node.previousSibling=parent.childNodes[-1]
            node.previousSibling.nextSibling=node
        self.lastEvent[1]=[(PROCESSING_INSTRUCTION, node), None ]
        self.lastEvent=self.lastEvent[1]
        #self.events.append( (PROCESSING_INSTRUCTION, node) )

    def ignorableWhitespace( self, chars ):
        node = self.document.createTextNode( chars[start:start+length] )
        parent=self.curNode
        node.parentNode=parent
        if parent.childNodes:
            node.previousSibling=parent.childNodes[-1]
            node.previousSibling.nextSibling=node
        self.lastEvent[1]=[(IGNORABLE_WHITESPACE, node), None ]
        self.lastEvent=self.lastEvent[1]
        #self.events.append( (IGNORABLE_WHITESPACE, node))

    def characters( self, chars ):
        node = self.document.createTextNode( chars )
        node.parentNode=self.curNode
        self.lastEvent[1]=[(CHARACTERS, node), None ]
        self.lastEvent=self.lastEvent[1]

    def startDocument( self ):
        node = self.curNode = self.document = minidom.Document()
        node.parentNode=None
        self.lastEvent[1]=[(START_DOCUMENT, node), None ]
        self.lastEvent=self.lastEvent[1]
        #self.events.append( (START_DOCUMENT, node) )
            
    def endDocument( self ):
        assert( not self.curNode.parentNode )
        for node in self.curNode.childNodes:
                if node.nodeType==node.ELEMENT_NODE:
                        self.document.documentElement = node
        #if not self.document.documentElement:
        #        raise Error, "No document element"

        self.lastEvent[1]=[(END_DOCUMENT, node), None ]
        #self.events.append( (END_DOCUMENT, self.curNode) )

class ErrorHandler:
    def warning( self, exception ):
        print exception
    def error( self, exception ):
        raise exception 
    def fatalError( self, exception ):
        raise exception 

class DOMEventStream:
    def __init__( self, stream, parser, bufsize ):
        self.stream=stream
        self.parser=parser
        self.bufsize=bufsize
        self.reset()

    def reset( self ):
        self.pulldom = PullDOM()
        self.parser.setContentHandler( self.pulldom )

    def __getitem__( self, pos ):
        rc=self.getEvent()
        if rc: return rc
        raise IndexError

    def expandNode( self, node ):
        event=self.getEvent()
        while event:
            token,cur_node=event
            if cur_node is node: return
            
            if token !=END_ELEMENT:
                cur_node.parentNode.childNodes.append( cur_node )
            event=self.getEvent()
        if node.nodeType==minidom.Node.DOCUMENT_NODE:
            for child in node.childNodes:
                if child.nodeType==minidom.Node.ELEMENT_NODE:
                    node.documentElement=child

    def getEvent( self ):
        if not self.pulldom.firstEvent[1]:
            self.pulldom.lastEvent=self.pulldom.firstEvent
        while not self.pulldom.firstEvent[1]:
            buf=self.stream.read( self.bufsize )
            if not buf:
                #FIXME: why doesn't Expat close work?
                #self.parser.close()
                return None
            self.parser.feed( buf )
        rc=self.pulldom.firstEvent[1][0]
        self.pulldom.firstEvent[1]=self.pulldom.firstEvent[1][1]
        return rc

# FIXME: sax2
#def _getParser( ):
 #   from xml.sax.saxexts import make_parser
    # expat doesn't report errors properly! Figure it out
  #  return make_parser()
   # return make_parser("xml.sax.drivers.drv_xmllib")


        
def _getParser():
     return ExpatParser()

default_bufsize=(2**14)-20
# FIXME: move into sax package for common usage
def parse( stream_or_string, parser=None, bufsize=default_bufsize ):
    if type( stream_or_string ) == type( "" ):
        stream=open( stream_or_string )
    else:
        stream=stream_or_string
    if not parser: 
        parser=_getParser()
    return DOMEventStream( stream, parser, bufsize )

def parseString( string, parser=None ):
    try:
        import cStringIO
        stringio=cStringIO.StringIO
    except ImportError:
        import StringIO
        stringio=StringIO.StringIO
        
    bufsize=len( string )
    stringio( string )
    parser=_getParser()
    return DOMEventStream( buf, parser, bufsize )

#FIXME: Use Lars' instead!!!
class SAX_expat:
    "SAX driver for the Pyexpat C module."

    def __init__(self):
        self.parser=pyexpat.ParserCreate()
        self.started=0

    def setDocumentHandler( self, handler ):
        self.parser.StartElementHandler = handler.startElement
        self.parser.EndElementHandler = handler.endElement
        self.parser.CharacterDataHandler = handler.datachars
        self.parser.ProcessingInstructionHandler = handler.processingInstruction
        self.doc_handler=handler

    def setErrorHandler( self, handler ):
        self.err_handler=handler

    # --- Locator methods. Only usable after errors.

    def getLineNumber(self):
        return self.parser.ErrorLineNumber

    def getColumnNumber(self):
        return self.parser.ErrorColumnNumber    

    # --- Internal

    def __report_error(self):
        msg=pyexpat.ErrorString(self.parser.ErrorCode)
        self.err_handler.fatalError(msg)

    # --- EXPERIMENTAL PYTHON SAX EXTENSIONS
        
    def get_parser_name(self):
        return "pyexpat"

    def get_parser_version(self):
        return "Unknown"

    def get_driver_version(self):
        return version
    
    def is_validating(self):
        return 0

    def is_dtd_reading(self):
        return 0

    def reset(self):
        self.parser=pyexpat.ParserCreate()
        self.parser.StartElementHandler = self.startElement
        self.parser.EndElementHandler = self.endElement
        self.parser.CharacterDataHandler = self.characters
        self.parser.ProcessingInstructionHandler = self.processingInstruction
    
    def feed(self,data):
        if not self.started:
            self.doc_handler.startDocument()
            self.started=1  
        if not self.parser.Parse(data):
            self.__report_error()

    def close(self):
        if not self.parser.Parse("",1):
            self.__report_error()
        self.doc_handler.endDocument()
        self.parser = None