[XML-SIG] Canonicalizing XML

Sjoerd Mullender sjoerd.mullender@oratrix.com
Mon, 23 Apr 2001 17:52:35 +0200


------- =_aaaaaaaaaa0
Content-Type: text/plain; charset="us-ascii"
Content-ID: <2357.988041096.1@bireme.oratrix.nl>

I've written a validating XML parser in Python that can produce
Canonical XML.

I'll attach it.  Usage (for getting Canonical XML):

	python fxmllib.py -c file.xml

On Fri, Apr 20 2001 Andrew Kuchling wrote:

> Has anyone written code for producing XML in Canonical XML format?
> (http://www.w3.org/TR/xml-c14n)
> 
> --amk
> 
> 
> 
> 

-- Sjoerd Mullender <sjoerd.mullender@oratrix.com>


------- =_aaaaaaaaaa0
Content-Type: text/plain; charset="us-ascii"
Content-ID: <2357.988041096.2@bireme.oratrix.nl>
Content-Description: Validating XML Parser
Content-Disposition: attachment; filename="fxmllib.py"
Content-Transfer-Encoding: quoted-printable

__version__ =3D "$Id: fxmllib.py,v 1.2 2001/04/20 15:12:49 sjoerd Exp $"

import re, string
import sys                              # need for CanonXMLParser

class Error(Exception):
    """Error class; raised when a syntax error is encountered.
       Instance variables are:
       lineno: line at which error was found;
       offset: offset into data where error was found;
       text: data in which error was found.
       If these values are unknown, they are set to None."""
    lineno =3D offset =3D text =3D filename =3D None
    def __init__(self, *args):
        self.args =3D args
        if len(args) > 1:
            self.lineno =3D args[1]
            if len(args) > 2:
                self.text =3D args[2]
                if len(args) > 3:
                    self.offset =3D args[3]
                    if len(args) > 4:
                        self.filename =3D args[4]

    def __str__(self):
        if self.filename:
            if self.lineno:
                msg =3D '"%s", line %d: ' % (self.filename, self.lineno)
            else:
                msg =3D '"%s": ' % self.filename
        elif self.lineno:
            msg =3D 'line %d: ' % self.lineno
        else:
            msg =3D ''
        return '%sSyntax error: %s' % (msg, self.args[0])

# The character sets below are taken directly from the XML spec.
_BaseChar =3D u'\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8=
-\u00FF' \
            u'\u0100-\u0131\u0134-\u013E\u0141-\u0148\u014A-\u017E' \
            u'\u0180-\u01C3\u01CD-\u01F0\u01F4-\u01F5\u01FA-\u0217' \
            u'\u0250-\u02A8\u02BB-\u02C1\u0386\u0388-\u038A\u038C' \
            u'\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D6\u03DA\u03DC\u03DE' \=

            u'\u03E0\u03E2-\u03F3\u0401-\u040C\u040E-\u044F\u0451-\u045C' =
\
            u'\u045E-\u0481\u0490-\u04C4\u04C7-\u04C8\u04CB-\u04CC' \
            u'\u04D0-\u04EB\u04EE-\u04F5\u04F8-\u04F9\u0531-\u0556\u0559' =
\
            u'\u0561-\u0586\u05D0-\u05EA\u05F0-\u05F2\u0621-\u063A' \
            u'\u0641-\u064A\u0671-\u06B7\u06BA-\u06BE\u06C0-\u06CE' \
            u'\u06D0-\u06D3\u06D5\u06E5-\u06E6\u0905-\u0939\u093D' \
            u'\u0958-\u0961\u0985-\u098C\u098F-\u0990\u0993-\u09A8' \
            u'\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09DC-\u09DD\u09DF-\u09E1' =
\
            u'\u09F0-\u09F1\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28' \
            u'\u0A2A-\u0A30\u0A32-\u0A33\u0A35-\u0A36\u0A38-\u0A39' \
            u'\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8B\u0A8D' \
            u'\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3' \
            u'\u0AB5-\u0AB9\u0ABD\u0AE0\u0B05-\u0B0C\u0B0F-\u0B10' \
            u'\u0B13-\u0B28\u0B2A-\u0B30\u0B32-\u0B33\u0B36-\u0B39\u0B3D' =
\
            u'\u0B5C-\u0B5D\u0B5F-\u0B61\u0B85-\u0B8A\u0B8E-\u0B90' \
            u'\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F\u0BA3-\u0BA4' =
\
            u'\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0C05-\u0C0C' \
            u'\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39' \
            u'\u0C60-\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8' \
            u'\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CDE\u0CE0-\u0CE1\u0D05-\u0D0C' =
\
            u'\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60-\u0D61' \
            u'\u0E01-\u0E2E\u0E30\u0E32-\u0E33\u0E40-\u0E45\u0E81-\u0E82' =
\
            u'\u0E84\u0E87-\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F' \=

            u'\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA-\u0EAB\u0EAD-\u0EAE\u0EB0' \=

            u'\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4\u0F40-\u0F47\u0F49-\u0F69' =
\
            u'\u10A0-\u10C5\u10D0-\u10F6\u1100\u1102-\u1103\u1105-\u1107' =
\
            u'\u1109\u110B-\u110C\u110E-\u1112\u113C\u113E\u1140\u114C' \
            u'\u114E\u1150\u1154-\u1155\u1159\u115F-\u1161\u1163\u1165' \
            u'\u1167\u1169\u116D-\u116E\u1172-\u1173\u1175\u119E\u11A8' \
            u'\u11AB\u11AE-\u11AF\u11B7-\u11B8\u11BA\u11BC-\u11C2\u11EB' \=

            u'\u11F0\u11F9\u1E00-\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15' \
            u'\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59' =
\
            u'\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE' \=

            u'\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB' \
            u'\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u212A-\u212B' =
\
            u'\u212E\u2180-\u2182\u3041-\u3094\u30A1-\u30FA\u3105-\u312C' =
\
            u'\uAC00-\uD7A3'
_Ideographic =3D u'\u4E00-\u9FA5\u3007\u3021-\u3029'
_CombiningChar =3D u'\u0300-\u0345\u0360-\u0361\u0483-\u0486\u0591-\u05A1\=
u05A3-\u05B9' \
                 u'\u05BB-\u05BD\u05BF\u05C1-\u05C2\u05C4\u064B-\u0652\u06=
70' \
                 u'\u06D6-\u06DC\u06DD-\u06DF\u06E0-\u06E4\u06E7-\u06E8' \=

                 u'\u06EA-\u06ED\u0901-\u0903\u093C\u093E-\u094C\u094D' \
                 u'\u0951-\u0954\u0962-\u0963\u0981-\u0983\u09BC\u09BE\u09=
BF' \
                 u'\u09C0-\u09C4\u09C7-\u09C8\u09CB-\u09CD\u09D7\u09E2-\u0=
9E3' \
                 u'\u0A02\u0A3C\u0A3E\u0A3F\u0A40-\u0A42\u0A47-\u0A48' \
                 u'\u0A4B-\u0A4D\u0A70-\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0=
AC5' \
                 u'\u0AC7-\u0AC9\u0ACB-\u0ACD\u0B01-\u0B03\u0B3C\u0B3E-\u0=
B43' \
                 u'\u0B47-\u0B48\u0B4B-\u0B4D\u0B56-\u0B57\u0B82-\u0B83' \=

                 u'\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C01-\u0=
C03' \
                 u'\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56' \=

                 u'\u0C82-\u0C83\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD' \=

                 u'\u0CD5-\u0CD6\u0D02-\u0D03\u0D3E-\u0D43\u0D46-\u0D48' \=

                 u'\u0D4A-\u0D4D\u0D57\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0E=
B1' \
                 u'\u0EB4-\u0EB9\u0EBB-\u0EBC\u0EC8-\u0ECD\u0F18-\u0F19\u0=
F35' \
                 u'\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86-\u0F8B' \
                 u'\u0F90-\u0F95\u0F97\u0F99-\u0FAD\u0FB1-\u0FB7\u0FB9' \
                 u'\u20D0-\u20DC\u20E1\u302A-\u302F\u3099\u309A'
_Digit =3D u'\u0030-\u0039\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u=
09EF' \
         u'\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE7-\u0BEF' \
         u'\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59' \
         u'\u0ED0-\u0ED9\u0F20-\u0F29'
_Extender =3D u'\u00B7\u02D0\u02D1\u0387\u0640\u0E46\u0EC6\u3005\u3031-\u3=
035' \
            u'\u309D-\u309E\u30FC-\u30FE'
_Letter =3D _BaseChar + _Ideographic
_NameChar =3D '-' + _Letter + _Digit + '._:' + _CombiningChar + _Extender

_S =3D '[ \t\r\n]+'                       # white space
_opS =3D '[ \t\r\n]*'                     # optional white space
_Name =3D '['+_Letter+'_:]['+_NameChar+']*' # XML Name
_QStr =3D "(?:'[^']*'|\"[^\"]*\")"        # quoted XML string
_Char =3D u'\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD' # legal characters

comment =3D re.compile('<!--(?P<comment>(?:[^-]|-[^-])*)-->')
space =3D re.compile(_S)
interesting =3D re.compile('[&<]')
amp =3D re.compile('&')
name =3D re.compile('^'+_Name+'$')
names =3D re.compile('^'+_Name+'(?:'+_S+_Name+')*$')

ref =3D re.compile('&(?:(?P<name>'+_Name+')|#(?P<char>(?:[0-9]+|x[0-9a-fA-=
F]+)));')
entref =3D re.compile('(?:&#(?P<char>(?:[0-9]+|x[0-9a-fA-F]+))|%(?P<pname>=
'+_Name+'));')
_attrre =3D _S+'(?P<attrname>'+_Name+')'+_opS+'=3D'+_opS+'(?P<attrvalue>'+=
_QStr+')'
attrfind =3D re.compile(_attrre)
starttag =3D re.compile('<(?P<tagname>'+_Name+')(?P<attrs>(?:'+_attrre+')*=
)'+_opS+'(?P<slash>/?)>')
endtag =3D re.compile('</(?P<tagname>'+_Name+')'+_opS+'>')

illegal =3D re.compile(r'\]\]>')
illegal1 =3D re.compile('[^'+_Char+']')

cdata =3D re.compile('<!\\[CDATA\\[(?P<cdata>(?:[^]]|\\](?!\\]>)|\\]\\](?!=
>))*)\\]\\]>')

_SystemLiteral =3D '(?P<syslit>'+_QStr+')'
_PublicLiteral =3D '(?P<publit>"[-\'()+,./:=3D?;!*#@$_%% \n\ra-zA-Z0-9]*"|=
' \
                            "'[-()+,./:=3D?;!*#@$_%% \n\ra-zA-Z0-9]*')"
_ExternalId =3D '(?:SYSTEM|PUBLIC'+_S+_PublicLiteral+')'+_S+_SystemLiteral=

externalid =3D re.compile(_ExternalId)
ndata =3D re.compile(_S+'NDATA'+_S+'(?P<name>'+_Name+')')
doctype =3D re.compile('<!DOCTYPE'+_S+'(?P<docname>'+_Name+')(?:'+_S+_Exte=
rnalId+')?'+_opS+'(?:\\[(?P<data>(?:'+_S+'|%'+_Name+';|'+comment.pattern+'=
|<(?:![^-]|[^!])(?:[^\'">]|\'[^\']*\'|"[^"]*")*>)*)\\]'+_opS+')?>')

xmldecl =3D re.compile('<\?xml'+
                     _S+'version'+_opS+'=3D'+_opS+'(?P<version>'+_QStr+')'=
+
                     '(?:'+_S+'encoding'+_opS+'=3D'+_opS+
                        "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
                        '"[A-Za-z][-A-Za-z0-9._]*"))?'
                     '(?:'+_S+'standalone'+_opS+'=3D'+_opS+
                        '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
                     _opS+'\?>')
textdecl =3D re.compile('<\?xml'
                      '(?:'+_S+'version'+_opS+'=3D'+_opS+'(?P<version>'+_Q=
Str+'))?'+
                      '(?:'+_S+'encoding'+_opS+'=3D'+_opS+
                      "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
                      '"[A-Za-z][-A-Za-z0-9._]*"))?'+
                      _opS+'\?>')
pidecl =3D re.compile('<\\?(?![xX][mM][lL][ \t\r\n?])(?P<name>'+_Name+')(?=
:'+_S+'(?P<data>(?:[^?]|\\?(?!>))*))?\\?>')

# XML NAMESPACES
_NCName =3D '['+_Letter+'_]['+'-' + _Letter + _Digit + '._' + _CombiningCh=
ar + _Extender+']*'    # XML Name, minus the ":"
ncname =3D re.compile(_NCName + '$')
qname =3D re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix=

                   '(?P<local>' + _NCName + ')$')
xmlns =3D re.compile('xmlns(?::(?P<ncname>' + _NCName + '))?$')

# DOCTYPE
_Nmtoken =3D '['+_NameChar+']+'
nmtoken =3D re.compile('^'+_Nmtoken+'$')
nmtokens =3D re.compile('^'+_Nmtoken+'(?:'+_S+_Nmtoken+')*$')
element =3D re.compile('<!ELEMENT'+_S+'(?P<name>'+_Name+')'+_S+r'(?P<conte=
nt>EMPTY|ANY|\()')
dfaelem0 =3D re.compile(_opS+r'(?P<token>\(|'+_Name+')')
dfaelem1 =3D re.compile(_opS+r'(?P<token>[)|,])')
dfaelem2 =3D re.compile(r'(?P<token>[+*?])')
mixedre =3D re.compile(r'\('+_opS+'#PCDATA'+'(('+_opS+r'\|'+_opS+_Name+')*=
'+_opS+r'\)\*|'+_opS+r'\))')
paren =3D re.compile('[()]')
attdef =3D re.compile(_S+'(?P<atname>'+_Name+')'+_S+'(?P<attype>CDATA|ID(?=
:REFS?)?|ENTIT(?:Y|IES)|NMTOKENS?|NOTATION'+_S+r'\((?P<notation>'+_opS+_Na=
me+'(?:'+_opS+r'\|'+_opS+_Name+')*'+_opS+r')\)|\('+_opS+_Nmtoken+'(?:'+_op=
S+r'\|'+_opS+_Nmtoken+')*'+_opS+r'\))'+_S+'(?P<atvalue>#REQUIRED|#IMPLIED|=
(?:#FIXED'+_S+')?(?P<atstring>'+_QStr+'))')
attlist =3D re.compile('<!ATTLIST'+_S+'(?P<elname>'+_Name+')(?P<atdef>(?:'=
+attdef.pattern+')*)'+_opS+'>')
_EntityVal =3D '"(?:[^"&%]|'+ref.pattern+'|%'+_Name+';)*"|' \
             "'(?:[^'&%]|"+ref.pattern+"|%"+_Name+";)*'"
entity =3D re.compile('<!ENTITY'+_S+'(?:%'+_S+'(?P<pname>'+_Name+')'+_S+'(=
?P<pvalue>'+_EntityVal+'|'+_ExternalId+')|(?P<ename>'+_Name+')'+_S+'(?P<va=
lue>'+_EntityVal+'|'+_ExternalId+'(?:'+_S+'NDATA'+_S+_Name+')?))'+_opS+'>'=
)
notation =3D re.compile('<!NOTATION'+_S+'(?P<name>'+_Name+')'+_S+'(?P<valu=
e>SYSTEM'+_S+_SystemLiteral+'|PUBLIC'+_S+_PublicLiteral+'(?:'+_S+_SystemLi=
teral+')?)'+_opS+'>')
peref =3D re.compile('%(?P<name>'+_Name+');')
ignore =3D re.compile(r'<!\[|\]\]>')
bracket =3D re.compile('[<>\'"%]')
conditional =3D re.compile(r'<!\['+_opS+'(?:(?P<inc>INCLUDE)|(?P<ign>IGNOR=
E))'+_opS+r'\[')

class XMLParser:
    """XMLParser([ xmlns ]) -> instance

       XML document parser.
       There is one optional argument:
       xmlns: understand XML Namespaces (default is 1)."""

    def __init__(self, xmlns =3D 1):
        self.__xmlns =3D xmlns            # whether or not to parse namesp=
aces
        self.reset()

    def reset(self):
        """reset()

           Reset parser to pristine state."""
        self.docname =3D None             # The outermost element in the d=
ocument (according to the DTD)
        self.rawdata =3D []
        self.entitydefs =3D {             # & entities defined in DTD (plu=
s the default ones)
            'lt': '&#60;',              # <
            'gt': '&#62;',              # >
            'amp': '&#38;',             # &
            'apos': '&#39;',            # '
            'quot': '&#34;',            # "
            }
        self.pentitydefs =3D {}           # % entities defined in DTD
        self.elems =3D {}                 # elements and their content/att=
rs
        self.baseurl =3D '.'              # base URL for external DTD
        self.ids =3D {}                   # IDs encountered in document
        self.notation =3D {}              # NOTATIONs
        self.doctype =3D None

    def feed(self, data):
        """feed(data)

           Feed data to parser."""
        self.rawdata.append(data)

    def close(self):
        """close()

           End of data, finish up parsing."""
        # Actually, this is where we start parsing.
        data =3D string.join(self.rawdata, '')
        self.rawdata =3D []
        self.parse(data)

    def __parse_textdecl(self, data, document =3D 0):
        # Figure out the encoding of a file by looking at the first
        # few bytes and the <?xml?> tag that may come at the very
        # beginning of the file.
        # This will convert the data to unicode from whatever format
        # it was originally.
        i =3D 0
        if data[:2] =3D=3D '\376\377':
            # UTF-16, big-endian
            enc =3D 'utf-16-be'
            i =3D 2
        elif data[:2] =3D=3D '\377\376':
            # UTF-16, little-endian
            enc =3D 'utf-16-le'
            i =3D 2
        elif data[:4] =3D=3D '\x00\x3C\x00\x3F':
            # UTF-16, big-endian
            enc =3D 'utf-16-be'
        elif data[:4] =3D=3D '\x3C\x00\x3F\x00':
            # UTF-16, little-endian
            enc =3D 'utf-16-le'
        else:
            enc =3D None                  # unknowns as yet
        if enc:
            try:
                data =3D unicode(data[i:], enc)
            except UnicodeError:
                self.__error("data cannot be converted to Unicode", data, =
i, self.baseurl, fatal =3D 1)
            i =3D 0
        # optional XMLDecl
        if document:
            res =3D xmldecl.match(data, i)
        else:
            res =3D textdecl.match(data, i)
        if res is not None:
            if document:
                version, encoding, standalone =3D res.group('version',
                                                          'encoding',
                                                          'standalone')
            else:
                version, encoding =3D res.group('version', 'encoding')
                standalone =3D None
            if version is not None and version[1:-1] !=3D '1.0':
                self.__error('only XML version 1.0 supported', data, res.s=
tart('version'), self.baseurl, fatal =3D 1)
            if encoding:
                encoding =3D encoding[1:-1]
                if enc and enc !=3D encoding.lower() and \
                   enc[:6] !=3D encoding.lower():
                    self.__error("declared encoding doesn't match actual e=
ncoding", data, res.start('encoding'), self.baseurl, fatal =3D 1)
                enc =3D encoding.lower()
            if standalone:
                standalone =3D standalone[1:-1]
##            self.handle_xml(encoding, standalone)
            i =3D res.end(0)
        if enc is None:
            # default is UTF 8
            enc =3D 'utf-8'
        if type(data) is not type(u'a'):
            try:
                data =3D unicode(data[i:], enc)
            except UnicodeError:
                self.__error("data cannot be converted to Unicode", data, =
i, self.baseurl, fatal =3D 1)
        else:
            data =3D data[i:]
        return data

    def __normalize_linefeed(self, data):
        # normalize line endings: first \r\n -> \n, then \r -> \n
        return u'\n'.join(u'\n'.join(data.split(u'\r\n')).split(u'\r'))

    def __normalize_space(self, data):
        # normalize white space: tab, linefeed and carriage return -> spac=
e
        data =3D ' '.join(data.split('\t'))
        data =3D ' '.join(data.split('\n'))
        data =3D u' '.join(data.split('\r'))
        return data

    def parse(self, data):
        """parse(data)

           Parse the data as an XML document."""
        from time import time
        t0 =3D time()
        data =3D self.__parse_textdecl(data, 1)
        data =3D self.__normalize_linefeed(data)
        # (Comment | PI | S)*
        i =3D self.__parse_misc(data, 0)
        # doctypedecl?
        res =3D doctype.match(data, i)
        if res is not None and self.doctype is None:
            docname, publit, syslit, docdata =3D res.group('docname', 'pub=
lit',
                                                        'syslit', 'data')
            self.docname =3D docname
            if publit: publit =3D string.join(string.split(publit[1:-1]))
            if syslit: syslit =3D syslit[1:-1]
            self.handle_doctype(docname, publit, syslit, docdata)
            i =3D res.end(0)
        elif self.doctype:
            # do as if there was a <!DOCTYPE> declaration
            self.handle_doctype(None, '', self.doctype, '')
        else:
            # self.doctype =3D=3D '' or no DOCTYPE
            # ignore DOCTYPE
            self.doctype =3D None
        t1 =3D time()
        # (Comment | PI | S)*
        i =3D self.__parse_misc(data, i)
        # the document itself
        res =3D starttag.match(data, i)
        if res is None:
            self.__error('no elements in document', data, i, self.baseurl,=
 fatal =3D 1)
        i =3D res.end(0)
        tagname, slash =3D res.group('tagname', 'slash')
        if self.docname and tagname !=3D self.docname:
            self.__error('starttag does not match DOCTYPE', data, res.star=
t('tagname'), self.baseurl, fatal =3D 0)
        val =3D self.__parse_attrs(tagname, data, res.start('tagname'), re=
s.span('attrs'), None)
        if val is None:
            return
        nstag, attrs, namespaces =3D val
        self.finish_starttag(nstag, attrs)
        if not slash:
            i =3D self.__parse_content(data, i, tagname, namespaces)
            if i is None:
                return
            if type(i) is type(res):
                res =3D i
            else:
                res =3D endtag.match(data, i)
            if res is None:
                self.__error('end tag missing', data, i, self.baseurl, fat=
al =3D 0)
            elif res.group('tagname') !=3D tagname:
                self.__error("end tag doesn't match start tag", data, res.=
start('tagname'), self.baseurl, fatal =3D 0)
            i =3D res.end(0)
        self.finish_endtag(nstag)
        i =3D self.__parse_misc(data, i)
        if i !=3D len(data):
            self.__error('garbage at end of document', data, i, self.baseu=
rl, fatal =3D 0)
        t2 =3D time()
        return t0, t1, t2

    def __parse_misc(self, data, i):
        # match any number of whitespace, processing instructions and comm=
ents
        matched =3D 1
        while matched:
            matched =3D 0
            res =3D comment.match(data, i)
            if res is not None:
                matched =3D 1
                c0, c1 =3D res.span('comment')
                ires =3D illegal1.search(data, c0, c1)
                if ires is not None:
                    self.__error('illegal characters in comment', data, ir=
es.start(0), self.baseurl, fatal =3D 0)
                self.handle_comment(data[c0:c1])
                i =3D res.end(0)
            res =3D pidecl.match(data, i)
            if res is not None:
                matched =3D 1
                c0, c1 =3D res.span('data')
                ires =3D illegal1.search(data, c0, c1)
                if ires is not None:
                    self.__error('illegal characters in Processing Instruc=
tion', data, ires.start(0), self.baseurl, fatal =3D 0)
                self.handle_proc(res.group('name'), res.group('data') or '=
')
                i =3D res.end(0)
            res =3D space.match(data, i)
            if res is not None:
                matched =3D 1
                i =3D res.end(0)
        return i

    def __update_state(self, dfa, states, tagname):
        # update the list of states in the dfa.  If tagname is None,
        # we're looking for the final state, so return a list of all
        # states reachable using epsilon transitions
        nstates =3D []
        seenstates =3D {}
        while states:
            s =3D states[0]
            seenstates[s] =3D 1
            del states[0]
            if tagname is not None and dfa[s].has_key(tagname):
                nstates =3D dfa[s][tagname][:]
            else:
                for s in dfa[s].get('', []):
                    if not seenstates.has_key(s):
                        states.append(s)
        if tagname is None:
            nstates =3D seenstates.keys()
        states[:] =3D nstates # change in-line

    def __check_dfa(self, dfa, initstate, tagname, data, i):
        states =3D [initstate]
        possibles =3D {}
        seenstates =3D {}
        while states:
            s =3D states[0]
            seenstates[s] =3D 1
            del states[0]
            for tag in dfa[s].keys():
                if tag and possibles.has_key(tag):
                    self.__error("non-deterministic content model for `%s'=
" % tagname, data, i, self.baseurl, fatal =3D 0)
                possibles[tag] =3D 1
            for s in dfa[s].get('', []):
                if not seenstates.has_key(s):
                    states.append(s)

    def __parse_content(self, data, i, ptagname, namespaces, states =3D No=
ne):
        # parse the content of an element (i.e. the string between
        # start tag and end tag)
        datalen =3D len(data)
        if self.elems.has_key(ptagname):
            content, attributes, start, end =3D self.elems[ptagname][:4] #=
 content model
            if states =3D=3D None:
                states =3D [start]
        else:
            content =3D None              # unknown content model
        while i < datalen:
            matched =3D 0
            res =3D interesting.search(data, i)
            if res is None:
                j =3D datalen
            else:
                j =3D res.start(0)
            if j > i:
                res =3D illegal.search(data, i, j)
                if res is not None:
                    self.__error("illegal data content in element `%s'" % =
ptagname, data, i, self.baseurl, fatal =3D 0)
                skip =3D 0
                complain =3D 0
                if content is not None:
                    res =3D space.match(data, i, j)
                    isspace =3D res is not None and res.span(0) =3D=3D (i,=
j)
                    if content =3D=3D 'EMPTY':
                        complain =3D 1
                        skip =3D 1
                    elif not isspace and  type(content) is type([]) and co=
ntent and type(content[0]) is type({}):
                        complain =3D 1
                    if complain:
                        self.__error("no character data allowed in element=
 `%s'" % ptagname, data, i, self.baseurl, fatal =3D 0)
                matched =3D 1
                if not skip:
                    self.handle_data(data[i:j])
                i =3D j
            res =3D starttag.match(data, i)
            if res is not None:
                tagname, slash =3D res.group('tagname', 'slash')
                if content =3D=3D 'EMPTY' or content =3D=3D '#PCDATA':
                    self.__error("empty element `%s' has content" % ptagna=
me, data, res.start(0), self.baseurl, fatal =3D 0)
                elif content =3D=3D 'ANY':
                    # always OK
                    pass
                elif type(content) is type([]) and content and type(conten=
t[0]) is not type({}):
                    # mixed
                    if tagname not in content:
                        self.__error("illegal content in element `%s'" % p=
tagname, data, res.start(0), self.baseurl, fatal =3D 0)
                elif content is not None:
                    self.__update_state(content, states, tagname)
                    if not states:
                        self.__error("illegal content for element `%s'" % =
ptagname, data, i, self.baseurl)
                val =3D self.__parse_attrs(tagname, data, res.start('tagna=
me'), res.span('attrs'), namespaces)
                if val is None:
                    return
                i =3D res.end(0)
                nstag, attrs, subnamespaces =3D val
                self.finish_starttag(nstag, attrs)
                if not slash:
                    i =3D self.__parse_content(data, i, tagname, subnamesp=
aces)
                    if i is None:
                        return
                    if type(i) is type(res):
                        res =3D i
                    else:
                        res =3D endtag.match(data, i)
                    if res is None:
                        self.__error('end tag missing', data, i, self.base=
url, fatal =3D 0)
                    elif res.group('tagname') !=3D tagname:
                        self.__error("end tag doesn't match start tag", da=
ta, res.start('tagname'), self.baseurl, fatal =3D 0)
                    i =3D res.end(0)
                self.finish_endtag(nstag)
                matched =3D 1
            res =3D endtag.match(data, i)
            if res is not None:
                if type(content) is type([]) and content and type(content[=
0]) is type({}):
                    self.__update_state(content, states, None)
                    if end not in states:
                        self.__error("content of element `%s' doesn't matc=
h content model" % ptagname, data, i, self.baseurl, fatal =3D 0)
                return res
            res =3D comment.match(data, i)
            if res is not None:
                c0, c1 =3D res.span('comment')
                ires =3D illegal1.search(data, c0, c1)
                if ires is not None:
                    self.__error('illegal characters in comment', data, ir=
es.start(0), self.baseurl, fatal =3D 0)
                self.handle_comment(data[c0:c1])
                i =3D res.end(0)
                matched =3D 1
            res =3D ref.match(data, i)
            if res is not None:
                name =3D res.group('name')
                if name:
                    if self.entitydefs.has_key(name):
                        sval =3D val =3D self.entitydefs[name]
                        baseurl =3D self.baseurl
                        if type(val) is type(()):
                            if val[2] is not None:
                                apply(self.handle_ndata, val)
                                val =3D None
                            else:
                                val =3D self.__read_pentity(val[0], val[1]=
)
                        if val is not None:
                            del self.entitydefs[name] # to break recursion=

                            n =3D self.__parse_content(val, 0, ptagname, n=
amespaces, states)
                            self.entitydefs[name] =3D sval # restore value=

                        if val is not None:
                            if n is None:
                                self.baseurl =3D baseurl
                                return
                            if type(n) is type(res) or n !=3D len(val):
                                if type(n) is type(res):
                                    n =3D res.start(0)
                                self.__error('misformed entity value', dat=
a, n, self.baseurl, fatal =3D 0)
                        self.baseurl =3D baseurl
                    else:
                        if self.docname:
                            self.__error("unknown entity reference `&%s;' =
in element `%s'" % (name, ptagname), data, i, self.baseurl, fatal =3D 0)
                        self.data =3D data
                        self.offset =3D res.start('name')
                        self.lineno =3D string.count(data, '\n', 0, self.o=
ffset)
                        self.unknown_entityref(name)
                else:
                    str =3D self.__parse_charref(res.group('char'), data, =
res.start(0))
                    if str is None:
                        return
                    self.handle_data(str)
                i =3D res.end(0)
                matched =3D 1
            res =3D pidecl.match(data, i)
            if res is not None:
                matched =3D 1
                c0, c1 =3D res.span('data')
                ires =3D illegal1.search(data, c0, c1)
                if ires is not None:
                    self.__error('illegal characters in Processing Instruc=
tion', data, ires.start(0), self.baseurl, fatal =3D 0)
                self.handle_proc(res.group('name'), res.group('data') or '=
')
                i =3D res.end(0)
            res =3D cdata.match(data, i)
            if res is not None:
                matched =3D 1
                c0, c1 =3D res.span('cdata')
                ires =3D illegal1.search(data, c0, c1)
                if ires is not None:
                    self.__error('illegal characters in CDATA section', da=
ta, ires.start(0), self.baseurl, fatal =3D 0)
                self.handle_cdata(res.group('cdata'))
                i =3D res.end(0)
            if not matched:
                self.__error("no valid content in element `%s'" % ptagname=
, data, i, self.baseurl)
                return
        return i

    def __check_attr(self, tagname, attrname, value, attributes, data, att=
rstart):
        # check that the attribute attrname on element tagname is of
        # the correct type with a legal value
        # return the normalized value (i.e. white space collapsed if
        # appropriate)
        # XXX this method needs work to be complete
        attype, atvalue, atstring =3D attributes[attrname]
        if atvalue[:6] =3D=3D '#FIXED':
            if value !=3D atstring:
                self.__error("attribute `%s' in element `%s' does not have=
 correct value" % (attrname, tagname), data, attrstart, self.baseurl, fata=
l =3D 0)
        if attype =3D=3D 'CDATA':
            return value                # always OK and don't change value=

        if type(attype) is type([]):    # enumeration
            if value not in attype:
                self.__error("attribute `%s' in element `%s' not valid" % =
(attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            return value
        if type(attype) is type(()):
            if value not in attype[1]:
                self.__error("attribute `%s' in element `%s' not valid" % =
(attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            return value
        if attype =3D=3D 'ID':
            if name.match(value) is None:
                self.__error("attribute `%s' in element `%s' is not an ID"=
 % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            if self.ids.has_key(value):
                self.__error("attrbute `%s' in element `%s' is not unique"=
 %  (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            self.ids[value] =3D 1
            return value
        if attype =3D=3D 'IDREF':
            if name.match(value) is None:
                self.__error("attrbute `%s' in element `%s' is not an IDRE=
F" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            # XXX should check ID exists
            return value
        if attype =3D=3D 'IDREFS':
            if names.match(value) is None:
                self.__error("attrbute `%s' in element `%s' is not an IDRE=
FS" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            # XXX should check IDs exist
            return value
        if attype =3D=3D 'NMTOKEN':
            if nmtoken.match(value) is None:
                self.__error("attrbute `%s' in element `%s' is not a NMTOK=
EN" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            return value
        if attype =3D=3D 'NMTOKENS':
            if nmtokens.match(value) is None:
                self.__error("attrbute `%s' in element `%s' is not a NMTOK=
ENS" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            return value
        if attype =3D=3D 'ENTITY':
            if name.match(value) is None:
                self.__error("attrbute `%s' in element `%s' is not an ENTI=
TY" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            # XXX should check ENTITY exists
            return value
        if attype =3D=3D 'ENTITIES':
            if names.match(value) is None:
                self.__error("attrbute `%s' in element `%s' is not an ENTI=
TIES" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            # XXX should check ENTITIES exist
            return value
        # XXX other types?
        return value

    def __parse_attrs(self, tagname, data, tagstart, span, namespaces):
        # parse the string between the tag name and closing bracket
        # for attribute=3Dvalue pairs
        i, dataend =3D span
        attrlist =3D []
        namespace =3D None
        reqattrs =3D {}                   # attributes that are #REQUIRED
        if self.elems.has_key(tagname):
            attributes =3D self.elems[tagname][1]
            for key, (attype, atvalue, atstring) in attributes.items():
                if atvalue =3D=3D '#REQUIRED':
                    reqattrs[key] =3D 1
            attrseen =3D {}               # attributes that we've seen
        else:
            attributes =3D None
        while i < dataend:
            res =3D attrfind.match(data, i, dataend)
            if res is None:
                # couldn't match any attributes, but there is more
                # string to parse: complain and ignore rest of string
                self.__error('bad attributes', data, i, self.baseurl, fata=
l =3D 0)
                return
            name =3D res.group('attrname')
            if reqattrs.has_key(name):
                del reqattrs[name]      # seen this #REQUIRED attribute
            if attributes is not None and attributes.has_key(name):
                attype =3D attributes[name][0]
            else:
                attype =3D None
            start, end =3D res.span('attrvalue')
            value =3D self.__parse_attrval(data, attype, span =3D (start+1=
, end-1))
            if value is None:
                # bad attribute value: ignore, but continue parsing
                i =3D res.end(0)
                continue
            attrstart =3D res.start('attrname')
            if attributes is not None:
                if attributes.has_key(name):
                    attrseen[name] =3D 1
                    value =3D self.__check_attr(tagname, name, value, attr=
ibutes, data, attrstart)
                else:
                    self.__error("unknown attribute `%s' on element `%s'" =
% (name, tagname), data, attrstart, self.baseurl, fatal =3D 0)
            i =3D res.end(0)
            if self.__xmlns:
                res =3D xmlns.match(name)
                if res is not None:
                    # namespace declaration
                    ncname =3D res.group('ncname')
                    if namespace is None:
                        namespace =3D {}
                    namespace[ncname or ''] =3D value or None
                    continue
            attrlist.append((name, value, attrstart))
        if reqattrs:
            # there are #REQUIRED attributes that we haven't seen
            reqattrs =3D reqattrs.keys()
            reqattrs.sort()
            if len(reqattrs) > 1:
                s =3D 's'
            else:
                s =3D ''
            reqattrs =3D string.join(reqattrs, "', `")
            self.__error("required attribute%s `%s' of element `%s' missin=
g" % (s, reqattrs, tagname), data, dataend, self.baseurl, fatal =3D  0)
        if attributes is not None:
            # fill in missing attributes that have a default value
            for key, (attype, atvalue, atstring) in attributes.items():
                if atstring is not None and not attrseen.has_key(key):
                    attrlist.append((key, atstring, dataend))
        if namespace is not None:
            namespaces =3D (namespace, namespaces)
        if namespaces is not None:
            res =3D qname.match(tagname)
            if res is not None:
                prefix, nstag =3D res.group('prefix', 'local')
                if prefix is None: prefix =3D ''
                ns =3D None
                n =3D namespaces
                while n is not None:
                    d, n =3D n
                    if d.has_key(prefix):
                        ns =3D d[prefix]
                        break
                if ns is not None:
                    tagname =3D ns + ' ' + nstag
                elif prefix !=3D '':
                    self.__error("unknown namespace prefix `%s'" % prefix,=
 data, tagstart, self.baseurl, fatal =3D 0)
            else:
                self.__error("badly formed tag name `%s'" % tagname, data,=
 tagstart, self.baseurl, fatal =3D 0)
        attrdict =3D {}                   # collect attributes/values
        for attr, value, attrstart in attrlist:
            if namespaces is not None:
                res =3D qname.match(attr)
                if res is not None:
                    prefix, nsattr =3D res.group('prefix', 'local')
                    if prefix:
                        ans =3D None
                        n =3D namespaces
                        while n is not None:
                            d, n =3D n
                            if d.has_key(prefix):
                                ans =3D d[prefix]
                                break
                        if ans is not None:
                            attr =3D ans + ' ' + nsattr
                        elif prefix !=3D '':
                            self.__error("unknown namespace prefix `%s'" %=
 prefix, data, attrstart, self.baseurl, fatal =3D 0)
                else:
                    self.__error("badly formed attribute name `%s'" % attr=
, data, attrstart, self.baseurl, fatal =3D 0)
            if attrdict.has_key(attr):
                self.__error("duplicate attribute name `%s'" % attr, data,=
 attrstart, self.baseurl, fatal =3D 0)
            attrdict[attr] =3D value
        return tagname, attrdict, namespaces

    def __parse_attrval(self, data, attype, span =3D None):
        # parse an attribute value, replacing entity and character
        # references with their values
        if span is None:
            i =3D 0
            dataend =3D len(data)
        else:
            i, dataend =3D span
        res =3D illegal1.search(data, i, dataend)
        if res is not None:
            self.__error("illegal characters in attribute value", data, re=
s.start(0), self.baseurl, fatal =3D 0)
        newval =3D []
        while i < dataend:
            res =3D interesting.search(data, i, dataend)
            if res is None:
                str =3D data[i:dataend]
                if attype is None or attype =3D=3D 'CDATA':
                    str =3D self.__normalize_space(str)
                newval.append(str)
                break
            j =3D res.start(0)
            if data[j] =3D=3D '<':
                self.__error("no `<' allowed in attribute value", data, j,=
 self.baseurl, fatal =3D 0)
            if j > i:
                str =3D data[i:j]
                if attype is None or attype =3D=3D 'CDATA':
                    str =3D self.__normalize_space(str)
                newval.append(str)
            res =3D ref.match(data, j, dataend)
            if res is None:
                self.__error('illegal attribute value', data, j, self.base=
url, fatal =3D 0)
                newval.append(data[j])  # the &
                i =3D j + 1               # continue searching after the &=

                continue
            i =3D res.end(0)
            name =3D res.group('name')
            if name:
                # entity reference (e.g. "&lt;")
                if self.entitydefs.has_key(name):
                    val =3D self.entitydefs[name]
                    if type(val) is type(()):
                        self.__error("no external parsed entity allowed in=
 attribute value", data, res.start(0), self.baseurl, fatal =3D 1)
                    del self.entitydefs[name]
                    nval =3D self.__parse_attrval(val, attype)
                    self.entitydefs[name] =3D val
                    if nval is None:
                        return
                    newval.append(nval)
                else:
                    self.__error("reference to unknown entity `%s'" % name=
, data, res.start(0), self.baseurl, fatal =3D 0)
                    newval.append('&%s;' % name)
            else:
                val =3D self.__parse_charref(res.group('char'), data, res.=
start(0))
                if val is None:
                    newval.append('&#%s;' % res.group('char'))
                    continue
                newval.append(val)
        str =3D string.join(newval, '')
        if attype is not None and attype !=3D 'CDATA':
            str =3D string.join(string.split(str))
        return str

    def __parse_charref(self, name, data, i):
        # parse a character reference (e.g. "%#38;")
        # the "name" arg is just part between # and ;
        if name[0] =3D=3D 'x':
            # e.g. &#x26;
            n =3D int(name[1:], 16)
        else:
            # e.g. &#38;
            n =3D int(name)
        try:
            c =3D unichr(n)
        except ValueError:
            self.__error('bad character reference', data, i, self.baseurl,=
 fatal =3D 0)
            return
        if illegal1.search(c):
            self.__error('bad character reference', data, i, self.baseurl,=
 fatal =3D 0)
        return c

    def __read_pentity(self, publit, syslit):
        import urllib
        syslit =3D urllib.basejoin(self.baseurl, syslit)
        baseurl =3D self.baseurl
        self.baseurl =3D syslit
        val =3D self.read_external(publit, syslit)
        val =3D self.__parse_textdecl(val)
        return self.__normalize_linefeed(val)

    def parse_dtd(self, data, internal =3D 1):
        """parse_dtd(data[, internal ])

           Parse the DTD.
           This method is called by the parse_doctype method and is
           provided so that parse_doctype can be overridden.
           Argument is a string containing the full DTD.
           Optional argument internal is true (default) if the DTD is
           internal."""
        i =3D 0
        matched =3D 1
        ilevel =3D 0                      # nesting level of ignored secti=
ons
        while i < len(data) and matched:
            matched =3D 0
            res =3D peref.match(data, i)
            if res is not None:
                matched =3D 1
                name =3D res.group('name')
                if self.pentitydefs.has_key(name):
                    val =3D self.pentitydefs[name]
                    baseurl =3D self.baseurl
                    if type(val) is type(()):
                        val =3D self.__read_pentity(val[0], val[1])
                    self.parse_dtd(val, internal)
                    self.baseurl =3D baseurl
                else:
                    self.__error("unknown entity `%%%s;'" % name, data, i,=
 self.baseurl, fatal =3D 0)
                i =3D res.end(0)
            res =3D element.match(data, i)
            if res is not None:
                matched =3D 1
                name, content =3D res.group('name', 'content')
                i =3D res.end(0)
                elemval =3D (None, {}, None, None, None)
                if self.elems.has_key(name):
                    elemval =3D self.elems[name]
                    if elemval[0] is not None:
                        # XXX is this an error?
                        self.__error('non-unique element name declaration'=
, data, i, self.baseurl, fatal =3D 0)
                    elif content =3D=3D 'EMPTY':
                        # check for NOTATION on EMPTY element
                        for atname, (attype, atvalue, atstring) in elemval=
[1].items():
                            if type(attype) is type(()) and attype[0] =3D=
=3D 'NOTATION':
                                self.__error("NOTATION not allowed on EMPT=
Y element", data, i, self.baseurl)
                if content[0] =3D=3D '(':
                    i =3D res.start('content')
                    j, content, start, end =3D self.__dfa(data, i)
                    if type(content) is type([]) and content and type(cont=
ent[0]) is type({}):
                        self.__check_dfa(content, start, name, data, i)
                    contentstr =3D data[i:j]
                    i =3D j
                else:
                    contentstr =3D content
                    start =3D end =3D 0
                self.elems[name] =3D (content, elemval[1], start, end, con=
tentstr)
                res =3D space.match(data, i)
                if res is not None:
                    i =3D res.end(0)
                if data[i:i+1] !=3D '>':
                    self.__error('bad DOCTYPE', data, i, self.baseurl)
                    return
                i =3D i+1
            res =3D attlist.match(data, i)
            if res is not None:
                matched =3D 1
                elname, atdef =3D res.group('elname', 'atdef')
                if not self.elems.has_key(elname):
                    self.elems[elname] =3D (None, {}, None, None, None)
                ares =3D attdef.match(atdef)
                while ares is not None:
                    atname, attype, atvalue, atstring =3D ares.group('atna=
me', 'attype', 'atvalue', 'atstring')
                    if attype[0] =3D=3D '(':
                        attype =3D map(string.strip, string.split(attype[1=
:-1], '|'))
                    elif attype[:8] =3D=3D 'NOTATION':
                        if self.elems[elname][0] =3D=3D 'EMPTY':
                            self.__error("NOTATION not allowed on EMPTY el=
ement", data, ares.start('attype'), self.baseurl)
                        atnot =3D map(string.strip, string.split(ares.grou=
p('notation'), '|'))
                        attype =3D ('NOTATION', atnot)
                    if atstring:
                        atstring =3D atstring[1:-1] # remove quotes
                        atstring =3D self.__parse_attrval(atstring, attype=
)
                        if attype !=3D 'CDATA':
                            atstring =3D string.join(string.split(atstring=
))
                        else:
                            atstring =3D string.join(string.split(atstring=
, '\t'), ' ')
                    if type(attype) is type([]):
                        if atstring is not None and atstring not in attype=
:
                            self.__error("default value for attribute `%s'=
 on element `%s' not listed as possible value" % (atname, elname), data, i=
, self.baseurl)
                    elif type(attype) is type(()):
                        if atstring is not None and atstring not in attype=
[1]:
                            self.__error("default value for attribute `%s'=
 on element `%s' not listed as possible value" % (atname, elname), data, i=
, self.baseurl)
                    if not self.elems[elname][1].has_key(atname):
                        # first definition counts
                        self.elems[elname][1][atname] =3D attype, atvalue,=
 atstring
                    ares =3D attdef.match(atdef, ares.end(0))
                i =3D res.end(0)
            res =3D entity.match(data, i)
            if res is not None:
                matched =3D 1
                pname, name =3D res.group('pname', 'ename')
                if pname:
                    pvalue =3D res.group('pvalue')
                    if pvalue[0] in ('"',"'"):
                        c0, c1 =3D res.span('pvalue')
                        ires =3D illegal1.search(data, c0+1, c1-1)
                        if ires is not None:
                            self.__error("illegal characters in entity val=
ue", data, ires.start(0), self.baseurl, fatal =3D 0)
                    if self.pentitydefs.has_key(pname):
                        # first definition counts
                        pass
                    elif pvalue[0] in ('"',"'"):
                        pvalue =3D pvalue[1:-1]
                        pvalue =3D self.__normalize_space(pvalue)
                        cres =3D entref.search(pvalue)
                        while cres is not None:
                            chr, nm =3D cres.group('char', 'pname')
                            if chr:
                                repl =3D self.__parse_charref(cres.group('=
char'), data, i)
                            elif self.pentitydefs.has_key(nm):
                                repl =3D self.pentitydefs[nm]
                            else:
                                self.__error("unknown entity `%s' referenc=
ed" % nm, data, i, self.baseurl)
                                repl =3D '%%%s;' % nm
                            if type(repl) is type(()):
                                baseurl =3D self.baseurl
                                repl =3D self.__read_pentity(repl[0], repl=
[1])
                                self.baseurl =3D baseurl
                            pvalue =3D pvalue[:cres.start(0)] + repl + pva=
lue[cres.end(0):]
                            cres =3D entref.search(pvalue, cres.start(0)+l=
en(repl))
                        self.pentitydefs[pname] =3D pvalue
                    else:
                        r =3D externalid.match(pvalue)
                        publit, syslit =3D r.group('publit', 'syslit')
                        if publit: publit =3D string.join(string.split(pub=
lit[1:-1]))
                        if syslit: syslit =3D syslit[1:-1]
                        self.pentitydefs[pname] =3D publit, syslit
                else:
                    value =3D res.group('value')
                    if value[0] in ('"',"'"):
                        c0, c1 =3D res.span('value')
                        ires =3D illegal1.search(data, c0+1, c1-1)
                        if ires is not None:
                            self.__error("illegal characters in entity val=
ue", data, ires.start(0), self.baseurl, fatal =3D 0)
                    if self.entitydefs.has_key(name):
                        # use first definition
                        pass
                    elif value[0] in ('"',"'"):
                        value =3D value[1:-1]
                        value =3D self.__normalize_space(value)
                        cres =3D entref.search(value)
                        while cres is not None:
                            chr, nm =3D cres.group('char', 'pname')
                            if chr:
                                repl =3D self.__parse_charref(cres.group('=
char'), data, i)
                            elif self.pentitydefs.has_key(nm):
                                repl =3D self.pentitydefs[nm]
                                if type(repl) is type(()):
                                    baseurl =3D self.baseurl
                                    repl =3D self.__read_pentity(repl[0], =
repl[1])
                                    self.baseurl =3D baseurl
                            else:
                                self.__error("unknown entity `%s' referenc=
ed" % nm, data, i, self.baseurl)
                                repl =3D '%%%s;' % nm
                            value =3D value[:cres.start(0)] + repl + value=
[cres.end(0):]
                            cres =3D entref.search(value, cres.start(0)+le=
n(repl))
                        self.entitydefs[name] =3D value
                    else:
                        r =3D externalid.match(value)
                        publit, syslit =3D r.group('publit', 'syslit')
                        if publit: publit =3D string.join(string.split(pub=
lit[1:-1]))
                        if syslit: syslit =3D syslit[1:-1]
                        r1 =3D ndata.match(value, r.end(0))
                        if r1 is not None:
                            ndataname =3D r1.group('name')
                        else:
                            ndataname =3D None
                        self.entitydefs[name] =3D publit, syslit, ndatanam=
e
                i =3D res.end(0)
            res =3D notation.match(data, i)
            if res is not None:
                matched =3D 1
                name, value =3D res.group('name', 'value')
                if not self.notation.has_key(name):
                    self.notation[name] =3D value
                i =3D res.end(0)
            j =3D i                       # remember where we were
            i =3D self.__parse_misc(data, i)
            matched =3D matched or i > j  # matched anything?
            if not internal:
                if data[i:i+1] =3D=3D '<':
                    hlevel =3D 1
                    quote =3D None
                    j =3D i+1
                    while hlevel > 0:
                        res =3D bracket.search(data, j)
                        if res is None:
                            self.__error("unexpected EOF", data, i, self.b=
aseurl, fatal =3D 1)
                        j =3D res.end(0)
                        c =3D data[res.start(0)]
                        if c =3D=3D '<':
                            hlevel =3D hlevel + 1
                        elif quote and c =3D=3D quote:
                            quote =3D None
                        elif c in ('"', "'"):
                            quote =3D c
                        elif c =3D=3D '>':
                            hlevel =3D hlevel - 1
                        elif hlevel =3D=3D 1 and not quote:
                            # only expand parsed entities at lowest level
                            res =3D peref.match(data, res.start(0))
                            if res is not None:
                                pname =3D res.group('name')
                                if self.pentitydefs.has_key(pname):
                                    repl =3D self.pentitydefs[pname]
                                    if type(repl) is type(()):
                                        baseurl =3D self.baseurl
                                        repl =3D self.__read_pentity(repl[=
0], repl[1])
                                        self.baseurl =3D baseurl
                                    data =3D data[:res.start(0)] + ' ' + r=
epl + ' ' + data[res.end(0):]
                                    j =3D res.start(0) + len(repl) + 2
                                else:
                                    j =3D res.end(0)
                res =3D conditional.match(data, i)
                if res is not None:
                    inc, ign =3D res.group('inc', 'ign')
                    i =3D res.end(0)
                    if ign:
                        level =3D 1
                        while level > 0:
                            res =3D ignore.search(data, i)
                            if res.start(0) =3D=3D '<':
                                level =3D level + 1
                            else:
                                level =3D level - 1
                            i =3D res.end(0)
                    elif inc:
                        ilevel =3D ilevel + 1
                if ilevel and data[i:i+3] =3D=3D ']]>':
                    i =3D i+3
                    ilevel =3D ilevel - 1
        if i < len(data):
            self.__error('error while parsing DOCTYPE', data, i, self.base=
url)

    def __dfa(self, data, i):
        res =3D mixedre.match(data, i)
        if res is not None:
            mixed =3D res.group(0)
            if mixed[-1] =3D=3D '*':
                mixed =3D map(string.strip, string.split(mixed[1:-2], '|')=
)
            else:
                mixed =3D '#PCDATA'
            return res.end(0), mixed, 0, 0
        dfa =3D []
        i, start, end =3D self.__dfa1(data, i, dfa)
        return i, dfa, start, end

    def __dfa1(self, data, i, dfa):
        res =3D dfaelem0.match(data, i)
        if res is None:
            self.__error("syntax error in element content: `(' or Name exp=
ecter", data, i, self.baseurl, fatal =3D 1)
        token =3D res.group('token')
        if token =3D=3D '(':
            i, start, end =3D self.__dfa1(data, res.end(0), dfa)
            res =3D dfaelem1.match(data, i)
            if res is None:
                self.__error("syntax error in element content: `)', `|', o=
r `,' expected", data, i, self.baseurl, fatal =3D 1)
            token =3D res.group('token')
            sep =3D token
            while token in (',','|'):
                if sep !=3D token:
                    self.__error("syntax error in element content: `%s' or=
 `)' expected" % sep, data, i, self.baseurl, fatal =3D 1)
                i, nstart, nend =3D self.__dfa1(data, res.end(0), dfa)
                res =3D dfaelem1.match(data, i)
                if res is None:
                    self.__error("syntax error in element content: `%s' or=
 `)' expected" % sep, data, i, self.baseurl, fatal =3D 1)
                token =3D res.group('token')
                if sep =3D=3D ',':
                    # concatenate DFAs
                    e =3D dfa[end].get('', [])
                    e.append(nstart)
                    dfa[end][''] =3D e
                    end =3D nend
                else:
                    # make parallel
                    s =3D len(dfa)
                    dfa.append({'': [start, nstart]})
                    e =3D dfa[end].get('', [])
                    e.append(len(dfa))
                    dfa[end][''] =3D e
                    e =3D dfa[nend].get('', [])
                    e.append(len(dfa))
                    dfa[nend][''] =3D e
                    start =3D s
                    end =3D len(dfa)
                    dfa.append({})
            # token =3D=3D ')'
            i =3D res.end(0)
        else:
            # it's a Name
            start =3D len(dfa)
            dfa.append({token: [start+1]})
            end =3D len(dfa)
            dfa.append({})
            i =3D res.end(0)
        res =3D dfaelem2.match(data, i)
        if res is not None:
            token =3D res.group('token')
            s =3D len(dfa)
            e =3D s+1
            if token =3D=3D '+':
                dfa.append({'': [start]})
            else:
                dfa.append({'': [start, e]})
            dfa.append({})
            l =3D dfa[end].get('', [])
            dfa[end][''] =3D l
            if token !=3D '?':
                l.append(start)
            l.append(e)
            start =3D s
            end =3D e
            i =3D res.end(0)
        return i, start, end

    def parse_doctype(self, tag, publit, syslit, data):
        """parse_doctype(tag, publit, syslit, data)

           Parse the DOCTYPE.

           This method is called by the handle_doctype callback method
           and is provided so that handle_doctype can be overridden.
           The arguments are:
           tag: the name of the outermost element of the document;
           publit: the Public Identifier of the DTD (or None);
           syslit: the System Literal of the DTD (or None);
           data: the internal subset of the DTD (or None)."""
        if data:
            self.parse_dtd(data)
        if syslit:
            import urllib
            syslit =3D urllib.basejoin(self.baseurl, syslit)
            baseurl =3D self.baseurl
            self.baseurl =3D syslit
            external =3D self.read_external(publit, syslit)
            external =3D self.__parse_textdecl(external)
            external =3D self.__normalize_linefeed(external)
            self.parse_dtd(external, 0)
            self.baseurl =3D baseurl

    def __error(self, message, data =3D None, i =3D None, filename =3D Non=
e, fatal =3D 1):
        # called for all syntax errors
        # this either raises an exception (Error) or calls
        # self.syntax_error which may be overridden
        if data is not None and i is not None:
            self.lineno =3D lineno =3D string.count(data, '\n', 0, i) + 1
        else:
            self.lineno =3D None
        self.data =3D data
        self.offset =3D i
        self.filename =3D filename
        if fatal:
            raise Error(message, lineno, data, i, filename)
        self.syntax_error(message)

    # Overridable -- handle xml processing instruction
    def handle_xml(self, encoding, standalone):
        pass

    # Overridable -- handle DOCTYPE
    def handle_doctype(self, tag, publit, syslit, data):
        if self.doctype is not None:
            syslit =3D self.doctype
        self.parse_doctype(tag, publit, syslit, data)

    # Example -- read external file referenced from DTD with a SystemLiter=
al
    def read_external(self, publit, syslit):
        return ''

    # Example -- handle comment, could be overridden
    def handle_comment(self, data):
        pass

    # Example -- handle processing instructions, could be overridden
    def handle_proc(self, name, data):
        pass

    # Example -- handle data, should be overridden
    def handle_data(self, data):
        pass

    # Example -- handle cdata, should be overridden
    def handle_cdata(self, data):
        pass

    elements =3D {}                       # dict: tagname -> (startfunc, e=
ndfunc)
    def finish_starttag(self, tagname, attrs):
        method =3D self.elements.get(tagname, (None, None))[0]
        if method is None:
            self.unknown_starttag(tagname, attrs)
        else:
            self.handle_starttag(tagname, method, attrs)

    def finish_endtag(self, tagname):
        method =3D self.elements.get(tagname, (None, None))[1]
        if method is None:
            self.unknown_endtag(tagname)
        else:
            self.handle_endtag(tagname, method)

    # Overridable -- handle start tag
    def handle_starttag(self, tagname, method, attrs):
        method(tagname, attrs)

    # Overridable -- handle end tag
    def handle_endtag(self, tagname, method):
        method(tagname)

    # To be overridden -- handlers for unknown objects
    def unknown_starttag(self, tagname, attrs):
        pass

    def unknown_endtag(self, tagname):
        pass

    def unknown_entityref(self, name):
        self.__error('reference to unknown entity', self.data, self.offset=
, self.baseurl)

    # Example -- handle relatively harmless syntax errors, could be overri=
dden
    def syntax_error(self, message):
        raise Error(message, self.lineno, self.data, self.offset, self.fil=
ename)

class TestXMLParser(XMLParser):

    def __init__(self, xmlns =3D 1):
        self.testdata =3D ""
        XMLParser.__init__(self, xmlns)

    def handle_xml(self, encoding, standalone):
        self.flush()
        print 'xml: encoding =3D %s standalone =3D %s' % (encoding, standa=
lone)

    def read_external(self, publit, syslit):
        print 'reading %s' % name
        try:
            import urllib
            u =3D urllib.urlopen(syslit)
            data =3D u.read()
            u.close()
        except 'x':
            return ''
        return data

    def handle_doctype(self, tag, publit, syslit, data):
        self.flush()
        print 'DOCTYPE: %s %s' % (tag, `data`)
        XMLParser.handle_doctype(self, tag, publit, syslit, data)

    def handle_comment(self, data):
        self.flush()
        r =3D `data`
        if len(r) > 68:
            r =3D r[:32] + '...' + r[-32:]
        print 'comment: %s' % r

    def handle_proc(self, name, data):
        self.flush()
        print 'processing: %s %s' % (name,`data`)

    def handle_data(self, data):
        self.testdata =3D self.testdata + data
        if len(`self.testdata`) >=3D 70:
            self.flush()

    def handle_cdata(self, data):
        self.flush()
        print 'cdata: %s' % `data`

    def flush(self):
        data =3D self.testdata
        if data:
            self.testdata =3D ""
            print 'data: %s ' % `data`

##    def syntax_error(self, message):
##        if self.lineno is not None:
##            print 'Syntax error at line %d: %s' % (self.lineno, message)=

##        else:
##            print 'Syntax error: %s' % message

    def unknown_starttag(self, tag, attrs):
        self.flush()
        if not attrs:
            print 'start tag: <%s>' % tag
        else:
            print 'start tag: <%s' % tag,
            for name, value in attrs.items():
                print '%s =3D "%s"' % (name.encode('latin-1'), `value`),
            print '>'

    def unknown_endtag(self, tag):
        self.flush()
        print 'end tag: </%s>' % tag

    def unknown_entityref(self, name):
        self.flush()
        print '&%s;' % name

class CanonXMLParser(XMLParser):
    __cache =3D {}

    def read_external(self, publit, syslit):
        if publit and self.__cache.has_key(publit):
            return self.__cache[publit]
        try:
            import urllib
            u =3D urllib.urlopen(syslit)
            data =3D u.read()
            u.close()
        except 'x':
            return ''
        if publit:
            self.__cache[publit] =3D data
        return data

    def handle_data(self, data):
        sys.stdout.write(self.encode(data))

    def handle_cdata(self, data):
        sys.stdout.write(self.encode(data))

    def handle_proc(self, name, data):
        sys.stdout.write('<?%s %s?>' % (name.encode('utf-8'), data.strip()=
.encode('utf-8')))

    def unknown_starttag(self, tag, attrs):
        sys.stdout.write('<%s' % tag.encode('utf-8'))
        attrlist =3D attrs.items()
        attrlist.sort()
        for name, value in attrlist:
            sys.stdout.write(' %s=3D"%s"' % (name.encode('utf-8'), self.en=
code(value)))
        sys.stdout.write('>')

    def unknown_endtag(self, tag):
        sys.stdout.write('</%s>' % tag.encode('utf-8'))

    def unknown_entityref(self, name):
        print '&%s;' % name.encode('utf-8')

    def encode(self, data):
        for c, tr in [('&', '&amp;'),
                      ('>', '&gt;'),
                      ('<', '&lt;'),
                      ('"', '&quot;'),
                      ('\t', '&#9;'),
                      ('\n', '&#10;'),
                      ('\r', '&#13;')]:
            data =3D tr.join(data.split(c))
        return data.encode('utf-8')

class CheckXMLParser(XMLParser):
    __cache =3D {}

    def read_external(self, publit, syslit):
        if publit and self.__cache.has_key(publit):
            return self.__cache[publit]
        try:
            import urllib
            u =3D urllib.urlopen(syslit)
            data =3D u.read()
            u.close()
        except 'x':
            return ''
        if publit:
            self.__cache[publit] =3D data
        return data

def test(args =3D None):
    import sys, getopt

    if not args:
        args =3D sys.argv[1:]

    opts, args =3D getopt.getopt(args, 'cstnvCd:')
    klass =3D TestXMLParser
    do_time =3D 0
    namespace =3D 1
    verbose =3D 0
    doctype =3D None
    for o, a in opts:
        if o =3D=3D '-c':
            klass =3D CanonXMLParser
        elif o =3D=3D '-C':
            klass =3D CheckXMLParser
        elif o =3D=3D '-s':
            klass =3D XMLParser
        elif o =3D=3D '-t':
            do_time =3D 1
        elif o =3D=3D '-n':
            namespace =3D 0
        elif o =3D=3D '-v':
            verbose =3D 1
        elif o =3D=3D '-d':
            doctype =3D a

    if not args:
        args =3D ['test.xml']

    for file in args:
        if file =3D=3D '-':
            f =3D sys.stdin
            url =3D '.'
        else:
            try:
                f =3D open(file, 'r')
            except IOError, msg:
                print file, ":", msg
                sys.exit(1)
            import urllib
            url =3D urllib.pathname2url(file)

        data =3D f.read()
        if f is not sys.stdin:
            f.close()

        x =3D klass(xmlns =3D namespace)
        x.baseurl =3D url
        x.doctype =3D doctype
        if verbose:
            print '=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D',file
        try:
            t0, t1, t2 =3D x.parse(data)
        except Error, info:
            do_time =3D 0                 # can't print times now
            print str(info)
            if info.text is not None and info.offset is not None:
                i =3D string.rfind(info.text, '\n', 0, info.offset) + 1
                j =3D string.find(info.text, '\n', info.offset)
                if j =3D=3D -1: j =3D len(info.text)
                try:
                    print info.text[i:j]
                except UnicodeError:
                    print `info.text[i:j]`
                else:
                    print ' '*(info.offset-i)+'^'
        if klass is CanonXMLParser and (verbose or len(args) > 1):
            sys.stdout.write('\n')
        if do_time:
            print 'total time: %g' % (t2-t0)
            print 'parse DTD: %g' % (t1-t0)
            print 'parse body: %g' %(t2-t1)

if __name__ =3D=3D '__main__':
    test()

------- =_aaaaaaaaaa0--