[XML-SIG] Canonicalizing XML
Sjoerd Mullender
sjoerd.mullender@oratrix.com
Mon, 23 Apr 2001 17:52:35 +0200
------- =_aaaaaaaaaa0
Content-Type: text/plain; charset="us-ascii"
Content-ID: <2357.988041096.1@bireme.oratrix.nl>
I've written a validating XML parser in Python that can produce
Canonical XML.
I'll attach it. Usage (for getting Canonical XML):
python fxmllib.py -c file.xml
On Fri, Apr 20 2001 Andrew Kuchling wrote:
> Has anyone written code for producing XML in Canonical XML format?
> (http://www.w3.org/TR/xml-c14n)
>
> --amk
>
>
>
>
-- Sjoerd Mullender <sjoerd.mullender@oratrix.com>
------- =_aaaaaaaaaa0
Content-Type: text/plain; charset="us-ascii"
Content-ID: <2357.988041096.2@bireme.oratrix.nl>
Content-Description: Validating XML Parser
Content-Disposition: attachment; filename="fxmllib.py"
Content-Transfer-Encoding: quoted-printable
__version__ =3D "$Id: fxmllib.py,v 1.2 2001/04/20 15:12:49 sjoerd Exp $"
import re, string
import sys # need for CanonXMLParser
class Error(Exception):
"""Error class; raised when a syntax error is encountered.
Instance variables are:
lineno: line at which error was found;
offset: offset into data where error was found;
text: data in which error was found.
If these values are unknown, they are set to None."""
lineno =3D offset =3D text =3D filename =3D None
def __init__(self, *args):
self.args =3D args
if len(args) > 1:
self.lineno =3D args[1]
if len(args) > 2:
self.text =3D args[2]
if len(args) > 3:
self.offset =3D args[3]
if len(args) > 4:
self.filename =3D args[4]
def __str__(self):
if self.filename:
if self.lineno:
msg =3D '"%s", line %d: ' % (self.filename, self.lineno)
else:
msg =3D '"%s": ' % self.filename
elif self.lineno:
msg =3D 'line %d: ' % self.lineno
else:
msg =3D ''
return '%sSyntax error: %s' % (msg, self.args[0])
# The character sets below are taken directly from the XML spec.
_BaseChar =3D u'\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8=
-\u00FF' \
u'\u0100-\u0131\u0134-\u013E\u0141-\u0148\u014A-\u017E' \
u'\u0180-\u01C3\u01CD-\u01F0\u01F4-\u01F5\u01FA-\u0217' \
u'\u0250-\u02A8\u02BB-\u02C1\u0386\u0388-\u038A\u038C' \
u'\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D6\u03DA\u03DC\u03DE' \=
u'\u03E0\u03E2-\u03F3\u0401-\u040C\u040E-\u044F\u0451-\u045C' =
\
u'\u045E-\u0481\u0490-\u04C4\u04C7-\u04C8\u04CB-\u04CC' \
u'\u04D0-\u04EB\u04EE-\u04F5\u04F8-\u04F9\u0531-\u0556\u0559' =
\
u'\u0561-\u0586\u05D0-\u05EA\u05F0-\u05F2\u0621-\u063A' \
u'\u0641-\u064A\u0671-\u06B7\u06BA-\u06BE\u06C0-\u06CE' \
u'\u06D0-\u06D3\u06D5\u06E5-\u06E6\u0905-\u0939\u093D' \
u'\u0958-\u0961\u0985-\u098C\u098F-\u0990\u0993-\u09A8' \
u'\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09DC-\u09DD\u09DF-\u09E1' =
\
u'\u09F0-\u09F1\u0A05-\u0A0A\u0A0F-\u0A10\u0A13-\u0A28' \
u'\u0A2A-\u0A30\u0A32-\u0A33\u0A35-\u0A36\u0A38-\u0A39' \
u'\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8B\u0A8D' \
u'\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2-\u0AB3' \
u'\u0AB5-\u0AB9\u0ABD\u0AE0\u0B05-\u0B0C\u0B0F-\u0B10' \
u'\u0B13-\u0B28\u0B2A-\u0B30\u0B32-\u0B33\u0B36-\u0B39\u0B3D' =
\
u'\u0B5C-\u0B5D\u0B5F-\u0B61\u0B85-\u0B8A\u0B8E-\u0B90' \
u'\u0B92-\u0B95\u0B99-\u0B9A\u0B9C\u0B9E-\u0B9F\u0BA3-\u0BA4' =
\
u'\u0BA8-\u0BAA\u0BAE-\u0BB5\u0BB7-\u0BB9\u0C05-\u0C0C' \
u'\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39' \
u'\u0C60-\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8' \
u'\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CDE\u0CE0-\u0CE1\u0D05-\u0D0C' =
\
u'\u0D0E-\u0D10\u0D12-\u0D28\u0D2A-\u0D39\u0D60-\u0D61' \
u'\u0E01-\u0E2E\u0E30\u0E32-\u0E33\u0E40-\u0E45\u0E81-\u0E82' =
\
u'\u0E84\u0E87-\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F' \=
u'\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA-\u0EAB\u0EAD-\u0EAE\u0EB0' \=
u'\u0EB2-\u0EB3\u0EBD\u0EC0-\u0EC4\u0F40-\u0F47\u0F49-\u0F69' =
\
u'\u10A0-\u10C5\u10D0-\u10F6\u1100\u1102-\u1103\u1105-\u1107' =
\
u'\u1109\u110B-\u110C\u110E-\u1112\u113C\u113E\u1140\u114C' \
u'\u114E\u1150\u1154-\u1155\u1159\u115F-\u1161\u1163\u1165' \
u'\u1167\u1169\u116D-\u116E\u1172-\u1173\u1175\u119E\u11A8' \
u'\u11AB\u11AE-\u11AF\u11B7-\u11B8\u11BA\u11BC-\u11C2\u11EB' \=
u'\u11F0\u11F9\u1E00-\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15' \
u'\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59' =
\
u'\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE' \=
u'\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB' \
u'\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u212A-\u212B' =
\
u'\u212E\u2180-\u2182\u3041-\u3094\u30A1-\u30FA\u3105-\u312C' =
\
u'\uAC00-\uD7A3'
_Ideographic =3D u'\u4E00-\u9FA5\u3007\u3021-\u3029'
_CombiningChar =3D u'\u0300-\u0345\u0360-\u0361\u0483-\u0486\u0591-\u05A1\=
u05A3-\u05B9' \
u'\u05BB-\u05BD\u05BF\u05C1-\u05C2\u05C4\u064B-\u0652\u06=
70' \
u'\u06D6-\u06DC\u06DD-\u06DF\u06E0-\u06E4\u06E7-\u06E8' \=
u'\u06EA-\u06ED\u0901-\u0903\u093C\u093E-\u094C\u094D' \
u'\u0951-\u0954\u0962-\u0963\u0981-\u0983\u09BC\u09BE\u09=
BF' \
u'\u09C0-\u09C4\u09C7-\u09C8\u09CB-\u09CD\u09D7\u09E2-\u0=
9E3' \
u'\u0A02\u0A3C\u0A3E\u0A3F\u0A40-\u0A42\u0A47-\u0A48' \
u'\u0A4B-\u0A4D\u0A70-\u0A71\u0A81-\u0A83\u0ABC\u0ABE-\u0=
AC5' \
u'\u0AC7-\u0AC9\u0ACB-\u0ACD\u0B01-\u0B03\u0B3C\u0B3E-\u0=
B43' \
u'\u0B47-\u0B48\u0B4B-\u0B4D\u0B56-\u0B57\u0B82-\u0B83' \=
u'\u0BBE-\u0BC2\u0BC6-\u0BC8\u0BCA-\u0BCD\u0BD7\u0C01-\u0=
C03' \
u'\u0C3E-\u0C44\u0C46-\u0C48\u0C4A-\u0C4D\u0C55-\u0C56' \=
u'\u0C82-\u0C83\u0CBE-\u0CC4\u0CC6-\u0CC8\u0CCA-\u0CCD' \=
u'\u0CD5-\u0CD6\u0D02-\u0D03\u0D3E-\u0D43\u0D46-\u0D48' \=
u'\u0D4A-\u0D4D\u0D57\u0E31\u0E34-\u0E3A\u0E47-\u0E4E\u0E=
B1' \
u'\u0EB4-\u0EB9\u0EBB-\u0EBC\u0EC8-\u0ECD\u0F18-\u0F19\u0=
F35' \
u'\u0F37\u0F39\u0F3E\u0F3F\u0F71-\u0F84\u0F86-\u0F8B' \
u'\u0F90-\u0F95\u0F97\u0F99-\u0FAD\u0FB1-\u0FB7\u0FB9' \
u'\u20D0-\u20DC\u20E1\u302A-\u302F\u3099\u309A'
_Digit =3D u'\u0030-\u0039\u0660-\u0669\u06F0-\u06F9\u0966-\u096F\u09E6-\u=
09EF' \
u'\u0A66-\u0A6F\u0AE6-\u0AEF\u0B66-\u0B6F\u0BE7-\u0BEF' \
u'\u0C66-\u0C6F\u0CE6-\u0CEF\u0D66-\u0D6F\u0E50-\u0E59' \
u'\u0ED0-\u0ED9\u0F20-\u0F29'
_Extender =3D u'\u00B7\u02D0\u02D1\u0387\u0640\u0E46\u0EC6\u3005\u3031-\u3=
035' \
u'\u309D-\u309E\u30FC-\u30FE'
_Letter =3D _BaseChar + _Ideographic
_NameChar =3D '-' + _Letter + _Digit + '._:' + _CombiningChar + _Extender
_S =3D '[ \t\r\n]+' # white space
_opS =3D '[ \t\r\n]*' # optional white space
_Name =3D '['+_Letter+'_:]['+_NameChar+']*' # XML Name
_QStr =3D "(?:'[^']*'|\"[^\"]*\")" # quoted XML string
_Char =3D u'\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD' # legal characters
comment =3D re.compile('<!--(?P<comment>(?:[^-]|-[^-])*)-->')
space =3D re.compile(_S)
interesting =3D re.compile('[&<]')
amp =3D re.compile('&')
name =3D re.compile('^'+_Name+'$')
names =3D re.compile('^'+_Name+'(?:'+_S+_Name+')*$')
ref =3D re.compile('&(?:(?P<name>'+_Name+')|#(?P<char>(?:[0-9]+|x[0-9a-fA-=
F]+)));')
entref =3D re.compile('(?:&#(?P<char>(?:[0-9]+|x[0-9a-fA-F]+))|%(?P<pname>=
'+_Name+'));')
_attrre =3D _S+'(?P<attrname>'+_Name+')'+_opS+'=3D'+_opS+'(?P<attrvalue>'+=
_QStr+')'
attrfind =3D re.compile(_attrre)
starttag =3D re.compile('<(?P<tagname>'+_Name+')(?P<attrs>(?:'+_attrre+')*=
)'+_opS+'(?P<slash>/?)>')
endtag =3D re.compile('</(?P<tagname>'+_Name+')'+_opS+'>')
illegal =3D re.compile(r'\]\]>')
illegal1 =3D re.compile('[^'+_Char+']')
cdata =3D re.compile('<!\\[CDATA\\[(?P<cdata>(?:[^]]|\\](?!\\]>)|\\]\\](?!=
>))*)\\]\\]>')
_SystemLiteral =3D '(?P<syslit>'+_QStr+')'
_PublicLiteral =3D '(?P<publit>"[-\'()+,./:=3D?;!*#@$_%% \n\ra-zA-Z0-9]*"|=
' \
"'[-()+,./:=3D?;!*#@$_%% \n\ra-zA-Z0-9]*')"
_ExternalId =3D '(?:SYSTEM|PUBLIC'+_S+_PublicLiteral+')'+_S+_SystemLiteral=
externalid =3D re.compile(_ExternalId)
ndata =3D re.compile(_S+'NDATA'+_S+'(?P<name>'+_Name+')')
doctype =3D re.compile('<!DOCTYPE'+_S+'(?P<docname>'+_Name+')(?:'+_S+_Exte=
rnalId+')?'+_opS+'(?:\\[(?P<data>(?:'+_S+'|%'+_Name+';|'+comment.pattern+'=
|<(?:![^-]|[^!])(?:[^\'">]|\'[^\']*\'|"[^"]*")*>)*)\\]'+_opS+')?>')
xmldecl =3D re.compile('<\?xml'+
_S+'version'+_opS+'=3D'+_opS+'(?P<version>'+_QStr+')'=
+
'(?:'+_S+'encoding'+_opS+'=3D'+_opS+
"(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
'"[A-Za-z][-A-Za-z0-9._]*"))?'
'(?:'+_S+'standalone'+_opS+'=3D'+_opS+
'(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
_opS+'\?>')
textdecl =3D re.compile('<\?xml'
'(?:'+_S+'version'+_opS+'=3D'+_opS+'(?P<version>'+_Q=
Str+'))?'+
'(?:'+_S+'encoding'+_opS+'=3D'+_opS+
"(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
'"[A-Za-z][-A-Za-z0-9._]*"))?'+
_opS+'\?>')
pidecl =3D re.compile('<\\?(?![xX][mM][lL][ \t\r\n?])(?P<name>'+_Name+')(?=
:'+_S+'(?P<data>(?:[^?]|\\?(?!>))*))?\\?>')
# XML NAMESPACES
_NCName =3D '['+_Letter+'_]['+'-' + _Letter + _Digit + '._' + _CombiningCh=
ar + _Extender+']*' # XML Name, minus the ":"
ncname =3D re.compile(_NCName + '$')
qname =3D re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix=
'(?P<local>' + _NCName + ')$')
xmlns =3D re.compile('xmlns(?::(?P<ncname>' + _NCName + '))?$')
# DOCTYPE
_Nmtoken =3D '['+_NameChar+']+'
nmtoken =3D re.compile('^'+_Nmtoken+'$')
nmtokens =3D re.compile('^'+_Nmtoken+'(?:'+_S+_Nmtoken+')*$')
element =3D re.compile('<!ELEMENT'+_S+'(?P<name>'+_Name+')'+_S+r'(?P<conte=
nt>EMPTY|ANY|\()')
dfaelem0 =3D re.compile(_opS+r'(?P<token>\(|'+_Name+')')
dfaelem1 =3D re.compile(_opS+r'(?P<token>[)|,])')
dfaelem2 =3D re.compile(r'(?P<token>[+*?])')
mixedre =3D re.compile(r'\('+_opS+'#PCDATA'+'(('+_opS+r'\|'+_opS+_Name+')*=
'+_opS+r'\)\*|'+_opS+r'\))')
paren =3D re.compile('[()]')
attdef =3D re.compile(_S+'(?P<atname>'+_Name+')'+_S+'(?P<attype>CDATA|ID(?=
:REFS?)?|ENTIT(?:Y|IES)|NMTOKENS?|NOTATION'+_S+r'\((?P<notation>'+_opS+_Na=
me+'(?:'+_opS+r'\|'+_opS+_Name+')*'+_opS+r')\)|\('+_opS+_Nmtoken+'(?:'+_op=
S+r'\|'+_opS+_Nmtoken+')*'+_opS+r'\))'+_S+'(?P<atvalue>#REQUIRED|#IMPLIED|=
(?:#FIXED'+_S+')?(?P<atstring>'+_QStr+'))')
attlist =3D re.compile('<!ATTLIST'+_S+'(?P<elname>'+_Name+')(?P<atdef>(?:'=
+attdef.pattern+')*)'+_opS+'>')
_EntityVal =3D '"(?:[^"&%]|'+ref.pattern+'|%'+_Name+';)*"|' \
"'(?:[^'&%]|"+ref.pattern+"|%"+_Name+";)*'"
entity =3D re.compile('<!ENTITY'+_S+'(?:%'+_S+'(?P<pname>'+_Name+')'+_S+'(=
?P<pvalue>'+_EntityVal+'|'+_ExternalId+')|(?P<ename>'+_Name+')'+_S+'(?P<va=
lue>'+_EntityVal+'|'+_ExternalId+'(?:'+_S+'NDATA'+_S+_Name+')?))'+_opS+'>'=
)
notation =3D re.compile('<!NOTATION'+_S+'(?P<name>'+_Name+')'+_S+'(?P<valu=
e>SYSTEM'+_S+_SystemLiteral+'|PUBLIC'+_S+_PublicLiteral+'(?:'+_S+_SystemLi=
teral+')?)'+_opS+'>')
peref =3D re.compile('%(?P<name>'+_Name+');')
ignore =3D re.compile(r'<!\[|\]\]>')
bracket =3D re.compile('[<>\'"%]')
conditional =3D re.compile(r'<!\['+_opS+'(?:(?P<inc>INCLUDE)|(?P<ign>IGNOR=
E))'+_opS+r'\[')
class XMLParser:
"""XMLParser([ xmlns ]) -> instance
XML document parser.
There is one optional argument:
xmlns: understand XML Namespaces (default is 1)."""
def __init__(self, xmlns =3D 1):
self.__xmlns =3D xmlns # whether or not to parse namesp=
aces
self.reset()
def reset(self):
"""reset()
Reset parser to pristine state."""
self.docname =3D None # The outermost element in the d=
ocument (according to the DTD)
self.rawdata =3D []
self.entitydefs =3D { # & entities defined in DTD (plu=
s the default ones)
'lt': '<', # <
'gt': '>', # >
'amp': '&', # &
'apos': ''', # '
'quot': '"', # "
}
self.pentitydefs =3D {} # % entities defined in DTD
self.elems =3D {} # elements and their content/att=
rs
self.baseurl =3D '.' # base URL for external DTD
self.ids =3D {} # IDs encountered in document
self.notation =3D {} # NOTATIONs
self.doctype =3D None
def feed(self, data):
"""feed(data)
Feed data to parser."""
self.rawdata.append(data)
def close(self):
"""close()
End of data, finish up parsing."""
# Actually, this is where we start parsing.
data =3D string.join(self.rawdata, '')
self.rawdata =3D []
self.parse(data)
def __parse_textdecl(self, data, document =3D 0):
# Figure out the encoding of a file by looking at the first
# few bytes and the <?xml?> tag that may come at the very
# beginning of the file.
# This will convert the data to unicode from whatever format
# it was originally.
i =3D 0
if data[:2] =3D=3D '\376\377':
# UTF-16, big-endian
enc =3D 'utf-16-be'
i =3D 2
elif data[:2] =3D=3D '\377\376':
# UTF-16, little-endian
enc =3D 'utf-16-le'
i =3D 2
elif data[:4] =3D=3D '\x00\x3C\x00\x3F':
# UTF-16, big-endian
enc =3D 'utf-16-be'
elif data[:4] =3D=3D '\x3C\x00\x3F\x00':
# UTF-16, little-endian
enc =3D 'utf-16-le'
else:
enc =3D None # unknowns as yet
if enc:
try:
data =3D unicode(data[i:], enc)
except UnicodeError:
self.__error("data cannot be converted to Unicode", data, =
i, self.baseurl, fatal =3D 1)
i =3D 0
# optional XMLDecl
if document:
res =3D xmldecl.match(data, i)
else:
res =3D textdecl.match(data, i)
if res is not None:
if document:
version, encoding, standalone =3D res.group('version',
'encoding',
'standalone')
else:
version, encoding =3D res.group('version', 'encoding')
standalone =3D None
if version is not None and version[1:-1] !=3D '1.0':
self.__error('only XML version 1.0 supported', data, res.s=
tart('version'), self.baseurl, fatal =3D 1)
if encoding:
encoding =3D encoding[1:-1]
if enc and enc !=3D encoding.lower() and \
enc[:6] !=3D encoding.lower():
self.__error("declared encoding doesn't match actual e=
ncoding", data, res.start('encoding'), self.baseurl, fatal =3D 1)
enc =3D encoding.lower()
if standalone:
standalone =3D standalone[1:-1]
## self.handle_xml(encoding, standalone)
i =3D res.end(0)
if enc is None:
# default is UTF 8
enc =3D 'utf-8'
if type(data) is not type(u'a'):
try:
data =3D unicode(data[i:], enc)
except UnicodeError:
self.__error("data cannot be converted to Unicode", data, =
i, self.baseurl, fatal =3D 1)
else:
data =3D data[i:]
return data
def __normalize_linefeed(self, data):
# normalize line endings: first \r\n -> \n, then \r -> \n
return u'\n'.join(u'\n'.join(data.split(u'\r\n')).split(u'\r'))
def __normalize_space(self, data):
# normalize white space: tab, linefeed and carriage return -> spac=
e
data =3D ' '.join(data.split('\t'))
data =3D ' '.join(data.split('\n'))
data =3D u' '.join(data.split('\r'))
return data
def parse(self, data):
"""parse(data)
Parse the data as an XML document."""
from time import time
t0 =3D time()
data =3D self.__parse_textdecl(data, 1)
data =3D self.__normalize_linefeed(data)
# (Comment | PI | S)*
i =3D self.__parse_misc(data, 0)
# doctypedecl?
res =3D doctype.match(data, i)
if res is not None and self.doctype is None:
docname, publit, syslit, docdata =3D res.group('docname', 'pub=
lit',
'syslit', 'data')
self.docname =3D docname
if publit: publit =3D string.join(string.split(publit[1:-1]))
if syslit: syslit =3D syslit[1:-1]
self.handle_doctype(docname, publit, syslit, docdata)
i =3D res.end(0)
elif self.doctype:
# do as if there was a <!DOCTYPE> declaration
self.handle_doctype(None, '', self.doctype, '')
else:
# self.doctype =3D=3D '' or no DOCTYPE
# ignore DOCTYPE
self.doctype =3D None
t1 =3D time()
# (Comment | PI | S)*
i =3D self.__parse_misc(data, i)
# the document itself
res =3D starttag.match(data, i)
if res is None:
self.__error('no elements in document', data, i, self.baseurl,=
fatal =3D 1)
i =3D res.end(0)
tagname, slash =3D res.group('tagname', 'slash')
if self.docname and tagname !=3D self.docname:
self.__error('starttag does not match DOCTYPE', data, res.star=
t('tagname'), self.baseurl, fatal =3D 0)
val =3D self.__parse_attrs(tagname, data, res.start('tagname'), re=
s.span('attrs'), None)
if val is None:
return
nstag, attrs, namespaces =3D val
self.finish_starttag(nstag, attrs)
if not slash:
i =3D self.__parse_content(data, i, tagname, namespaces)
if i is None:
return
if type(i) is type(res):
res =3D i
else:
res =3D endtag.match(data, i)
if res is None:
self.__error('end tag missing', data, i, self.baseurl, fat=
al =3D 0)
elif res.group('tagname') !=3D tagname:
self.__error("end tag doesn't match start tag", data, res.=
start('tagname'), self.baseurl, fatal =3D 0)
i =3D res.end(0)
self.finish_endtag(nstag)
i =3D self.__parse_misc(data, i)
if i !=3D len(data):
self.__error('garbage at end of document', data, i, self.baseu=
rl, fatal =3D 0)
t2 =3D time()
return t0, t1, t2
def __parse_misc(self, data, i):
# match any number of whitespace, processing instructions and comm=
ents
matched =3D 1
while matched:
matched =3D 0
res =3D comment.match(data, i)
if res is not None:
matched =3D 1
c0, c1 =3D res.span('comment')
ires =3D illegal1.search(data, c0, c1)
if ires is not None:
self.__error('illegal characters in comment', data, ir=
es.start(0), self.baseurl, fatal =3D 0)
self.handle_comment(data[c0:c1])
i =3D res.end(0)
res =3D pidecl.match(data, i)
if res is not None:
matched =3D 1
c0, c1 =3D res.span('data')
ires =3D illegal1.search(data, c0, c1)
if ires is not None:
self.__error('illegal characters in Processing Instruc=
tion', data, ires.start(0), self.baseurl, fatal =3D 0)
self.handle_proc(res.group('name'), res.group('data') or '=
')
i =3D res.end(0)
res =3D space.match(data, i)
if res is not None:
matched =3D 1
i =3D res.end(0)
return i
def __update_state(self, dfa, states, tagname):
# update the list of states in the dfa. If tagname is None,
# we're looking for the final state, so return a list of all
# states reachable using epsilon transitions
nstates =3D []
seenstates =3D {}
while states:
s =3D states[0]
seenstates[s] =3D 1
del states[0]
if tagname is not None and dfa[s].has_key(tagname):
nstates =3D dfa[s][tagname][:]
else:
for s in dfa[s].get('', []):
if not seenstates.has_key(s):
states.append(s)
if tagname is None:
nstates =3D seenstates.keys()
states[:] =3D nstates # change in-line
def __check_dfa(self, dfa, initstate, tagname, data, i):
states =3D [initstate]
possibles =3D {}
seenstates =3D {}
while states:
s =3D states[0]
seenstates[s] =3D 1
del states[0]
for tag in dfa[s].keys():
if tag and possibles.has_key(tag):
self.__error("non-deterministic content model for `%s'=
" % tagname, data, i, self.baseurl, fatal =3D 0)
possibles[tag] =3D 1
for s in dfa[s].get('', []):
if not seenstates.has_key(s):
states.append(s)
def __parse_content(self, data, i, ptagname, namespaces, states =3D No=
ne):
# parse the content of an element (i.e. the string between
# start tag and end tag)
datalen =3D len(data)
if self.elems.has_key(ptagname):
content, attributes, start, end =3D self.elems[ptagname][:4] #=
content model
if states =3D=3D None:
states =3D [start]
else:
content =3D None # unknown content model
while i < datalen:
matched =3D 0
res =3D interesting.search(data, i)
if res is None:
j =3D datalen
else:
j =3D res.start(0)
if j > i:
res =3D illegal.search(data, i, j)
if res is not None:
self.__error("illegal data content in element `%s'" % =
ptagname, data, i, self.baseurl, fatal =3D 0)
skip =3D 0
complain =3D 0
if content is not None:
res =3D space.match(data, i, j)
isspace =3D res is not None and res.span(0) =3D=3D (i,=
j)
if content =3D=3D 'EMPTY':
complain =3D 1
skip =3D 1
elif not isspace and type(content) is type([]) and co=
ntent and type(content[0]) is type({}):
complain =3D 1
if complain:
self.__error("no character data allowed in element=
`%s'" % ptagname, data, i, self.baseurl, fatal =3D 0)
matched =3D 1
if not skip:
self.handle_data(data[i:j])
i =3D j
res =3D starttag.match(data, i)
if res is not None:
tagname, slash =3D res.group('tagname', 'slash')
if content =3D=3D 'EMPTY' or content =3D=3D '#PCDATA':
self.__error("empty element `%s' has content" % ptagna=
me, data, res.start(0), self.baseurl, fatal =3D 0)
elif content =3D=3D 'ANY':
# always OK
pass
elif type(content) is type([]) and content and type(conten=
t[0]) is not type({}):
# mixed
if tagname not in content:
self.__error("illegal content in element `%s'" % p=
tagname, data, res.start(0), self.baseurl, fatal =3D 0)
elif content is not None:
self.__update_state(content, states, tagname)
if not states:
self.__error("illegal content for element `%s'" % =
ptagname, data, i, self.baseurl)
val =3D self.__parse_attrs(tagname, data, res.start('tagna=
me'), res.span('attrs'), namespaces)
if val is None:
return
i =3D res.end(0)
nstag, attrs, subnamespaces =3D val
self.finish_starttag(nstag, attrs)
if not slash:
i =3D self.__parse_content(data, i, tagname, subnamesp=
aces)
if i is None:
return
if type(i) is type(res):
res =3D i
else:
res =3D endtag.match(data, i)
if res is None:
self.__error('end tag missing', data, i, self.base=
url, fatal =3D 0)
elif res.group('tagname') !=3D tagname:
self.__error("end tag doesn't match start tag", da=
ta, res.start('tagname'), self.baseurl, fatal =3D 0)
i =3D res.end(0)
self.finish_endtag(nstag)
matched =3D 1
res =3D endtag.match(data, i)
if res is not None:
if type(content) is type([]) and content and type(content[=
0]) is type({}):
self.__update_state(content, states, None)
if end not in states:
self.__error("content of element `%s' doesn't matc=
h content model" % ptagname, data, i, self.baseurl, fatal =3D 0)
return res
res =3D comment.match(data, i)
if res is not None:
c0, c1 =3D res.span('comment')
ires =3D illegal1.search(data, c0, c1)
if ires is not None:
self.__error('illegal characters in comment', data, ir=
es.start(0), self.baseurl, fatal =3D 0)
self.handle_comment(data[c0:c1])
i =3D res.end(0)
matched =3D 1
res =3D ref.match(data, i)
if res is not None:
name =3D res.group('name')
if name:
if self.entitydefs.has_key(name):
sval =3D val =3D self.entitydefs[name]
baseurl =3D self.baseurl
if type(val) is type(()):
if val[2] is not None:
apply(self.handle_ndata, val)
val =3D None
else:
val =3D self.__read_pentity(val[0], val[1]=
)
if val is not None:
del self.entitydefs[name] # to break recursion=
n =3D self.__parse_content(val, 0, ptagname, n=
amespaces, states)
self.entitydefs[name] =3D sval # restore value=
if val is not None:
if n is None:
self.baseurl =3D baseurl
return
if type(n) is type(res) or n !=3D len(val):
if type(n) is type(res):
n =3D res.start(0)
self.__error('misformed entity value', dat=
a, n, self.baseurl, fatal =3D 0)
self.baseurl =3D baseurl
else:
if self.docname:
self.__error("unknown entity reference `&%s;' =
in element `%s'" % (name, ptagname), data, i, self.baseurl, fatal =3D 0)
self.data =3D data
self.offset =3D res.start('name')
self.lineno =3D string.count(data, '\n', 0, self.o=
ffset)
self.unknown_entityref(name)
else:
str =3D self.__parse_charref(res.group('char'), data, =
res.start(0))
if str is None:
return
self.handle_data(str)
i =3D res.end(0)
matched =3D 1
res =3D pidecl.match(data, i)
if res is not None:
matched =3D 1
c0, c1 =3D res.span('data')
ires =3D illegal1.search(data, c0, c1)
if ires is not None:
self.__error('illegal characters in Processing Instruc=
tion', data, ires.start(0), self.baseurl, fatal =3D 0)
self.handle_proc(res.group('name'), res.group('data') or '=
')
i =3D res.end(0)
res =3D cdata.match(data, i)
if res is not None:
matched =3D 1
c0, c1 =3D res.span('cdata')
ires =3D illegal1.search(data, c0, c1)
if ires is not None:
self.__error('illegal characters in CDATA section', da=
ta, ires.start(0), self.baseurl, fatal =3D 0)
self.handle_cdata(res.group('cdata'))
i =3D res.end(0)
if not matched:
self.__error("no valid content in element `%s'" % ptagname=
, data, i, self.baseurl)
return
return i
def __check_attr(self, tagname, attrname, value, attributes, data, att=
rstart):
# check that the attribute attrname on element tagname is of
# the correct type with a legal value
# return the normalized value (i.e. white space collapsed if
# appropriate)
# XXX this method needs work to be complete
attype, atvalue, atstring =3D attributes[attrname]
if atvalue[:6] =3D=3D '#FIXED':
if value !=3D atstring:
self.__error("attribute `%s' in element `%s' does not have=
correct value" % (attrname, tagname), data, attrstart, self.baseurl, fata=
l =3D 0)
if attype =3D=3D 'CDATA':
return value # always OK and don't change value=
if type(attype) is type([]): # enumeration
if value not in attype:
self.__error("attribute `%s' in element `%s' not valid" % =
(attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
return value
if type(attype) is type(()):
if value not in attype[1]:
self.__error("attribute `%s' in element `%s' not valid" % =
(attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
return value
if attype =3D=3D 'ID':
if name.match(value) is None:
self.__error("attribute `%s' in element `%s' is not an ID"=
% (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
if self.ids.has_key(value):
self.__error("attrbute `%s' in element `%s' is not unique"=
% (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
self.ids[value] =3D 1
return value
if attype =3D=3D 'IDREF':
if name.match(value) is None:
self.__error("attrbute `%s' in element `%s' is not an IDRE=
F" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
# XXX should check ID exists
return value
if attype =3D=3D 'IDREFS':
if names.match(value) is None:
self.__error("attrbute `%s' in element `%s' is not an IDRE=
FS" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
# XXX should check IDs exist
return value
if attype =3D=3D 'NMTOKEN':
if nmtoken.match(value) is None:
self.__error("attrbute `%s' in element `%s' is not a NMTOK=
EN" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
return value
if attype =3D=3D 'NMTOKENS':
if nmtokens.match(value) is None:
self.__error("attrbute `%s' in element `%s' is not a NMTOK=
ENS" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
return value
if attype =3D=3D 'ENTITY':
if name.match(value) is None:
self.__error("attrbute `%s' in element `%s' is not an ENTI=
TY" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
# XXX should check ENTITY exists
return value
if attype =3D=3D 'ENTITIES':
if names.match(value) is None:
self.__error("attrbute `%s' in element `%s' is not an ENTI=
TIES" % (attrname, tagname), data, attrstart, self.baseurl, fatal =3D 0)
# XXX should check ENTITIES exist
return value
# XXX other types?
return value
def __parse_attrs(self, tagname, data, tagstart, span, namespaces):
# parse the string between the tag name and closing bracket
# for attribute=3Dvalue pairs
i, dataend =3D span
attrlist =3D []
namespace =3D None
reqattrs =3D {} # attributes that are #REQUIRED
if self.elems.has_key(tagname):
attributes =3D self.elems[tagname][1]
for key, (attype, atvalue, atstring) in attributes.items():
if atvalue =3D=3D '#REQUIRED':
reqattrs[key] =3D 1
attrseen =3D {} # attributes that we've seen
else:
attributes =3D None
while i < dataend:
res =3D attrfind.match(data, i, dataend)
if res is None:
# couldn't match any attributes, but there is more
# string to parse: complain and ignore rest of string
self.__error('bad attributes', data, i, self.baseurl, fata=
l =3D 0)
return
name =3D res.group('attrname')
if reqattrs.has_key(name):
del reqattrs[name] # seen this #REQUIRED attribute
if attributes is not None and attributes.has_key(name):
attype =3D attributes[name][0]
else:
attype =3D None
start, end =3D res.span('attrvalue')
value =3D self.__parse_attrval(data, attype, span =3D (start+1=
, end-1))
if value is None:
# bad attribute value: ignore, but continue parsing
i =3D res.end(0)
continue
attrstart =3D res.start('attrname')
if attributes is not None:
if attributes.has_key(name):
attrseen[name] =3D 1
value =3D self.__check_attr(tagname, name, value, attr=
ibutes, data, attrstart)
else:
self.__error("unknown attribute `%s' on element `%s'" =
% (name, tagname), data, attrstart, self.baseurl, fatal =3D 0)
i =3D res.end(0)
if self.__xmlns:
res =3D xmlns.match(name)
if res is not None:
# namespace declaration
ncname =3D res.group('ncname')
if namespace is None:
namespace =3D {}
namespace[ncname or ''] =3D value or None
continue
attrlist.append((name, value, attrstart))
if reqattrs:
# there are #REQUIRED attributes that we haven't seen
reqattrs =3D reqattrs.keys()
reqattrs.sort()
if len(reqattrs) > 1:
s =3D 's'
else:
s =3D ''
reqattrs =3D string.join(reqattrs, "', `")
self.__error("required attribute%s `%s' of element `%s' missin=
g" % (s, reqattrs, tagname), data, dataend, self.baseurl, fatal =3D 0)
if attributes is not None:
# fill in missing attributes that have a default value
for key, (attype, atvalue, atstring) in attributes.items():
if atstring is not None and not attrseen.has_key(key):
attrlist.append((key, atstring, dataend))
if namespace is not None:
namespaces =3D (namespace, namespaces)
if namespaces is not None:
res =3D qname.match(tagname)
if res is not None:
prefix, nstag =3D res.group('prefix', 'local')
if prefix is None: prefix =3D ''
ns =3D None
n =3D namespaces
while n is not None:
d, n =3D n
if d.has_key(prefix):
ns =3D d[prefix]
break
if ns is not None:
tagname =3D ns + ' ' + nstag
elif prefix !=3D '':
self.__error("unknown namespace prefix `%s'" % prefix,=
data, tagstart, self.baseurl, fatal =3D 0)
else:
self.__error("badly formed tag name `%s'" % tagname, data,=
tagstart, self.baseurl, fatal =3D 0)
attrdict =3D {} # collect attributes/values
for attr, value, attrstart in attrlist:
if namespaces is not None:
res =3D qname.match(attr)
if res is not None:
prefix, nsattr =3D res.group('prefix', 'local')
if prefix:
ans =3D None
n =3D namespaces
while n is not None:
d, n =3D n
if d.has_key(prefix):
ans =3D d[prefix]
break
if ans is not None:
attr =3D ans + ' ' + nsattr
elif prefix !=3D '':
self.__error("unknown namespace prefix `%s'" %=
prefix, data, attrstart, self.baseurl, fatal =3D 0)
else:
self.__error("badly formed attribute name `%s'" % attr=
, data, attrstart, self.baseurl, fatal =3D 0)
if attrdict.has_key(attr):
self.__error("duplicate attribute name `%s'" % attr, data,=
attrstart, self.baseurl, fatal =3D 0)
attrdict[attr] =3D value
return tagname, attrdict, namespaces
def __parse_attrval(self, data, attype, span =3D None):
# parse an attribute value, replacing entity and character
# references with their values
if span is None:
i =3D 0
dataend =3D len(data)
else:
i, dataend =3D span
res =3D illegal1.search(data, i, dataend)
if res is not None:
self.__error("illegal characters in attribute value", data, re=
s.start(0), self.baseurl, fatal =3D 0)
newval =3D []
while i < dataend:
res =3D interesting.search(data, i, dataend)
if res is None:
str =3D data[i:dataend]
if attype is None or attype =3D=3D 'CDATA':
str =3D self.__normalize_space(str)
newval.append(str)
break
j =3D res.start(0)
if data[j] =3D=3D '<':
self.__error("no `<' allowed in attribute value", data, j,=
self.baseurl, fatal =3D 0)
if j > i:
str =3D data[i:j]
if attype is None or attype =3D=3D 'CDATA':
str =3D self.__normalize_space(str)
newval.append(str)
res =3D ref.match(data, j, dataend)
if res is None:
self.__error('illegal attribute value', data, j, self.base=
url, fatal =3D 0)
newval.append(data[j]) # the &
i =3D j + 1 # continue searching after the &=
continue
i =3D res.end(0)
name =3D res.group('name')
if name:
# entity reference (e.g. "<")
if self.entitydefs.has_key(name):
val =3D self.entitydefs[name]
if type(val) is type(()):
self.__error("no external parsed entity allowed in=
attribute value", data, res.start(0), self.baseurl, fatal =3D 1)
del self.entitydefs[name]
nval =3D self.__parse_attrval(val, attype)
self.entitydefs[name] =3D val
if nval is None:
return
newval.append(nval)
else:
self.__error("reference to unknown entity `%s'" % name=
, data, res.start(0), self.baseurl, fatal =3D 0)
newval.append('&%s;' % name)
else:
val =3D self.__parse_charref(res.group('char'), data, res.=
start(0))
if val is None:
newval.append('&#%s;' % res.group('char'))
continue
newval.append(val)
str =3D string.join(newval, '')
if attype is not None and attype !=3D 'CDATA':
str =3D string.join(string.split(str))
return str
def __parse_charref(self, name, data, i):
# parse a character reference (e.g. "%#38;")
# the "name" arg is just part between # and ;
if name[0] =3D=3D 'x':
# e.g. &
n =3D int(name[1:], 16)
else:
# e.g. &
n =3D int(name)
try:
c =3D unichr(n)
except ValueError:
self.__error('bad character reference', data, i, self.baseurl,=
fatal =3D 0)
return
if illegal1.search(c):
self.__error('bad character reference', data, i, self.baseurl,=
fatal =3D 0)
return c
def __read_pentity(self, publit, syslit):
import urllib
syslit =3D urllib.basejoin(self.baseurl, syslit)
baseurl =3D self.baseurl
self.baseurl =3D syslit
val =3D self.read_external(publit, syslit)
val =3D self.__parse_textdecl(val)
return self.__normalize_linefeed(val)
def parse_dtd(self, data, internal =3D 1):
"""parse_dtd(data[, internal ])
Parse the DTD.
This method is called by the parse_doctype method and is
provided so that parse_doctype can be overridden.
Argument is a string containing the full DTD.
Optional argument internal is true (default) if the DTD is
internal."""
i =3D 0
matched =3D 1
ilevel =3D 0 # nesting level of ignored secti=
ons
while i < len(data) and matched:
matched =3D 0
res =3D peref.match(data, i)
if res is not None:
matched =3D 1
name =3D res.group('name')
if self.pentitydefs.has_key(name):
val =3D self.pentitydefs[name]
baseurl =3D self.baseurl
if type(val) is type(()):
val =3D self.__read_pentity(val[0], val[1])
self.parse_dtd(val, internal)
self.baseurl =3D baseurl
else:
self.__error("unknown entity `%%%s;'" % name, data, i,=
self.baseurl, fatal =3D 0)
i =3D res.end(0)
res =3D element.match(data, i)
if res is not None:
matched =3D 1
name, content =3D res.group('name', 'content')
i =3D res.end(0)
elemval =3D (None, {}, None, None, None)
if self.elems.has_key(name):
elemval =3D self.elems[name]
if elemval[0] is not None:
# XXX is this an error?
self.__error('non-unique element name declaration'=
, data, i, self.baseurl, fatal =3D 0)
elif content =3D=3D 'EMPTY':
# check for NOTATION on EMPTY element
for atname, (attype, atvalue, atstring) in elemval=
[1].items():
if type(attype) is type(()) and attype[0] =3D=
=3D 'NOTATION':
self.__error("NOTATION not allowed on EMPT=
Y element", data, i, self.baseurl)
if content[0] =3D=3D '(':
i =3D res.start('content')
j, content, start, end =3D self.__dfa(data, i)
if type(content) is type([]) and content and type(cont=
ent[0]) is type({}):
self.__check_dfa(content, start, name, data, i)
contentstr =3D data[i:j]
i =3D j
else:
contentstr =3D content
start =3D end =3D 0
self.elems[name] =3D (content, elemval[1], start, end, con=
tentstr)
res =3D space.match(data, i)
if res is not None:
i =3D res.end(0)
if data[i:i+1] !=3D '>':
self.__error('bad DOCTYPE', data, i, self.baseurl)
return
i =3D i+1
res =3D attlist.match(data, i)
if res is not None:
matched =3D 1
elname, atdef =3D res.group('elname', 'atdef')
if not self.elems.has_key(elname):
self.elems[elname] =3D (None, {}, None, None, None)
ares =3D attdef.match(atdef)
while ares is not None:
atname, attype, atvalue, atstring =3D ares.group('atna=
me', 'attype', 'atvalue', 'atstring')
if attype[0] =3D=3D '(':
attype =3D map(string.strip, string.split(attype[1=
:-1], '|'))
elif attype[:8] =3D=3D 'NOTATION':
if self.elems[elname][0] =3D=3D 'EMPTY':
self.__error("NOTATION not allowed on EMPTY el=
ement", data, ares.start('attype'), self.baseurl)
atnot =3D map(string.strip, string.split(ares.grou=
p('notation'), '|'))
attype =3D ('NOTATION', atnot)
if atstring:
atstring =3D atstring[1:-1] # remove quotes
atstring =3D self.__parse_attrval(atstring, attype=
)
if attype !=3D 'CDATA':
atstring =3D string.join(string.split(atstring=
))
else:
atstring =3D string.join(string.split(atstring=
, '\t'), ' ')
if type(attype) is type([]):
if atstring is not None and atstring not in attype=
:
self.__error("default value for attribute `%s'=
on element `%s' not listed as possible value" % (atname, elname), data, i=
, self.baseurl)
elif type(attype) is type(()):
if atstring is not None and atstring not in attype=
[1]:
self.__error("default value for attribute `%s'=
on element `%s' not listed as possible value" % (atname, elname), data, i=
, self.baseurl)
if not self.elems[elname][1].has_key(atname):
# first definition counts
self.elems[elname][1][atname] =3D attype, atvalue,=
atstring
ares =3D attdef.match(atdef, ares.end(0))
i =3D res.end(0)
res =3D entity.match(data, i)
if res is not None:
matched =3D 1
pname, name =3D res.group('pname', 'ename')
if pname:
pvalue =3D res.group('pvalue')
if pvalue[0] in ('"',"'"):
c0, c1 =3D res.span('pvalue')
ires =3D illegal1.search(data, c0+1, c1-1)
if ires is not None:
self.__error("illegal characters in entity val=
ue", data, ires.start(0), self.baseurl, fatal =3D 0)
if self.pentitydefs.has_key(pname):
# first definition counts
pass
elif pvalue[0] in ('"',"'"):
pvalue =3D pvalue[1:-1]
pvalue =3D self.__normalize_space(pvalue)
cres =3D entref.search(pvalue)
while cres is not None:
chr, nm =3D cres.group('char', 'pname')
if chr:
repl =3D self.__parse_charref(cres.group('=
char'), data, i)
elif self.pentitydefs.has_key(nm):
repl =3D self.pentitydefs[nm]
else:
self.__error("unknown entity `%s' referenc=
ed" % nm, data, i, self.baseurl)
repl =3D '%%%s;' % nm
if type(repl) is type(()):
baseurl =3D self.baseurl
repl =3D self.__read_pentity(repl[0], repl=
[1])
self.baseurl =3D baseurl
pvalue =3D pvalue[:cres.start(0)] + repl + pva=
lue[cres.end(0):]
cres =3D entref.search(pvalue, cres.start(0)+l=
en(repl))
self.pentitydefs[pname] =3D pvalue
else:
r =3D externalid.match(pvalue)
publit, syslit =3D r.group('publit', 'syslit')
if publit: publit =3D string.join(string.split(pub=
lit[1:-1]))
if syslit: syslit =3D syslit[1:-1]
self.pentitydefs[pname] =3D publit, syslit
else:
value =3D res.group('value')
if value[0] in ('"',"'"):
c0, c1 =3D res.span('value')
ires =3D illegal1.search(data, c0+1, c1-1)
if ires is not None:
self.__error("illegal characters in entity val=
ue", data, ires.start(0), self.baseurl, fatal =3D 0)
if self.entitydefs.has_key(name):
# use first definition
pass
elif value[0] in ('"',"'"):
value =3D value[1:-1]
value =3D self.__normalize_space(value)
cres =3D entref.search(value)
while cres is not None:
chr, nm =3D cres.group('char', 'pname')
if chr:
repl =3D self.__parse_charref(cres.group('=
char'), data, i)
elif self.pentitydefs.has_key(nm):
repl =3D self.pentitydefs[nm]
if type(repl) is type(()):
baseurl =3D self.baseurl
repl =3D self.__read_pentity(repl[0], =
repl[1])
self.baseurl =3D baseurl
else:
self.__error("unknown entity `%s' referenc=
ed" % nm, data, i, self.baseurl)
repl =3D '%%%s;' % nm
value =3D value[:cres.start(0)] + repl + value=
[cres.end(0):]
cres =3D entref.search(value, cres.start(0)+le=
n(repl))
self.entitydefs[name] =3D value
else:
r =3D externalid.match(value)
publit, syslit =3D r.group('publit', 'syslit')
if publit: publit =3D string.join(string.split(pub=
lit[1:-1]))
if syslit: syslit =3D syslit[1:-1]
r1 =3D ndata.match(value, r.end(0))
if r1 is not None:
ndataname =3D r1.group('name')
else:
ndataname =3D None
self.entitydefs[name] =3D publit, syslit, ndatanam=
e
i =3D res.end(0)
res =3D notation.match(data, i)
if res is not None:
matched =3D 1
name, value =3D res.group('name', 'value')
if not self.notation.has_key(name):
self.notation[name] =3D value
i =3D res.end(0)
j =3D i # remember where we were
i =3D self.__parse_misc(data, i)
matched =3D matched or i > j # matched anything?
if not internal:
if data[i:i+1] =3D=3D '<':
hlevel =3D 1
quote =3D None
j =3D i+1
while hlevel > 0:
res =3D bracket.search(data, j)
if res is None:
self.__error("unexpected EOF", data, i, self.b=
aseurl, fatal =3D 1)
j =3D res.end(0)
c =3D data[res.start(0)]
if c =3D=3D '<':
hlevel =3D hlevel + 1
elif quote and c =3D=3D quote:
quote =3D None
elif c in ('"', "'"):
quote =3D c
elif c =3D=3D '>':
hlevel =3D hlevel - 1
elif hlevel =3D=3D 1 and not quote:
# only expand parsed entities at lowest level
res =3D peref.match(data, res.start(0))
if res is not None:
pname =3D res.group('name')
if self.pentitydefs.has_key(pname):
repl =3D self.pentitydefs[pname]
if type(repl) is type(()):
baseurl =3D self.baseurl
repl =3D self.__read_pentity(repl[=
0], repl[1])
self.baseurl =3D baseurl
data =3D data[:res.start(0)] + ' ' + r=
epl + ' ' + data[res.end(0):]
j =3D res.start(0) + len(repl) + 2
else:
j =3D res.end(0)
res =3D conditional.match(data, i)
if res is not None:
inc, ign =3D res.group('inc', 'ign')
i =3D res.end(0)
if ign:
level =3D 1
while level > 0:
res =3D ignore.search(data, i)
if res.start(0) =3D=3D '<':
level =3D level + 1
else:
level =3D level - 1
i =3D res.end(0)
elif inc:
ilevel =3D ilevel + 1
if ilevel and data[i:i+3] =3D=3D ']]>':
i =3D i+3
ilevel =3D ilevel - 1
if i < len(data):
self.__error('error while parsing DOCTYPE', data, i, self.base=
url)
def __dfa(self, data, i):
res =3D mixedre.match(data, i)
if res is not None:
mixed =3D res.group(0)
if mixed[-1] =3D=3D '*':
mixed =3D map(string.strip, string.split(mixed[1:-2], '|')=
)
else:
mixed =3D '#PCDATA'
return res.end(0), mixed, 0, 0
dfa =3D []
i, start, end =3D self.__dfa1(data, i, dfa)
return i, dfa, start, end
def __dfa1(self, data, i, dfa):
res =3D dfaelem0.match(data, i)
if res is None:
self.__error("syntax error in element content: `(' or Name exp=
ecter", data, i, self.baseurl, fatal =3D 1)
token =3D res.group('token')
if token =3D=3D '(':
i, start, end =3D self.__dfa1(data, res.end(0), dfa)
res =3D dfaelem1.match(data, i)
if res is None:
self.__error("syntax error in element content: `)', `|', o=
r `,' expected", data, i, self.baseurl, fatal =3D 1)
token =3D res.group('token')
sep =3D token
while token in (',','|'):
if sep !=3D token:
self.__error("syntax error in element content: `%s' or=
`)' expected" % sep, data, i, self.baseurl, fatal =3D 1)
i, nstart, nend =3D self.__dfa1(data, res.end(0), dfa)
res =3D dfaelem1.match(data, i)
if res is None:
self.__error("syntax error in element content: `%s' or=
`)' expected" % sep, data, i, self.baseurl, fatal =3D 1)
token =3D res.group('token')
if sep =3D=3D ',':
# concatenate DFAs
e =3D dfa[end].get('', [])
e.append(nstart)
dfa[end][''] =3D e
end =3D nend
else:
# make parallel
s =3D len(dfa)
dfa.append({'': [start, nstart]})
e =3D dfa[end].get('', [])
e.append(len(dfa))
dfa[end][''] =3D e
e =3D dfa[nend].get('', [])
e.append(len(dfa))
dfa[nend][''] =3D e
start =3D s
end =3D len(dfa)
dfa.append({})
# token =3D=3D ')'
i =3D res.end(0)
else:
# it's a Name
start =3D len(dfa)
dfa.append({token: [start+1]})
end =3D len(dfa)
dfa.append({})
i =3D res.end(0)
res =3D dfaelem2.match(data, i)
if res is not None:
token =3D res.group('token')
s =3D len(dfa)
e =3D s+1
if token =3D=3D '+':
dfa.append({'': [start]})
else:
dfa.append({'': [start, e]})
dfa.append({})
l =3D dfa[end].get('', [])
dfa[end][''] =3D l
if token !=3D '?':
l.append(start)
l.append(e)
start =3D s
end =3D e
i =3D res.end(0)
return i, start, end
def parse_doctype(self, tag, publit, syslit, data):
"""parse_doctype(tag, publit, syslit, data)
Parse the DOCTYPE.
This method is called by the handle_doctype callback method
and is provided so that handle_doctype can be overridden.
The arguments are:
tag: the name of the outermost element of the document;
publit: the Public Identifier of the DTD (or None);
syslit: the System Literal of the DTD (or None);
data: the internal subset of the DTD (or None)."""
if data:
self.parse_dtd(data)
if syslit:
import urllib
syslit =3D urllib.basejoin(self.baseurl, syslit)
baseurl =3D self.baseurl
self.baseurl =3D syslit
external =3D self.read_external(publit, syslit)
external =3D self.__parse_textdecl(external)
external =3D self.__normalize_linefeed(external)
self.parse_dtd(external, 0)
self.baseurl =3D baseurl
def __error(self, message, data =3D None, i =3D None, filename =3D Non=
e, fatal =3D 1):
# called for all syntax errors
# this either raises an exception (Error) or calls
# self.syntax_error which may be overridden
if data is not None and i is not None:
self.lineno =3D lineno =3D string.count(data, '\n', 0, i) + 1
else:
self.lineno =3D None
self.data =3D data
self.offset =3D i
self.filename =3D filename
if fatal:
raise Error(message, lineno, data, i, filename)
self.syntax_error(message)
# Overridable -- handle xml processing instruction
def handle_xml(self, encoding, standalone):
pass
# Overridable -- handle DOCTYPE
def handle_doctype(self, tag, publit, syslit, data):
if self.doctype is not None:
syslit =3D self.doctype
self.parse_doctype(tag, publit, syslit, data)
# Example -- read external file referenced from DTD with a SystemLiter=
al
def read_external(self, publit, syslit):
return ''
# Example -- handle comment, could be overridden
def handle_comment(self, data):
pass
# Example -- handle processing instructions, could be overridden
def handle_proc(self, name, data):
pass
# Example -- handle data, should be overridden
def handle_data(self, data):
pass
# Example -- handle cdata, should be overridden
def handle_cdata(self, data):
pass
elements =3D {} # dict: tagname -> (startfunc, e=
ndfunc)
def finish_starttag(self, tagname, attrs):
method =3D self.elements.get(tagname, (None, None))[0]
if method is None:
self.unknown_starttag(tagname, attrs)
else:
self.handle_starttag(tagname, method, attrs)
def finish_endtag(self, tagname):
method =3D self.elements.get(tagname, (None, None))[1]
if method is None:
self.unknown_endtag(tagname)
else:
self.handle_endtag(tagname, method)
# Overridable -- handle start tag
def handle_starttag(self, tagname, method, attrs):
method(tagname, attrs)
# Overridable -- handle end tag
def handle_endtag(self, tagname, method):
method(tagname)
# To be overridden -- handlers for unknown objects
def unknown_starttag(self, tagname, attrs):
pass
def unknown_endtag(self, tagname):
pass
def unknown_entityref(self, name):
self.__error('reference to unknown entity', self.data, self.offset=
, self.baseurl)
# Example -- handle relatively harmless syntax errors, could be overri=
dden
def syntax_error(self, message):
raise Error(message, self.lineno, self.data, self.offset, self.fil=
ename)
class TestXMLParser(XMLParser):
def __init__(self, xmlns =3D 1):
self.testdata =3D ""
XMLParser.__init__(self, xmlns)
def handle_xml(self, encoding, standalone):
self.flush()
print 'xml: encoding =3D %s standalone =3D %s' % (encoding, standa=
lone)
def read_external(self, publit, syslit):
print 'reading %s' % name
try:
import urllib
u =3D urllib.urlopen(syslit)
data =3D u.read()
u.close()
except 'x':
return ''
return data
def handle_doctype(self, tag, publit, syslit, data):
self.flush()
print 'DOCTYPE: %s %s' % (tag, `data`)
XMLParser.handle_doctype(self, tag, publit, syslit, data)
def handle_comment(self, data):
self.flush()
r =3D `data`
if len(r) > 68:
r =3D r[:32] + '...' + r[-32:]
print 'comment: %s' % r
def handle_proc(self, name, data):
self.flush()
print 'processing: %s %s' % (name,`data`)
def handle_data(self, data):
self.testdata =3D self.testdata + data
if len(`self.testdata`) >=3D 70:
self.flush()
def handle_cdata(self, data):
self.flush()
print 'cdata: %s' % `data`
def flush(self):
data =3D self.testdata
if data:
self.testdata =3D ""
print 'data: %s ' % `data`
## def syntax_error(self, message):
## if self.lineno is not None:
## print 'Syntax error at line %d: %s' % (self.lineno, message)=
## else:
## print 'Syntax error: %s' % message
def unknown_starttag(self, tag, attrs):
self.flush()
if not attrs:
print 'start tag: <%s>' % tag
else:
print 'start tag: <%s' % tag,
for name, value in attrs.items():
print '%s =3D "%s"' % (name.encode('latin-1'), `value`),
print '>'
def unknown_endtag(self, tag):
self.flush()
print 'end tag: </%s>' % tag
def unknown_entityref(self, name):
self.flush()
print '&%s;' % name
class CanonXMLParser(XMLParser):
__cache =3D {}
def read_external(self, publit, syslit):
if publit and self.__cache.has_key(publit):
return self.__cache[publit]
try:
import urllib
u =3D urllib.urlopen(syslit)
data =3D u.read()
u.close()
except 'x':
return ''
if publit:
self.__cache[publit] =3D data
return data
def handle_data(self, data):
sys.stdout.write(self.encode(data))
def handle_cdata(self, data):
sys.stdout.write(self.encode(data))
def handle_proc(self, name, data):
sys.stdout.write('<?%s %s?>' % (name.encode('utf-8'), data.strip()=
.encode('utf-8')))
def unknown_starttag(self, tag, attrs):
sys.stdout.write('<%s' % tag.encode('utf-8'))
attrlist =3D attrs.items()
attrlist.sort()
for name, value in attrlist:
sys.stdout.write(' %s=3D"%s"' % (name.encode('utf-8'), self.en=
code(value)))
sys.stdout.write('>')
def unknown_endtag(self, tag):
sys.stdout.write('</%s>' % tag.encode('utf-8'))
def unknown_entityref(self, name):
print '&%s;' % name.encode('utf-8')
def encode(self, data):
for c, tr in [('&', '&'),
('>', '>'),
('<', '<'),
('"', '"'),
('\t', '	'),
('\n', ' '),
('\r', ' ')]:
data =3D tr.join(data.split(c))
return data.encode('utf-8')
class CheckXMLParser(XMLParser):
__cache =3D {}
def read_external(self, publit, syslit):
if publit and self.__cache.has_key(publit):
return self.__cache[publit]
try:
import urllib
u =3D urllib.urlopen(syslit)
data =3D u.read()
u.close()
except 'x':
return ''
if publit:
self.__cache[publit] =3D data
return data
def test(args =3D None):
import sys, getopt
if not args:
args =3D sys.argv[1:]
opts, args =3D getopt.getopt(args, 'cstnvCd:')
klass =3D TestXMLParser
do_time =3D 0
namespace =3D 1
verbose =3D 0
doctype =3D None
for o, a in opts:
if o =3D=3D '-c':
klass =3D CanonXMLParser
elif o =3D=3D '-C':
klass =3D CheckXMLParser
elif o =3D=3D '-s':
klass =3D XMLParser
elif o =3D=3D '-t':
do_time =3D 1
elif o =3D=3D '-n':
namespace =3D 0
elif o =3D=3D '-v':
verbose =3D 1
elif o =3D=3D '-d':
doctype =3D a
if not args:
args =3D ['test.xml']
for file in args:
if file =3D=3D '-':
f =3D sys.stdin
url =3D '.'
else:
try:
f =3D open(file, 'r')
except IOError, msg:
print file, ":", msg
sys.exit(1)
import urllib
url =3D urllib.pathname2url(file)
data =3D f.read()
if f is not sys.stdin:
f.close()
x =3D klass(xmlns =3D namespace)
x.baseurl =3D url
x.doctype =3D doctype
if verbose:
print '=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D',file
try:
t0, t1, t2 =3D x.parse(data)
except Error, info:
do_time =3D 0 # can't print times now
print str(info)
if info.text is not None and info.offset is not None:
i =3D string.rfind(info.text, '\n', 0, info.offset) + 1
j =3D string.find(info.text, '\n', info.offset)
if j =3D=3D -1: j =3D len(info.text)
try:
print info.text[i:j]
except UnicodeError:
print `info.text[i:j]`
else:
print ' '*(info.offset-i)+'^'
if klass is CanonXMLParser and (verbose or len(args) > 1):
sys.stdout.write('\n')
if do_time:
print 'total time: %g' % (t2-t0)
print 'parse DTD: %g' % (t1-t0)
print 'parse body: %g' %(t2-t1)
if __name__ =3D=3D '__main__':
test()
------- =_aaaaaaaaaa0--