[Spambayes-checkins] spambayes/spambayes PyMeldLite.py,1.4,1.5

Thu Jan 23 10:28:19 EST 2003

Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv29471

Modified Files:
	PyMeldLite.py 
Log Message:
Uses expat rather than xmllib when running under 2.3 (expat is now
included in the distribution, and xmllib is deprecated).
Improved the bad-XML-characters code to write high characters
as charrefs rather than replacing them with '?'.

Index: PyMeldLite.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/PyMeldLite.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** PyMeldLite.py	22 Jan 2003 18:29:11 -0000	1.4
--- PyMeldLite.py	23 Jan 2003 18:28:15 -0000	1.5
***************
*** 195,209 ****

  # Entrian.Coverage: Pragma Stop
! try:
!     # XXX Take this seriously before 2.4 comes out...
!     import warnings
!     warnings.filterwarnings(action='ignore',
!                             message='.*xmllib',
!                             category=DeprecationWarning)
! except ImportError:
!     pass
! 
! import re, xmllib
! 
  try:
      True, False, bool
--- 195,199 ----

  # Entrian.Coverage: Pragma Stop
! import sys, re, string
  try:
      True, False, bool
***************
*** 223,230 ****
  nonSelfClose = {'textarea': None}

! # Map characters not allowed in XML content to '?'
! import string
! badxml_chars = ''.join([chr(c) for c in range(0, 32) + range(128, 160)
!                                if c not in [9, 10, 13]])
  badxml_map = string.maketrans(badxml_chars, '?' * len(badxml_chars))

--- 213,222 ----
  nonSelfClose = {'textarea': None}

! # Map high characters to charrefs.
! def replaceHighCharacters(match):
!     return "&#%d;" % ord(match.group(1))
! 
! # Map meaningless low characters to '?'
! badxml_chars = ''.join([chr(c) for c in range(0, 32) if c not in [9, 10, 13]])
  badxml_map = string.maketrans(badxml_chars, '?' * len(badxml_chars))

***************
*** 359,454 ****

! class _TreeGenerator(xmllib.XMLParser):
!     """An XML parser that generates a lightweight DOM tree.  Call `feed()`
!     with XML source, then `close()`, then `getTree()` will give you the
!     tree's `_RootNode`:

!     >>> g = _TreeGenerator()
!     >>> g.feed("<xml>Stuff. ")
!     >>> g.feed("More stuff.</xml>")
!     >>> g.close()
!     >>> tree = g.getTree()
!     >>> print tree.toText()
!     <xml>Stuff. More stuff.</xml>
!     """

!     def __init__(self):
!         xmllib.XMLParser.__init__(self, translate_attribute_references=False)
!         self.entitydefs = {}    # entitydefs is an xmllib.XMLParser attribute.
!         self._tree = _RootNode()
!         self._currentNode = self._tree
!         self._pendingText = []

!     def getTree(self):
!         """Returns the generated tree; call `feed()` then `close()` first."""
!         return self._tree

!     def _collapsePendingText(self):
!         """Text (any content that isn't an open/close element) is built up
!         in `self._pendingText` until an open/close element is seen, at which
!         point it gets collapsed into a `_TextNode`."""

!         data = ''.join(self._pendingText)
!         self._currentNode.children.append(_TextNode(data))
!         self._pendingText = []

!     def handle_xml(self, encoding, standalone):
!         xml = '<?xml version="1.0"'
!         if encoding:
!             xml += ' encoding="%s"' % encoding
!         if standalone:
!             xml += ' standalone="%s"' % standalone
!         xml += '?>'
!         self._pendingText.append(xml)

!     def handle_doctype(self, tag, pubid, syslit, data):
!         doctype = '<!DOCTYPE %s' % tag
!         if pubid:
!             doctype += ' PUBLIC "%s"' % pubid
!         elif syslit:
!             doctype += ' SYSTEM'
!         if syslit:
!             doctype += ' "%s"' % syslit
!         if data:
!             doctype += ' [%s]>' % data
!         else:
!             doctype += '>'
!         self._pendingText.append(doctype)

!     def handle_comment(self, data):
!         self._pendingText.append('<!--%s-->' % data)

!     def handle_proc(self, name, data):
!         self._pendingText.append('<?%s %s ?>' % (name, data.strip()))

!     def handle_data(self, data):
!         self._pendingText.append(data)

!     def handle_charref(self, ref):
!         self._pendingText.append('&#%s;' % ref)

!     unknown_charref = handle_charref

!     def handle_entityref(self, ref):
!         self._pendingText.append('&%s;' % ref)

!     unknown_entityref = handle_entityref

!     def handle_cdata(self, data):
!         if self._pendingText:
!             self._collapsePendingText()
!         self._pendingText.append('<![CDATA[%s]]>' % data)

!     def unknown_starttag(self, tag, attributes):
!         if self._pendingText:
!             self._collapsePendingText()
!         newNode = _ElementNode(self._currentNode, tag, attributes)
!         self._currentNode.children.append(newNode)
!         self._currentNode = newNode

!     def unknown_endtag(self, tag):
!         if self._pendingText:
!             self._collapsePendingText()
!         self._currentNode = self._currentNode.parent

--- 351,540 ----

! # For XML parsing we use xmllib in versions prior to 2.3, because we can't
! # be sure that expat will be there, or that it will be a decent version.
! # We use expat in versions 2.3 and above, because we can be sure it will
! # be there and xmllib is deprecated from 2.3.

! # The slightly odd Entrian.Coverage pragmas in this section make sure that
! # whichever branch is taken, we get code coverage for that branch and no
! # coverage failures for the other.
! if sys.hexversion >> 16 < 0x203:
!     # Entrian.Coverage: Pragma Stop
!     import xmllib
!     class _TreeGenerator(xmllib.XMLParser):
!         # Entrian.Coverage: Pragma Start
!         """An XML parser that generates a lightweight DOM tree.  Call `feed()`
!         with XML source, then `close()`, then `getTree()` will give you the
!         tree's `_RootNode`:

!         >>> g = _TreeGenerator()
!         >>> g.feed("<xml>Stuff. ")
!         >>> g.feed("More stuff.</xml>")
!         >>> g.close()
!         >>> tree = g.getTree()
!         >>> print tree.toText()
!         <xml>Stuff. More stuff.</xml>
!         """

!         def __init__(self):
!             xmllib.XMLParser.__init__(self,
!                                       translate_attribute_references=False)
!             self.entitydefs = {}    # This is an xmllib.XMLParser attribute.
!             self._tree = _RootNode()
!             self._currentNode = self._tree
!             self._pendingText = []

!         def getTree(self):
!             """Returns the generated tree; call `feed` then `close` first."""
!             return self._tree

!         def _collapsePendingText(self):
!             """Text (any content that isn't an open/close element) is built up
!             in `self._pendingText` until an open/close element is seen, at
!             which point it gets collapsed into a `_TextNode`."""

!             data = ''.join(self._pendingText)
!             self._currentNode.children.append(_TextNode(data))
!             self._pendingText = []

!         def handle_xml(self, encoding, standalone):
!             xml = '<?xml version="1.0"'
!             if encoding:
!                 xml += ' encoding="%s"' % encoding
!             if standalone:
!                 xml += ' standalone="%s"' % standalone
!             xml += '?>'
!             self._pendingText.append(xml)

!         def handle_doctype(self, tag, pubid, syslit, data):
!             doctype = '<!DOCTYPE %s' % tag
!             if pubid:
!                 doctype += ' PUBLIC "%s"' % pubid
!             elif syslit:
!                 doctype += ' SYSTEM'
!             if syslit:
!                 doctype += ' "%s"' % syslit
!             if data:
!                 doctype += ' [%s]>' % data
!             else:
!                 doctype += '>'
!             self._pendingText.append(doctype)

!         def handle_comment(self, data):
!             self._pendingText.append('<!--%s-->' % data)

!         def handle_proc(self, name, data):
!             self._pendingText.append('<?%s %s ?>' % (name, data.strip()))

!         def handle_data(self, data):
!             self._pendingText.append(data)

!         def handle_charref(self, ref):
!             self._pendingText.append('&#%s;' % ref)

!         unknown_charref = handle_charref

!         def handle_entityref(self, ref):
!             self._pendingText.append('&%s;' % ref)

!         unknown_entityref = handle_entityref

!         def handle_cdata(self, data):
!             if self._pendingText:
!                 self._collapsePendingText()
!             self._pendingText.append('<![CDATA[%s]]>' % data)

!         def unknown_starttag(self, tag, attributes):
!             if self._pendingText:
!                 self._collapsePendingText()
!             newNode = _ElementNode(self._currentNode, tag, attributes)
!             self._currentNode.children.append(newNode)
!             self._currentNode = newNode
! 
!         def unknown_endtag(self, tag):
!             if self._pendingText:
!                 self._collapsePendingText()
!             self._currentNode = self._currentNode.parent
! 
! else:
!     # Entrian.Coverage: Pragma Stop
!     import xml.parsers.expat
!     class _TreeGenerator:
!         # Entrian.Coverage: Pragma Start
!         """An XML parser that generates a lightweight DOM tree.  Call `feed()`
!         with XML source, then `close()`, then `getTree()` will give you the
!         tree's `_RootNode`:
! 
!         >>> g = _TreeGenerator()
!         >>> g.feed("<xml>Stuff. ")
!         >>> g.feed("More stuff.</xml>")
!         >>> g.close()
!         >>> tree = g.getTree()
!         >>> print tree.toText()
!         <xml>Stuff. More stuff.</xml>
!         """
! 
!         def __init__(self):
!             self._tree = _RootNode()
!             self._currentNode = self._tree
!             self._pendingText = []
!             self._parser = xml.parsers.expat.ParserCreate()
!             self._parser.buffer_text = True
!             self._parser.DefaultHandler = self.DefaultHandler
!             self._parser.StartElementHandler = self.StartElementHandler
!             self._parser.EndElementHandler = self.EndElementHandler
! 
!         # All entities and charrefs, like &bull; and &#160;, are considered
!         # valid - who are we to argue?  Expat thinks it knows better, so we
!         # fool it here.
!         def _mungeEntities(self, data):
!             return re.sub(r'&(\w+);', r':PyMeldEntity:\1:', data)
! 
!         def _unmungeEntities(self, data):
!             return re.sub(r':PyMeldEntity:(\w+):', r'&\1;', data)
! 
!         def feed(self, data):
!             """Call this with XML content to be parsed."""
!             data = self._mungeEntities(data)
!             self._parser.Parse(data)
! 
!         def close(self):
!             """Call this when you've passed all your XML content to `feed`."""
!             self._parser.Parse("", True)
! 
!         def getTree(self):
!             """Returns the generated tree; call `feed` then `close` first."""
!             return self._tree
! 
!         def _collapsePendingText(self):
!             """Text (any content that isn't an open/close element) is built up
!             in `self._pendingText` until an open/close element is seen, at
!             which point it gets collapsed into a `_TextNode`."""
! 
!             data = ''.join(self._pendingText)
!             data = self._unmungeEntities(data)
!             self._currentNode.children.append(_TextNode(data))
!             self._pendingText = []
! 
!         def DefaultHandler(self, data):
!             """Expat handler."""
!             self._pendingText.append(str(data))
! 
!         def StartElementHandler(self, tag, attributes):
!             """Expat handler."""
!             if self._pendingText:
!                 self._collapsePendingText()
!             newAttributes = {}
!             for name, value in attributes.iteritems():
!                 newAttributes[str(name)] = self._unmungeEntities(str(value))
!             newNode = _ElementNode(self._currentNode, str(tag), newAttributes)
!             self._currentNode.children.append(newNode)
!             self._currentNode = newNode
! 
!         def EndElementHandler(self, tag):
!             """Expat handler."""
!             if self._pendingText:
!                 self._collapsePendingText()
!             self._currentNode = self._currentNode.parent

***************
*** 480,485 ****
                   source[match.end(1):]

!     # Map characters not allowed in XML content to '?'
      source = source.translate(badxml_map)

      # Parse the XML and generate the tree.
--- 566,572 ----
                   source[match.end(1):]

!     # Map characters not allowed in XML content to sensible things.
      source = source.translate(badxml_map)
+     source = re.sub('([\x80-\xff])', replaceHighCharacters, source)

      # Parse the XML and generate the tree.
***************
*** 889,897 ****

  'XML proc': """
! >>> print Meld('''<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
  ... <?codewarrior exportversion="1.0.1" ideversion="4.2" ?>
  ... <!DOCTYPE PROJECT [
  ... <!ELEMENT PROJECT (TARGETLIST, TARGETORDER, GROUPLIST, DESIGNLIST?)>
- ... (...etc...)
  ... ]>
  ... <PROJECT>Stuff</PROJECT>''')
--- 976,983 ----

  'XML proc': """
! >>> print Meld('''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
  ... <?codewarrior exportversion="1.0.1" ideversion="4.2" ?>
  ... <!DOCTYPE PROJECT [
  ... <!ELEMENT PROJECT (TARGETLIST, TARGETORDER, GROUPLIST, DESIGNLIST?)>
  ... ]>
  ... <PROJECT>Stuff</PROJECT>''')
***************
*** 900,904 ****
  <!DOCTYPE PROJECT [
  <!ELEMENT PROJECT (TARGETLIST, TARGETORDER, GROUPLIST, DESIGNLIST?)>
- (...etc...)
  ]>
  <PROJECT>Stuff</PROJECT>
--- 986,989 ----
***************
*** 913,923 ****
  'entities and charrefs': """
  >>> page = Meld('''<html><body>&bull; This "and&#160;that"...
! ... <span id="s" title="&quot;Quoted&quot; & Not">x</span></body></html>''')
  >>> print page.s.title
  "Quoted" & Not
- >>> page.s.title = page.s.title     # Accept liberally, produce strictly.
- >>> print page
- <html><body>&bull; This "and&#160;that"...
- <span id="s" title="&quot;Quoted&quot; &amp; Not">x</span></body></html>
  >>> page.s.title = page.s.title + " <>"
  >>> print page.s.title
--- 998,1004 ----
  'entities and charrefs': """
  >>> page = Meld('''<html><body>&bull; This "and&#160;that"...
! ... <span id="s" title="&quot;Quoted&quot; &amp; Not">x</span></body></html>''')
  >>> print page.s.title
  "Quoted" & Not
  >>> page.s.title = page.s.title + " <>"
  >>> print page.s.title
***************
*** 1068,1076 ****
  'bad XML characters': """
  >>> page = Meld('''<x>
! ... Valentines Day Special \x96 2 bikinis for the price of one
  ... </x>''')    # No exception.
  >>> print page
  <x>
! Valentines Day Special ? 2 bikinis for the price of one
  </x>
  """
--- 1149,1157 ----
  'bad XML characters': """
  >>> page = Meld('''<x>
! ... Valentines Day Special \x96 2 bikinis for the price of one \x01
  ... </x>''')    # No exception.
  >>> print page
  <x>
! Valentines Day Special &#150; 2 bikinis for the price of one ?
  </x>
  """