[Python-checkins] CVS: python/dist/src/Lib HTMLParser.py,1.3,1.4

Fred L. Drake fdrake@users.sourceforge.net
Fri, 03 Aug 2001 12:51:01 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv30083

Modified Files:
	HTMLParser.py 
Log Message:

Change some comments into docstrings.

Fix handling of hexadecimal character references (legal in XHTML) so that
they are properly interpreted as character references.
This fixes SF bug #445196.


Index: HTMLParser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/HTMLParser.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** HTMLParser.py	2001/05/23 04:53:44	1.3
--- HTMLParser.py	2001/08/03 19:50:59	1.4
***************
*** 1,3 ****
! """A parser for HTML."""
  
  # This file is based on sgmllib.py, but the API is slightly different.
--- 1,3 ----
! """A parser for HTML and XHTML."""
  
  # This file is based on sgmllib.py, but the API is slightly different.
***************
*** 19,23 ****
  
  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
! charref = re.compile('&#([0-9]+)[^0-9]')
  
  starttagopen = re.compile('<[a-zA-Z]')
--- 19,23 ----
  
  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
! charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  
  starttagopen = re.compile('<[a-zA-Z]')
***************
*** 74,103 ****
  
  
! # HTML parser class -- find tags and call handler functions.
! # Usage:
! #
! #     p = HTMLParser(); p.feed(data); ...; p.close()
  
! # Start tags are handled by calling self.handle_starttag() or
! # self.handle_startendtag(); end tags by self.handle_endtag().  The
! # data between tags is passed from the parser to the derived class by
! # calling self.handle_data() with the data as argument (the data may
! # be split up in arbitrary chunks).  Entity references are passed by
! # calling self.handle_entityref() with the entity reference as the
! # argument.  Numeric character references are passed to
! # self.handle_charref() with the string containing the reference as
! # the argument.
  
! class HTMLParser:
  
      CDATA_CONTENT_ELEMENTS = ("script", "style")
  
  
-     # Interface -- initialize and reset this instance
      def __init__(self):
          self.reset()
  
-     # Interface -- reset this instance.  Loses all unprocessed data
      def reset(self):
          self.rawdata = ''
          self.stack = []
--- 74,106 ----
  
  
! class HTMLParser:
!     """Find tags and other markup and call handler functions.
  
!     Usage:
!         p = HTMLParser()
!         p.feed(data)
!         ...
!         p.close()
  
!     Start tags are handled by calling self.handle_starttag() or
!     self.handle_startendtag(); end tags by self.handle_endtag().  The
!     data between tags is passed from the parser to the derived class
!     by calling self.handle_data() with the data as argument (the data
!     may be split up in arbitrary chunks).  Entity references are
!     passed by calling self.handle_entityref() with the entity
!     reference as the argument.  Numeric character references are
!     passed to self.handle_charref() with the string containing the
!     reference as the argument.
!     """
  
      CDATA_CONTENT_ELEMENTS = ("script", "style")
  
  
      def __init__(self):
+         """Initialize and reset this instance."""
          self.reset()
  
      def reset(self):
+         """Reset this instance.  Loses all unprocessed data."""
          self.rawdata = ''
          self.stack = []
***************
*** 107,120 ****
          self.interesting = interesting_normal
  
-     # Interface -- feed some data to the parser.  Call this as
-     # often as you want, with as little or as much text as you
-     # want (may include '\n').  (This just saves the text, all the
-     # processing is done by goahead().)
      def feed(self, data):
          self.rawdata = self.rawdata + data
          self.goahead(0)
  
-     # Interface -- handle the remaining data
      def close(self):
          self.goahead(1)
  
--- 110,124 ----
          self.interesting = interesting_normal
  
      def feed(self, data):
+         """Feed data to the parser.
+ 
+         Call this as often as you want, with as little or as much text
+         as you want (may include '\n').
+         """
          self.rawdata = self.rawdata + data
          self.goahead(0)
  
      def close(self):
+         """Handle any buffered data."""
          self.goahead(1)
  
***************
*** 136,147 ****
          return j
  
-     # Interface -- return current line number and offset.
      def getpos(self):
          return self.lineno, self.offset
  
      __starttag_text = None
  
-     # Interface -- return full source of start tag: "<...>"
      def get_starttag_text(self):
          return self.__starttag_text
  
--- 140,151 ----
          return j
  
      def getpos(self):
+         """Return current line number and offset."""
          return self.lineno, self.offset
  
      __starttag_text = None
  
      def get_starttag_text(self):
+         """Return full source of start tag: '<...>'."""
          return self.__starttag_text
  
***************
*** 196,200 ****
                  match = charref.match(rawdata, i)
                  if match:
!                     name = match.group(1)
                      self.handle_charref(name)
                      k = match.end()
--- 200,204 ----
                  match = charref.match(rawdata, i)
                  if match:
!                     name = match.group()[2:-1]
                      self.handle_charref(name)
                      k = match.end()