[Python-checkins] CVS: python/dist/src/Lib HTMLParser.py,1.4,1.5

Fred L. Drake fdrake@users.sourceforge.net
Mon, 20 Aug 2001 14:24:21 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv22193/Lib

Modified Files:
	HTMLParser.py 
Log Message:

Deal more appropriately with bare ampersands and pointy brackets; this
module has to deal with "class" HTML-as-deployed as well as XHTML, so we
cannot be as strict as XHTML allows.

This closes SF bug #453059, but uses a different fix than suggested in
the bug comments.


Index: HTMLParser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/HTMLParser.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** HTMLParser.py	2001/08/03 19:50:59	1.4
--- HTMLParser.py	2001/08/20 21:24:19	1.5
***************
*** 16,20 ****
  interesting_normal = re.compile('[&<]')
  interesting_cdata = re.compile(r'<(/|\Z)')
! incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
  
  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
--- 16,21 ----
  interesting_normal = re.compile('[&<]')
  interesting_cdata = re.compile(r'<(/|\Z)')
! incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
!                         '|#([0-9]*|[xX][0-9a-fA-F]*))?')
  
  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
***************
*** 186,194 ****
                      k = self.parse_declaration(i)
                  else:
!                     if i < n-1:
!                         raise HTMLParseError(
!                             "invalid '<' construct: %s" % `rawdata[i:i+2]`,
!                             self.getpos())
!                     k = -1
                  if k < 0:
                      if end:
--- 187,192 ----
                      k = self.parse_declaration(i)
                  else:
!                     self.handle_data("<")
!                     k = i + 1
                  if k < 0:
                      if end:
***************
*** 204,208 ****
                      k = match.end()
                      if rawdata[k-1] != ';':
!                         k = k-1
                      i = self.updatepos(i, k)
                      continue
--- 202,206 ----
                      k = match.end()
                      if rawdata[k-1] != ';':
!                         k = k - 1
                      i = self.updatepos(i, k)
                      continue
***************
*** 213,227 ****
                      k = match.end()
                      if rawdata[k-1] != ';':
!                         k = k-1
                      i = self.updatepos(i, k)
                      continue
!                 if incomplete.match(rawdata, i):
!                     if end:
                          raise HTMLParseError(
                              "EOF in middle of entity or char ref",
                              self.getpos())
                      return -1 # incomplete
!                 raise HTMLParseError("'&' not part of entity or char ref",
!                                      self.getpos())
              else:
                  assert 0, "interesting.search() lied"
--- 211,227 ----
                      k = match.end()
                      if rawdata[k-1] != ';':
!                         k = k - 1
                      i = self.updatepos(i, k)
                      continue
!                 match = incomplete.match(rawdata, i)
!                 if match:
!                     rest = rawdata[i:]
!                     if end and rest != "&" and match.group() == rest:
                          raise HTMLParseError(
                              "EOF in middle of entity or char ref",
                              self.getpos())
                      return -1 # incomplete
!                 self.handle_data("&")
!                 i = self.updatepos(i, i + 1)
              else:
                  assert 0, "interesting.search() lied"