[Python-checkins] CVS: python/dist/src/Lib HTMLParser.py,1.6,1.7

Fred L. Drake fdrake@users.sourceforge.net
Tue, 04 Sep 2001 09:26:05 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv17166

Modified Files:
	HTMLParser.py 
Log Message:
HTMLParser is allowed to be more strict than sgmllib, so let's not
change their basic behavior:  When parsing something that cannot possibly
be valid in either HTML or XHTML, raise an exception.


Index: HTMLParser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/HTMLParser.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** HTMLParser.py	2001/09/04 15:10:16	1.6
--- HTMLParser.py	2001/09/04 16:26:03	1.7
***************
*** 270,275 ****
          # in practice, this should look like: ((name|stringlit) S*)+ '>'
          n = len(rawdata)
!         decltype = None
!         extrachars = ""
          while j < n:
              c = rawdata[j]
--- 270,279 ----
          # in practice, this should look like: ((name|stringlit) S*)+ '>'
          n = len(rawdata)
!         decltype, j = self.scan_name(j, i)
!         if j < 0:
!             return j
!         if decltype.lower() != "doctype":
!             raise HTMLParseError("unknown declaration: '%s'" % decltype,
!                                  self.getpos())
          while j < n:
              c = rawdata[j]
***************
*** 277,284 ****
                  # end of declaration syntax
                  data = rawdata[i+2:j]
!                 if decltype == "doctype":
!                     self.handle_decl(data)
!                 else:
!                     self.unknown_decl(data)
                  return j + 1
              if c in "\"'":
--- 281,285 ----
                  # end of declaration syntax
                  data = rawdata[i+2:j]
!                 self.handle_decl(data)
                  return j + 1
              if c in "\"'":
***************
*** 288,315 ****
                  j = m.end()
              elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
!                 m = declname.match(rawdata, j)
!                 if not m:
!                     return -1 # incomplete
!                 j = m.end()
!                 if decltype is None:
!                     decltype = m.group(0).rstrip().lower()
!                     if decltype != "doctype":
!                         extrachars = "="
              elif c == "[" and decltype == "doctype":
                  j = self.parse_doctype_subset(j + 1, i)
-                 if j < 0:
-                     return j
-             elif c in extrachars:
-                 j = j + 1
-                 while j < n and rawdata[j] in string.whitespace:
-                     j = j + 1
-                 if j == n:
-                     # end of buffer while in declaration
-                     return -1
              else:
                  raise HTMLParseError(
                      "unexpected char in declaration: %s" % `rawdata[j]`,
                      self.getpos())
!             decltype = decltype or ''
          return -1 # incomplete
  
--- 289,301 ----
                  j = m.end()
              elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
!                 name, j = self.scan_name(j, i)
              elif c == "[" and decltype == "doctype":
                  j = self.parse_doctype_subset(j + 1, i)
              else:
                  raise HTMLParseError(
                      "unexpected char in declaration: %s" % `rawdata[j]`,
                      self.getpos())
!             if j < 0:
!                 return j
          return -1 # incomplete
  
***************
*** 360,368 ****
                      # end of buffer; incomplete
                      return -1
!                 m = declname.match(rawdata, j + 1)
!                 s = m.group()
!                 if s == rawdata[j+1:]:
!                     return -1
!                 j = j + 1 + len(s.rstrip())
                  if rawdata[j] == ";":
                      j = j + 1
--- 346,352 ----
                      # end of buffer; incomplete
                      return -1
!                 s, j = self.scan_name(j + 1, declstartpos)
!                 if j < 0:
!                     return j
                  if rawdata[j] == ";":
                      j = j + 1
***************
*** 384,389 ****
              else:
                  self.updatepos(declstartpos, j)
!                 raise HTMLParseError("unexpected char in internal subset",
!                                      self.getpos())
          # end of buffer reached
          return -1
--- 368,374 ----
              else:
                  self.updatepos(declstartpos, j)
!                 raise HTMLParseError(
!                     "unexpected char %s in internal subset" % `c`,
!                     self.getpos())
          # end of buffer reached
          return -1