[Python-checkins] CVS: python/dist/src/Lib sgmllib.py,1.37,1.38

Fred L. Drake fdrake@users.sourceforge.net
Mon, 24 Sep 2001 13:15:53 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv14044

Modified Files:
	sgmllib.py 
Log Message:
Re-factor the SGMLParser class to use the new markupbase.ParserBase class.
Use a new internal method, error(), consistently to raise parse errors;
the new base class also uses this.
Adjust the parse_comment() method to return the new offset into the buffer
instead of the number of characters scanned; this was the only helper
method that did it this way, so we have better consistency now.  Required
to share the new base class.
This fixes SF bug #448482 and #453706.


Index: sgmllib.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sgmllib.py,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** sgmllib.py	2001/08/02 07:15:29	1.37
--- sgmllib.py	2001/09/24 20:15:51	1.38
***************
*** 10,13 ****
--- 10,14 ----
  
  
+ import markupbase
  import re
  
***************
*** 28,41 ****
  shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
  shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
- piopen = re.compile('<\?')
  piclose = re.compile('>')
- endtagopen = re.compile('</[<>a-zA-Z]')
  endbracket = re.compile('[<>]')
- special = re.compile('<![^<>]*>')
- commentopen = re.compile('<!--')
  commentclose = re.compile(r'--\s*>')
- declopen = re.compile('<!')
- declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
- declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
  tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  attrfind = re.compile(
--- 29,35 ----
***************
*** 43,49 ****
      r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
  
- decldata = re.compile(r'[^>\'\"]+')
- declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
- 
  
  class SGMLParseError(RuntimeError):
--- 37,40 ----
***************
*** 63,67 ****
  # self.handle_entityref() with the entity reference as argument.
  
! class SGMLParser:
  
      def __init__(self, verbose=0):
--- 54,58 ----
  # self.handle_entityref() with the entity reference as argument.
  
! class SGMLParser(markupbase.ParserBase):
  
      def __init__(self, verbose=0):
***************
*** 77,80 ****
--- 68,72 ----
          self.nomoretags = 0
          self.literal = 0
+         markupbase.ParserBase.reset(self)
  
      def setnomoretags(self):
***************
*** 107,110 ****
--- 99,105 ----
          self.goahead(1)
  
+     def error(self, message):
+         raise SGMLParseError(message)
+ 
      # Internal -- handle data as far as reasonable.  May leave state
      # and data to be processed by a subsequent call.  If 'end' is
***************
*** 120,126 ****
                  break
              match = interesting.search(rawdata, i)
!             if match: j = match.start(0)
              else: j = n
!             if i < j: self.handle_data(rawdata[i:j])
              i = j
              if i == n: break
--- 115,122 ----
                  break
              match = interesting.search(rawdata, i)
!             if match: j = match.start()
              else: j = n
!             if i < j:
!                 self.handle_data(rawdata[i:j])
              i = j
              if i == n: break
***************
*** 135,168 ****
                      i = k
                      continue
!                 if endtagopen.match(rawdata, i):
                      k = self.parse_endtag(i)
                      if k < 0: break
!                     i =  k
                      self.literal = 0
                      continue
!                 if commentopen.match(rawdata, i):
!                     if self.literal:
!                         self.handle_data(rawdata[i])
                          i = i+1
!                         continue
                      k = self.parse_comment(i)
                      if k < 0: break
!                     i = i+k
                      continue
!                 if piopen.match(rawdata, i):
!                     if self.literal:
!                         self.handle_data(rawdata[i])
!                         i = i+1
!                         continue
                      k = self.parse_pi(i)
                      if k < 0: break
                      i = i+k
                      continue
!                 match = special.match(rawdata, i)
!                 if match:
!                     if self.literal:
!                         self.handle_data(rawdata[i])
!                         i = i+1
!                         continue
                      # This is some sort of declaration; in "HTML as
                      # deployed," this should only be the document type
--- 131,159 ----
                      i = k
                      continue
!                 if rawdata.startswith("</", i):
                      k = self.parse_endtag(i)
                      if k < 0: break
!                     i = k
                      self.literal = 0
                      continue
!                 if self.literal:
!                     if n > (i + 1):
!                         self.handle_data("<")
                          i = i+1
!                     else:
!                         # incomplete
!                         break
!                     continue
!                 if rawdata.startswith("<!--", i):
                      k = self.parse_comment(i)
                      if k < 0: break
!                     i = k
                      continue
!                 if rawdata.startswith("<?", i):
                      k = self.parse_pi(i)
                      if k < 0: break
                      i = i+k
                      continue
!                 if rawdata.startswith("<!", i):
                      # This is some sort of declaration; in "HTML as
                      # deployed," this should only be the document type
***************
*** 192,196 ****
                      continue
              else:
!                 raise SGMLParseError('neither < nor & ??')
              # We get here only if incomplete matches but
              # nothing else
--- 183,187 ----
                      continue
              else:
!                 self.error('neither < nor & ??')
              # We get here only if incomplete matches but
              # nothing else
***************
*** 213,263 ****
  
      # Internal -- parse comment, return length or -1 if not terminated
!     def parse_comment(self, i):
          rawdata = self.rawdata
          if rawdata[i:i+4] != '<!--':
!             raise SGMLParseError('unexpected call to parse_comment()')
          match = commentclose.search(rawdata, i+4)
          if not match:
              return -1
!         j = match.start(0)
!         self.handle_comment(rawdata[i+4: j])
!         j = match.end(0)
!         return j-i
  
!     # Internal -- parse declaration.
!     def parse_declaration(self, i):
!         # This is some sort of declaration; in "HTML as
!         # deployed," this should only be the document type
!         # declaration ("<!DOCTYPE html...>").
!         rawdata = self.rawdata
!         j = i + 2
!         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
!         if rawdata[j:j+1] in ("-", ""):
!             # Start of comment followed by buffer boundary,
!             # or just a buffer boundary.
!             return -1
!         # in practice, this should look like: ((name|stringlit) S*)+ '>'
!         n = len(rawdata)
!         while j < n:
!             c = rawdata[j]
!             if c == ">":
!                 # end of declaration syntax
!                 self.handle_decl(rawdata[i+2:j])
!                 return j + 1
!             if c in "\"'":
!                 m = declstringlit.match(rawdata, j)
!                 if not m:
!                     return -1 # incomplete
!                 j = m.end()
!             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
!                 m = declname.match(rawdata, j)
!                 if not m:
!                     return -1 # incomplete
!                 j = m.end()
!             else:
!                 raise SGMLParseError(
!                     "unexpected char in declaration: %s" % `rawdata[j]`)
!         # end of buffer between tokens
!         return -1
  
      # Internal -- parse processing instr, return length or -1 if not terminated
--- 204,221 ----
  
      # Internal -- parse comment, return length or -1 if not terminated
!     def parse_comment(self, i, report=1):
          rawdata = self.rawdata
          if rawdata[i:i+4] != '<!--':
!             self.error('unexpected call to parse_comment()')
          match = commentclose.search(rawdata, i+4)
          if not match:
              return -1
!         if report:
!             j = match.start(0)
!             self.handle_comment(rawdata[i+4: j])
!         return match.end(0)
  
!     # Extensions for the DOCTYPE scanner:
!     _decl_otherchars = '='
  
      # Internal -- parse processing instr, return length or -1 if not terminated
***************
*** 265,269 ****
          rawdata = self.rawdata
          if rawdata[i:i+2] != '<?':
!             raise SGMLParseError('unexpected call to parse_pi()')
          match = piclose.search(rawdata, i+2)
          if not match:
--- 223,227 ----
          rawdata = self.rawdata
          if rawdata[i:i+2] != '<?':
!             self.error('unexpected call to parse_pi()')
          match = piclose.search(rawdata, i+2)
          if not match:
***************
*** 312,316 ****
              match = tagfind.match(rawdata, i+1)
              if not match:
!                 raise SGMLParseError('unexpected call to parse_starttag')
              k = match.end(0)
              tag = rawdata[i+1:k].lower()
--- 270,274 ----
              match = tagfind.match(rawdata, i+1)
              if not match:
!                 self.error('unexpected call to parse_starttag')
              k = match.end(0)
              tag = rawdata[i+1:k].lower()
***************
*** 466,469 ****
--- 424,428 ----
      def unknown_charref(self, ref): pass
      def unknown_entityref(self, ref): pass
+     def unknown_decl(self, data): pass