[Python-checkins] python/dist/src/Lib markupbase.py,1.6,1.7 sgmllib.py,1.42,1.43

Sun, 30 Mar 2003 06:25:41 -0800

Update of /cvsroot/python/python/dist/src/Lib
In directory sc8-pr-cvs1:/tmp/cvs-serv1474/Lib

Modified Files:
	markupbase.py sgmllib.py 
Log Message:
Patch #545300: Support marked sections.


Index: markupbase.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/markupbase.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** markupbase.py	3 Jun 2002 15:58:31 -0000	1.6
--- markupbase.py	30 Mar 2003 14:25:39 -0000	1.7
***************
*** 5,8 ****
--- 5,15 ----
  _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
  _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
+ _commentclose = re.compile(r'--\s*>')
+ _markedsectionclose = re.compile(r']\s*]\s*>')
+ 
+ # An analysis of the MS-Word extensions is available at
+ # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
+ 
+ _msmarkedsectionclose = re.compile(r']\s*>')
  
  del re
***************
*** 54,57 ****
--- 61,71 ----
          # deployed," this should only be the document type
          # declaration ("<!DOCTYPE html...>").
+         # ISO 8879:1986, however, has more complex 
+         # declaration syntax for elements in <!...>, including:
+         # --comment--
+         # [marked section]
+         # name in the following list: ENTITY, DOCTYPE, ELEMENT, 
+         # ATTLIST, NOTATION, SHORTREF, USEMAP, 
+         # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
          rawdata = self.rawdata
          j = i + 2
***************
*** 61,67 ****
              # or just a buffer boundary.
              return -1
!         # in practice, this should look like: ((name|stringlit) S*)+ '>'
          n = len(rawdata)
!         decltype, j = self._scan_name(j, i)
          if j < 0:
              return j
--- 75,91 ----
              # or just a buffer boundary.
              return -1
!         # A simple, practical version could look like: ((name|stringlit) S*) + '>'
          n = len(rawdata)
!         if rawdata[j:j+1] == '--': #comment
!             # Locate --.*-- as the body of the comment
!             return self.parse_comment(i)
!         elif rawdata[j] == '[': #marked section
!             # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
!             # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
!             # Note that this is extended by Microsoft Office "Save as Web" function
!             # to include [if...] and [endif].
!             return self.parse_marked_section(i)
!         else: #all other declaration elements
!             decltype, j = self._scan_name(j, i)
          if j < 0:
              return j
***************
*** 88,93 ****
--- 112,124 ----
                  j = j + 1
              elif c == "[":
+                 # this could be handled in a separate doctype parser
                  if decltype == "doctype":
                      j = self._parse_doctype_subset(j + 1, i)
+                 elif decltype in ("attlist", "linktype", "link", "element"):
+                     # must tolerate []'d groups in a content model in an element declaration
+                     # also in data attribute specifications of attlist declaration
+                     # also link type declaration subsets in linktype declarations
+                     # also link attribute specification lists in link declarations
+                     self.error("unsupported '[' char in %s declaration" % decltype)
                  else:
                      self.error("unexpected '[' char in declaration")
***************
*** 98,101 ****
--- 129,168 ----
                  return j
          return -1 # incomplete
+ 
+     # Internal -- parse a marked section
+     # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
+     def parse_marked_section( self, i, report=1 ):
+         rawdata= self.rawdata
+         assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
+         sectName, j = self._scan_name( i+3, i )
+         if j < 0:
+             return j
+         if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
+             # look for standard ]]> ending
+             match= _markedsectionclose.search(rawdata, i+3)
+         elif sectName in ("if", "else", "endif"):
+             # look for MS Office ]> ending
+             match= _msmarkedsectionclose.search(rawdata, i+3)
+         else:
+             self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`)
+         if not match:
+             return -1
+         if report:
+             j = match.start(0)
+             self.unknown_decl(rawdata[i+3: j])
+         return match.end(0)
+             
+     # Internal -- parse comment, return length or -1 if not terminated
+     def parse_comment(self, i, report=1):
+         rawdata = self.rawdata
+         if rawdata[i:i+4] != '<!--':
+             self.error('unexpected call to parse_comment()')
+         match = _commentclose.search(rawdata, i+4)
+         if not match:
+             return -1
+         if report:
+             j = match.start(0)
+             self.handle_comment(rawdata[i+4: j])
+         return match.end(0)
  
      # Internal -- scan past the internal subset in a <!DOCTYPE declaration,

Index: sgmllib.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sgmllib.py,v
retrieving revision 1.42
retrieving revision 1.43
diff -C2 -d -r1.42 -r1.43
*** sgmllib.py	14 Mar 2003 16:21:55 -0000	1.42
--- sgmllib.py	30 Mar 2003 14:25:39 -0000	1.43
***************
*** 31,35 ****
  piclose = re.compile('>')
  endbracket = re.compile('[<>]')
- commentclose = re.compile(r'--\s*>')
  tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
  attrfind = re.compile(
--- 31,34 ----
***************
*** 146,149 ****
--- 145,152 ----
                      continue
                  if rawdata.startswith("<!--", i):
+                 	# Strictly speaking, a comment is --.*-- 
+                 	# within a declaration tag <!...>.
+                 	# This should be removed, 
+                 	# and comments handled only in parse_declaration.
                      k = self.parse_comment(i)
                      if k < 0: break
***************
*** 203,219 ****
          # XXX if end: check for empty stack
  
-     # Internal -- parse comment, return length or -1 if not terminated
-     def parse_comment(self, i, report=1):
-         rawdata = self.rawdata
-         if rawdata[i:i+4] != '<!--':
-             self.error('unexpected call to parse_comment()')
-         match = commentclose.search(rawdata, i+4)
-         if not match:
-             return -1
-         if report:
-             j = match.start(0)
-             self.handle_comment(rawdata[i+4: j])
-         return match.end(0)
- 
      # Extensions for the DOCTYPE scanner:
      _decl_otherchars = '='
--- 206,209 ----
***************
*** 471,474 ****
--- 461,468 ----
          self.flush()
          print '*** unknown char ref: &#' + ref + ';'
+ 
+     def unknown_decl(self, data):
+         self.flush()
+         print '*** unknown decl: [' + data + ']'
  
      def close(self):