[Python-checkins] CVS: python/dist/src/Lib HTMLParser.py,1.9,1.10

Fred L. Drake fdrake@users.sourceforge.net
Mon, 03 Dec 2001 09:09:53 -0800


Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv6260

Modified Files:
	HTMLParser.py 
Log Message:
Convert to using string methods instead of the string module.
In goahead(), use a bound version of rawdata.startswith() since we use the
same method all the time and never change the value of rawdata.  This can
save a lot of bound method creation.


Index: HTMLParser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/HTMLParser.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** HTMLParser.py	2001/09/24 20:10:28	1.9
--- HTMLParser.py	2001/12/03 17:09:50	1.10
***************
*** 11,15 ****
  import markupbase
  import re
- import string
  
  # Regular expressions used for parsing
--- 11,14 ----
***************
*** 24,28 ****
  starttagopen = re.compile('<[a-zA-Z]')
  piclose = re.compile('>')
- endtagopen = re.compile('</')
  commentclose = re.compile(r'--\s*>')
  tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
--- 23,26 ----
***************
*** 97,101 ****
          """Reset this instance.  Loses all unprocessed data."""
          self.rawdata = ''
-         self.stack = []
          self.lasttag = '???'
          self.interesting = interesting_normal
--- 95,98 ----
***************
*** 146,161 ****
              i = self.updatepos(i, j)
              if i == n: break
!             if rawdata[i] == '<':
                  if starttagopen.match(rawdata, i): # < + letter
                      k = self.parse_starttag(i)
!                 elif endtagopen.match(rawdata, i): # </
                      k = self.parse_endtag(i)
                      if k >= 0:
                          self.clear_cdata_mode()
!                 elif rawdata.startswith("<!--", i): # <!--
                      k = self.parse_comment(i)
!                 elif rawdata.startswith("<?", i): # <?
                      k = self.parse_pi(i)
!                 elif rawdata.startswith("<!", i): # <!
                      k = self.parse_declaration(i)
                  elif (i + 1) < n:
--- 143,159 ----
              i = self.updatepos(i, j)
              if i == n: break
!             startswith = rawdata.startswith
!             if startswith('<', i):
                  if starttagopen.match(rawdata, i): # < + letter
                      k = self.parse_starttag(i)
!                 elif startswith("</", i):
                      k = self.parse_endtag(i)
                      if k >= 0:
                          self.clear_cdata_mode()
!                 elif startswith("<!--", i):
                      k = self.parse_comment(i)
!                 elif startswith("<?", i):
                      k = self.parse_pi(i)
!                 elif startswith("<!", i):
                      k = self.parse_declaration(i)
                  elif (i + 1) < n:
***************
*** 169,173 ****
                      break
                  i = self.updatepos(i, k)
!             elif rawdata[i:i+2] == "&#":
                  match = charref.match(rawdata, i)
                  if match:
--- 167,171 ----
                      break
                  i = self.updatepos(i, k)
!             elif startswith("&#", i):
                  match = charref.match(rawdata, i)
                  if match:
***************
*** 175,179 ****
                      self.handle_charref(name)
                      k = match.end()
!                     if rawdata[k-1] != ';':
                          k = k - 1
                      i = self.updatepos(i, k)
--- 173,177 ----
                      self.handle_charref(name)
                      k = match.end()
!                     if not startswith(';', k-1):
                          k = k - 1
                      i = self.updatepos(i, k)
***************
*** 181,185 ****
                  else:
                      break
!             elif rawdata[i] == '&':
                  match = entityref.match(rawdata, i)
                  if match:
--- 179,183 ----
                  else:
                      break
!             elif startswith('&', i):
                  match = entityref.match(rawdata, i)
                  if match:
***************
*** 187,191 ****
                      self.handle_entityref(name)
                      k = match.end()
!                     if rawdata[k-1] != ';':
                          k = k - 1
                      i = self.updatepos(i, k)
--- 185,189 ----
                      self.handle_entityref(name)
                      k = match.end()
!                     if not startswith(';', k-1):
                          k = k - 1
                      i = self.updatepos(i, k)
***************
*** 194,199 ****
                  if match:
                      # match.group() will contain at least 2 chars
!                     rest = rawdata[i:]
!                     if end and match.group() == rest:
                          self.error("EOF in middle of entity or char ref")
                      # incomplete
--- 192,196 ----
                  if match:
                      # match.group() will contain at least 2 chars
!                     if end and match.group() == rawdata[i:]:
                          self.error("EOF in middle of entity or char ref")
                      # incomplete
***************
*** 253,257 ****
          assert match, 'unexpected call to parse_starttag()'
          k = match.end()
!         self.lasttag = tag = string.lower(rawdata[i+1:k])
  
          while k < endpos:
--- 250,254 ----
          assert match, 'unexpected call to parse_starttag()'
          k = match.end()
!         self.lasttag = tag = rawdata[i+1:k].lower()
  
          while k < endpos:
***************
*** 266,284 ****
                  attrvalue = attrvalue[1:-1]
                  attrvalue = self.unescape(attrvalue)
!             attrs.append((string.lower(attrname), attrvalue))
              k = m.end()
  
!         end = string.strip(rawdata[k:endpos])
          if end not in (">", "/>"):
              lineno, offset = self.getpos()
              if "\n" in self.__starttag_text:
!                 lineno = lineno + string.count(self.__starttag_text, "\n")
                  offset = len(self.__starttag_text) \
!                          - string.rfind(self.__starttag_text, "\n")
              else:
                  offset = offset + len(self.__starttag_text)
              self.error("junk characters in start tag: %s"
                         % `rawdata[k:endpos][:20]`)
!         if end[-2:] == '/>':
              # XHTML-style empty tag: <span attr="value" />
              self.handle_startendtag(tag, attrs)
--- 263,281 ----
                  attrvalue = attrvalue[1:-1]
                  attrvalue = self.unescape(attrvalue)
!             attrs.append((attrname.lower(), attrvalue))
              k = m.end()
  
!         end = rawdata[k:endpos].strip()
          if end not in (">", "/>"):
              lineno, offset = self.getpos()
              if "\n" in self.__starttag_text:
!                 lineno = lineno + self.__starttag_text.count("\n")
                  offset = len(self.__starttag_text) \
!                          - self.__starttag_text.rfind("\n")
              else:
                  offset = offset + len(self.__starttag_text)
              self.error("junk characters in start tag: %s"
                         % `rawdata[k:endpos][:20]`)
!         if end.endswith('/>'):
              # XHTML-style empty tag: <span attr="value" />
              self.handle_startendtag(tag, attrs)
***************
*** 300,307 ****
                  return j + 1
              if next == "/":
!                 s = rawdata[j:j+2]
!                 if s == "/>":
                      return j + 2
!                 if s == "/":
                      # buffer boundary
                      return -1
--- 297,303 ----
                  return j + 1
              if next == "/":
!                 if rawdata.startswith("/>", j):
                      return j + 2
!                 if rawdata.startswith("/", j):
                      # buffer boundary
                      return -1
***************
*** 333,337 ****
              self.error("bad end tag: %s" % `rawdata[i:j]`)
          tag = match.group(1)
!         self.handle_endtag(string.lower(tag))
          return j
  
--- 329,333 ----
              self.error("bad end tag: %s" % `rawdata[i:j]`)
          tag = match.group(1)
!         self.handle_endtag(tag.lower())
          return j
  
***************
*** 380,387 ****
          if '&' not in s:
              return s
!         s = string.replace(s, "&lt;", "<")
!         s = string.replace(s, "&gt;", ">")
!         s = string.replace(s, "&apos;", "'")
!         s = string.replace(s, "&quot;", '"')
!         s = string.replace(s, "&amp;", "&") # Must be last
          return s
--- 376,383 ----
          if '&' not in s:
              return s
!         s = s.replace("&lt;", "<")
!         s = s.replace("&gt;", ">")
!         s = s.replace("&apos;", "'")
!         s = s.replace("&quot;", '"')
!         s = s.replace("&amp;", "&") # Must be last
          return s