[Python-checkins] CVS: python/dist/src/Lib HTMLParser.py,1.9,1.10
Fred L. Drake
fdrake@users.sourceforge.net
Mon, 03 Dec 2001 09:09:53 -0800
Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv6260
Modified Files:
HTMLParser.py
Log Message:
Convert to using string methods instead of the string module.
In goahead(), use a bound version of rawdata.startswith() since we use the
same method all the time and never change the value of rawdata. This can
save a lot of bound method creation.
Index: HTMLParser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/HTMLParser.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** HTMLParser.py 2001/09/24 20:10:28 1.9
--- HTMLParser.py 2001/12/03 17:09:50 1.10
***************
*** 11,15 ****
import markupbase
import re
- import string
# Regular expressions used for parsing
--- 11,14 ----
***************
*** 24,28 ****
starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
- endtagopen = re.compile('</')
commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
--- 23,26 ----
***************
*** 97,101 ****
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
- self.stack = []
self.lasttag = '???'
self.interesting = interesting_normal
--- 95,98 ----
***************
*** 146,161 ****
i = self.updatepos(i, j)
if i == n: break
! if rawdata[i] == '<':
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
! elif endtagopen.match(rawdata, i): # </
k = self.parse_endtag(i)
if k >= 0:
self.clear_cdata_mode()
! elif rawdata.startswith("<!--", i): # <!--
k = self.parse_comment(i)
! elif rawdata.startswith("<?", i): # <?
k = self.parse_pi(i)
! elif rawdata.startswith("<!", i): # <!
k = self.parse_declaration(i)
elif (i + 1) < n:
--- 143,159 ----
i = self.updatepos(i, j)
if i == n: break
! startswith = rawdata.startswith
! if startswith('<', i):
if starttagopen.match(rawdata, i): # < + letter
k = self.parse_starttag(i)
! elif startswith("</", i):
k = self.parse_endtag(i)
if k >= 0:
self.clear_cdata_mode()
! elif startswith("<!--", i):
k = self.parse_comment(i)
! elif startswith("<?", i):
k = self.parse_pi(i)
! elif startswith("<!", i):
k = self.parse_declaration(i)
elif (i + 1) < n:
***************
*** 169,173 ****
break
i = self.updatepos(i, k)
! elif rawdata[i:i+2] == "&#":
match = charref.match(rawdata, i)
if match:
--- 167,171 ----
break
i = self.updatepos(i, k)
! elif startswith("&#", i):
match = charref.match(rawdata, i)
if match:
***************
*** 175,179 ****
self.handle_charref(name)
k = match.end()
! if rawdata[k-1] != ';':
k = k - 1
i = self.updatepos(i, k)
--- 173,177 ----
self.handle_charref(name)
k = match.end()
! if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
***************
*** 181,185 ****
else:
break
! elif rawdata[i] == '&':
match = entityref.match(rawdata, i)
if match:
--- 179,183 ----
else:
break
! elif startswith('&', i):
match = entityref.match(rawdata, i)
if match:
***************
*** 187,191 ****
self.handle_entityref(name)
k = match.end()
! if rawdata[k-1] != ';':
k = k - 1
i = self.updatepos(i, k)
--- 185,189 ----
self.handle_entityref(name)
k = match.end()
! if not startswith(';', k-1):
k = k - 1
i = self.updatepos(i, k)
***************
*** 194,199 ****
if match:
# match.group() will contain at least 2 chars
! rest = rawdata[i:]
! if end and match.group() == rest:
self.error("EOF in middle of entity or char ref")
# incomplete
--- 192,196 ----
if match:
# match.group() will contain at least 2 chars
! if end and match.group() == rawdata[i:]:
self.error("EOF in middle of entity or char ref")
# incomplete
***************
*** 253,257 ****
assert match, 'unexpected call to parse_starttag()'
k = match.end()
! self.lasttag = tag = string.lower(rawdata[i+1:k])
while k < endpos:
--- 250,254 ----
assert match, 'unexpected call to parse_starttag()'
k = match.end()
! self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
***************
*** 266,284 ****
attrvalue = attrvalue[1:-1]
attrvalue = self.unescape(attrvalue)
! attrs.append((string.lower(attrname), attrvalue))
k = m.end()
! end = string.strip(rawdata[k:endpos])
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
! lineno = lineno + string.count(self.__starttag_text, "\n")
offset = len(self.__starttag_text) \
! - string.rfind(self.__starttag_text, "\n")
else:
offset = offset + len(self.__starttag_text)
self.error("junk characters in start tag: %s"
% `rawdata[k:endpos][:20]`)
! if end[-2:] == '/>':
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
--- 263,281 ----
attrvalue = attrvalue[1:-1]
attrvalue = self.unescape(attrvalue)
! attrs.append((attrname.lower(), attrvalue))
k = m.end()
! end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
! lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
! - self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
self.error("junk characters in start tag: %s"
% `rawdata[k:endpos][:20]`)
! if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
***************
*** 300,307 ****
return j + 1
if next == "/":
! s = rawdata[j:j+2]
! if s == "/>":
return j + 2
! if s == "/":
# buffer boundary
return -1
--- 297,303 ----
return j + 1
if next == "/":
! if rawdata.startswith("/>", j):
return j + 2
! if rawdata.startswith("/", j):
# buffer boundary
return -1
***************
*** 333,337 ****
self.error("bad end tag: %s" % `rawdata[i:j]`)
tag = match.group(1)
! self.handle_endtag(string.lower(tag))
return j
--- 329,333 ----
self.error("bad end tag: %s" % `rawdata[i:j]`)
tag = match.group(1)
! self.handle_endtag(tag.lower())
return j
***************
*** 380,387 ****
if '&' not in s:
return s
! s = string.replace(s, "<", "<")
! s = string.replace(s, ">", ">")
! s = string.replace(s, "'", "'")
! s = string.replace(s, """, '"')
! s = string.replace(s, "&", "&") # Must be last
return s
--- 376,383 ----
if '&' not in s:
return s
! s = s.replace("<", "<")
! s = s.replace(">", ">")
! s = s.replace("'", "'")
! s = s.replace(""", '"')
! s = s.replace("&", "&") # Must be last
return s