[Python-checkins] CVS: python/dist/src/Lib HTMLParser.py,1.8,1.9
Fred L. Drake
fdrake@users.sourceforge.net
Mon, 24 Sep 2001 13:10:30 -0700
Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv12855
Modified Files:
HTMLParser.py
Log Message:
Re-factor the HTMLParser class to use the new markupbase.ParserBase class.
Use a new internal method, error(), consistently to raise parse errors;
the new base class also uses this.
Index: HTMLParser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/HTMLParser.py,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** HTMLParser.py 2001/09/18 02:26:38 1.8
--- HTMLParser.py 2001/09/24 20:10:28 1.9
***************
*** 9,12 ****
--- 9,13 ----
+ import markupbase
import re
import string
***************
*** 22,31 ****
starttagopen = re.compile('<[a-zA-Z]')
- piopen = re.compile(r'<\?')
piclose = re.compile('>')
endtagopen = re.compile('</')
- declopen = re.compile('<!')
- special = re.compile('<![^<>]*>')
- commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
--- 23,28 ----
***************
*** 48,58 ****
\s* # trailing whitespace
""", re.VERBOSE)
- endstarttag = re.compile(r"\s*/?>")
endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
- declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
- declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
-
class HTMLParseError(Exception):
--- 45,51 ----
***************
*** 74,78 ****
! class HTMLParser:
"""Find tags and other markup and call handler functions.
--- 67,71 ----
! class HTMLParser(markupbase.ParserBase):
"""Find tags and other markup and call handler functions.
***************
*** 106,112 ****
self.stack = []
self.lasttag = '???'
- self.lineno = 1
- self.offset = 0
self.interesting = interesting_normal
def feed(self, data):
--- 99,104 ----
self.stack = []
self.lasttag = '???'
self.interesting = interesting_normal
+ markupbase.ParserBase.reset(self)
def feed(self, data):
***************
*** 123,146 ****
self.goahead(1)
! # Internal -- update line number and offset. This should be
! # called for each piece of data exactly once, in order -- in other
! # words the concatenation of all the input strings to this
! # function should be exactly the entire input.
! def updatepos(self, i, j):
! if i >= j:
! return j
! rawdata = self.rawdata
! nlines = string.count(rawdata, "\n", i, j)
! if nlines:
! self.lineno = self.lineno + nlines
! pos = string.rindex(rawdata, "\n", i, j) # Should not fail
! self.offset = j-(pos+1)
! else:
! self.offset = self.offset + j-i
! return j
!
! def getpos(self):
! """Return current line number and offset."""
! return self.lineno, self.offset
__starttag_text = None
--- 115,120 ----
self.goahead(1)
! def error(self, message):
! raise HTMLParseError(message, self.getpos())
__starttag_text = None
***************
*** 179,187 ****
if k >= 0:
self.clear_cdata_mode()
! elif commentopen.match(rawdata, i): # <!--
k = self.parse_comment(i)
! elif piopen.match(rawdata, i): # <?
k = self.parse_pi(i)
! elif declopen.match(rawdata, i): # <!
k = self.parse_declaration(i)
elif (i + 1) < n:
--- 153,161 ----
if k >= 0:
self.clear_cdata_mode()
! elif rawdata.startswith("<!--", i): # <!--
k = self.parse_comment(i)
! elif rawdata.startswith("<?", i): # <?
k = self.parse_pi(i)
! elif rawdata.startswith("<!", i): # <!
k = self.parse_declaration(i)
elif (i + 1) < n:
***************
*** 192,197 ****
if k < 0:
if end:
! raise HTMLParseError("EOF in middle of construct",
! self.getpos())
break
i = self.updatepos(i, k)
--- 166,170 ----
if k < 0:
if end:
! self.error("EOF in middle of construct")
break
i = self.updatepos(i, k)
***************
*** 223,229 ****
rest = rawdata[i:]
if end and match.group() == rest:
! raise HTMLParseError(
! "EOF in middle of entity or char ref",
! self.getpos())
# incomplete
break
--- 196,200 ----
rest = rawdata[i:]
if end and match.group() == rest:
! self.error("EOF in middle of entity or char ref")
# incomplete
break
***************
*** 256,516 ****
return j
- # Internal -- parse declaration.
- def parse_declaration(self, i):
- # This is some sort of declaration; in "HTML as
- # deployed," this should only be the document type
- # declaration ("<!DOCTYPE html...>").
- rawdata = self.rawdata
- j = i + 2
- assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
- if rawdata[j:j+1] in ("-", ""):
- # Start of comment followed by buffer boundary,
- # or just a buffer boundary.
- return -1
- # in practice, this should look like: ((name|stringlit) S*)+ '>'
- n = len(rawdata)
- decltype, j = self.scan_name(j, i)
- if j < 0:
- return j
- if decltype.lower() != "doctype":
- raise HTMLParseError("unknown declaration: '%s'" % decltype,
- self.getpos())
- while j < n:
- c = rawdata[j]
- if c == ">":
- # end of declaration syntax
- data = rawdata[i+2:j]
- self.handle_decl(data)
- return j + 1
- if c in "\"'":
- m = declstringlit.match(rawdata, j)
- if not m:
- return -1 # incomplete
- j = m.end()
- elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
- name, j = self.scan_name(j, i)
- elif c == "[" and decltype == "doctype":
- j = self.parse_doctype_subset(j + 1, i)
- else:
- raise HTMLParseError(
- "unexpected char in declaration: %s" % `rawdata[j]`,
- self.getpos())
- if j < 0:
- return j
- return -1 # incomplete
-
- # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
- # returning the index just past any whitespace following the trailing ']'.
- def parse_doctype_subset(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- j = i
- while j < n:
- c = rawdata[j]
- if c == "<":
- s = rawdata[j:j+2]
- if s == "<":
- # end of buffer; incomplete
- return -1
- if s != "<!":
- self.updatepos(declstartpos, j + 1)
- raise HTMLParseError("unexpect char in internal subset",
- self.getpos())
- if (j + 2) == n:
- # end of buffer; incomplete
- return -1
- if (j + 4) > n:
- # end of buffer; incomplete
- return -1
- if rawdata[j:j+4] == "<!--":
- j = self.parse_comment(j, report=0)
- if j < 0:
- return j
- continue
- name, j = self.scan_name(j + 2, declstartpos)
- if j == -1:
- return -1
- if name not in ("attlist", "element", "entity", "notation"):
- self.updatepos(declstartpos, j + 2)
- raise HTMLParseError(
- "unknown declaration %s in internal subset" % `name`,
- self.getpos())
- # handle the individual names
- meth = getattr(self, "parse_doctype_" + name)
- j = meth(j, declstartpos)
- if j < 0:
- return j
- elif c == "%":
- # parameter entity reference
- if (j + 1) == n:
- # end of buffer; incomplete
- return -1
- s, j = self.scan_name(j + 1, declstartpos)
- if j < 0:
- return j
- if rawdata[j] == ";":
- j = j + 1
- elif c == "]":
- j = j + 1
- while j < n and rawdata[j] in string.whitespace:
- j = j + 1
- if j < n:
- if rawdata[j] == ">":
- return j
- self.updatepos(declstartpos, j)
- raise HTMLParseError(
- "unexpected char after internal subset",
- self.getpos())
- else:
- return -1
- elif c in string.whitespace:
- j = j + 1
- else:
- self.updatepos(declstartpos, j)
- raise HTMLParseError(
- "unexpected char %s in internal subset" % `c`,
- self.getpos())
- # end of buffer reached
- return -1
-
- def parse_doctype_element(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- name, j = self.scan_name(i, declstartpos)
- if j == -1:
- return -1
- # style content model; just skip until '>'
- if '>' in rawdata[j:]:
- return string.find(rawdata, ">", j) + 1
- return -1
-
- def parse_doctype_attlist(self, i, declstartpos):
- rawdata = self.rawdata
- name, j = self.scan_name(i, declstartpos)
- c = rawdata[j:j+1]
- if c == "":
- return -1
- if c == ">":
- return j + 1
- while 1:
- # scan a series of attribute descriptions; simplified:
- # name type [value] [#constraint]
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
- c = rawdata[j:j+1]
- if c == "":
- return -1
- if c == "(":
- # an enumerated type; look for ')'
- if ")" in rawdata[j:]:
- j = string.find(rawdata, ")", j) + 1
- else:
- return -1
- while rawdata[j:j+1] in string.whitespace:
- j = j + 1
- if not rawdata[j:]:
- # end of buffer, incomplete
- return -1
- else:
- name, j = self.scan_name(j, declstartpos)
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c in "'\"":
- m = declstringlit.match(rawdata, j)
- if m:
- j = m.end()
- else:
- return -1
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c == "#":
- if rawdata[j:] == "#":
- # end of buffer
- return -1
- name, j = self.scan_name(j + 1, declstartpos)
- if j < 0:
- return j
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c == '>':
- # all done
- return j + 1
-
- def parse_doctype_notation(self, i, declstartpos):
- name, j = self.scan_name(i, declstartpos)
- if j < 0:
- return j
- rawdata = self.rawdata
- while 1:
- c = rawdata[j:j+1]
- if not c:
- # end of buffer; incomplete
- return -1
- if c == '>':
- return j + 1
- if c in "'\"":
- m = declstringlit.match(rawdata, j)
- if not m:
- return -1
- j = m.end()
- else:
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
-
- def parse_doctype_entity(self, i, declstartpos):
- rawdata = self.rawdata
- if rawdata[i:i+1] == "%":
- j = i + 1
- while 1:
- c = rawdata[j:j+1]
- if not c:
- return -1
- if c in string.whitespace:
- j = j + 1
- else:
- break
- else:
- j = i
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
- while 1:
- c = self.rawdata[j:j+1]
- if not c:
- return -1
- if c in "'\"":
- m = declstringlit.match(rawdata, j)
- if m:
- j = m.end()
- else:
- return -1 # incomplete
- elif c == ">":
- return j + 1
- else:
- name, j = self.scan_name(j, declstartpos)
- if j < 0:
- return j
-
- def scan_name(self, i, declstartpos):
- rawdata = self.rawdata
- n = len(rawdata)
- if i == n:
- return None, -1
- m = declname.match(rawdata, i)
- if m:
- s = m.group()
- name = s.strip()
- if (i + len(s)) == n:
- return None, -1 # end of buffer
- return name.lower(), m.end()
- else:
- self.updatepos(declstartpos, i)
- raise HTMLParseError("expected name token", self.getpos())
-
# Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
--- 227,230 ----
***************
*** 564,570 ****
else:
offset = offset + len(self.__starttag_text)
! raise HTMLParseError("junk characters in start tag: %s"
! % `rawdata[k:endpos][:20]`,
! (lineno, offset))
if end[-2:] == '/>':
# XHTML-style empty tag: <span attr="value" />
--- 278,283 ----
else:
offset = offset + len(self.__starttag_text)
! self.error("junk characters in start tag: %s"
! % `rawdata[k:endpos][:20]`)
if end[-2:] == '/>':
# XHTML-style empty tag: <span attr="value" />
***************
*** 595,600 ****
# else bogus input
self.updatepos(i, j + 1)
! raise HTMLParseError("malformed empty start tag",
! self.getpos())
if next == "":
# end of input
--- 308,312 ----
# else bogus input
self.updatepos(i, j + 1)
! self.error("malformed empty start tag")
if next == "":
# end of input
***************
*** 606,611 ****
return -1
self.updatepos(i, j)
! raise HTMLParseError("malformed start tag", self.getpos())
! raise AssertionError("we should not gt here!")
# Internal -- parse endtag, return end or -1 if incomplete
--- 318,323 ----
return -1
self.updatepos(i, j)
! self.error("malformed start tag")
! raise AssertionError("we should not get here!")
# Internal -- parse endtag, return end or -1 if incomplete
***************
*** 619,624 ****
match = endtagfind.match(rawdata, i) # </ + tag + >
if not match:
! raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
! self.getpos())
tag = match.group(1)
self.handle_endtag(string.lower(tag))
--- 331,335 ----
match = endtagfind.match(rawdata, i) # </ + tag + >
if not match:
! self.error("bad end tag: %s" % `rawdata[i:j]`)
tag = match.group(1)
self.handle_endtag(string.lower(tag))
***************
*** 661,664 ****
--- 372,378 ----
def handle_pi(self, data):
pass
+
+ def unknown_decl(self, data):
+ self.error("unknown declaration: " + `data`)
# Internal -- helper to remove special character quoting