[Python-checkins] CVS: python/dist/src/Lib HTMLParser.py,1.3,1.4
Fred L. Drake
fdrake@users.sourceforge.net
Fri, 03 Aug 2001 12:51:01 -0700
Update of /cvsroot/python/python/dist/src/Lib
In directory usw-pr-cvs1:/tmp/cvs-serv30083
Modified Files:
HTMLParser.py
Log Message:
Change some comments into docstrings.
Fix handling of hexadecimal character references (legal in XHTML) so that
they are properly interpreted as character references.
This fixes SF bug #445196.
Index: HTMLParser.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/HTMLParser.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** HTMLParser.py 2001/05/23 04:53:44 1.3
--- HTMLParser.py 2001/08/03 19:50:59 1.4
***************
*** 1,3 ****
! """A parser for HTML."""
# This file is based on sgmllib.py, but the API is slightly different.
--- 1,3 ----
! """A parser for HTML and XHTML."""
# This file is based on sgmllib.py, but the API is slightly different.
***************
*** 19,23 ****
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
! charref = re.compile('&#([0-9]+)[^0-9]')
starttagopen = re.compile('<[a-zA-Z]')
--- 19,23 ----
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
! charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
starttagopen = re.compile('<[a-zA-Z]')
***************
*** 74,103 ****
! # HTML parser class -- find tags and call handler functions.
! # Usage:
! #
! # p = HTMLParser(); p.feed(data); ...; p.close()
! # Start tags are handled by calling self.handle_starttag() or
! # self.handle_startendtag(); end tags by self.handle_endtag(). The
! # data between tags is passed from the parser to the derived class by
! # calling self.handle_data() with the data as argument (the data may
! # be split up in arbitrary chunks). Entity references are passed by
! # calling self.handle_entityref() with the entity reference as the
! # argument. Numeric character references are passed to
! # self.handle_charref() with the string containing the reference as
! # the argument.
! class HTMLParser:
CDATA_CONTENT_ELEMENTS = ("script", "style")
- # Interface -- initialize and reset this instance
def __init__(self):
self.reset()
- # Interface -- reset this instance. Loses all unprocessed data
def reset(self):
self.rawdata = ''
self.stack = []
--- 74,106 ----
! class HTMLParser:
! """Find tags and other markup and call handler functions.
! Usage:
! p = HTMLParser()
! p.feed(data)
! ...
! p.close()
! Start tags are handled by calling self.handle_starttag() or
! self.handle_startendtag(); end tags by self.handle_endtag(). The
! data between tags is passed from the parser to the derived class
! by calling self.handle_data() with the data as argument (the data
! may be split up in arbitrary chunks). Entity references are
! passed by calling self.handle_entityref() with the entity
! reference as the argument. Numeric character references are
! passed to self.handle_charref() with the string containing the
! reference as the argument.
! """
CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self):
+ """Initialize and reset this instance."""
self.reset()
def reset(self):
+ """Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.stack = []
***************
*** 107,120 ****
self.interesting = interesting_normal
- # Interface -- feed some data to the parser. Call this as
- # often as you want, with as little or as much text as you
- # want (may include '\n'). (This just saves the text, all the
- # processing is done by goahead().)
def feed(self, data):
self.rawdata = self.rawdata + data
self.goahead(0)
- # Interface -- handle the remaining data
def close(self):
self.goahead(1)
--- 110,124 ----
self.interesting = interesting_normal
def feed(self, data):
+ """Feed data to the parser.
+
+ Call this as often as you want, with as little or as much text
+ as you want (may include '\n').
+ """
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
+ """Handle any buffered data."""
self.goahead(1)
***************
*** 136,147 ****
return j
- # Interface -- return current line number and offset.
def getpos(self):
return self.lineno, self.offset
__starttag_text = None
- # Interface -- return full source of start tag: "<...>"
def get_starttag_text(self):
return self.__starttag_text
--- 140,151 ----
return j
def getpos(self):
+ """Return current line number and offset."""
return self.lineno, self.offset
__starttag_text = None
def get_starttag_text(self):
+ """Return full source of start tag: '<...>'."""
return self.__starttag_text
***************
*** 196,200 ****
match = charref.match(rawdata, i)
if match:
! name = match.group(1)
self.handle_charref(name)
k = match.end()
--- 200,204 ----
match = charref.match(rawdata, i)
if match:
! name = match.group()[2:-1]
self.handle_charref(name)
k = match.end()