[Python-checkins] cpython (2.7): #13358: HTMLParser now calls handle_data only once for each CDATA.
ezio.melotti
python-checkins at python.org
Fri Nov 18 17:03:10 CET 2011
http://hg.python.org/cpython/rev/91163aa3d5b4
changeset: 73608:91163aa3d5b4
branch: 2.7
parent: 73605:7262f8f276ff
user: Ezio Melotti <ezio.melotti at gmail.com>
date: Fri Nov 18 18:00:40 2011 +0200
summary:
#13358: HTMLParser now calls handle_data only once for each CDATA.
files:
Lib/HTMLParser.py | 7 ++++---
Lib/test/test_htmlparser.py | 21 +++++++++++++++++++++
Misc/NEWS | 2 ++
3 files changed, 27 insertions(+), 3 deletions(-)
diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@@ -14,7 +14,6 @@
# Regular expressions used for parsing
interesting_normal = re.compile('[&<]')
-interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@@ -125,8 +124,8 @@
return self.__starttag_text
def set_cdata_mode(self, elem):
- self.interesting = interesting_cdata
self.cdata_elem = elem.lower()
+ self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self):
self.interesting = interesting_normal
@@ -144,6 +143,8 @@
if match:
j = match.start()
else:
+ if self.cdata_elem:
+ break
j = n
if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j)
@@ -212,7 +213,7 @@
else:
assert 0, "interesting.search() lied"
# end while
- if end and i < n:
+ if end and i < n and not self.cdata_elem:
self.handle_data(rawdata[i:n])
i = self.updatepos(i, n)
self.rawdata = rawdata[i:]
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -286,6 +286,27 @@
("data", content),
("endtag", element_lower)])
+ def test_cdata_with_closing_tags(self):
+ # see issue #13358
+ # make sure that HTMLParser calls handle_data only once for each CDATA.
+ # The normal event collector normalizes the events in get_events,
+ # so we override it to return the original list of events.
+ class Collector(EventCollector):
+ def get_events(self):
+ return self.events
+
+ content = """<!-- not a comment --> ¬-an-entity-ref;
+ <a href="" /> </p><p> & <span></span></style>
+ '</script' + '>' </html> </head> </scripter>!"""
+ for element in [' script', 'script ', ' script ',
+ '\nscript', 'script\n', '\nscript\n']:
+ s = u'<script>{content}</{element}>'.format(element=element,
+ content=content)
+ self._run_check(s, [("starttag", "script", []),
+ ("data", content),
+ ("endtag", "script")],
+ collector=Collector)
+
def test_malformatted_charref(self):
self._run_check("<p>&#bad;</p>", [
("starttag", "p", []),
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -79,6 +79,8 @@
Library
-------
+- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
+
- Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
node when it is the only child of an element. Initial patch by Dan
Kenigsberg.
--
Repository URL: http://hg.python.org/cpython
More information about the Python-checkins
mailing list