[Python-checkins] cpython: #15156: HTMLParser now uses the new "html.entities.html5" dictionary.
ezio.melotti
python-checkins at python.org
Sun Jun 24 22:04:09 CEST 2012
http://hg.python.org/cpython/rev/0d53703b1a99
changeset: 77742:0d53703b1a99
parent: 77740:1120041f2df4
user: Ezio Melotti <ezio.melotti at gmail.com>
date: Sun Jun 24 22:02:56 2012 +0200
summary:
#15156: HTMLParser now uses the new "html.entities.html5" dictionary.
files:
Doc/library/html.entities.rst | 4 ---
Lib/html/parser.py | 30 ++++++++++------------
Lib/test/test_htmlparser.py | 7 ++++-
Misc/NEWS | 2 +
4 files changed, 22 insertions(+), 21 deletions(-)
diff --git a/Doc/library/html.entities.rst b/Doc/library/html.entities.rst
--- a/Doc/library/html.entities.rst
+++ b/Doc/library/html.entities.rst
@@ -11,10 +11,6 @@
This module defines four dictionaries, :data:`html5`,
:data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`.
-:data:`entitydefs` is used to provide the :attr:`entitydefs`
-attribute of the :class:`html.parser.HTMLParser` class. The definition provided
-here contains all the entities defined by XHTML 1.0 that can be handled using
-simple textual substitution in the Latin-1 character set (ISO-8859-1).
.. data:: html5
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -500,7 +500,6 @@
self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting
- entitydefs = None
def unescape(self, s):
if '&' not in s:
return s
@@ -510,24 +509,23 @@
if s[0] == "#":
s = s[1:]
if s[0] in ['x','X']:
- c = int(s[1:], 16)
+ c = int(s[1:].rstrip(';'), 16)
else:
- c = int(s)
+ c = int(s.rstrip(';'))
return chr(c)
except ValueError:
- return '&#'+ s +';'
+ return '&#' + s
else:
- # Cannot use name2codepoint directly, because HTMLParser
- # supports apos, which is not part of HTML 4
- import html.entities
- if HTMLParser.entitydefs is None:
- entitydefs = HTMLParser.entitydefs = {'apos':"'"}
- for k, v in html.entities.name2codepoint.items():
- entitydefs[k] = chr(v)
- try:
- return self.entitydefs[s]
- except KeyError:
- return '&'+s+';'
+ from html.entities import html5
+ if s in html5:
+ return html5[s]
+ elif s.endswith(';'):
+ return '&' + s
+ for x in range(2, len(s)):
+ if s[:x] in html5:
+ return html5[s[:x]] + s[x:]
+ else:
+ return '&' + s
- return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));",
+ return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
replaceEntities, s, flags=re.ASCII)
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -456,7 +456,7 @@
self._run_check('<form action="/xxx.php?a=1&b=2&", '
'method="post">', [
('starttag', 'form',
- [('action', '/xxx.php?a=1&b=2&'),
+ [('action', '/xxx.php?a=1&b=2&'),
(',', None), ('method', 'post')])])
def test_weird_chars_in_unquoted_attribute_values(self):
@@ -541,6 +541,11 @@
self.assertEqual(p.unescape('&'),'&')
# see #12888
self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050)
+ # see #15156
+ self.assertEqual(p.unescape('ÉricÉric'
+ '&alphacentauriαcentauri'),
+ 'ÉricÉric&alphacentauriαcentauri')
+ self.assertEqual(p.unescape('&co;'), '&co;')
def test_broken_comments(self):
html = ('<! not really a comment >'
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -76,6 +76,8 @@
It is used automatically on platforms supporting the necessary os.openat()
and os.unlinkat() functions. Main code by Martin von Löwis.
+- Issue #15156: HTMLParser now uses the new "html.entities.html5" dictionary.
+
- Issue #11113: add a new "html5" dictionary containing the named character
references defined by the HTML5 standard and the equivalent Unicode
character(s) to the html.entities module.
--
Repository URL: http://hg.python.org/cpython
More information about the Python-checkins
mailing list