[Python-checkins] bpo-41748: Handles unquoted attributes with commas (GH-24072)
miss-islington
webhook-mailer at python.org
Mon Feb 1 15:54:47 EST 2021
https://github.com/python/cpython/commit/0874491bcc392f7bd9c394ec2fdab183e3f320dd
commit: 0874491bcc392f7bd9c394ec2fdab183e3f320dd
branch: 3.9
author: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
committer: miss-islington <31488909+miss-islington at users.noreply.github.com>
date: 2021-02-01T12:54:43-08:00
summary:
bpo-41748: Handles unquoted attributes with commas (GH-24072)
* bpo-41748: Adds tests for unquoted attributes with comma
* bpo-41748: Handles unquoted attributes with comma
* bpo-41748: Addresses review comments
* bpo-41748: Addresses review comments
* Adds more test cases
* Simplifies the regex for handling spaces
* bpo-41748: Moves attributes tests under the right class
* bpo-41748: Addresses review about duplicate attributes
* bpo-41748: Adds NEWS.d entry for this patch
(cherry picked from commit 9eb11a139fac5514d8456626806a68b3e3b7eafb)
Co-authored-by: Karl Dubost <karl+github at la-grange.net>
files:
A Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst
M Lib/html/parser.py
M Lib/test/test_htmlparser.py
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index 60830779816a0..9e49effca1fcc 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -46,7 +46,7 @@
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
- (?:\s*,)* # possibly followed by a comma
+ \s* # possibly followed by a space
)?(?:\s|/(?!>))*
)*
)?
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index a2bfb39d16a57..12917755a5601 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -452,42 +452,6 @@ def test_illegal_declarations(self):
self._run_check('<!spacer type="block" height="25">',
[('comment', 'spacer type="block" height="25"')])
- def test_with_unquoted_attributes(self):
- # see #12008
- html = ("<html><body bgcolor=d0ca90 text='181008'>"
- "<table cellspacing=0 cellpadding=1 width=100% ><tr>"
- "<td align=left><font size=-1>"
- "- <a href=/rabota/><span class=en> software-and-i</span></a>"
- "- <a href='/1/'><span class=en> library</span></a></table>")
- expected = [
- ('starttag', 'html', []),
- ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
- ('starttag', 'table',
- [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
- ('starttag', 'tr', []),
- ('starttag', 'td', [('align', 'left')]),
- ('starttag', 'font', [('size', '-1')]),
- ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
- ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
- ('endtag', 'span'), ('endtag', 'a'),
- ('data', '- '), ('starttag', 'a', [('href', '/1/')]),
- ('starttag', 'span', [('class', 'en')]), ('data', ' library'),
- ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
- ]
- self._run_check(html, expected)
-
- def test_comma_between_attributes(self):
- self._run_check('<form action="/xxx.php?a=1&b=2&", '
- 'method="post">', [
- ('starttag', 'form',
- [('action', '/xxx.php?a=1&b=2&'),
- (',', None), ('method', 'post')])])
-
- def test_weird_chars_in_unquoted_attribute_values(self):
- self._run_check('<form action=bogus|&#()value>', [
- ('starttag', 'form',
- [('action', 'bogus|&#()value')])])
-
def test_invalid_end_tags(self):
# A collection of broken end tags. <br> is used as separator.
# see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
@@ -766,6 +730,62 @@ def test_end_tag_in_attribute_value(self):
[("href", "http://www.example.org/\">;")]),
("data", "spam"), ("endtag", "a")])
+ def test_with_unquoted_attributes(self):
+ # see #12008
+ html = ("<html><body bgcolor=d0ca90 text='181008'>"
+ "<table cellspacing=0 cellpadding=1 width=100% ><tr>"
+ "<td align=left><font size=-1>"
+ "- <a href=/rabota/><span class=en> software-and-i</span></a>"
+ "- <a href='/1/'><span class=en> library</span></a></table>")
+ expected = [
+ ('starttag', 'html', []),
+ ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
+ ('starttag', 'table',
+ [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
+ ('starttag', 'tr', []),
+ ('starttag', 'td', [('align', 'left')]),
+ ('starttag', 'font', [('size', '-1')]),
+ ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
+ ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
+ ('endtag', 'span'), ('endtag', 'a'),
+ ('data', '- '), ('starttag', 'a', [('href', '/1/')]),
+ ('starttag', 'span', [('class', 'en')]), ('data', ' library'),
+ ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
+ ]
+ self._run_check(html, expected)
+
+ def test_comma_between_attributes(self):
+ # see bpo 41478
+ # HTMLParser preserves duplicate attributes, leaving the task of
+ # removing duplicate attributes to a conformant html tree builder
+ html = ('<div class=bar,baz=asd>' # between attrs (unquoted)
+ '<div class="bar",baz="asd">' # between attrs (quoted)
+ '<div class=bar, baz=asd,>' # after values (unquoted)
+ '<div class="bar", baz="asd",>' # after values (quoted)
+ '<div class="bar",>' # one comma values (quoted)
+ '<div class=,bar baz=,asd>' # before values (unquoted)
+ '<div class=,"bar" baz=,"asd">' # before values (quoted)
+ '<div ,class=bar ,baz=asd>' # before names
+ '<div class,="bar" baz,="asd">' # after names
+ )
+ expected = [
+ ('starttag', 'div', [('class', 'bar,baz=asd'),]),
+ ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]),
+ ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]),
+ ('starttag', 'div', [('class', 'bar'), (',', None),
+ ('baz', 'asd'), (',', None)]),
+ ('starttag', 'div', [('class', 'bar'), (',', None)]),
+ ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]),
+ ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]),
+ ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]),
+ ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]),
+ ]
+ self._run_check(html, expected)
+
+ def test_weird_chars_in_unquoted_attribute_values(self):
+ self._run_check('<form action=bogus|&#()value>', [
+ ('starttag', 'form',
+ [('action', 'bogus|&#()value')])])
if __name__ == "__main__":
unittest.main()
diff --git a/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst
new file mode 100644
index 0000000000000..52efa3ac3d40e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst
@@ -0,0 +1,2 @@
+Fix HTMLParser parsing rules for element attributes containing
+commas with spaces. Patch by Karl Dubost.
\ No newline at end of file
More information about the Python-checkins
mailing list