[Python-checkins] cpython (merge 3.2 -> default): Merge the HTMLParser fix with 3.2.

ezio.melotti python-checkins at python.org
Tue Feb 21 08:29:24 CET 2012


http://hg.python.org/cpython/rev/3a701916ba8a
changeset:   75118:3a701916ba8a
parent:      75110:e51951b4fcc3
parent:      75117:2d16048b10cd
user:        Ezio Melotti <ezio.melotti at gmail.com>
date:        Tue Feb 21 09:29:10 2012 +0200
summary:
  Merge the HTMLParser fix with 3.2.

files:
  Lib/html/parser.py          |  18 +++++++++++-------
  Lib/test/test_htmlparser.py |  21 +++++++++++++++++++++
  Misc/NEWS                   |   2 ++
  3 files changed, 34 insertions(+), 7 deletions(-)


diff --git a/Lib/html/parser.py b/Lib/html/parser.py
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -26,14 +26,18 @@
 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
-# Note, the strict one of this pair isn't really strict, but we can't
-# make it correctly strict without breaking backward compatibility.
+# Note:
+#  1) the strict attrfind isn't really strict, but we can't make it
+#     correctly strict without breaking backward compatibility;
+#  2) if you change attrfind remember to update locatestarttagend too;
+#  3) if you change attrfind and/or locatestarttagend the parser will
+#     explode, so don't do it.
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 attrfind_tolerant = re.compile(
-    r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
-    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
+    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
 locatestarttagend = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
   (?:\s+                             # whitespace before attribute name
@@ -50,15 +54,15 @@
 """, re.VERBOSE)
 locatestarttagend_tolerant = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
-  (?:\s*                             # optional whitespace before attribute name
-    (?:(?<=['"\s])[^\s/>][^\s/=>]*   # attribute name
+  (?:[\s/]*                          # optional whitespace before attribute name
+    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
       (?:\s*=+\s*                    # value indicator
         (?:'[^']*'                   # LITA-enclosed value
           |"[^"]*"                   # LIT-enclosed value
           |(?!['"])[^>\s]*           # bare value
          )
          (?:\s*,)*                   # possibly followed by a comma
-       )?\s*
+       )?(?:\s|/(?!>))*
      )*
    )?
   \s*                                # trailing whitespace
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -389,6 +389,27 @@
         self._run_check("<a foo='>'", [('data', "<a foo='>'")])
         self._run_check("<a foo='>", [('data', "<a foo='>")])
 
+    def test_slashes_in_starttag(self):
+        self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
+        html = ('<img width=902 height=250px '
+                'src="/sites/default/files/images/homepage/foo.jpg" '
+                '/*what am I doing here*/ />')
+        expected = [(
+            'startendtag', 'img',
+            [('width', '902'), ('height', '250px'),
+             ('src', '/sites/default/files/images/homepage/foo.jpg'),
+             ('*what', None), ('am', None), ('i', None),
+             ('doing', None), ('here*', None)]
+        )]
+        self._run_check(html, expected)
+        html = ('<a / /foo/ / /=/ / /bar/ / />'
+                '<a / /foo/ / /=/ / /bar/ / >')
+        expected = [
+            ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
+            ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
+        ]
+        self._run_check(html, expected)
+
     def test_declaration_junk_chars(self):
         self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
 
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -479,6 +479,8 @@
 Library
 -------
 
+- HTMLParser is now able to handle slashes in the start tag.
+
 - Issue #13641: Decoding functions in the base64 module now accept ASCII-only
   unicode strings.  Patch by Catalin Iacob.
 

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list