[Python-checkins] r43532 - in python/trunk: Doc/lib/libsgmllib.tex Lib/sgmllib.py Lib/test/test_sgmllib.py Misc/NEWS

georg.brandl python-checkins at python.org
Sat Apr 1 10:35:20 CEST 2006


Author: georg.brandl
Date: Sat Apr  1 10:35:18 2006
New Revision: 43532

Modified:
   python/trunk/Doc/lib/libsgmllib.tex
   python/trunk/Lib/sgmllib.py
   python/trunk/Lib/test/test_sgmllib.py
   python/trunk/Misc/NEWS
Log:
patch #1462498: handle entityrefs in attribute values.



Modified: python/trunk/Doc/lib/libsgmllib.tex
==============================================================================
--- python/trunk/Doc/lib/libsgmllib.tex	(original)
+++ python/trunk/Doc/lib/libsgmllib.tex	Sat Apr  1 10:35:18 2006
@@ -95,12 +95,15 @@
 should be used to support semantic interpretation of the start tag.
 The \var{attributes} argument is a list of \code{(\var{name},
 \var{value})} pairs containing the attributes found inside the tag's
-\code{<>} brackets.  The \var{name} has been translated to lower case
-and double quotes and backslashes in the \var{value} have been interpreted.
+\code{<>} brackets.  The \var{name} has been translated to lower case.
+Double quotes and backslashes in the \var{value} have been interpreted,
+as well as known entity and character references.
 For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this
 method would be called as \samp{unknown_starttag('a', [('href',
 'http://www.cwi.nl/')])}.  The base implementation simply calls
 \var{method} with \var{attributes} as the only argument.
+\versionadded[Handling of entity and character references within
+              attribute values]{2.5}
 \end{methoddesc}
 
 \begin{methoddesc}{handle_endtag}{tag, method}

Modified: python/trunk/Lib/sgmllib.py
==============================================================================
--- python/trunk/Lib/sgmllib.py	(original)
+++ python/trunk/Lib/sgmllib.py	Sat Apr  1 10:35:18 2006
@@ -269,9 +269,37 @@
             attrname, rest, attrvalue = match.group(1, 2, 3)
             if not rest:
                 attrvalue = attrname
-            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
-                 attrvalue[:1] == '"' == attrvalue[-1:]:
-                attrvalue = attrvalue[1:-1]
+            else:
+                if (attrvalue[:1] == "'" == attrvalue[-1:] or 
+                    attrvalue[:1] == '"' == attrvalue[-1:]):
+                    # strip quotes
+                    attrvalue = attrvalue[1:-1]
+                l = 0
+                new_attrvalue = ''
+                while l < len(attrvalue):
+                    av_match = entityref.match(attrvalue, l)
+                    if (av_match and av_match.group(1) in self.entitydefs and
+                        attrvalue[av_match.end(1)] == ';'):
+                        # only substitute entityrefs ending in ';' since
+                        # otherwise we may break <a href='?p=x&q=y'>
+                        # which is very common
+                        new_attrvalue += self.entitydefs[av_match.group(1)]
+                        l = av_match.end(0)
+                        continue
+                    ch_match = charref.match(attrvalue, l)
+                    if ch_match:
+                        try:
+                            char = chr(int(ch_match.group(1)))
+                            new_attrvalue += char
+                            l = ch_match.end(0)
+                            continue
+                        except ValueError:
+                            # invalid character reference, don't substitute
+                            pass
+                    # all other cases
+                    new_attrvalue += attrvalue[l]
+                    l += 1
+                attrvalue = new_attrvalue
             attrs.append((attrname.lower(), attrvalue))
             k = match.end(0)
         if rawdata[j] == '>':

Modified: python/trunk/Lib/test/test_sgmllib.py
==============================================================================
--- python/trunk/Lib/test/test_sgmllib.py	(original)
+++ python/trunk/Lib/test/test_sgmllib.py	Sat Apr  1 10:35:18 2006
@@ -214,6 +214,20 @@
             ("starttag", "e", [("a", "rgb(1,2,3)")]),
             ])
 
+    def test_attr_values_entities(self):
+        """Substitution of entities and charrefs in attribute values"""
+        # SF bug #1452246
+        self.check_events("""<a b=&lt; c=&lt;&gt; d=&lt-&gt; e='&lt; '
+                                f="&xxx;" g='&#32;&#33;' h='&#500;' i='x?a=b&c=d;'>""",
+            [("starttag", "a", [("b", "<"),
+                                ("c", "<>"),
+                                ("d", "&lt->"),
+                                ("e", "< "),
+                                ("f", "&xxx;"),
+                                ("g", " !"),
+                                ("h", "&#500;"),
+                                ("i", "x?a=b&c=d;"), ])])
+
     def test_attr_funky_names(self):
         self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
             ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),

Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS	(original)
+++ python/trunk/Misc/NEWS	Sat Apr  1 10:35:18 2006
@@ -489,6 +489,9 @@
 Library
 -------
 
+- Patch #1462498: sgmllib now handles entity and character references
+  in attribute values.
+
 - Added the sqlite3 package. This is based on pysqlite2.1.3, and provides
   a DB-API interface in the standard library. You'll need sqlite 3.2.2 or
   later to build this - if you have an earlier version, the C extension 


More information about the Python-checkins mailing list