[Python-checkins] r43532 - in python/trunk: Doc/lib/libsgmllib.tex Lib/sgmllib.py Lib/test/test_sgmllib.py Misc/NEWS
georg.brandl
python-checkins at python.org
Sat Apr 1 10:35:20 CEST 2006
Author: georg.brandl
Date: Sat Apr 1 10:35:18 2006
New Revision: 43532
Modified:
python/trunk/Doc/lib/libsgmllib.tex
python/trunk/Lib/sgmllib.py
python/trunk/Lib/test/test_sgmllib.py
python/trunk/Misc/NEWS
Log:
patch #1462498: handle entityrefs in attribute values.
Modified: python/trunk/Doc/lib/libsgmllib.tex
==============================================================================
--- python/trunk/Doc/lib/libsgmllib.tex (original)
+++ python/trunk/Doc/lib/libsgmllib.tex Sat Apr 1 10:35:18 2006
@@ -95,12 +95,15 @@
should be used to support semantic interpretation of the start tag.
The \var{attributes} argument is a list of \code{(\var{name},
\var{value})} pairs containing the attributes found inside the tag's
-\code{<>} brackets. The \var{name} has been translated to lower case
-and double quotes and backslashes in the \var{value} have been interpreted.
+\code{<>} brackets. The \var{name} has been translated to lower case.
+Double quotes and backslashes in the \var{value} have been interpreted,
+as well as known entity and character references.
For instance, for the tag \code{<A HREF="http://www.cwi.nl/">}, this
method would be called as \samp{unknown_starttag('a', [('href',
'http://www.cwi.nl/')])}. The base implementation simply calls
\var{method} with \var{attributes} as the only argument.
+\versionadded[Handling of entity and character references within
+ attribute values]{2.5}
\end{methoddesc}
\begin{methoddesc}{handle_endtag}{tag, method}
Modified: python/trunk/Lib/sgmllib.py
==============================================================================
--- python/trunk/Lib/sgmllib.py (original)
+++ python/trunk/Lib/sgmllib.py Sat Apr 1 10:35:18 2006
@@ -269,9 +269,37 @@
attrname, rest, attrvalue = match.group(1, 2, 3)
if not rest:
attrvalue = attrname
- elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
- attrvalue[:1] == '"' == attrvalue[-1:]:
- attrvalue = attrvalue[1:-1]
+ else:
+ if (attrvalue[:1] == "'" == attrvalue[-1:] or
+ attrvalue[:1] == '"' == attrvalue[-1:]):
+ # strip quotes
+ attrvalue = attrvalue[1:-1]
+ l = 0
+ new_attrvalue = ''
+ while l < len(attrvalue):
+ av_match = entityref.match(attrvalue, l)
+ if (av_match and av_match.group(1) in self.entitydefs and
+ attrvalue[av_match.end(1)] == ';'):
+ # only substitute entityrefs ending in ';' since
+ # otherwise we may break <a href='?p=x&q=y'>
+ # which is very common
+ new_attrvalue += self.entitydefs[av_match.group(1)]
+ l = av_match.end(0)
+ continue
+ ch_match = charref.match(attrvalue, l)
+ if ch_match:
+ try:
+ char = chr(int(ch_match.group(1)))
+ new_attrvalue += char
+ l = ch_match.end(0)
+ continue
+ except ValueError:
+ # invalid character reference, don't substitute
+ pass
+ # all other cases
+ new_attrvalue += attrvalue[l]
+ l += 1
+ attrvalue = new_attrvalue
attrs.append((attrname.lower(), attrvalue))
k = match.end(0)
if rawdata[j] == '>':
Modified: python/trunk/Lib/test/test_sgmllib.py
==============================================================================
--- python/trunk/Lib/test/test_sgmllib.py (original)
+++ python/trunk/Lib/test/test_sgmllib.py Sat Apr 1 10:35:18 2006
@@ -214,6 +214,20 @@
("starttag", "e", [("a", "rgb(1,2,3)")]),
])
+ def test_attr_values_entities(self):
+ """Substitution of entities and charrefs in attribute values"""
+ # SF bug #1452246
+ self.check_events("""<a b=< c=<> d=<-> e='< '
+ f="&xxx;" g=' !' h='Ǵ' i='x?a=b&c=d;'>""",
+ [("starttag", "a", [("b", "<"),
+ ("c", "<>"),
+ ("d", "<->"),
+ ("e", "< "),
+ ("f", "&xxx;"),
+ ("g", " !"),
+ ("h", "Ǵ"),
+ ("i", "x?a=b&c=d;"), ])])
+
def test_attr_funky_names(self):
self.check_events("""<a a.b='v' c:d=v e-f=v>""", [
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),
Modified: python/trunk/Misc/NEWS
==============================================================================
--- python/trunk/Misc/NEWS (original)
+++ python/trunk/Misc/NEWS Sat Apr 1 10:35:18 2006
@@ -489,6 +489,9 @@
Library
-------
+- Patch #1462498: sgmllib now handles entity and character references
+ in attribute values.
+
- Added the sqlite3 package. This is based on pysqlite2.1.3, and provides
a DB-API interface in the standard library. You'll need sqlite 3.2.2 or
later to build this - if you have an earlier version, the C extension
More information about the Python-checkins
mailing list