[Python-checkins] cpython (2.7): Issue #13333: The UTF-7 decoder now accepts lone surrogates

antoine.pitrou python-checkins at python.org
Tue Nov 15 01:54:57 CET 2011


http://hg.python.org/cpython/rev/050772822bde
changeset:   73554:050772822bde
branch:      2.7
parent:      73550:555871844962
user:        Antoine Pitrou <solipsis at pitrou.net>
date:        Tue Nov 15 01:49:40 2011 +0100
summary:
  Issue #13333: The UTF-7 decoder now accepts lone surrogates
(the encoder already accepts them).

files:
  Lib/test/test_unicode.py |  14 +++++++++++---
  Misc/NEWS                |   3 +++
  Objects/unicodeobject.c  |  14 +++++---------
  3 files changed, 19 insertions(+), 12 deletions(-)


diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -771,10 +771,18 @@
         for (x, y) in utfTests:
             self.assertEqual(x.encode('utf-7'), y)
 
-        # Unpaired surrogates not supported
-        self.assertRaises(UnicodeError, unicode, '+3ADYAA-', 'utf-7')
+        # Unpaired surrogates are passed through
+        self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
+        self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
+        self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
+        self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
+        self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
+        self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
+        self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
+        self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
 
-        self.assertEqual(unicode('+3ADYAA-', 'utf-7', 'replace'), u'\ufffd\ufffd')
+        self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
+        self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
 
         # Direct encoded characters
         set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
diff --git a/Misc/NEWS b/Misc/NEWS
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -9,6 +9,9 @@
 Core and Builtins
 -----------------
 
+- Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder
+  already accepts them).
+
 - Remove Py3k warning for callable.
 
 - Issue #10519: Avoid unnecessary recursive function calls in
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1628,21 +1628,17 @@
                             *p++ = outCh;
 #endif
                             surrogate = 0;
+                            continue;
                         }
                         else {
+                            *p++ = surrogate;
                             surrogate = 0;
-                            errmsg = "second surrogate missing";
-                            goto utf7Error;
                         }
                     }
-                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
+                    if (outCh >= 0xD800 && outCh <= 0xDBFF) {
                         /* first surrogate */
                         surrogate = outCh;
                     }
-                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
-                        errmsg = "unexpected second surrogate";
-                        goto utf7Error;
-                    }
                     else {
                         *p++ = outCh;
                     }
@@ -1652,8 +1648,8 @@
                 inShift = 0;
                 s++;
                 if (surrogate) {
-                    errmsg = "second surrogate missing at end of shift sequence";
-                    goto utf7Error;
+                    *p++ = surrogate;
+                    surrogate = 0;
                 }
                 if (base64bits > 0) { /* left-over bits */
                     if (base64bits >= 6) {

-- 
Repository URL: http://hg.python.org/cpython


More information about the Python-checkins mailing list