[issue14648] Attempt to format ascii and non-ascii strings together fails with "... UCS2 ..."

STINNER Victor report at bugs.python.org
Mon Apr 23 23:16:08 CEST 2012


STINNER Victor <victor.stinner at gmail.com> added the comment:

>>>> "{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412")
> python: Objects/unicodeobject.c:1223: _copy_characters: Assertion `ch <= to_maxchar' failed.

Attached patch fixes this issue.

----------
keywords: +patch
Added file: http://bugs.python.org/file25327/format_nonascii.patch

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue14648>
_______________________________________
-------------- next part --------------
diff -r 6762b943ee59 Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py	Tue Apr 17 21:42:07 2012 -0400
+++ b/Lib/test/test_unicode.py	Mon Apr 23 16:25:13 2012 +0200
@@ -924,6 +924,14 @@ class UnicodeTest(string_tests.CommonTes
         self.assertRaises(ValueError, format, '', '#')
         self.assertRaises(ValueError, format, '', '#20')
 
+        # Non-ASCII
+        self.assertEqual("{0:s}{1:s}".format("ABC", "\u0410\u0411\u0412"),
+                         'ABC\u0410\u0411\u0412')
+        self.assertEqual("{0:.3s}".format("ABC\u0410\u0411\u0412"),
+                         'ABC')
+        self.assertEqual("{0:.0s}".format("ABC\u0410\u0411\u0412"),
+                         '')
+
     def test_format_map(self):
         self.assertEqual(''.format_map({}), '')
         self.assertEqual('a'.format_map({}), 'a')
diff -r 6762b943ee59 Objects/unicodeobject.c
--- a/Objects/unicodeobject.c	Tue Apr 17 21:42:07 2012 -0400
+++ b/Objects/unicodeobject.c	Mon Apr 23 16:25:13 2012 +0200
@@ -1957,6 +1957,37 @@ PyUnicode_FromKindAndData(int kind, cons
     }
 }
 
+Py_UCS4
+_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
+{
+    enum PyUnicode_Kind kind;
+    void *startptr, *endptr;
+
+    assert(PyUnicode_IS_READY(unicode));
+    assert(0 <= start);
+    assert(end <= PyUnicode_GET_LENGTH(unicode));
+    assert(start <= end);
+
+    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
+        return PyUnicode_MAX_CHAR_VALUE(unicode);
+
+    if (start == end)
+        return 127;
+
+    kind = PyUnicode_KIND(unicode);
+    startptr = PyUnicode_DATA(unicode);
+    endptr = (char*)startptr + end * kind;
+    if (start)
+        startptr = (char*)startptr + start * kind;
+    switch(kind)
+    {
+    case PyUnicode_1BYTE_KIND: return ucs1lib_find_max_char(startptr, endptr);
+    case PyUnicode_2BYTE_KIND: return ucs2lib_find_max_char(startptr, endptr);
+    default:
+    case PyUnicode_4BYTE_KIND: return ucs4lib_find_max_char(startptr, endptr);
+    }
+}
+
 /* Ensure that a string uses the most efficient storage, if it is not the
    case: create a new string with of the right kind. Write NULL into *p_unicode
    on error. */
diff -r 6762b943ee59 Python/formatter_unicode.c
--- a/Python/formatter_unicode.c	Tue Apr 17 21:42:07 2012 -0400
+++ b/Python/formatter_unicode.c	Mon Apr 23 16:25:13 2012 +0200
@@ -713,10 +713,10 @@ format_string_internal(PyObject *value, 
     Py_ssize_t lpad;
     Py_ssize_t rpad;
     Py_ssize_t total;
-    Py_ssize_t pos;
+    Py_ssize_t i, pos;
     Py_ssize_t len = PyUnicode_GET_LENGTH(value);
     PyObject *result = NULL;
-    Py_UCS4 maxchar = 127;
+    Py_UCS4 ch, maxchar = 127;
 
     /* sign is not allowed on strings */
     if (format->sign != '\0') {
@@ -752,8 +752,12 @@ format_string_internal(PyObject *value, 
     if (lpad != 0 || rpad != 0)
         maxchar = Py_MAX(maxchar, format->fill_char);
 
+    ch = _PyUnicode_FindMaxChar(value, 0, len);
+    maxchar = Py_MAX(maxchar, ch);
+
     /* allocate the resulting string */
     result = PyUnicode_New(total, maxchar);
+    printf("maxchar = 0x%x\n", maxchar);
     if (result == NULL)
         goto done;
 


More information about the Python-bugs-list mailing list