[issue14339] Optimizing bin, oct and hex

Sun Apr 15 22:32:12 CEST 2012

Serhiy Storchaka <storchaka at gmail.com> added the comment:

> (1) The patch appears to assume that a Unicode string created with PyUnicode_New(size, 127) will have 'kind' PyUnicode_1BYTE_KIND.  While this might be true in the current implementation, I don't know whether this is guaranteed in general.  Martin, any comments on this?

The same assumption is used above in long_to_decimal_string(). I added
the same assert and changed maxchar to 'f'.

> (2) The patch doesn't compile with '--with-pydebug':  there's a reference to the (now) undefined variable 'buffer' in one of the asserts.

Fixed.

> (3) The overflow check looks as though it needs to be reworked.

I was left an old constraint (it is enough), but it really can be
weakened (in fact, it can be so weakened in the current code).

Thank you for the comments and the found error.

----------
Added file: http://bugs.python.org/file25227/long_to_binary_base_2.patch

_______________________________________
Python tracker <report at bugs.python.org>
<http://bugs.python.org/issue14339>
_______________________________________
-------------- next part --------------
diff -r 13307eb5bf47 Objects/longobject.c

--- a/Objects/longobject.c	Sat Apr 14 14:20:29 2012 -0500
+++ b/Objects/longobject.c	Sun Apr 15 23:29:34 2012 +0300
@@ -1672,11 +1672,10 @@
 {
     register PyLongObject *a = (PyLongObject *)aa;
     PyObject *v;
-    Py_ssize_t i, sz;
+    Py_ssize_t sz;
     Py_ssize_t size_a;
-    char *p;
-    char sign = '\0';
-    char *buffer;
+    Py_UCS1 *p;
+    int negative;
     int bits;
 
     assert(base == 2 || base == 8 || base == 10 || base == 16);
@@ -1688,6 +1687,7 @@
         return NULL;
     }
     size_a = ABS(Py_SIZE(a));
+    negative = Py_SIZE(a) < 0;
 
     /* Compute a rough upper bound for the length of the string */
     switch (base) {
@@ -1706,31 +1706,37 @@
     }
     /* compute length of output string: allow 2 characters for prefix and
        1 for possible '-' sign. */
-    if (size_a > (PY_SSIZE_T_MAX - 3) / PyLong_SHIFT / sizeof(Py_UCS4)) {
+    if (size_a > (PY_SSIZE_T_MAX - 3) / PyLong_SHIFT) {
         PyErr_SetString(PyExc_OverflowError,
                         "int is too large to format");
         return NULL;
     }
     /* now size_a * PyLong_SHIFT + 3 <= PY_SSIZE_T_MAX, so the RHS below
        is safe from overflow */
-    sz = 3 + (size_a * PyLong_SHIFT + (bits - 1)) / bits;
-    assert(sz >= 0);
-    buffer = PyMem_Malloc(sz);
-    if (buffer == NULL) {
+    if (size_a == 0) {
+        sz = 3;
+    }
+    else {
+        sz = 2 + negative + ((size_a - 1) * PyLong_SHIFT +
+                             bits_in_digit(a->ob_digit[size_a - 1]) +
+                             (bits - 1)) / bits;
+    }
+    v = PyUnicode_New(sz, 'f');
+    if (v == NULL) {
         PyErr_NoMemory();
         return NULL;
     }
-    p = &buffer[sz];
-    if (Py_SIZE(a) < 0)
-        sign = '-';
-
-    if (Py_SIZE(a) == 0) {
+    assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
+    p = PyUnicode_1BYTE_DATA(v) + sz;
+
+    if (size_a == 0) {
         *--p = '0';
     }
     else {
         /* JRH: special case for power-of-2 bases */
         twodigits accum = 0;
         int accumbits = 0;              /* # of bits in accum */
+        Py_ssize_t i;
         for (i = 0; i < size_a; ++i) {
             accum |= (twodigits)a->ob_digit[i] << accumbits;
             accumbits += PyLong_SHIFT;
@@ -1739,7 +1745,7 @@
                 char cdigit;
                 cdigit = (char)(accum & (base - 1));
                 cdigit += (cdigit < 10) ? '0' : 'a'-10;
-                assert(p > buffer);
+                assert(p > PyUnicode_1BYTE_DATA(v));
                 *--p = cdigit;
                 accumbits -= bits;
                 accum >>= bits;
@@ -1747,6 +1753,7 @@
         }
     }
 
+    assert(p == PyUnicode_1BYTE_DATA(v) + 2 + negative);
     if (base == 16)
         *--p = 'x';
     else if (base == 8)
@@ -1754,10 +1761,8 @@
     else /* (base == 2) */
         *--p = 'b';
     *--p = '0';
-    if (sign)
-        *--p = sign;
-    v = PyUnicode_DecodeASCII(p, &buffer[sz] - p, NULL);
-    PyMem_Free(buffer);
+    if (negative)
+        *--p = '-';
     return v;
 }