[Python-checkins] bpo-34749: Improved performance of binascii.a2b_base64(). (GH-9444)

Miss Islington (bot) webhook-mailer at python.org
Sun Jul 14 08:15:36 EDT 2019


https://github.com/python/cpython/commit/1c5e68e7145f0825f9b952389141edb9436eb43d
commit: 1c5e68e7145f0825f9b952389141edb9436eb43d
branch: master
author: Sergey Fedoseev <fedoseev.sergey at gmail.com>
committer: Miss Islington (bot) <31488909+miss-islington at users.noreply.github.com>
date: 2019-07-14T05:15:32-07:00
summary:

bpo-34749: Improved performance of binascii.a2b_base64(). (GH-9444)



https://bugs.python.org/issue34749

files:
A Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst
M Modules/binascii.c

diff --git a/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst b/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst
new file mode 100644
index 000000000000..5a5e5b492c0b
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2018-09-21-13-23-29.bpo-34749.B0k819.rst
@@ -0,0 +1,2 @@
+:func:`binascii.a2b_base64` is now up to 2 times faster. Patch by Sergey
+Fedoseev.
diff --git a/Modules/binascii.c b/Modules/binascii.c
index 1c7dc35882de..94b0732c12c8 100644
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -130,7 +130,7 @@ static const unsigned char table_a2b_hqx[256] = {
 static const unsigned char table_b2a_hqx[] =
 "!\"#$%&'()*+,-012345689 at ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr";
 
-static const char table_a2b_base64[] = {
+static const unsigned char table_a2b_base64[] = {
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
@@ -138,7 +138,16 @@ static const char table_a2b_base64[] = {
     -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
     15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
     -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
-    41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
+    41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
+
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
 };
 
 #define BASE64_PAD '='
@@ -413,32 +422,6 @@ binascii_b2a_uu_impl(PyObject *module, Py_buffer *data, int backtick)
     return _PyBytesWriter_Finish(&writer, ascii_data);
 }
 
-
-static int
-binascii_find_valid(const unsigned char *s, Py_ssize_t slen, int num)
-{
-    /* Finds & returns the (num+1)th
-    ** valid character for base64, or -1 if none.
-    */
-
-    int ret = -1;
-    unsigned char c, b64val;
-
-    while ((slen > 0) && (ret == -1)) {
-        c = *s;
-        b64val = table_a2b_base64[c & 0x7f];
-        if ( ((c <= 0x7f) && (b64val != (unsigned char)-1)) ) {
-            if (num == 0)
-                ret = *s;
-            num--;
-        }
-
-        s++;
-        slen--;
-    }
-    return ret;
-}
-
 /*[clinic input]
 binascii.a2b_base64
 
@@ -452,88 +435,74 @@ static PyObject *
 binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
 /*[clinic end generated code: output=0628223f19fd3f9b input=5872acf6e1cac243]*/
 {
-    const unsigned char *ascii_data;
-    unsigned char *bin_data;
-    unsigned char *bin_data_start;
-    int leftbits = 0;
-    unsigned char this_ch;
-    unsigned int leftchar = 0;
-    Py_ssize_t ascii_len, bin_len;
-    int quad_pos = 0;
-    _PyBytesWriter writer;
-    binascii_state *state;
-
-    ascii_data = data->buf;
-    ascii_len = data->len;
+    assert(data->len >= 0);
 
-    assert(ascii_len >= 0);
-
-    if (ascii_len > PY_SSIZE_T_MAX - 3)
-        return PyErr_NoMemory();
-
-    bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
-
-    _PyBytesWriter_Init(&writer);
+    const unsigned char *ascii_data = data->buf;
+    size_t ascii_len = data->len;
 
     /* Allocate the buffer */
-    bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
+    Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
+    _PyBytesWriter writer;
+    _PyBytesWriter_Init(&writer);
+    unsigned char *bin_data = _PyBytesWriter_Alloc(&writer, bin_len);
     if (bin_data == NULL)
         return NULL;
-    bin_data_start = bin_data;
-
-    for( ; ascii_len > 0; ascii_len--, ascii_data++) {
-        this_ch = *ascii_data;
+    unsigned char *bin_data_start = bin_data;
 
-        if (this_ch > 0x7f ||
-            this_ch == '\r' || this_ch == '\n' || this_ch == ' ')
-            continue;
+    int quad_pos = 0;
+    unsigned char leftchar = 0;
+    int pads = 0;
+    for (size_t i = 0; i < ascii_len; i++) {
+        unsigned char this_ch = ascii_data[i];
 
         /* Check for pad sequences and ignore
         ** the invalid ones.
         */
         if (this_ch == BASE64_PAD) {
-            if ( (quad_pos < 2) ||
-                 ((quad_pos == 2) &&
-                  (binascii_find_valid(ascii_data, ascii_len, 1)
-                   != BASE64_PAD)) )
-            {
-                continue;
-            }
-            else {
+            if (quad_pos >= 2 && quad_pos + ++pads >= 4) {
                 /* A pad sequence means no more input.
                 ** We've already interpreted the data
                 ** from the quad at this point.
                 */
-                leftbits = 0;
-                break;
+                goto done;
             }
+            continue;
         }
 
-        this_ch = table_a2b_base64[*ascii_data];
-        if ( this_ch == (unsigned char) -1 )
+        this_ch = table_a2b_base64[this_ch];
+        if (this_ch >= 64) {
             continue;
+        }
+        pads = 0;
 
-        /*
-        ** Shift it in on the low end, and see if there's
-        ** a byte ready for output.
-        */
-        quad_pos = (quad_pos + 1) & 0x03;
-        leftchar = (leftchar << 6) | (this_ch);
-        leftbits += 6;
-
-        if ( leftbits >= 8 ) {
-            leftbits -= 8;
-            *bin_data++ = (leftchar >> leftbits) & 0xff;
-            leftchar &= ((1 << leftbits) - 1);
+        switch (quad_pos) {
+            case 0:
+                quad_pos = 1;
+                leftchar = this_ch;
+                break;
+            case 1:
+                quad_pos = 2;
+                *bin_data++ = (leftchar << 2) | (this_ch >> 4);
+                leftchar = this_ch & 0x0f;
+                break;
+            case 2:
+                quad_pos = 3;
+                *bin_data++ = (leftchar << 4) | (this_ch >> 2);
+                leftchar = this_ch & 0x03;
+                break;
+            case 3:
+                quad_pos = 0;
+                *bin_data++ = (leftchar << 6) | (this_ch);
+                leftchar = 0;
+                break;
         }
     }
 
-    if (leftbits != 0) {
-        state = PyModule_GetState(module);
+    if (quad_pos != 0) {
+        binascii_state *state = PyModule_GetState(module);
         if (state == NULL) {
-            return NULL;
-        }
-        if (leftbits == 6) {
+            /* error already set, from PyModule_GetState */
+        } else if (quad_pos == 1) {
             /*
             ** There is exactly one extra valid, non-padding, base64 character.
             ** This is an invalid length, as there is no possible input that
@@ -551,6 +520,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data)
         return NULL;
     }
 
+done:
     return _PyBytesWriter_Finish(&writer, bin_data);
 }
 



More information about the Python-checkins mailing list