Adding info to Python Buglist

Sat Oct 16 21:11:03 EDT 1999

I've located the problem.  It's a bug in the "binascii.c" module.

The binascii base64 decoding function assumes that, for every pad
character ('='), there is one less byte present in the output.  There
are only two valid quad sequences with the pad symbol: 'XXX=' and
'XX=='. Using the binascii assumption with '====', 4 characters are
removed from a string with 3 characters.  _PyString_Resize() is called
without checking whether the calling argument is a positive, and odd
behavior can follow.

The RFC* says that invalid sequences and characters should be ignored. 
This includes invalid pad sequences and invalid characters.  The
"binascii.c" file that I got for Python 1.5.2 does not ignore valid
characters or invalid pad sequences.  Characters are masked with 0x7f;
the binary value 0xc1 is mapped to the letter 'A' in this case.

I've fixed the problem in my copy of Python, and have included the patch
to "Modules/binascii.c" with this message.  To apply the patch:

  1) Go into the directory containing source code for Python 1.5.2.
  2) Go into the Modules directory.
  3) Apply the patch: patch -N binascii.c binascii.patch
  4) Recompile.

*The RFC I'm talking about is RFC 1521.  Section 5.2 details base64
encoding.

	Jason Trowbridge
	ratman at nmt.edu
-------------- next part --------------
diff -Naur older/Modules/binascii.c newer/Modules/binascii.c

--- older/Modules/binascii.c	Sat Oct 16 18:44:21 1999
+++ newer/Modules/binascii.c	Sat Oct 16 18:49:46 1999
@@ -75,6 +75,21 @@
 ** Jack Jansen, CWI, July 1995.
 */
 
+/*  I don't know about the other encodings, but base64 is detailed in the MIME
+ *  RFC.  Take a look at Section 5.2 of RFC 1521 at
+ *  http://www.faqs.org/rfcs/rfc1521.html for details on what's valid
+ *  and what's not.
+ *
+ *  Fixed a bug in base64 decoding involving illegal pad sequences.  Illegal
+ *  pad sequences would actually shorten the result string.  If there were
+ *  too many illegal pad sequences, the string could end up with a negative
+ *  length, resulting in Bad Things.
+ *
+ *  Fixed a bug where illegal characters could be remapped into legal
+ *  characters (instead of being ignored).
+ * 
+ *  --Jason Trowbridge
+ */
 
 #include "Python.h"
 
@@ -328,6 +343,29 @@
 	return rv;
 }
 
+
+int find_valid( char *s, int slen, int num )
+{
+	/* Finds & returns the (num+1)th valid character for base64, or -1 if none. */
+
+	int ret = -1, tmp;
+	unsigned char c, b64val;
+
+	while ((slen > 0) && (ret == -1)) {
+		c = *s;
+		b64val = table_a2b_base64[c & 0x7f];
+		if ( ((c <= 0x7f) && (b64val != (unsigned char)-1)) ) {
+			num--;
+			if (num == 0)
+				ret = *s;
+		}
+
+		s++;
+		slen--;
+	}
+	return ret;
+}
+
 static char doc_a2b_base64[] = "(ascii) -> bin. Decode a line of base64 data";
 
 static PyObject *
@@ -339,9 +377,10 @@
 	int leftbits = 0;
 	unsigned char this_ch;
 	unsigned int leftchar = 0;
-	int npad = 0;
+	int npadfills = 0;
 	PyObject *rv;
 	int ascii_len, bin_len;
+	int quad_pos = 0;
 	
 	if ( !PyArg_ParseTuple(args, "t#", &ascii_data, &ascii_len) )
 		return NULL;
@@ -353,37 +392,54 @@
 		return NULL;
 	bin_data = (unsigned char *)PyString_AsString(rv);
 	bin_len = 0;
-	for( ; ascii_len > 0 ; ascii_len--, ascii_data++ ) {
-		/* Skip some punctuation */
-		this_ch = (*ascii_data & 0x7f);
-		if ( this_ch == '\r' || this_ch == '\n' || this_ch == ' ' )
+
+	for( ; ascii_len > 0 ; ascii_len--, ascii_data++) {
+		this_ch = *ascii_data;
+
+		if (this_ch > 0x7f || this_ch == '\r' || this_ch == '\n' || this_ch == ' ')
 			continue;
-		
-		if ( this_ch == BASE64_PAD )
-			npad++;
-		this_ch = table_a2b_base64[(*ascii_data) & 0x7f];
-		if ( this_ch == (unsigned char) -1 ) continue;
+
+		if (this_ch == BASE64_PAD) {  // Ignore invalid pad sequences
+			if ( (quad_pos < 2) || 
+					 (quad_pos == 2) && (find_valid(ascii_data, ascii_len, 1) != '=') ) {
+				continue;
+			} else {
+				npadfills = 4 - quad_pos;
+			}
+		}
+
+		this_ch = table_a2b_base64[*ascii_data];
+		if ( this_ch == (unsigned char) -1 )
+			continue;
+
 		/*
 		** Shift it in on the low end, and see if there's
 		** a byte ready for output.
 		*/
+		quad_pos = (quad_pos + 1) & 0x03;
 		leftchar = (leftchar << 6) | (this_ch);
 		leftbits += 6;
+
 		if ( leftbits >= 8 ) {
 			leftbits -= 8;
-			*bin_data++ = (leftchar >> leftbits) & 0xff;
+			
+			if (npadfills == 0) {
+				*bin_data++ = (leftchar >> leftbits) & 0xff;
+				bin_len++;
+			} else {
+				npadfills--;
+			}
+
 			leftchar &= ((1 << leftbits) - 1);
-			bin_len++;
 		}
-	}
-	/* Check that no bits are left */
-	if ( leftbits ) {
+ 	}
+
+	if (leftbits != 0) {
 		PyErr_SetString(Error, "Incorrect padding");
 		Py_DECREF(rv);
 		return NULL;
 	}
-	/* and remove any padding */
-	bin_len -= npad;
+
 	/* and set string size correctly */
 	_PyString_Resize(&rv, bin_len);
 	return rv;