[Jython-checkins] jython: Tolerate unicode arguments to binascii methods (fixes #2826).

Tue Dec 24 06:10:11 EST 2019

https://hg.python.org/jython/rev/7fe475b0fea2
changeset:   8315:7fe475b0fea2
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Mon Dec 23 17:51:51 2019 +0000
summary:
  Tolerate unicode arguments to binascii methods (fixes #2826).

Where a unicode argument is given, it is interpreted as bytes through
the default encoding. A test suite is added (missing from CPython 2) to
extend test_binascii to unicode arguments (ASCII decoding only).

There is some tidying up in binascii.java, but more is needed.

files:
  Lib/test/test_binascii_jy.py         |   72 +++++
  NEWS                                 |    1 +
  src/org/python/modules/binascii.java |  197 +++++++-------
  3 files changed, 169 insertions(+), 101 deletions(-)

diff --git a/Lib/test/test_binascii_jy.py b/Lib/test/test_binascii_jy.py
new file mode 100644
--- /dev/null
+++ b/Lib/test/test_binascii_jy.py
@@ -0,0 +1,72 @@
+"""Test unicode handling in the binascii Java module."""
+
+from test import test_support
+from test.test_binascii import BinASCIITest
+import unittest
+import binascii
+
+
+class UnicodeBinASCIITest(BinASCIITest):
+
+    type2test = unicode
+
+    # Create binary test data, but only 7-bit data to survive implicit unicode to str conversion.
+    rawdata = "The quick brown fox jumps over the lazy dog.\r\n"
+    rawdata += "".join(map(chr, xrange(128)))
+    rawdata += "\r\nHello world.\n"
+
+    def test_base64invalid(self):
+        # Test base64 with random invalid characters sprinkled throughout.
+        # This is a copy of BinASCIITest.test_base64invalid with 256 changed to 128 where we
+        # generate "fillers".
+
+        # Creating the modified test reveals a latent bug in the test as written, which is that the
+        # padding character "=" is/was inserted as a filler. In the original test, the location of
+        # that is harmless. With the change 256 to 128, it causes early termination of the
+        # a2b_base64 conversion (both CPython and Jython). We therefore make padding a valid
+        # character, excluding it from the fillers.
+
+        MAX_BASE64 = 57
+        lines = []
+        for i in range(0, len(self.data), MAX_BASE64):
+            b = self.type2test(self.rawdata[i:i+MAX_BASE64])
+            a = binascii.b2a_base64(b)
+            lines.append(a)
+
+        fillers = ""
+        valid = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/"
+        valid += "=" # pad character also valid
+        for i in xrange(128): # not 256 as in BinASCIITest.test_base64invalid
+            c = chr(i)
+            if c not in valid:
+                fillers += c
+
+        def addnoise(line):
+            noise = fillers
+            ratio = len(line) // len(noise)
+            res = ""
+            while line and noise:
+                if len(line) // len(noise) > ratio:
+                    c, line = line[0], line[1:]
+                else:
+                    c, noise = noise[0], noise[1:]
+                res += c
+            return res + noise + line
+
+        res = ""
+        for line in map(addnoise, lines):
+            a = self.type2test(line)
+            b = binascii.a2b_base64(a)
+            res += b
+        self.assertEqual(res, self.rawdata)
+
+        # Test base64 with just invalid characters, which should return
+        # empty strings. TBD: shouldn't it raise an exception instead ?
+        self.assertEqual(binascii.a2b_base64(self.type2test(fillers)), '')
+
+
+def test_main():
+    test_support.run_unittest(UnicodeBinASCIITest)
+
+if __name__ == "__main__":
+    test_main()
diff --git a/NEWS b/NEWS
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,7 @@
 Jython 2.7.2b3
   Bugs fixed
     - [ 2820 ] Import fails with UnicodeDecodeError if sys.path contains invalid UTF-8 bytes
+    - [ 2826 ] Unicode hex string decode failure
     - [ 2836 ] Java Swing library works only in interactive jython session
 
 Jython 2.7.2b2
diff --git a/src/org/python/modules/binascii.java b/src/org/python/modules/binascii.java
--- a/src/org/python/modules/binascii.java
+++ b/src/org/python/modules/binascii.java
@@ -1,5 +1,6 @@
 /*
- * Copyright 1998 Finn Bock.
+ * Copyright 2019 Jython Developers
+ * Original conversion from CPython source copyright 1998 Finn Bock.
  *
  * This program contains material copyrighted by:
  * Copyright (c) 1991, 1992, 1993, 1994 by Stichting Mathematisch Centrum,
@@ -145,6 +146,7 @@
     private static short SKIP = 0x7E;
     private static short FAIL = 0x7D;
 
+    //@formatter:off
     private static short[] table_a2b_hqx = {
         /*       ^@    ^A    ^B    ^C    ^D    ^E    ^F    ^G   */
         /* 0*/  FAIL, FAIL, FAIL, FAIL, FAIL, FAIL, FAIL, FAIL,
@@ -195,13 +197,14 @@
                 FAIL, FAIL, FAIL, FAIL, FAIL, FAIL, FAIL, FAIL,
                 FAIL, FAIL, FAIL, FAIL, FAIL, FAIL, FAIL, FAIL,
     };
+    //@formatter:on
 
     private static byte[] table_b2a_hqx =
         StringUtil.toBytes("!\"#$%&'()*+,-012345689 at ABCDEFGHIJKLMNPQRSTUVXYZ[`abcdefhijklmpqr");
 
 
 
-
+    //@formatter:off
     private static short table_a2b_base64[] = {
         -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
         -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
@@ -212,6 +215,7 @@
         -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
         41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1
     };
+    //@formatter:on
 
     private static char BASE64_PAD = '=';
 
@@ -222,7 +226,7 @@
         StringUtil.toBytes("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
 
 
-
+    //@formatter:off
     private static int[] crctab_hqx = {
         0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
         0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef,
@@ -257,6 +261,7 @@
         0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8,
         0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0,
     };
+    //@formatter:on
 
 
 
@@ -270,7 +275,7 @@
      * binary data. Lines normally contain 45 (binary) bytes, except for the
      * last line. Line data may be followed by whitespace.
      */
-    public static PyString a2b_uu(BufferProtocol bp) {
+    public static PyString a2b_uu(PyObject bp) {
         int leftbits = 0;
         int leftchar = 0;
 
@@ -349,20 +354,22 @@
      * is the converted line, including a newline char. The length of
      * <i>data</i> should be at most 45.
      */
-    public static PyString b2a_uu(BufferProtocol bp) {
+    public static PyString b2a_uu(PyObject bp) {
         int leftbits = 0;
         char this_ch;
         int leftchar = 0;
 
-        PyBuffer bin_data = bp.getBuffer(PyBUF.SIMPLE);
+        try (PyBuffer bin_data = getTextBuffer(bp)) {
 
-        StringBuilder ascii_data = new StringBuilder();
-        try {
             int bin_len = bin_data.getLen();
             if (bin_len > 45) {
                 // The 45 is a limit that appears in all uuencode's
                 throw new PyException(Error, "At most 45 bytes at once");
             }
+            // Each 3 bytes in (rounded up) produces 4 characters out.
+            int ascii_len = 4 * ((bin_len + 2) / 3);
+            // Plus a 1 byte length and '\n'
+            StringBuilder ascii_data = new StringBuilder(ascii_len + 2);
 
             // Store the length */
             ascii_data.append((char)(' ' + (bin_len & 077)));
@@ -383,15 +390,15 @@
                     ascii_data.append((char)(this_ch + ' '));
                 }
             }
-        } finally {
-            bin_data.release();
+
+            ascii_data.append('\n'); // Append a courtesy newline
+            return new PyString(ascii_data.toString());
+
+        } catch (ClassCastException e) {
+            throw argMustBeBytes("b2a_uu", bp);
         }
-        ascii_data.append('\n'); // Append a courtesy newline
-
-        return new PyString(ascii_data.toString());
     }
 
-
     private static int binascii_find_valid(PyBuffer b, int offset, int num) {
         int blen = b.getLen() - offset;
 
@@ -427,7 +434,7 @@
      * Convert a block of base64 data back to binary and return the
      * binary data. More than one line may be passed at a time.
      */
-    public static PyString a2b_base64(BufferProtocol bp) {
+    public static PyString a2b_base64(PyObject bp) {
         int leftbits = 0;
         char this_ch;
         int leftchar = 0;
@@ -437,7 +444,9 @@
             int ascii_len = ascii_data.getLen();
 
             int bin_len = 0;
-            StringBuilder bin_data = new StringBuilder();
+
+            // Every 4 characters (rounded up) maps to 3 bytes
+            StringBuilder bin_data = new StringBuilder(3 * ((ascii_len + 3) / 4));
 
             for (int i = 0; ascii_len > 0; ascii_len--, i++) {
                 // Skip some punctuation
@@ -498,18 +507,18 @@
      * Convert binary data to a line of ASCII characters in base64 coding.
      * The return value is the converted line, including a newline char.
      */
-    public static PyString b2a_base64(BufferProtocol bp) {
-        int leftbits = 0;
+    public static PyString b2a_base64(PyObject bp) {
+        int leftbits = 0;   // how many bits waiting
         char this_ch;
-        int leftchar = 0;
-
-        StringBuilder ascii_data = new StringBuilder();
+        int leftchar = 0;   // store bits not yet emitted (max 12 bits)
 
         try (PyBuffer bin_data = getTextBuffer(bp)) {
             int bin_len = bin_data.getLen();
             if (bin_len > BASE64_MAXBIN) {
                 throw new PyException(Error, "Too much data for base64 line");
             }
+            // Every 3 bytes (rounded up) maps to 4 characters (and there's a newline)
+            StringBuilder ascii_data = new StringBuilder(4 * ((bin_len + 2) / 3) + 1);
 
             for (int i = 0; bin_len > 0; bin_len--, i++) {
                 // Shift the data into our buffer
@@ -524,6 +533,7 @@
                 }
             }
 
+            // Emit the balance of bits and append a newline
             if (leftbits == 2) {
                 ascii_data.append((char) table_b2a_base64[(leftchar & 3) << 4]);
                 ascii_data.append(BASE64_PAD);
@@ -551,7 +561,7 @@
      * binary bytes, or (in case of the last portion of the binhex4 data)
      * have the remaining bits zero.
      */
-    public static PyTuple a2b_hqx(BufferProtocol bp) {
+    public static PyTuple a2b_hqx(PyObject bp) {
         int leftbits = 0;
         char this_ch;
         int leftchar = 0;
@@ -606,13 +616,13 @@
      * Perform binhex4 style RLE-compression on <i>data</i> and return the
      * result.
      */
-    static public PyString rlecode_hqx(BufferProtocol bp) {
-        PyBuffer in_data = bp.getBuffer(PyBUF.SIMPLE);
-        int len = in_data.getLen();
+    static public PyString rlecode_hqx(PyObject bp) {
+
+        try (PyBuffer in_data = getTextBuffer(bp)) {
 
-        StringBuilder out_data = new StringBuilder();
+            int len = in_data.getLen();
+            StringBuilder out_data = new StringBuilder();
 
-        try {
             for (int in=0; in < len; in++) {
                 char ch = (char) in_data.intAt(in);
                 if (ch == RUNCHAR) {
@@ -639,10 +649,10 @@
                     }
                 }
             }
-        } finally {
-            in_data.release();
+            return new PyString(out_data.toString());
+        } catch (ClassCastException e) {
+            throw argMustBeBytes("rlecode_hqx", bp);
         }
-        return new PyString(out_data.toString());
     }
 
 
@@ -655,7 +665,7 @@
      * resulting string. The argument should already be RLE-coded, and have a
      * length divisible by 3 (except possibly the last fragment).
      */
-    public static PyString b2a_hqx(BufferProtocol bp) {
+    public static PyString b2a_hqx(PyObject bp) {
         int leftbits = 0;
         char this_ch;
         int leftchar = 0;
@@ -702,21 +712,20 @@
      * unless data input data ends in an orphaned repeat indicator, in which
      * case the <tt>Incomplete</tt> exception is raised.
      */
-    static public PyString rledecode_hqx(BufferProtocol bp) {
+    static public PyString rledecode_hqx(PyObject bp) {
         char in_byte, in_repeat;
 
-        PyBuffer in_data = bp.getBuffer(PyBUF.SIMPLE);
-        int in_len = in_data.getLen();
-        int i = 0;
+        try (PyBuffer in_data = getTextBuffer(bp)) {
+            int in_len = in_data.getLen();
+            int i = 0;
 
-        StringBuilder out_data = new StringBuilder();
-        try {
+            StringBuilder out_data = new StringBuilder();
+
             // Empty string is a special case
             if (in_len == 0) {
                 return Py.EmptyString;
             }
 
-
             // Handle first byte separately (since we have to get angry
             // in case of an orphaned RLE code).
             if (--in_len < 0) {
@@ -767,14 +776,14 @@
                     out_data.append(in_byte);
                 }
             }
-        } finally {
-            in_data.release();
+
+            return new PyString(out_data.toString());
+
+        } catch (ClassCastException e) {
+            throw argMustBeBytes("rledecode_hqx", bp);
         }
-        return new PyString(out_data.toString());
     }
 
-
-
     public static PyString __doc__crc_hqx = new PyString(
         "(data, oldcrc) -> newcrc. Compute hqx CRC incrementally"
     );
@@ -784,26 +793,25 @@
      * Compute the binhex4 crc value of <i>data</i>, starting with an initial
      * <i>crc</i> and returning the result.
      */
-    public static int crc_hqx(BufferProtocol bp, int crc) {
-        PyBuffer bin_data = bp.getBuffer(PyBUF.SIMPLE);
-        int len = bin_data.getLen();
-        int i = 0;
+    public static int crc_hqx(PyObject bp, int crc) {
+        try (PyBuffer bin_data = getTextBuffer(bp)) {
+            int len = bin_data.getLen();
+            int i = 0;
 
-        try {
             while(len-- > 0) {
                 crc=((crc<<8)&0xff00) ^
                            crctab_hqx[((crc>>8)&0xff)^ (char) bin_data.intAt(i++)];
             }
-        } finally {
-            bin_data.release();
+            return crc;
+
+        } catch (ClassCastException e) {
+            throw argMustBeBytes("crc_hqx", bp);
         }
-
-        return crc;
     }
 
 
 
-
+//@formatter:off
 static long[] crc_32_tab = new long[] {
 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
@@ -858,27 +866,29 @@
 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
 0x2d02ef8dL
 };
+//@formatter:on
 
-    public static int crc32(BufferProtocol bp) {
+    public static int crc32(PyObject bp) {
         return crc32(bp, 0);
     }
 
-    public static int crc32(BufferProtocol bp, long crc) {
-        PyBuffer bin_data = bp.getBuffer(PyBUF.SIMPLE);
-        int len = bin_data.getLen();
+    public static int crc32(PyObject bp, long crc) {
 
         crc &= 0xFFFFFFFFL;
         crc = crc ^ 0xFFFFFFFFL;
-        try {
+
+        try (PyBuffer bin_data = getTextBuffer(bp)) {
+            int len = bin_data.getLen();
             for (int i = 0; i < len; i++) {
                 char ch = (char) bin_data.intAt(i);
                 crc = (int)crc_32_tab[(int) ((crc ^ ch) & 0xffL)] ^ (crc >> 8);
                 /* Note:  (crc >> 8) MUST zero fill on left */
                 crc &= 0xFFFFFFFFL;
             }
-        } finally {
-            bin_data.release();
+        } catch (ClassCastException e) {
+            throw argMustBeBytes("crc32", bp);
         }
+
         if (crc >= 0x80000000) {
             return -(int)(crc+1 & 0xFFFFFFFF);
         } else {
@@ -886,7 +896,6 @@
         }
     }
 
-
     private static char[] hexdigit = "0123456789abcdef".toCharArray();
 
     public static PyString __doc__b2a_hex = new PyString(
@@ -895,7 +904,7 @@
         "This function is also available as \"hexlify()\"."
     );
 
-    public static PyString b2a_hex(BufferProtocol bp) {
+    public static PyString b2a_hex(PyObject bp) {
 
         try (PyBuffer argbuf = getTextBuffer(bp)) {
 
@@ -916,7 +925,7 @@
         }
     }
 
-    public static PyString hexlify(BufferProtocol argbuf) {
+    public static PyString hexlify(PyObject argbuf) {
         return b2a_hex(argbuf);
     }
 
@@ -929,9 +938,9 @@
         "This function is also available as \"unhexlify()\""
     );
 
-    public static PyString a2b_hex(BufferProtocol bp) {
+    public static PyString a2b_hex(PyObject bp) {
 
-        try (PyBuffer argbuf = bp.getBuffer(PyBUF.SIMPLE)) {
+        try (PyBuffer argbuf = getTextBuffer(bp)) {
 
             int arglen = argbuf.getLen();
             StringBuilder retbuf = new StringBuilder(arglen / 2);
@@ -959,7 +968,7 @@
         }
     }
 
-    public static PyString unhexlify(BufferProtocol argbuf) {
+    public static PyString unhexlify(PyObject argbuf) {
         return a2b_hex(argbuf);
     }
 
@@ -994,20 +1003,12 @@
     {
         ArgParser ap = new ArgParser("a2b_qp", arg, kws, new String[] {"s", "header"});
 
-        PyObject pyObject = ap.getPyObject(0);
-        BufferProtocol bp;
-        if (pyObject instanceof BufferProtocol) {
-            bp = (BufferProtocol) pyObject;
-        } else {
-            throw Py.TypeError("expected something conforming to the buffer protocol, got "
-                    + pyObject.getType().fastGetName());
-        }
+        PyObject bp = ap.getPyObject(0);
 
         StringBuilder sb = new StringBuilder();
         boolean header = getIntFlagAsBool(ap, 1, 0, "an integer is required");
 
-        PyBuffer ascii_data = bp.getBuffer(PyBUF.SIMPLE);
-        try {
+        try (PyBuffer ascii_data = getTextBuffer((PyObject)bp)) {
             for (int i=0, m=ascii_data.getLen(); i<m;) {
                     char c = (char) ascii_data.intAt(i++);
                     if (header && c == '_') {
@@ -1034,10 +1035,10 @@
                             sb.append(c);
                     }
             }
-        } finally {
-            ascii_data.release();
+        return new PyString(sb.toString());
+        } catch (ClassCastException e) {
+            throw argMustBeBytes("a2b_qp", bp);
         }
-        return new PyString(sb.toString());
     }
 
     final private static Pattern RN_TO_N = Pattern.compile("\r\n");
@@ -1055,21 +1056,14 @@
         boolean istext = getIntFlagAsBool(ap, 2, 1, "an integer is required");
         boolean header = getIntFlagAsBool(ap, 3, 0, "an integer is required");
 
-        PyObject pyObject = ap.getPyObject(0);
-        BufferProtocol bp;
-        if (pyObject instanceof BufferProtocol) {
-            bp = (BufferProtocol) pyObject;
-        } else {
-            throw Py.TypeError("expected something conforming to the buffer protocol, got "
-                    + pyObject.getType().fastGetName());
-        }
+        PyObject bp = ap.getPyObject(0);
+
+        try (PyBuffer bin_data = getTextBuffer(bp)) {
 
-        PyBuffer bin_data = bp.getBuffer(PyBUF.SIMPLE);
-        int datalen = bin_data.getLen();
-        StringBuilder sb = new StringBuilder(datalen);
-        try {
+            int datalen = bin_data.getLen();
+            StringBuilder sb = new StringBuilder(datalen);
+            String lineEnd = "\n";
 
-            String lineEnd = "\n";
             // Work out if line endings should be crlf.
             for (int i = 0, m = bin_data.getLen(); i < m; i++) {
                 if ('\n' == bin_data.intAt(i)) {
@@ -1149,11 +1143,13 @@
                     }
                 }
             }
-        } finally {
-            bin_data.release();
+
+            return new PyString(sb.toString());
+
+        } catch (ClassCastException e) {
+            throw argMustBeBytes("b2a_qp", bp);
         }
 
-        return new PyString(sb.toString());
     }
 
     /**
@@ -1162,17 +1158,16 @@
      * may be a {@code PyUnicode}, in which case the it will be decoded to bytes using the default
      * encoding ({@code sys.getdefaultencoding()}.
      *
-     * @param text an object with the buffer protocol
-     * @return a byte-buffer view of the ASCII text
+     * @param text an object with the buffer protocol (or {@code unicode})
+     * @return a byte-buffer view of argument (or default decoding if {@code unicode})
      * @throws ClassCastException where the text object does not implement the buffer protocol
      */
-    private static PyBuffer getTextBuffer(BufferProtocol text)
-            throws ClassCastException {
+    private static PyBuffer getTextBuffer(PyObject text) throws ClassCastException {
         if (text instanceof PyUnicode) {
             String s = ((PyUnicode) text).encode();
             return new SimpleStringBuffer(PyBUF.SIMPLE, null, s);
         } else {
-            return text.getBuffer(PyBUF.SIMPLE);
+            return ((BufferProtocol) text).getBuffer(PyBUF.SIMPLE);
         }
     }
 
@@ -1184,7 +1179,7 @@
      * @param arg argument provided from which actual type will be reported
      * @return TypeError to throw
      */
-    private static PyException argMustBeBytes(String f, BufferProtocol arg) {
+    private static PyException argMustBeBytes(String f, PyObject arg) {
         String fmt = "%s() argument 1 must bytes or unicode, not %s";
         String type = "null";
         if (arg instanceof PyObject) {

-- 
Repository URL: https://hg.python.org/jython