[Jython-checkins] jython: Add PYTHONIOENCODING env var addressing issue #1876, and -E option to suppress.

jeff.allen jython-checkins at python.org
Sun Feb 9 21:22:03 CET 2014


http://hg.python.org/jython/rev/6e438088c0e3
changeset:   7181:6e438088c0e3
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Sun Feb 09 19:26:34 2014 +0000
summary:
  Add PYTHONIOENCODING env var addressing issue #1876, and -E option to suppress.
Also introduces registry items python.io.encoding and python.io.errors, with appropriate sequence
of priority for site, user, environment variable and command-line values.
Additions to test.test_sys (from CPython 2.7) and test.test_sys_jy for registry items.

files:
  Lib/test/test_sys.py                   |   20 ++
  Lib/test/test_sys_jy.py                |   63 ++++++-
  NEWS                                   |    4 +
  src/org/python/core/Console.java       |    8 +
  src/org/python/core/Options.java       |    1 -
  src/org/python/core/PlainConsole.java  |    9 +-
  src/org/python/core/PyFile.java        |   21 +-
  src/org/python/core/PySystemState.java |   54 ++++-
  src/org/python/core/StdoutWrapper.java |    5 +-
  src/org/python/util/jython.java        |  125 ++++++++++--
  10 files changed, 256 insertions(+), 54 deletions(-)


diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py
--- a/Lib/test/test_sys.py
+++ b/Lib/test/test_sys.py
@@ -251,6 +251,26 @@
         self.assert_(vi[3] in ("alpha", "beta", "candidate", "final"))
         self.assert_(isinstance(vi[4], int))
 
+    def test_ioencoding(self):  # from v2.7 test
+        import subprocess,os
+        env = dict(os.environ)
+
+        # Test character: cent sign, encoded as 0x4A (ASCII J) in CP424,
+        # not representable in ASCII.
+
+        env["PYTHONIOENCODING"] = "cp424"
+        p = subprocess.Popen([sys.executable, "-c", 'print unichr(0xa2)'],
+                             stdout = subprocess.PIPE, env=env)
+        out = p.stdout.read().strip()
+        self.assertEqual(out, unichr(0xa2).encode("cp424"))
+
+        env["PYTHONIOENCODING"] = "ascii:replace"
+        p = subprocess.Popen([sys.executable, "-c", 'print unichr(0xa2)'],
+                             stdout = subprocess.PIPE, env=env)
+        out = p.stdout.read().strip()
+        self.assertEqual(out, '?')
+
+
 def test_main():
     if test.test_support.is_jython:
         del SysModuleTest.test_lost_displayhook
diff --git a/Lib/test/test_sys_jy.py b/Lib/test/test_sys_jy.py
--- a/Lib/test/test_sys_jy.py
+++ b/Lib/test/test_sys_jy.py
@@ -1,3 +1,4 @@
+# -*- coding: iso-8859-1 -*-
 from __future__ import with_statement
 import os
 import re
@@ -185,13 +186,69 @@
         finally:
             os.rmdir(moduleDir)
         self.assertFalse(os.path.exists(moduleDir))        
-        
+
+class SysEncodingTest(unittest.TestCase):
+
+    # Adapted from CPython 2.7 test_sys to exercise setting Jython registry
+    # values related to encoding and error policy.
+
+    def test_ioencoding(self):  # adapted from CPython v2.7 test_sys
+        import subprocess, os
+        env = dict(os.environ)
+
+        def check(code, encoding=None, errors=None):
+            # Execute with encoding and errors optionally set via Java properties
+            command = [sys.executable]
+            if (encoding):
+                command.append('-Dpython.io.encoding={}'.format(encoding))
+            if (errors):
+                command.append('-Dpython.io.errors={}'.format(errors))
+            command.append('-c')
+            command.append('print unichr({:#x})'.format(code))
+            #print "\n   ", " ".join(command), " ... ",
+            p = subprocess.Popen(command, stdout = subprocess.PIPE, env=env)
+            return p.stdout.read().strip()
+
+        env.pop("PYTHONIOENCODING", None)
+        self.assertEqual(check(ord(u'A')), b"A")
+
+        # Test character: U+00a2 cent sign (¢) is:
+        # not representable in ASCII.
+        # xml: &#162
+        # cp1252: a2
+        # cp850: bd
+        # cp424: 4a
+        # utf-8: c2 a2
+
+        self.assertEqual(check(0xa2, "iso-8859-1"), "¢") # same as this file
+
+        # self.assertEqual(check(0xa2, "ascii"), "") # and an error message
+        self.assertEqual(check(0xa2, "ascii", "ignore"),"")
+        self.assertEqual(check(0xa2, "ascii", "replace"), "?")
+        self.assertEqual(check(0xa2, "ascii", "backslashreplace"), r"\xa2")
+        self.assertEqual(check(0xa2, "ascii", "xmlcharrefreplace"), "¢")
+
+        self.assertEqual(check(0xa2, "Cp1252"), "\xa2")
+        self.assertEqual(check(0xa2, "Cp424"), "\x4a")
+        self.assertEqual(check(0xa2, "utf-8"), "\xc2\xa2")
+
+        self.assertEqual(check(0xa2, "iso8859-5", "backslashreplace"), r"\xa2")
+
+        # Now check that PYTHONIOENCODING can be superseded piecemeal
+        env["PYTHONIOENCODING"] = "ascii:xmlcharrefreplace"
+        self.assertEqual(check(0xa2, "iso8859-5"), "¢")
+        self.assertEqual(check(0xa2, None, "backslashreplace"), r"\xa2")
+        self.assertEqual(check(0xa2, "cp850"), "\xbd")
+
 
 def test_main():
-    test_support.run_unittest(SysTest,
+    test_support.run_unittest(
+                              SysTest,
                               ShadowingTest,
                               SyspathResourceTest,
-                              SyspathUnicodeTest)
+                              SyspathUnicodeTest,
+                              SysEncodingTest,
+                             )
 
 if __name__ == "__main__":
     test_main()
diff --git a/NEWS b/NEWS
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,7 @@
     - [ 1753 ] zlib doesn't call end() on compress and decompress
     - [ 1860 ] test failures in test_array.py
     - [ 1862 ] cStringIO does not support arrays as arguments
+    - [ 1876 ] PYTHONIOENCODING unsupported, used (among others) by PyDev
     - [ 1926 ] Adjust MutableSet.pop test so we do not need to skip it
     - [ 1964 ] time.strptime() does not support %f in format
     - [ 2005 ] threading.Event object's wait([timeout]) function returns null instead of True/False.
@@ -18,6 +19,9 @@
     - [ 2075 ] Incorrect padding for hex format strings
     - [ 2082 ] Unexpected (Pdb) prompt during regression tests
     - [ 2083 ] os.unlink() can delete directories
+  New Features
+  	- Command line option -E (ignore environment variables)
+  	- Environment variable PYTHONIOENCODING, and corresponding registry items
 
 Jython 2.7b1
   Bugs Fixed
diff --git a/src/org/python/core/Console.java b/src/org/python/core/Console.java
--- a/src/org/python/core/Console.java
+++ b/src/org/python/core/Console.java
@@ -2,6 +2,7 @@
 package org.python.core;
 
 import java.io.IOException;
+import java.nio.charset.Charset;
 
 /**
  * A class named in configuration as the value of <code>python.console</code> must implement this
@@ -29,4 +30,11 @@
      */
     public void uninstall() throws UnsupportedOperationException;
 
+    /**
+     * Accessor for encoding to use for line input as a <code>Charset</code>.
+     *
+     * @return Charset of the encoding in use.
+     */
+    public Charset getEncodingCharset();
+
 }
diff --git a/src/org/python/core/Options.java b/src/org/python/core/Options.java
--- a/src/org/python/core/Options.java
+++ b/src/org/python/core/Options.java
@@ -83,7 +83,6 @@
     public static boolean dont_write_bytecode = false;
 
     /** Whether -E (ignore environment) was enabled via the command line. */
-    //XXX: place holder, not implemented yet.
     public static boolean ignore_environment = false;
 
     //XXX: place holder, not implemented yet.
diff --git a/src/org/python/core/PlainConsole.java b/src/org/python/core/PlainConsole.java
--- a/src/org/python/core/PlainConsole.java
+++ b/src/org/python/core/PlainConsole.java
@@ -37,7 +37,7 @@
      * must be one supported by the JVM. The PlainConsole does not replace <code>System.in</code> or
      * <code>System.out</code>, and does not add any line-editing capability to what is standard for
      * your OS console.
-     * 
+     *
      * @param encoding name of a supported encoding or <code>null</code> for
      *            <code>Charset.defaultCharset()</code>
      */
@@ -59,7 +59,7 @@
      * A <code>PlainConsole</code> may be uninstalled. This method assumes any sub-class may not be
      * uninstalled. Sub-classes that permit themselves to be uninstalled <b>must</b> override (and
      * not call) this method.
-     * 
+     *
      * @throws UnsupportedOperationException unless this class is exactly <code>PlainConsole</code>
      */
     @Override
@@ -71,4 +71,9 @@
         }
     }
 
+    @Override
+    public Charset getEncodingCharset() {
+        return encodingCharset;
+    }
+
 }
diff --git a/src/org/python/core/PyFile.java b/src/org/python/core/PyFile.java
--- a/src/org/python/core/PyFile.java
+++ b/src/org/python/core/PyFile.java
@@ -48,6 +48,9 @@
     @ExposedGet(doc = BuiltinDocs.file_encoding_doc)
     public String encoding;
 
+    @ExposedGet(doc = BuiltinDocs.file_errors_doc)
+    public String errors;
+
     /** Indicator dictating whether a space should be written to this
      * file on the next print statement (not currently implemented in
      * print ) */
@@ -170,6 +173,18 @@
     }
 
     /**
+     * Set the strings defining the encoding and error handling policy. Setting these strings
+     * affects behaviour of the {@link #writelines(PyObject)} when passed a {@link PyUnicode} value.
+     *
+     * @param encoding the <code>encoding</code> property of <code>file</code>.
+     * @param errors the <code>errors</code> property of <code>file</code> (or <code>null</code>).
+     */
+    void setEncoding(String encoding, String errors) {
+        this.encoding = encoding;
+        this.errors = errors;
+    }
+
+    /**
      * Wrap the given RawIOBase with a BufferedIOBase according to the
      * mode and given bufsize.
      *
@@ -446,13 +461,13 @@
      *
      * @param obj to write
      * @param message for TypeError if raised (or null for default message)
-     * @return bytes representing tha value (as a String in the Jython convention)
+     * @return bytes representing the value (as a String in the Jython convention)
      */
     private String asWritable(PyObject obj, String message) {
 
         if (obj instanceof PyUnicode) {
-            // By convention, use platform default encoding to bytes
-            return ((PyUnicode)obj).encode();
+            // Unicode must be encoded into bytes (null arguments here invoke the default values)
+            return ((PyUnicode)obj).encode(encoding, errors);
 
         } else if (obj instanceof PyString) {
             // Take a short cut
diff --git a/src/org/python/core/PySystemState.java b/src/org/python/core/PySystemState.java
--- a/src/org/python/core/PySystemState.java
+++ b/src/org/python/core/PySystemState.java
@@ -53,6 +53,8 @@
     public static final String PYTHON_CACHEDIR = "python.cachedir";
     public static final String PYTHON_CACHEDIR_SKIP = "python.cachedir.skip";
     public static final String PYTHON_CONSOLE_ENCODING = "python.console.encoding";
+    public static final String PYTHON_IO_ENCODING = "python.io.encoding";
+    public static final String PYTHON_IO_ERRORS = "python.io.errors";
     protected static final String CACHEDIR_DEFAULT_NAME = "cachedir";
 
     public static final String JYTHON_JAR = "jython.jar";
@@ -256,18 +258,25 @@
         }
     }
 
+    /**
+     * Initialise the encoding of <code>sys.stdin</code>, <code>sys.stdout</code>, and
+     * <code>sys.stderr</code>, and their error handling policy, from registry variables.
+     * Under the console app util.jython, values reflect PYTHONIOENCODING if not overridden.
+     * Note that the encoding must name a Python codec, as in <code>codecs.encode()</code>.
+     */
     private void initEncoding() {
-        String encoding = registry.getProperty(PYTHON_CONSOLE_ENCODING);
-        if (encoding == null) {
-            return;
+        // Two registry variables, counterparts to PYTHONIOENCODING = [encoding][:errors]
+        String encoding = registry.getProperty(PYTHON_IO_ENCODING);
+        String errors = registry.getProperty(PYTHON_IO_ERRORS);
+
+        if (encoding==null) {
+            // We still don't have an explicit selection for this: match the console.
+            encoding = Py.getConsole().getEncodingCharset().name();
         }
 
-        for (PyFile stdStream : new PyFile[] {(PyFile)this.stdin, (PyFile)this.stdout,
-                (PyFile)this.stderr}) {
-            if (stdStream.isatty()) {
-                stdStream.encoding = encoding;
-            }
-        }
+        ((PyFile)stdin).setEncoding(encoding, errors);
+        ((PyFile)stdout).setEncoding(encoding, errors);
+        ((PyFile)stderr).setEncoding(encoding, "backslashreplace");
     }
 
     // might be nice to have something general here, but for now these
@@ -683,6 +692,8 @@
         } catch (SecurityException e) {
             // Continue
         }
+
+        // Now the post properties (possibly set by custom JythonInitializer).
         registry.putAll(postProperties);
         if (standalone) {
             // set default standalone property (if not yet set)
@@ -690,24 +701,34 @@
                 registry.put(PYTHON_CACHEDIR_SKIP, "true");
             }
         }
+
+        /*
+         *  The console encoding is the one used by line-editing consoles to decode on the OS side and
+         *  encode on the Python side. It must be a Java codec name, so any relationship to
+         *  python.io.encoding is dubious.
+         */
         if (!registry.containsKey(PYTHON_CONSOLE_ENCODING)) {
             String encoding = getPlatformEncoding();
             if (encoding != null) {
                 registry.put(PYTHON_CONSOLE_ENCODING, encoding);
             }
         }
+
         // Set up options from registry
         Options.setFromRegistry();
     }
 
     /**
-     * @return the encoding of the underlying platform; can be <code>null</code>
+     * Return the encoding of the underlying platform, if we can work it out by any means at all.
+     *
+     * @return the encoding of the underlying platform
      */
     private static String getPlatformEncoding() {
         // first try to grab the Console encoding
         String encoding = getConsoleEncoding();
         if (encoding == null) {
             try {
+                // Not quite the console encoding (differs on Windows)
                 encoding = System.getProperty("file.encoding");
             } catch (SecurityException se) {
                 // ignore, can't do anything about it
@@ -722,7 +743,7 @@
     private static String getConsoleEncoding() {
         String encoding = null;
         try {
-            Method encodingMethod = Console.class.getDeclaredMethod("encoding");
+            Method encodingMethod = java.io.Console.class.getDeclaredMethod("encoding");
             encodingMethod.setAccessible(true); // private static method
             encoding = (String)encodingMethod.invoke(Console.class);
         } catch (Exception e) {
@@ -731,6 +752,12 @@
         return encoding;
     }
 
+    /**
+     * Merge the contents of a property file into the registry without overriding any values already
+     * set there.
+     *
+     * @param file
+     */
     private static void addRegistryFile(File file) {
         if (file.exists()) {
             if (!file.isDirectory()) {
@@ -922,9 +949,6 @@
         }
         Py.initClassExceptions(getDefaultBuiltins());
 
-        // defaultSystemState can't init its own encoding, see its constructor
-        Py.defaultSystemState.initEncoding();
-
         // Make sure that Exception classes have been loaded
         new PySyntaxError("", 1, 1, "", "");
 
@@ -1077,7 +1101,7 @@
                 Class<?> consoleClass = Class.forName(consoleName);
 
                 // Ensure it can be cast to the interface type of all consoles
-                if (! consoleType.isAssignableFrom(consoleClass)) {
+                if (!consoleType.isAssignableFrom(consoleClass)) {
                     throw new ClassCastException();
                 }
 
diff --git a/src/org/python/core/StdoutWrapper.java b/src/org/python/core/StdoutWrapper.java
--- a/src/org/python/core/StdoutWrapper.java
+++ b/src/org/python/core/StdoutWrapper.java
@@ -103,8 +103,9 @@
 
     private String printToFile(PyFile file, PyObject o) {
         String s;
-        if (o instanceof PyUnicode && file.encoding != null) {
-            s = ((PyUnicode)o).encode(file.encoding, "strict");
+        if (o instanceof PyUnicode) {
+            // Use the encoding and policy defined for the stream. (Each may be null.)
+            s = ((PyUnicode)o).encode(file.encoding, file.errors);
         } else {
             s = o.__str__().toString();
         }
diff --git a/src/org/python/util/jython.java b/src/org/python/util/jython.java
--- a/src/org/python/util/jython.java
+++ b/src/org/python/util/jython.java
@@ -55,8 +55,7 @@
             + "-c cmd   : program passed in as string (terminates option list)\n"
             // + "-d       : debug output from parser (also PYTHONDEBUG=x)\n"
             + "-Dprop=v : Set the property `prop' to value `v'\n"
-            // + "-E       : ignore environment variables (such as PYTHONPATH)\n"
-            + "-C codec : Use a different codec when reading from the console.\n"
+            + "-E       : ignore environment variables (such as JYTHONPATH)\n"
             + "-h       : print this help message and exit (also --help)\n"
             + "-i       : inspect interactively after running script\n"
             // + ", (also PYTHONINSPECT=x)\n"
@@ -83,9 +82,11 @@
             + "file     : program read from script file\n"
             + "-        : program read from stdin (default; interactive mode if a tty)\n"
             + "arg ...  : arguments passed to program in sys.argv[1:]\n" + "\n"
-            + "Other environment variables:\n" + "JYTHONPATH: '" + File.pathSeparator
+            + "Other environment variables:\n" //
+            + "JYTHONPATH: '" + File.pathSeparator
             + "'-separated list of directories prefixed to the default module\n"
-            + "            search path.  The result is sys.path.";
+            + "            search path.  The result is sys.path.\n"
+            + "PYTHONIOENCODING: Encoding[:errors] used for stdin/stdout/stderr.";
 
     public static boolean shouldRestart;
 
@@ -94,7 +95,7 @@
      * root of the JAR archive. Note that the __name__ is set to the base name of the JAR file and
      * not to "__main__" (for historic reasons). This method do NOT handle exceptions. the caller
      * SHOULD handle any (Py)Exceptions thrown by the code.
-     * 
+     *
      * @param filename The path to the filename to run.
      */
     public static void runJar(String filename) {
@@ -211,16 +212,22 @@
         // Get system properties (or empty set if we're prevented from accessing them)
         Properties preProperties = PySystemState.getBaseProperties();
 
+        // Read environment variable PYTHONIOENCODING into properties (registry)
+        String pythonIoEncoding = getenv("PYTHONIOENCODING");
+        if (pythonIoEncoding != null) {
+            String[] spec = splitString(pythonIoEncoding, ':', 2);
+            // Note that if encoding or errors is blank (=null), the registry value wins.
+            addDefault(preProperties, PySystemState.PYTHON_IO_ENCODING, spec[0]);
+            addDefault(preProperties, PySystemState.PYTHON_IO_ERRORS, spec[1]);
+        }
+
         // Decide if System.in is interactive
         if (!opts.fixInteractive || opts.interactive) {
             // The options suggest System.in is interactive: but only if isatty() agrees
             opts.interactive = Py.isInteractive();
             if (opts.interactive) {
                 // Set the default console type if nothing else has
-                String consoleClassName = preProperties.getProperty("python.console");
-                if (consoleClassName==null) {
-                    preProperties.setProperty("python.console", PYTHON_CONSOLE_CLASS);
-                }
+                addDefault(preProperties, "python.console", PYTHON_CONSOLE_CLASS);
             }
         }
 
@@ -230,7 +237,9 @@
 
         PyList warnoptions = new PyList();
         addWarnings(opts.warnoptions, warnoptions);
-        addWarnings(warnOptionsFromEnv(), warnoptions);
+        if (!Options.ignore_environment) {
+            addWarnings(warnOptionsFromEnv(), warnoptions);
+        }
         systemState.setWarnoptions(warnoptions);
 
         // Make sure warnings module is loaded if there are warning options
@@ -378,24 +387,18 @@
         }
 
         if (opts.fixInteractive || (opts.filename == null && opts.command == null)) {
-            if (opts.encoding == null) {
-                opts.encoding = PySystemState.registry.getProperty("python.console.encoding");
-            }
-            if (opts.encoding != null) {
-                if (!Charset.isSupported(opts.encoding)) {
-                    System.err.println(opts.encoding
-                            + " is not a supported encoding on this JVM, so it can't "
-                            + "be used in python.console.encoding.");
-                    System.exit(1);
-                }
-                interp.cflags.encoding = opts.encoding;
-            }
+            // Go interactive with the console: the parser needs to know the encoding.
+            String encoding = Py.getConsole().getEncodingCharset().name();
+
+            // Run the interpreter interactively
             try {
+                interp.cflags.encoding = encoding;
                 interp.interact(null, null);
             } catch (Throwable t) {
                 Py.printException(t);
             }
         }
+
         interp.cleanup();
     }
 
@@ -414,9 +417,79 @@
             // continue
         }
     }
+
+    /**
+     * Return an array of trimmed strings by splitting the argument at each occurrence of a
+     * separator character. (Helper for configuration variable processing.) Segments of zero length
+     * after trimming emerge as <code>null</code>. If there are more than the specified number of
+     * segments the last element of the array contains all of the source string after the
+     * <code>(n-1)</code>th occurrence of <code>sep</code>.
+     *
+     * @param spec to split
+     * @param sep character on which to split
+     * @param n number of parts to split into
+     * @return <code>n</code>-element array of strings (or <code>null</code>s)
+     */
+    private static String[] splitString(String spec, char sep, int n) {
+        String[] list = new String[n];
+        int p = 0, i = 0, L = spec.length();
+        while (p < L) {
+            int c = spec.indexOf(sep, p);
+            if (c < 0 || i >= n - 1) {
+                // No more seps, or no more space: i.th piece is the rest of spec.
+                c = L;
+            }
+            String s = spec.substring(p, c).trim();
+            list[i++] = (s.length() > 0) ? s : null;
+            p = c + 1;
+        }
+        return list;
+    }
+
+    /**
+     * If the key is not currently present and the passed value is not <code>null</code>, sets the
+     * <code>key</code> to the <code>value</code> in the given <code>Properties</code> object. Thus,
+     * it provides a default value for a subsequent <code>getProperty()</code>.
+     *
+     * @param registry to be (possibly) updated
+     * @param key at which to set value
+     * @param value to set (or <code>null</code> for no setting)
+     * @return true iff a value was set
+     */
+    private static boolean addDefault(Properties registry, String key, String value) {
+        // Set value at key if nothing else has set it
+        if (value == null || registry.containsKey(key)) {
+            return false;
+        } else {
+            registry.setProperty(key, value);
+            return true;
+        }
+    }
+
+    /**
+     * Get the value of an environment variable, if we are allowed to and it exists; otherwise
+     * return <code>null</code>. We are allowed to access the environment variable if the -E flag
+     * was not given and the application has permission to read environment variables. The -E flag
+     * is reflected in {@link Options#ignore_environment}, and will be set automatically if it turns
+     * out we do not have permission.
+     *
+     * @param varname name to access in the environment
+     * @return the value or <code>null</code>.
+     */
+    private static String getenv(String varname) {
+        if (!Options.ignore_environment) {
+            try {
+                return System.getenv(varname);
+            } catch (SecurityException e) {
+                // We're not allowed to access them after all
+                Options.ignore_environment = true;
+            }
+        }
+        return null;
+    }
+
 }
 
-
 class CommandLineOptions {
 
     public String filename;
@@ -515,12 +588,8 @@
                 } else {
                     return argumentExpected(arg);
                 }
-            } else if (arg.equals("-C")) {
-                encoding = args[++index];
-                setProperty("python.console.encoding", encoding);
             } else if (arg.equals("-E")) {
-                // XXX: accept -E (ignore environment variables) to be compatible with
-                // CPython. do nothing for now (we could ignore the registry)
+                // -E (ignore environment variables)
                 Options.ignore_environment = true;
             } else if (arg.startsWith("-D")) {
                 String key = null;

-- 
Repository URL: http://hg.python.org/jython


More information about the Jython-checkins mailing list