[Jython-checkins] jython: Improvements to PyString.format and in StringFormatter related to text.

Sun Jun 8 14:13:05 CEST 2014

http://hg.python.org/jython/rev/324e1138e1f3
changeset:   7284:324e1138e1f3
user:        Jeff Allen <ja.py at farowl.co.uk>
date:        Tue Jun 03 21:30:16 2014 +0100
summary:
  Improvements to PyString.__format__ and in StringFormatter related to text.
Brings %s, %r, %c, %% into the new framework. Quite some rationalisation of
StringFormatter made possible. Small improvement in test conformity.

files:
  Lib/test/test_format_jy.py                          |   57 +-
  Lib/test/test_unicode.py                            |    7 +-
  src/org/python/core/PyInteger.java                  |    2 +-
  src/org/python/core/PyString.java                   |  482 ++++-----
  src/org/python/core/stringlib/IntegerFormatter.java |    1 +
  src/org/python/core/stringlib/InternalFormat.java   |  132 +-
  src/org/python/core/stringlib/TextFormatter.java    |  105 ++
  tests/java/org/python/core/StringFormatTest.java    |   17 +-
  8 files changed, 480 insertions(+), 323 deletions(-)

diff --git a/Lib/test/test_format_jy.py b/Lib/test/test_format_jy.py
--- a/Lib/test/test_format_jy.py
+++ b/Lib/test/test_format_jy.py
@@ -5,8 +5,9 @@
 from test import test_support
 import unittest
 
-class FormatTestCase(unittest.TestCase):
-    # Tests that %d converts values for custom classes implementing __int__
+class FormatSubclass(unittest.TestCase):
+    # Custom __int__ and __float__ should be respected by %-formatting
+
     def test_int_conversion_support(self):
         class Foo(object):
             def __init__(self, x): self.x = x
@@ -21,9 +22,59 @@
             def __float__(self): return self. x
         self.assertEqual('1.0', '%.1f' % Foo(1.0))
 
+class FormatUnicodeBase(unittest.TestCase):
+
+    # Test padding non-BMP result
+    def test_pad_string(self):
+        self.padcheck(u"architect")
+        self.padcheck(u'a\U00010001cde')
+
+class FormatUnicodeClassic(FormatUnicodeBase):
+    # Check using %-formatting
+
+    def padcheck(self, s):
+        self.assertEqual(10, len('%10.4s' % s))
+        self.assertEqual(u' '*6 + s[0:4], '%10.4s' % s)
+        self.assertEqual(u' '*6 + s[0:4], '% 10.4s' % s)
+        self.assertEqual(u' '*6 + s[0:4], '%010.4s' % s)
+        self.assertEqual(s[0:3] + u' '*5, '%-8.3s' % s)
+
+class FormatUnicodeModern(FormatUnicodeBase):
+    # Check using __format__
+
+    def padcheck(self, s):
+        self.assertEqual(10, len(format(s, '10.4s')))
+        self.assertEqual(s[0:3] + u' '*7, format(s, '10.3s'))
+        self.assertEqual(s[0:3] + u'~'*7, format(s, '~<10.3s'))
+        self.assertEqual(s[0:3] + u'~'*7, format(s, '~<10.3'))
+        self.assertEqual(u' '*6 + s[0:4], format(s, '>10.4s'))
+        self.assertEqual(u'*'*6 + s[0:4], format(s, '*>10.4s'))
+        self.assertEqual(u'*'*6 + s[0:4], format(s, '*>10.4'))
+
+
+class FormatMisc(unittest.TestCase):
+    # Odd tests Jython used to fail
+
+    def test_percent_padded(self) :
+        self.assertEqual('%hello', '%%%s' % 'hello')
+        self.assertEqual(u'     %hello', '%6%%s' % u'hello')
+        self.assertEqual(u'%     hello', u'%-6%%s' % 'hello')
+
+        self.assertEqual('     %', '%6%' % ())
+        self.assertEqual('     %', '%06%' % ())
+        self.assertEqual('   %', '%*%' % 4)
+        self.assertEqual('%     ', '%-6%' % ())
+        self.assertEqual('%     ', '%-06%' % ())
+        self.assertEqual('%   ', '%*%' % -4)
+
 
 def test_main():
-    test_support.run_unittest(FormatTestCase)
+    test_support.run_unittest(
+            FormatSubclass,
+            FormatUnicodeClassic,
+            FormatUnicodeModern,
+            FormatMisc,
+    )
 
 if __name__ == '__main__':
     test_main()
diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@@ -357,13 +357,12 @@
         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000,  3.50')
         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000,  3.57')
         self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
-        if not sys.platform.startswith('java'):
-            self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
+        self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
         self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
         self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
 
-        # self.assertEqual(u'%c' % 0x1234, u'\u1234')
-        # self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
+        self.assertEqual(u'%c' % 0x1234, u'\u1234')
+        self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
 
         # formatting jobs delegated from the string implementation:
         self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
diff --git a/src/org/python/core/PyInteger.java b/src/org/python/core/PyInteger.java
--- a/src/org/python/core/PyInteger.java
+++ b/src/org/python/core/PyInteger.java
@@ -1074,7 +1074,7 @@
             case 'c':
                 // Character data: specific prohibitions.
                 if (Spec.specified(spec.sign)) {
-                    throw IntegerFormatter.notAllowed("Sign", "integer", spec.type);
+                    throw IntegerFormatter.signNotAllowed("integer", spec.type);
                 } else if (spec.alternate) {
                     throw IntegerFormatter.alternateFormNotAllowed("integer", spec.type);
                 }
diff --git a/src/org/python/core/PyString.java b/src/org/python/core/PyString.java
--- a/src/org/python/core/PyString.java
+++ b/src/org/python/core/PyString.java
@@ -10,10 +10,11 @@
 import org.python.core.stringlib.FieldNameIterator;
 import org.python.core.stringlib.FloatFormatter;
 import org.python.core.stringlib.IntegerFormatter;
+import org.python.core.stringlib.InternalFormat;
+import org.python.core.stringlib.InternalFormat.Formatter;
 import org.python.core.stringlib.InternalFormat.Spec;
-import org.python.core.stringlib.InternalFormatSpec;
-import org.python.core.stringlib.InternalFormatSpecParser;
 import org.python.core.stringlib.MarkupIterator;
+import org.python.core.stringlib.TextFormatter;
 import org.python.core.util.StringUtil;
 import org.python.expose.ExposedMethod;
 import org.python.expose.ExposedNew;
@@ -3898,50 +3899,68 @@
 
     @ExposedMethod(doc = BuiltinDocs.str___format___doc)
     final PyObject str___format__(PyObject formatSpec) {
-        if (!(formatSpec instanceof PyString)) {
-            throw Py.TypeError("__format__ requires str or unicode");
+
+        // Parse the specification
+        Spec spec = InternalFormat.fromText(formatSpec, "__format__");
+
+        // Get a formatter for the specification
+        TextFormatter f = prepareFormatter(spec);
+        if (f == null) {
+            // The type code was not recognised
+            throw Formatter.unknownFormat(spec.type, "string");
         }
 
-        PyString formatSpecStr = (PyString)formatSpec;
-        String result;
-        try {
-            String specString = formatSpecStr.getString();
-            InternalFormatSpec spec = new InternalFormatSpecParser(specString).parse();
-            result = formatString(getString(), spec);
-        } catch (IllegalArgumentException e) {
-            throw Py.ValueError(e.getMessage());
+        // Bytes mode if neither this nor formatSpec argument is Unicode.
+        boolean unicode = this instanceof PyUnicode || formatSpec instanceof PyUnicode;
+        f.setBytes(!unicode);
+
+        // Convert as per specification.
+        f.format(getString());
+
+        // Return a result that has the same type (str or unicode) as the formatSpec argument.
+        return f.pad().getPyResult();
+    }
+
+    /**
+     * Common code for {@link PyString} and {@link PyUnicode} to prepare a {@link TextFormatter}
+     * from a parsed specification. The object returned has format method
+     * {@link TextFormatter#format(String)} that treats its argument as UTF-16 encoded unicode (not
+     * just <code>char</code>s). That method will format its argument ( <code>str</code> or
+     * <code>unicode</code>) according to the PEP 3101 formatting specification supplied here. This
+     * would be used during <code>text.__format__(".5s")</code> or
+     * <code>"{:.5s}".format(text)</code> where <code>text</code> is this Python string.
+     *
+     * @param spec a parsed PEP-3101 format specification.
+     * @return a formatter ready to use, or null if the type is not a string format type.
+     * @throws PyException(ValueError) if the specification is faulty.
+     */
+    @SuppressWarnings("fallthrough")
+    static TextFormatter prepareFormatter(Spec spec) throws PyException {
+        // Slight differences between format types
+        switch (spec.type) {
+
+            case Spec.NONE:
+            case 's':
+                // Check for disallowed parts of the specification
+                if (spec.grouping) {
+                    throw Formatter.notAllowed("Grouping", "string", spec.type);
+                } else if (Spec.specified(spec.sign)) {
+                    throw Formatter.signNotAllowed("string", '\0');
+                } else if (spec.alternate) {
+                    throw Formatter.alternateFormNotAllowed("string");
+                } else if (spec.align == '=') {
+                    throw Formatter.alignmentNotAllowed('=', "string");
+                }
+                // spec may be incomplete. The defaults are those commonly used for string formats.
+                spec = spec.withDefaults(Spec.STRING);
+                // Get a formatter for the specification
+                return new TextFormatter(spec);
+
+            default:
+                // The type code was not recognised
+                return null;
         }
-        return formatSpecStr.createInstance(result);
-    }
-
-    /**
-     * Format the given text according to a parsed PEP 3101 formatting specification, as during
-     * <code>text.__format__(format_spec)</code> or <code>"{:s}".format(text)</code> where
-     * <code>text</code> is a Python string.
-     *
-     * @param text to format
-     * @param spec the parsed PEP 3101 formatting specification
-     * @return the result of the formatting
-     */
-    public static String formatString(String text, InternalFormatSpec spec) {
-        if (spec.sign != '\0') {
-            throw new IllegalArgumentException("Sign not allowed in string format specifier");
-        }
-        if (spec.alternate) {
-            throw new IllegalArgumentException(
-                    "Alternate form (#) not allowed in string format specifier");
-        }
-        if (spec.align == '=') {
-            throw new IllegalArgumentException(
-                    "'=' alignment not allowed in string format specifier");
-        }
-        if (spec.precision >= 0 && text.length() > spec.precision) {
-            text = text.substring(0, spec.precision);
-        }
-        return spec.pad(text, '<', 0);
-    }
-
-    /* arguments' conversion helper */
+    }
 
     @Override
     public String asString(int index) throws PyObject.ConversionException {
@@ -4006,10 +4025,6 @@
     String format;
     /** Where the output is built. */
     StringBuilder buffer;
-    /** Remembers that the value currently converted is negative */
-    boolean negative;
-    /** Precision from format specification. */
-    int precision;
     /**
      * Index into args of argument currently being worked, or special values indicating -1: a single
      * item that has not yet been used, -2: a single item that has already been used, -3: a mapping.
@@ -4018,7 +4033,7 @@
     /** Arguments supplied to {@link #format(PyObject)} method. */
     PyObject args;
     /** Indicate a <code>PyUnicode</code> result is expected. */
-    boolean unicodeCoercion;
+    boolean needUnicode;
 
     final char pop() {
         try {
@@ -4054,7 +4069,7 @@
     public StringFormatter(String format, boolean unicodeCoercion) {
         index = 0;
         this.format = format;
-        this.unicodeCoercion = unicodeCoercion;
+        this.needUnicode = unicodeCoercion;
         buffer = new StringBuilder(format.length() + 100);
     }
 
@@ -4155,9 +4170,9 @@
     }
 
     /**
-     * Return the argument as either a {@link PyFloat} according to its <code>__float__</code>
-     * method. If the argument has no such method, or it raises an exception, we return the argument
-     * itself. The caller must check the return type.
+     * Return the argument as a {@link PyFloat} according to its <code>__float__</code> method. If
+     * the argument has no such method, or it raises an exception, we return the argument itself.
+     * The caller must check the return type.
      *
      * @param arg to convert
      * @return PyFloat if possible
@@ -4171,7 +4186,7 @@
         } else {
             // use __float__ to get a float.
             if (arg.getClass() == PyFloat.class) {
-                // A common case where it is safe to return arg.__int__()
+                // A common case where it is safe to return arg.__float__()
                 return arg.__float__();
 
             } else {
@@ -4194,6 +4209,46 @@
     }
 
     /**
+     * Return the argument as either a {@link PyString} or a {@link PyUnicode}, and set the
+     * {@link #needUnicode} member accordingly. If we already know we are building a Unicode string
+     * (<code>needUnicode==true</code>), then any argument that is not already a
+     * <code>PyUnicode</code> will be converted by calling its <code>__unicode__</code> method.
+     * Conversely, if we are not yet building a Unicode string (<code>needUnicode==false</code> ),
+     * then a PyString will pass unchanged, a <code>PyUnicode</code> will switch us to Unicode mode
+     * (<code>needUnicode=true</code>), and any other type will be converted by calling its
+     * <code>__str__</code> method, which will return a <code>PyString</code>, or possibly a
+     * <code>PyUnicode</code>, which will switch us to Unicode mode.
+     *
+     * @param arg to convert
+     * @return PyString or PyUnicode equivalent
+     */
+    private PyString asText(PyObject arg) {
+
+        if (arg instanceof PyUnicode) {
+            // arg is already acceptable.
+            needUnicode = true;
+            return (PyUnicode)arg;
+
+        } else if (needUnicode) {
+            // The string being built is unicode, so we need that version of the arg.
+            return arg.__unicode__();
+
+        } else if (arg instanceof PyString) {
+            // The string being built is not unicode, so arg is already acceptable.
+            return (PyString)arg;
+
+        } else {
+            // The string being built is not unicode, so use __str__ to get a PyString.
+            PyString s = arg.__str__();
+            // But __str__ might return PyUnicode, and we have to notice that.
+            if (s instanceof PyUnicode) {
+                needUnicode = true;
+            }
+            return s;
+        }
+    }
+
+    /**
      * Main service of this class: format one or more arguments with the format string supplied at
      * construction.
      *
@@ -4204,7 +4259,7 @@
     public PyString format(PyObject args) {
         PyObject dict = null;
         this.args = args;
-        boolean needUnicode = unicodeCoercion;
+
         if (args instanceof PyTuple) {
             // We will simply work through the tuple elements
             argIndex = 0;
@@ -4220,16 +4275,6 @@
 
         while (index < format.length()) {
 
-            // Attributes to be parsed from the next format specifier
-            boolean ljustFlag = false;
-            boolean signFlag = false;
-            boolean blankFlag = false;
-            boolean altFlag = false;
-            boolean zeroFlag = false;
-
-            int width = -1;
-            precision = -1;
-
             // Read one character from the format string
             char c = pop();
             if (c != '%') {
@@ -4239,6 +4284,14 @@
 
             // It's a %, so the beginning of a conversion specifier. Parse it.
 
+            // Attributes to be parsed from the next format specifier
+            boolean altFlag = false;
+            char sign = Spec.NONE;
+            char fill = ' ';
+            char align = '>';
+            int width = Spec.UNSPECIFIED;
+            int precision = Spec.UNSPECIFIED;
+
             // A conversion specifier contains the following components, in this order:
             // + The '%' character, which marks the start of the specifier.
             // + Mapping key (optional), consisting of a parenthesised sequence of characters.
@@ -4278,19 +4331,22 @@
             while (true) {
                 switch (c = pop()) {
                     case '-':
-                        ljustFlag = true;
+                        align = '<';
                         continue;
                     case '+':
-                        signFlag = true;
+                        sign = '+';
                         continue;
                     case ' ':
-                        blankFlag = true;
+                        if (!Spec.specified(sign)) {
+                            // Blank sign only wins if '+' not specified.
+                            sign = ' ';
+                        }
                         continue;
                     case '#':
                         altFlag = true;
                         continue;
                     case '0':
-                        zeroFlag = true;
+                        fill = '0';
                         continue;
                 }
                 break;
@@ -4307,7 +4363,7 @@
             width = getNumber();
             if (width < 0) {
                 width = -width;
-                ljustFlag = true;
+                align = '<';
             }
 
             /*
@@ -4330,103 +4386,105 @@
                 c = pop();
             }
 
-            // c is now the conversion type.
-            if (c == '%') {
-                // It was just a percent sign after all
-                buffer.append(c);
-                continue;
+            /*
+             * As a function of the conversion type (currently in c) override some of the formatting
+             * flags we read from the format specification.
+             */
+            switch (c) {
+                case 's':
+                case 'r':
+                case 'c':
+                case '%':
+                    // These have string-like results: fill, if needed, is always blank.
+                    fill = ' ';
+                    break;
+
+                default:
+                    if (fill == '0' && align == '>') {
+                        // Zero-fill comes after the sign in right-justification.
+                        align = '=';
+                    } else {
+                        // If left-justifying, the fill is always blank.
+                        fill = ' ';
+                    }
             }
 
             /*
+             * Encode as an InternalFormat.Spec. The values in the constructor always have specified
+             * values, except for sign, width and precision.
+             */
+            Spec spec = new Spec(fill, align, sign, altFlag, width, false, precision, c);
+
+            /*
              * Process argument according to format specification decoded from the string. It is
-             * important we don't read the argumnent from the list until this point because of the
+             * important we don't read the argument from the list until this point because of the
              * possibility that width and precision were specified via the argument list.
              */
-            PyObject arg = getarg();
-            String string = null;
-            negative = false;
-
-            // Independent of type, decide the padding character based on decoded flags.
-            char fill = ' ';
-            if (zeroFlag) {
-                fill = '0';
-            } else {
-                fill = ' ';
-            }
-
-            // Encode as an InternalFormat.Spec
-            char fill2 = ' ';
-            char align = ljustFlag ? '<' : '>';
-            if (zeroFlag && !ljustFlag) {
-                // We only actually fill with zero if right-justifying
-                fill2 = '0';
-                // And then the fill comes after the sign.
-                align = '=';
-            }
-            char sign = signFlag ? '+' : (blankFlag ? ' ' : Spec.NONE);
-            int w = width;
-            Spec spec = new Spec(fill2, align, sign, altFlag, w, false, precision, c);
-
-            // Signal that the padding, sign, base prefix etc. have all been taken care of
-            boolean jobDone = false;
-
-            // Perform the type-specific formatting
-            switch (c) {
-
-                case 's':
-                    // String (converts any Python object using str()).
-                    if (arg instanceof PyUnicode) {
-                        needUnicode = true;
-                    }
-                    // fall through ...
-
-                case 'r':
-                    // String (converts any Python object using repr()).
-                    fill = ' ';
-                    if (c == 's') {
-                        if (needUnicode) {
-                            string = arg.__unicode__().toString();
-                        } else {
-                            string = arg.__str__().toString();
-                        }
-                    } else {
-                        string = arg.__repr__().toString();
-                    }
-                    if (precision >= 0 && string.length() > precision) {
-                        string = string.substring(0, precision);
-                    }
-
+
+            // Depending on the type of conversion, we use one of these formatters:
+            FloatFormatter ff;
+            IntegerFormatter fi;
+            TextFormatter ft;
+            Formatter f; // = ff, fi or ft, whichever we actually use.
+
+            switch (spec.type) {
+
+                case 's': // String: converts any object using __str__(), __unicode__() ...
+                case 'r': // ... or repr().
+                    PyObject arg = getarg();
+
+                    // Get hold of the actual object to display (may set needUnicode)
+                    PyString argAsString = asText(spec.type == 's' ? arg : arg.__repr__());
+                    // Format the str/unicode form of the argument using this Spec.
+                    f = ft = new TextFormatter(spec);
+                    ft.setBytes(!needUnicode);
+                    ft.format(argAsString.getString());
                     break;
 
                 case 'd': // All integer formats (+case for X).
                 case 'o':
                 case 'x':
                 case 'X':
+                case 'c': // Single character (accepts integer or single character string).
                 case 'u': // Obsolete type identical to 'd'.
                 case 'i': // Compatibility with scanf().
 
-                    // Format using this Spec the double form of the argument.
-                    IntegerFormatter fi = new IntegerFormatter.Traditional(spec);
-
-                    // Note various types accepted here as long as they have an __int__ method.
-                    PyObject argAsNumber = asNumber(arg);
-
-                    // We have to check what we got back..
-                    if (argAsNumber instanceof PyInteger) {
-                        fi.format(((PyInteger)argAsNumber).getValue());
-                    } else if (argAsNumber instanceof PyLong) {
-                        fi.format(((PyLong)argAsNumber).getValue());
+                    // Format the argument using this Spec.
+                    f = fi = new IntegerFormatter.Traditional(spec);
+                    // If not producing PyUnicode, disallow codes >255.
+                    fi.setBytes(!needUnicode);
+
+                    arg = getarg();
+
+                    if (arg instanceof PyString && spec.type == 'c') {
+                        if (arg.__len__() != 1) {
+                            throw Py.TypeError("%c requires int or char");
+                        } else {
+                            if (!needUnicode && arg instanceof PyUnicode) {
+                                // Change of mind forced by encountering unicode object.
+                                needUnicode = true;
+                                fi.setBytes(false);
+                            }
+                            fi.format(((PyString)arg).getString().codePointAt(0));
+                        }
+
                     } else {
-                        // It couldn't be converted, raise the error here
-                        throw Py.TypeError("%" + c + " format: a number is required, not "
-                                + arg.getType().fastGetName());
+                        // Note various types accepted here as long as they have an __int__ method.
+                        PyObject argAsNumber = asNumber(arg);
+
+                        // We have to check what we got back.
+                        if (argAsNumber instanceof PyInteger) {
+                            fi.format(((PyInteger)argAsNumber).getValue());
+                        } else if (argAsNumber instanceof PyLong) {
+                            fi.format(((PyLong)argAsNumber).getValue());
+                        } else {
+                            // It couldn't be converted, raise the error here
+                            throw Py.TypeError("%" + spec.type
+                                    + " format: a number is required, not "
+                                    + arg.getType().fastGetName());
+                        }
                     }
 
-                    fi.pad();
-                    string = fi.getResult();
-
-                    // Suppress subsequent attempts to insert a correct sign, done already.
-                    jobDone = true;
                     break;
 
                 case 'e': // All floating point formats (+case).
@@ -4437,9 +4495,11 @@
                 case 'G':
 
                     // Format using this Spec the double form of the argument.
-                    FloatFormatter ff = new FloatFormatter(spec);
+                    f = ff = new FloatFormatter(spec);
+                    ff.setBytes(!needUnicode);
 
                     // Note various types accepted here as long as they have a __float__ method.
+                    arg = getarg();
                     PyObject argAsFloat = asFloat(arg);
 
                     // We have to check what we got back..
@@ -4451,128 +4511,24 @@
                                 + arg.getType().fastGetName());
                     }
 
-                    ff.pad();
-                    string = ff.getResult();
-
-                    // Suppress subsequent attempts to insert a correct sign, done already.
-                    // signFlag = blankFlag = negative = false;
-                    jobDone = true;
                     break;
 
-                case 'c':
-                    // Single character (accepts integer or single character string).
-                    fill = ' ';
-                    if (arg instanceof PyString) {
-                        string = ((PyString)arg).toString();
-                        if (string.length() != 1) {
-                            throw Py.TypeError("%c requires int or char");
-                        }
-                        if (arg instanceof PyUnicode) {
-                            needUnicode = true;
-                        }
-                        break;
-                    }
-
-                    // arg is not a str (or unicode)
-                    int val;
-                    try {
-                        // Explicitly __int__ so we can look for an AttributeError (which is
-                        // less invasive to mask than a TypeError)
-                        val = arg.__int__().asInt();
-                    } catch (PyException e) {
-                        if (e.match(Py.AttributeError)) {
-                            throw Py.TypeError("%c requires int or char");
-                        }
-                        throw e;
-                    }
-                    // Range check, according to ultimate type of result as presentl;y known.
-                    if (!needUnicode) {
-                        if (val < 0) {
-                            throw Py.OverflowError("unsigned byte integer is less than minimum");
-                        } else if (val > 255) {
-                            throw Py.OverflowError("unsigned byte integer is greater than maximum");
-                        }
-                    } else if (val < 0 || val > PySystemState.maxunicode) {
-                        throw Py.OverflowError("%c arg not in range(0x110000) (wide Python build)");
-                    }
-                    string = new String(new int[] {val}, 0, 1);
+                case '%': // Percent symbol, but surprisingly, padded.
+
+                    // We use an integer formatter.
+                    f = fi = new IntegerFormatter.Traditional(spec);
+                    fi.setBytes(!needUnicode);
+                    fi.format('%');
                     break;
 
                 default:
                     throw Py.ValueError("unsupported format character '"
-                            + codecs.encode(Py.newString(c), null, "replace") + "' (0x"
-                            + Integer.toHexString(c) + ") at index " + (index - 1));
+                            + codecs.encode(Py.newString(spec.type), null, "replace") + "' (0x"
+                            + Integer.toHexString(spec.type) + ") at index " + (index - 1));
             }
 
-            /*
-             * We have now dealt with the translation of the (absolute value of the) argument, in
-             * variable string[]. In the next sections we deal with sign, padding and base prefix.
-             */
-            if (jobDone) {
-                // Type-specific formatting has already taken care of all this.
-                buffer.append(string);
-
-            } else {
-                // Legacy code still needed
-                int length = string.length();
-                int skip = 0;
-
-                // Decide how to represent the sign according to format and actual sign of argument.
-                String signString = null;
-                if (negative) {
-                    signString = "-";
-                } else {
-                    if (signFlag) {
-                        signString = "+";
-                    } else if (blankFlag) {
-                        signString = " ";
-                    }
-                }
-
-                // The width (from here on) will be the remaining width on the line.
-                if (width < length) {
-                    width = length;
-                }
-
-                // Insert the sign in the buffer and adjust the width.
-                if (signString != null) {
-                    if (fill != ' ') {
-                        // When the fill is not space, the sign comes before the fill.
-                        buffer.append(signString);
-                    }
-                    // Adjust width for sign.
-                    if (width > length) {
-                        width--;
-                    }
-                }
-
-                // Fill on the left of the item.
-                if (width > length && !ljustFlag) {
-                    do {
-                        buffer.append(fill);
-                    } while (--width > length);
-                }
-
-                // If the fill is spaces, we will have deferred the sign and hex base prefix
-                if (fill == ' ') {
-                    if (signString != null) {
-                        buffer.append(signString);
-                    }
-                }
-
-                // Now append the converted argument.
-                if (skip > 0) {
-                    // The string contains a hex-prefix, but we have already inserted one.
-                    buffer.append(string.substring(skip));
-                } else {
-                    buffer.append(string);
-                }
-
-                // If this hasn't filled the space required, add right-padding.
-                while (--width >= length) {
-                    buffer.append(' ');
-                }
-            }
+            // Pad the result as required in the format and append to the overall result.
+            buffer.append(f.pad().getResult());
         }
 
         /*
diff --git a/src/org/python/core/stringlib/IntegerFormatter.java b/src/org/python/core/stringlib/IntegerFormatter.java
--- a/src/org/python/core/stringlib/IntegerFormatter.java
+++ b/src/org/python/core/stringlib/IntegerFormatter.java
@@ -294,6 +294,7 @@
                     break;
 
                 case 'c':
+                case '%':
                     // Binary.
                     format_c(value);
                     break;
diff --git a/src/org/python/core/stringlib/InternalFormat.java b/src/org/python/core/stringlib/InternalFormat.java
--- a/src/org/python/core/stringlib/InternalFormat.java
+++ b/src/org/python/core/stringlib/InternalFormat.java
@@ -334,63 +334,75 @@
          * modes, the padding is around the whole buffer.) When this would not be appropriate, it is
          * up to the client to disallow this (which <code>complex</code> does).
          *
-         * @return this object
+         * @return this Formatter object
          */
         public Formatter pad() {
-
             // We'll need this many pad characters (if>0). Note Spec.UNDEFINED<0.
             int n = spec.width - result.length();
             if (n > 0) {
+                // Note: use of leftIndex anticipates client-owned result buffer.
+                pad(0, n);
+            }
+            return this;
+        }
 
-                char align = spec.getAlign('>'); // Right for numbers (wrong for strings)
-                char fill = spec.getFill(' ');
+        /**
+         * Pad the last result (defined as the contents of {@link #result} from argument
+         * <code>leftIndex</code> to the end) using the alignment, by <code>n</code> repetitions of
+         * the fill character defined in {@link #spec}, and distributed according to
+         * <code>spec.align</code>. The value of <code>leftIndex</code> is only used if the
+         * alignment is '>' (left) or '^' (both). The value of the critical lengths (lenWhole,
+         * lenSign, etc.) are not affected, because we assume that <code>leftIndex <= </code>
+         * {@link #start}.
+         *
+         * @param leftIndex the index in result at which to insert left-fill characters.
+         * @param n number of fill characters to insert.
+         */
+        protected void pad(int leftIndex, int n) {
+            char align = spec.getAlign('>'); // Right for numbers (strings will supply '<' align)
+            char fill = spec.getFill(' ');
 
-                // Start by assuming padding is all leading ('>' case or '=')
-                int leading = n;
+            // Start by assuming padding is all leading ('>' case or '=')
+            int leading = n;
 
-                // Split the total padding according to the alignment
-                if (align == '^') {
-                    // Half the padding before
-                    leading = n / 2;
-                } else if (align == '<') {
-                    // All the padding after
-                    leading = 0;
+            // Split the total padding according to the alignment
+            if (align == '^') {
+                // Half the padding before
+                leading = n / 2;
+            } else if (align == '<') {
+                // All the padding after
+                leading = 0;
+            }
+
+            // All padding that is not leading is trailing
+            int trailing = n - leading;
+
+            // Insert the leading space
+            if (leading > 0) {
+                if (align == '=') {
+                    // Incorporate into the (latest) whole part
+                    leftIndex = start + lenSign;
+                    lenWhole += leading;
+                } else {
+                    // Default is to insert at the stated leftIndex <= start.
+                    start += leading;
                 }
-
-                // All padding that is not leading is trailing
-                int trailing = n - leading;
-
-                // Insert the leading space
-                if (leading > 0) {
-                    int pos;
-                    if (align == '=') {
-                        // Incorporate into the (latest) whole part
-                        pos = start + lenSign;
-                        lenWhole += leading;
-                    } else {
-                        // Insert at the very beginning (not start) by default.
-                        pos = 0;
-                        start += leading;
-                    }
-                    makeSpaceAt(pos, leading);
-                    for (int i = 0; i < leading; i++) {
-                        result.setCharAt(pos + i, fill);
-                    }
-                }
-
-                // Append the trailing space
-                for (int i = 0; i < trailing; i++) {
-                    result.append(fill);
-                }
-
-                // Check for special case
-                if (align == '=' && fill == '0' && spec.grouping) {
-                    // We must extend the grouping separator into the padding
-                    zeroPadAfterSignWithGroupingFixup(3, ',');
+                makeSpaceAt(leftIndex, leading);
+                for (int i = 0; i < leading; i++) {
+                    result.setCharAt(leftIndex + i, fill);
                 }
             }
 
-            return this;
+            // Append the trailing space
+            for (int i = 0; i < trailing; i++) {
+                result.append(fill);
+            }
+
+            // Check for special case
+            if (align == '=' && fill == '0' && spec.grouping) {
+                // We must extend the grouping separator into the padding
+                zeroPadAfterSignWithGroupingFixup(3, ',');
+            }
         }
 
         /**
@@ -512,6 +524,18 @@
 
         /**
          * Convenience method returning a {@link Py#ValueError} reporting that specifying a
+         * sign is not allowed in a format specifier for the named type.
+         *
+         * @param forType the type it was found applied to
+         * @param code the formatting code (or '\0' not to mention one)
+         * @return exception to throw
+         */
+        public static PyException signNotAllowed(String forType, char code) {
+            return notAllowed("Sign", forType, code);
+        }
+
+        /**
+         * Convenience method returning a {@link Py#ValueError} reporting that specifying a
          * precision is not allowed in a format specifier for the named type.
          *
          * @param forType the type it was found applied to
@@ -534,6 +558,18 @@
 
         /**
          * Convenience method returning a {@link Py#ValueError} reporting that some format specifier
+         * feature is not allowed for the named data type.
+         *
+         * @param outrage committed in the present case
+         * @param forType the data type (e.g. "integer") it where it is an outrage
+         * @return exception to throw
+         */
+        public static PyException notAllowed(String outrage, String forType) {
+            return notAllowed(outrage, forType, '\0');
+        }
+
+        /**
+         * Convenience method returning a {@link Py#ValueError} reporting that some format specifier
          * feature is not allowed for the named format code and data type. Produces a message like:
          * <p>
          * <code>outrage+" not allowed with "+forType+" format specifier '"+code+"'"</code>
@@ -753,6 +789,12 @@
                 false, Spec.UNSPECIFIED, Spec.NONE);
 
         /**
+         * Defaults applicable to string types. Equivalent to " <"
+         */
+        public static final Spec STRING = new Spec(' ', '<', Spec.NONE, false, Spec.UNSPECIFIED,
+                false, Spec.UNSPECIFIED, Spec.NONE);
+
+        /**
          * Constructor offering just precision and type.
          *
          * <pre>
diff --git a/src/org/python/core/stringlib/TextFormatter.java b/src/org/python/core/stringlib/TextFormatter.java
new file mode 100644
--- /dev/null
+++ b/src/org/python/core/stringlib/TextFormatter.java
@@ -0,0 +1,105 @@
+// Copyright (c) Jython Developers
+package org.python.core.stringlib;
+
+import org.python.core.stringlib.InternalFormat.Spec;
+
+/**
+ * A class that provides the implementation of <code>str</code> and <code>unicode</code> formatting.
+ * In a limited way, it acts like a StringBuilder to which text, formatted according to the format
+ * specifier supplied at construction. These are ephemeral objects that are not, on their own,
+ * thread safe.
+ */
+public class TextFormatter extends InternalFormat.Formatter {
+
+    /**
+     * Construct the formatter from a specification and guess the initial buffer capacity. A
+     * reference is held to this specification.
+     *
+     * @param spec parsed conversion specification
+     */
+    public TextFormatter(Spec spec) {
+        // No right answer here for the buffer size, especially as non-BMP Unicode possible.
+        super(spec, Math.max(spec.width, spec.getPrecision(10)) + 6);
+    }
+
+    /*
+     * Re-implement the text appends so they return the right type.
+     */
+    @Override
+    public TextFormatter append(char c) {
+        super.append(c);
+        return this;
+    }
+
+    @Override
+    public TextFormatter append(CharSequence csq) {
+        super.append(csq);
+        return this;
+    }
+
+    @Override
+    public TextFormatter append(CharSequence csq, int start, int end) //
+            throws IndexOutOfBoundsException {
+        super.append(csq, start, end);
+        return this;
+    }
+
+    /**
+     * Format the given <code>String</code> into the <code>result</code> buffer. Largely, this is a
+     * matter of copying the value of the argument, but a subtlety arises when the string contains
+     * supplementary (non-BMP) Unicode characters, which are represented as surrogate pairs. The
+     * precision specified in the format relates to a count of Unicode characters (code points), not
+     * Java <code>char</code>s. The method deals with this correctly, essentially by not counting
+     * the high-surrogates in the allowance. The final value of {@link #lenWhole} counts the UTF-16
+     * units added.
+     *
+     * @param value to format
+     * @return this <code>TextFormatter</code> object
+     */
+    public TextFormatter format(String value) {
+        this.reset();
+        int p = spec.precision, n = value.length();
+
+        if (Spec.specified(p) && p < n) {
+            /*
+             * A precision p was specified less than the length: we may have to truncate. Note we
+             * compared p with the UTF-16 length, even though it is the code point length that
+             * matters. But the code point length cannot be greater than n.
+             */
+            int count = 0;
+            while (count < p) {
+                // count is the number of UTF-16 chars.
+                char c = value.charAt(count++);
+                result.append(c);
+                // A high-surrogate will always be followed by a low, so doesn't count.
+                if (Character.isHighSurrogate(c) && p < n) {
+                    // Accomplish "not counting" by bumping the limit p, within the array bounds.
+                    p += 1;
+                }
+            }
+            // Record the UTF-16 count as the length in buffer
+            lenWhole = count;
+
+        } else {
+            // We definitely don't need to truncate. Append the whole string.
+            lenWhole = n;
+            result.append(value);
+        }
+
+        return this;
+    }
+
+    /**
+     * Pad the result according to the specification, dealing correctly with Unicode.
+     */
+    @Override
+    public TextFormatter pad() {
+        // We'll need this many pad characters (if>0). Note Spec.UNDEFINED<0.
+        int n = spec.width - result.codePointCount(0, result.length());
+        if (n > 0) {
+            pad(0, n);
+        }
+        return this;
+    }
+
+}
diff --git a/tests/java/org/python/core/StringFormatTest.java b/tests/java/org/python/core/StringFormatTest.java
--- a/tests/java/org/python/core/StringFormatTest.java
+++ b/tests/java/org/python/core/StringFormatTest.java
@@ -10,6 +10,7 @@
 import org.python.core.stringlib.InternalFormatSpec;
 import org.python.core.stringlib.InternalFormatSpecParser;
 import org.python.core.stringlib.MarkupIterator;
+import org.python.core.stringlib.TextFormatter;
 import org.python.util.PythonInterpreter;
 
 /**
@@ -219,15 +220,17 @@
     }
 
     public void testFormatString() {
-        InternalFormatSpec spec = new InternalFormatSpec();
-        assertEquals("abc", PyString.formatString("abc", spec));
+        String v = "abc";
+        TextFormatter f;
+        f = PyString.prepareFormatter(InternalFormat.fromText(""));
+        assertEquals("abc", f.format(v).pad().getResult());
 
-        spec.precision = 3;
-        assertEquals("abc", PyString.formatString("abcdef", spec));
+        String v2 = "abcdef";
+        f = PyString.prepareFormatter(InternalFormat.fromText(".3"));
+        assertEquals("abc", f.format(v2).pad().getResult());
 
-        spec.precision = -1;
-        spec.width = 6;
-        assertEquals("abc   ", PyString.formatString("abc", spec));
+        f = PyString.prepareFormatter(InternalFormat.fromText("6"));
+        assertEquals("abc   ", f.format(v).pad().getResult());
     }
 
     public void testMarkupIterator() {

-- 
Repository URL: http://hg.python.org/jython


[Jython-checkins] jython: Improvements to PyString.__format__ and in StringFormatter related to text.

[Jython-checkins] jython: Improvements to PyString.format and in StringFormatter related to text.