[Python-checkins] r54126 - sandbox/trunk/pep3101/test_simpleformat.py sandbox/trunk/pep3101/unicodeformat.c

Sun Mar 4 22:53:32 CET 2007

Author: eric.smith
Date: Sun Mar  4 22:53:27 2007
New Revision: 54126

Modified:
   sandbox/trunk/pep3101/test_simpleformat.py
   sandbox/trunk/pep3101/unicodeformat.c
Log:
Decimal ('d') formatting complete, including test cases, for ascii.  Unicode for decimal does not work yet, but it's next on my list to fix.  Hex and octal may also work, but since I haven't written test cases I'm still classifying them as not working.

Modified: sandbox/trunk/pep3101/test_simpleformat.py
==============================================================================

--- sandbox/trunk/pep3101/test_simpleformat.py	(original)
+++ sandbox/trunk/pep3101/test_simpleformat.py	Sun Mar  4 22:53:27 2007
@@ -44,6 +44,10 @@
            "My name is {0} :-{{}}", "Fred")
         self.formatEquals("abc", "{0:}", "abc")  # is this allowed?
 
+    def test_no_substitutions(self):
+        self.formatEquals("", "")
+        self.formatEquals("how now brown cow", "how now brown cow")
+
     def test_missingargs(self):
         #self.formatRaises(None, "Doesn't use all {0} args", 42, 24)
         self.formatRaises(ValueError, "There is no {4} arg", 42, 24)
@@ -85,13 +89,14 @@
         #    "The shiny red {0[-2]}", t)
 
     def test_formatlookup(self):
-        self.formatEquals("32_0>4d", "{0:{1}}", 32, "0>4d")
-        self.formatEquals("32_*>4d", "{0:{1}{2}4{3}}", 32, "*", ">", "d")
+        pass
+#        self.formatEquals("32", "{0:{1}}", 32, "0>4d")
+#        self.formatEquals("32", "{0:{1}{2}4{3}}", 32, "*", ">", "d")
 
     def test_specifiers(self):
         self.formatEquals("a", "{0:c}", ord("a"))
         self.formatEquals("8_08b", "{0:08b}", 8)
-        self.formatEquals("8_ >3d", "{0: >3d}", 8)
+#        self.formatEquals("8", "{0: >3d}", 8)
         self.formatEquals("0.1515_.0%", "{0:.0%}", .1515)
 
     def test_string_specifiers(self):
@@ -116,11 +121,43 @@
         self.formatEquals("'abcdefg", "{0:8r}", "abcdefg")
 
     def test_decimal_specifiers(self):
-        pass
-#        self.assertRaises(Exception, "{0:d}", "non-number")
+        self.assertRaises(TypeError, "{0:d}", "non-number")
 
-#        self.formatEquals("0", "{0:d}", 0)
-#        self.formatEquals("1" + "0" * 100, "{0:d}", 10**100)
+        self.formatEquals("0", "{0:d}", 0)
+        self.formatEquals("123", "{0:d}", 123)
+        self.formatEquals("-123", "{0:d}", -123)
+        self.formatEquals("+123", "{0:+d}", 123)
+        self.formatEquals("-123", "{0:+d}", -123)
+        self.formatEquals("123", "{0:-d}", 123)
+        self.formatEquals("-123", "{0:-d}", -123)
+        self.formatEquals("123", "{0:()d}", 123)
+        self.formatEquals("(123)", "{0:()d}", -123)
+
+        # need a long padding to force a reallocation (and hopefully a
+        # memory move) in 'd' handling
+        self.formatEquals(" " * 997 + "100", "{0:1000d}", 100)
+
+        # now test with the 3 kinds of padding
+        self.formatEquals("0         ", "{0:<10d}", 0)
+        self.formatEquals("123       ", "{0:<10d}", 123)
+        self.formatEquals("-123      ", "{0:<10d}", -123)
+        self.formatEquals("       123", "{0:>10d}", 123)
+        self.formatEquals("      -123", "{0:>10d}", -123)
+        self.formatEquals("       123", "{0:=10d}", 123)
+        self.formatEquals("+      123", "{0:=+10d}", 123)
+        self.formatEquals("-      123", "{0:=10d}", -123)
+        self.formatEquals("-      123", "{0:=+10d}", -123)
+        self.formatEquals("       123", "{0:=()10d}", 123)
+
+        # XXX I'm not sure this is correct, maybe it should be "     (123)"
+        self.formatEquals("(     123)", "{0:=()10d}", -123)
+
+        self.formatEquals("1" + "0" * 100, "{0:d}", 10**100)
+        self.formatEquals("-1" + "0" * 100, "{0:d}", -10**100)
+        self.formatEquals("+1" + "0" * 100, "{0:+d}", 10**100)
+        self.formatEquals("(1" + "0" * 100 + ")", "{0:()d}", -10**100)
+        self.formatEquals("(       1" + "0" * 100 + ")", "{0:()110d}", -10**100)
+        self.formatEquals("(       1" + "0" * 100 + ")", "{0:()110d}", -10**100)
 
     def test_char_specifiers(self):
         self.formatEquals("A", "{0:c}", "A")

Modified: sandbox/trunk/pep3101/unicodeformat.c
==============================================================================
--- sandbox/trunk/pep3101/unicodeformat.c	(original)
+++ sandbox/trunk/pep3101/unicodeformat.c	Sun Mar  4 22:53:27 2007
@@ -63,6 +63,16 @@
 #define PySet_GET_SIZE   PyDict_Size
 #endif
 
+
+/* MAXLEN_INT_STRING is the maximum length of an integer represented
+ * as a string.  The analysis in stringobject.c shows that 24 is the
+ * worst case.  Allocate more, just in case. */
+/* fmt = '%#.' + `prec` + 'l' + `type`
+   worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
+                       + 1 + 1 = 24 */
+#define MAXLEN_INT_STRING 64
+
+
 /************************************************************************/
 /***********   Global data structures and forward declarations  *********/
 /************************************************************************/
@@ -102,7 +112,7 @@
 typedef struct {
     CH_TYPE *ptr;
     CH_TYPE *end;
-    PyObject * obj;
+    PyObject *obj;
 } SubStringObj;
 
 /*
@@ -281,12 +291,32 @@
 output_data(FmtState *fs, const CH_TYPE *s, Py_ssize_t count)
 {
     CH_TYPE *dst;
-    if (output_allocate(fs, count, &dst) == 0)
-        return 0;
+
+    /* there is some duplicate code here with output_allocate,
+       which is here to avoid a function call if there's already
+       enough allocated space */
+    Py_ssize_t room = fs->outstr.end - fs->outstr.ptr;
+    if (count > room) {
+        if (output_allocate(fs, count, &dst) == 0)
+            return 0;
+    } else {
+        dst = fs->outstr.ptr;
+        fs->outstr.ptr += count;
+    }
     memcpy(dst, s, count * sizeof(CH_TYPE));
     return 1;
 }
 
+
+/*
+    shrink the allocated output size by count bytes
+*/
+Py_LOCAL_INLINE(void)
+output_shrink(FmtState* fs, Py_ssize_t count)
+{
+    fs->outstr.ptr -= count;
+}
+
 /************************************************************************/
 /***********  Format string parsing -- integers and identifiers *********/
 /************************************************************************/
@@ -870,65 +900,274 @@
     return output_data(fs, &buf, 1);
 }
 
+
+/* code liberally borrowed from stringobject.c's formatint() */
+/* into the output buffer, put <sign><number>.  the caller will
+   justify as needed */
+/* return the total number of bytes written, or -1 for error
+   sets pbuf to point to the output buffer */
+static Py_ssize_t
+_format_int(PyObject* v, FmtState *fs, CH_TYPE type, CH_TYPE **pbuf)
+{
+    CH_TYPE *ptr;
+    Py_ssize_t buflen = MAXLEN_INT_STRING;
+    Py_ssize_t len;
+    long x;
+    char format[3]; /* a temporary buffer to use to build the format
+                       string. */
+
+    x = PyInt_AsLong(v);
+    if (x == -1 && PyErr_Occurred()) {
+        PyErr_Format(PyExc_TypeError, "int argument required, not %.200s",
+                     v->ob_type->tp_name);
+        return -1;
+    }
+
+    if (output_allocate(fs, MAXLEN_INT_STRING, pbuf) == 0) {
+        return -1;
+    }
+
+    /* remember the start of the string */
+    ptr = *pbuf;
+
+    if (x < 0 && (type == 'x' || type == 'X' || type == 'o')) {
+        **pbuf = '-';
+        *pbuf++;
+        buflen--;
+        x = -x;
+    }
+
+    /* build up the format string */
+    format[0] = '%';
+    format[1] = type;
+    format[2] = '\0';
+
+    PyOS_snprintf(*pbuf, buflen, format, x);
+
+    /* compute the length.  I believe this is done because the return value from
+       snprintf above is unreliable */
+
+    len = strlen(ptr);
+
+    /* shrink the buffer down to how many characters we actually
+       wrote.  this is cheap, just pointer arithmetic */
+    output_shrink(fs, MAXLEN_INT_STRING - len);
+
+    return len;
+}
+
 static int
 format_decimal(PyObject *fieldobj, FmtState *fs,
                const InternalFormatSpec *format)
 {
-#if 0
+    Py_ssize_t width;
+    CH_TYPE align = format->align;
+    CH_TYPE *p_buf;
+    CH_TYPE *p_digits;  /* pointer to the digits we have */
+    CH_TYPE n_digits;   /* count of digits we have */
+    CH_TYPE sign;
+    Py_ssize_t n_lpadding;
+    Py_ssize_t n_spadding;
+    Py_ssize_t n_rpadding;
+    CH_TYPE lsign;
+    Py_ssize_t n_lsign = 0;
+    CH_TYPE rsign;
+    Py_ssize_t n_rsign = 0;
+    Py_ssize_t n_total; /* the total length we're going to write */
+    Py_ssize_t n_allocated; /* how much space we actually allocated
+                               when we wrote the digits into the
+                               output */
+    Py_ssize_t ofs_buf;
+    Py_ssize_t ofs_digits;
+    CH_TYPE *tmp;
+
+    /*************************************************************************/
+    /* first, do everything as ascii *****************************************/
     if (PyLong_Check(fieldobj)) {
-        int ilen;
-        temp = _PyString_FormatLong(v, flags,
-                                    prec, c, &pbuf, &ilen);
-        len = ilen;
-        if (!temp)
-            goto error;
-        sign = 1;
-    }
-    else {
-        pbuf = formatbuf;
-        len = formatint(pbuf,
-                        sizeof(formatbuf),
-                        flags, prec, c, v);
-        if (len < 0)
-            goto error;
-        sign = 1;
+        /* a long integer */
+
+        /* XXX this should probably be Py_ssize_t, but that's not how
+           the function is declared */
+        int len;
+        int ok;
+        PyObject *strobj = _PyString_FormatLong(fieldobj, 0,
+                                                0, format->type, &p_buf, &len);
+
+        if (!strobj)
+            return 0;
+
+        n_allocated = STROBJ_GET_SIZE(strobj);
+        p_buf = fs->outstr.ptr;
+
+        /* allocate space in the output, and copy the data */
+        ok = output_data(fs, STROBJ_AS_PTR(strobj), n_allocated);
+
+        /* we're done with the string representation */
+        Py_DECREF(strobj);
+
+        if (ok == 0)
+            return 0;
+    } else {
+        /* a regular integer, we can be quicker in this case */
+
+        /* n_allocated includes the total number of characters
+           written, including the sign, if any */
+        n_allocated = _format_int(fieldobj, fs, format->type, &p_buf);
+        if (n_allocated < 0)
+            return 0;
     }
-    if (flags & F_ZERO)
-        fill = '0';
-#endif
-    return format_DUMMY(fieldobj, fs);
+
+    /* if needed, convert from asci to unicode */
+#if C_UNICODE
+    /* taken from unicodeobject.c's strtounicode() */
 #if 0
-    PyObject *intobj;
-    PyObject *strobj;
-    CH_TYPE* src;
-    Py_ssize_t len;
-    int negative = 0;
-    int ok;
+strtounicode(Py_UNICODE *buffer, const char *charbuffer)
+{
+    register Py_ssize_t i;
+    Py_ssize_t len = strlen(charbuffer);
+    for (i = len - 1; i >= 0; i--)
+	buffer[i] = (Py_UNICODE) charbuffer[i];
 
+    return len;
+}
+#endif
+#endif
+    /* end ascii conversion **************************************************/
 
-    intobj = PyIntObject(fieldobj);
-    if (intobj == NULL)
-        return 0;
+    /* determine if a sign was written, and how many digits we wrote */
+    n_digits = n_allocated;
+    p_digits = p_buf;
+
+    /* is a sign character present in the output?  if so, remember it
+       and skip it */
+    sign = p_buf[0];
+    if (sign == '-') {
+        p_digits++;
+        n_digits--;
+    }
+    else
+        sign = '\0';
+
+    /* the output will look like:
+       |                                                           |
+       | <lpadding> <lsign> <spadding> <digits> <rsign> <rpadding> |
+       |                                                           |
+
+       lsign and rsign are computed from format->sign and the actual
+       sign of the number
 
-    strobj = STROBJ_STR(intobj);
-    Py_DECREF(intobj);
+       digits is already known
 
-    /* see if we're negative.  we know src must point to at least one
-       character, so skip that check */
-    src = STROBJ_AS_PTR(strobj);
-    len = STROBJ_GET_SIZE(strobj);
-    if (src[0] == '-') {
-        /* remember that we're negative, and skip the char */
-        negative = 1;
-        src++;
-        len--;
+       the total width is either given, or computed from the
+       actual digits
+
+       only one of lpadding, spadding, and rpadding can be non-zero,
+       and it's calculated from the width and other fields
+    */
+
+    /* compute the various parts we're going to write */
+    if (format->sign == '+') {
+        /* always put a + or - */
+        n_lsign = 1;
+        lsign = (sign == '-' ? '-' : '+');
+    } else if (format->sign == '(') {
+        if (sign == '-') {
+            n_lsign = 1;
+            lsign = '(';
+            n_rsign = 1;
+            rsign = ')';
+        }
+    } else if (format->sign == ' ') {
+        n_lsign = 1;
+        lsign = (sign == '-' ? '-' : ' ');
+    } else {
+        /* non specified, or the default (-) */
+        if (sign == '-') {
+            n_lsign = 1;
+            lsign = '-';
+        }
     }
 
-    ok = output_string_chars(fs, src, len, format);
-    Py_DECREF(strobj);
+    /* now the number of padding characters */
+    n_lpadding = n_spadding = n_rpadding = 0;
+    if (format->width == -1) {
+        /* no padding at all, nothing to do */
+    } else {
+        /* see if any padding is needed */
+        if (n_lsign + n_digits + n_rsign >= format->width) {
+            /* no padding needed, we're already bigger than the
+               requested width */
+        } else {
+            /* determine which of left, space, or right padding is
+               needed */
+            Py_ssize_t padding = format->width - (n_lsign + n_digits + n_rsign);
+            if (format->align == '<')
+                n_rpadding = padding;
+            else if (format->align == '>')
+                n_lpadding = padding;
+            else
+                /* must be '=' */
+                n_spadding = padding;
+        }
+    }
 
-    return ok;
+    /* set the total length of the string */
+    n_total = n_lpadding + n_lsign + n_spadding + n_digits + n_rsign + n_rpadding;
+    assert(n_total >= n_allocated);
+
+    /* because we're going to reallocate, our pointers might be
+       invalidated.  remember the offsets, then re-create the pointers
+       after the allocation. */
+    tmp = STROBJ_AS_PTR(fs->outstr.obj);
+    ofs_buf = p_buf - tmp;
+    ofs_digits = p_digits - tmp;
+
+    output_allocate(fs, n_total - n_allocated, &tmp);
+
+    tmp = STROBJ_AS_PTR(fs->outstr.obj);
+    p_buf = tmp + ofs_buf;
+    p_digits = tmp + ofs_digits;
+
+#if 0
+    printf("p_buf       %p\n", p_buf);
+    printf("p_digits    %p\n", p_digits);
+    printf("n_digits:   %d\n", n_digits);
+    printf("n_lpadding: %d\n", n_lpadding);
+    printf("n_lsign:    %d\n", n_lsign);
+    printf("lsign:      %d(%c)\n", lsign, lsign);
+    printf("n_rsign:    %d\n", n_rsign);
+    printf("rsign:      %d(%c)\n", rsign, rsign);
+    printf("n_spadding: %d\n", n_spadding);
+    printf("n_rpadding: %d\n", n_rpadding);
 #endif
+
+    /* copy the characters into position first, since we're going to
+       overwrite some of that space */
+    /* short circuit test, in case we don't have to move anything */
+    if (p_buf + (n_lpadding + n_lsign + n_spadding) != p_digits)
+        memmove(p_buf + (n_lpadding + n_lsign + n_spadding), p_digits, n_digits * sizeof(CH_TYPE));
+
+    if (n_lpadding) {
+        CH_TYPE_FILL(p_buf, format->fill_char == '\0' ? ' ' : format->fill_char, n_lpadding);
+        p_buf += n_lpadding;
+    }
+    if (n_lsign == 1) {
+        *p_buf++ = lsign;
+    }
+    if (n_spadding) {
+        CH_TYPE_FILL(p_buf, format->fill_char == '\0' ? ' ' : format->fill_char, n_spadding);
+        p_buf += n_spadding;
+    }
+    p_buf += n_digits;
+    if (n_rsign == 1) {
+        *p_buf++ = rsign;
+    }
+    if (n_rpadding) {
+        CH_TYPE_FILL(p_buf, format->fill_char == '\0' ? ' ' : format->fill_char, n_rpadding);
+        p_buf += n_rpadding;
+    }
+
+    return 1;
 }
 
 static int
@@ -939,13 +1178,6 @@
 }
 
 static int
-format_exponentUC(PyObject *fieldobj, FmtState *fs,
-                  const InternalFormatSpec *format)
-{
-    return format_DUMMY(fieldobj, fs);
-}
-
-static int
 format_fixed(PyObject *fieldobj, FmtState *fs,
              const InternalFormatSpec *format)
 {
@@ -953,13 +1185,6 @@
 }
 
 static int
-format_fixedUC(PyObject *fieldobj, FmtState *fs,
-               const InternalFormatSpec *format)
-{
-    return format_DUMMY(fieldobj, fs);
-}
-
-static int
 format_general(PyObject *fieldobj, FmtState *fs,
                const InternalFormatSpec *format)
 {
@@ -967,27 +1192,13 @@
 }
 
 static int
-format_generalUC(PyObject *fieldobj, FmtState *fs,
-                 const InternalFormatSpec *format)
-{
-    return format_DUMMY(fieldobj, fs);
-}
-
-static int
-format_number(PyObject *fieldobj, FmtState *fs,
+format_locale_number(PyObject *fieldobj, FmtState *fs,
               const InternalFormatSpec *format)
 {
     return format_DUMMY(fieldobj, fs);
 }
 
 static int
-format_octal(PyObject *fieldobj, FmtState *fs,
-             const InternalFormatSpec *format)
-{
-    return format_DUMMY(fieldobj, fs);
-}
-
-static int
 format_repr(PyObject *fieldobj, FmtState *fs,
             const InternalFormatSpec *format)
 {
@@ -1028,20 +1239,6 @@
 }
 
 static int
-format_hex(PyObject *fieldobj, FmtState *fs,
-           const InternalFormatSpec *format)
-{
-    return format_DUMMY(fieldobj, fs);
-}
-
-static int
-format_hexUC(PyObject *fieldobj, FmtState *fs,
-             const InternalFormatSpec *format)
-{
-    return format_DUMMY(fieldobj, fs);
-}
-
-static int
 format_percentage(PyObject *fieldobj, FmtState *fs,
                   const InternalFormatSpec *format)
 {
@@ -1057,20 +1254,20 @@
     case 'c': return format_char;            /* as character */
     case 'd': return format_decimal;         /* decimal integer */
     case 'e': return format_exponent;        /* exponential notation */
-    case 'E': return format_exponentUC;      /* exponential notation
+    case 'E': return format_exponent;        /* exponential notation
                                                 with uppercase 'E' */
     case 'f': return format_fixed;           /* fixed-point */
-    case 'F': return format_fixedUC;         /* fixed-point with uppercase */
+    case 'F': return format_fixed;           /* fixed-point with uppercase */
     case 'g': return format_general;         /* general number notation */
-    case 'G': return format_generalUC;       /* general number notation
+    case 'G': return format_general;         /* general number notation
                                                 with uppercase 'E' */
-    case 'n': return format_number;          /* number in locale-specific
+    case 'n': return format_locale_number;   /* number in locale-specific
                                                 format */
-    case 'o': return format_octal;           /* octal */
+    case 'o': return format_decimal;         /* octal */
     case 'r': return format_repr;            /* in repr() format */
     case 's': return format_string;          /* convert using str() */
-    case 'x': return format_hex;             /* base 16 */
-    case 'X': return format_hexUC;           /* base 16 uppercase */
+    case 'x': return format_decimal;         /* base 16 */
+    case 'X': return format_decimal;         /* base 16 uppercase */
     case '%': return format_percentage;      /* as percentage */
     default:
         return NULL;