[Python-checkins] r54056 - sandbox/trunk/pep3101/README.txt sandbox/trunk/pep3101/pep3101.c sandbox/trunk/pep3101/pep3101.h sandbox/trunk/pep3101/setup.py sandbox/trunk/pep3101/stringformat.c sandbox/trunk/pep3101/test_simpleformat.py sandbox/trunk/pep3101/unicodeformat.c

patrick.maupin python-checkins at python.org
Thu Mar 1 09:22:28 CET 2007


Author: patrick.maupin
Date: Thu Mar  1 09:22:25 2007
New Revision: 54056

Added:
   sandbox/trunk/pep3101/README.txt
   sandbox/trunk/pep3101/pep3101.c
   sandbox/trunk/pep3101/pep3101.h
   sandbox/trunk/pep3101/setup.py
   sandbox/trunk/pep3101/stringformat.c
   sandbox/trunk/pep3101/test_simpleformat.py
   sandbox/trunk/pep3101/unicodeformat.c
Log:
Initial version of pep3101 sandbox code committed

Added: sandbox/trunk/pep3101/README.txt
==============================================================================
--- (empty file)
+++ sandbox/trunk/pep3101/README.txt	Thu Mar  1 09:22:25 2007
@@ -0,0 +1,70 @@
+
+PEP 3101
+
+This directory is where sample code to support PEP 3101 and do
+related things is being developed.
+
+Current developers:
+
+    Patrick Maupin (pmaupin at gmail.com)
+    Eric V. Smith
+    Pete Shinner
+
+The code is only half-baked at present
+(development was started at PyCon 2007 and is in progress).
+
+Although the PEP3101 goal is a (unicode) string format method, since
+this is a sandbox, we might do a few more ambitious things as well,
+to see if people like them.
+
+The current plan of record is to make a pep3101 extension module.
+It will have at least the following features:
+
+    - contains a format function which takes a string and parameters,
+      (but which is easily portable to a string method for Py3k)
+    - can be compiled against 2.4, 2.5, and Py3K
+    - Works with the string object as well as unicode
+
+The current code has a module which is progressing nicely, and some
+unittests for the current version.
+
+Files:
+
+    - unicodeformat.c is designed to be easily added to Python
+      as a method of the unicode object.
+    - stringformat.c is a wrapper around unicodeformat.c, which
+      "templatizes" the entire file to make it easy to add to Python
+      as a method of the string object.
+    - pep3101.h contains definitions for the functions in stringformat
+      and unicodeformat
+    - pep3101.c contains a module implementation which can be linked
+      with these method files for testing and/or use with earlier
+      Python versions.
+    - setup.py  -- Use "build" option to make the extension module
+    - test_simpleformat.py  -- initial unittests
+
+Todo:
+
+    - finish up format specifier handling
+    - document differences between PEP and implementation
+    - Add docstrings to module
+    - print string offset information on certain errors
+    - Add _flags options
+    - Play with possible implementations for formatting
+      strings against dictionaries as well as the format
+      (dangerous)
+    - Play with possible implementations for exposing
+      lowest level format specifier handler for use in
+      compatible template systems.
+    - Play with possible options for specifying additional
+      escape syntaxes
+
+_flags options to consider adding:
+
+    - useall=1   means all arguments should be used
+    - allow_leading_under  means leading underbars allowed
+    - syntax=0,1,2,3 -- different syntaxes
+    - hook=object -- callback hook as described in PEP
+    - informational mode to dump exceptions into string
+      (as described in pep)
+    - max_recursion=xxx (default 4)

Added: sandbox/trunk/pep3101/pep3101.c
==============================================================================
--- (empty file)
+++ sandbox/trunk/pep3101/pep3101.c	Thu Mar  1 09:22:25 2007
@@ -0,0 +1,64 @@
+#include "Python.h"
+
+/* XXX -- remove this include if integrated into Python.h ??? */
+#include "pep3101.h"
+
+
+/* ----------------------------------------------------- */
+
+static char pep3101_format__doc__[] =
+""
+;
+
+static PyObject *
+pep3101_format(PyObject *self, PyObject *args, PyObject *keywords)
+{
+    PyObject *newargs, *newself, *result;
+    newself = PyTuple_GetItem(args, 0); /* borrowed reference */
+    if (newself == NULL)
+        return NULL;
+    newargs = PyTuple_GetSlice(args, 1, PyTuple_Size(args) + 1);
+    if (newargs == NULL)
+        return NULL;
+    if (PyUnicode_Check(newself))
+        result = PyUnicode_FormatMethod(newself, newargs, keywords);
+    else if (PyString_Check(newself))
+        result = PyString_FormatMethod(newself, newargs, keywords);
+    else {
+        result = NULL;
+        PyErr_SetString(PyExc_TypeError,
+            "First parameter to format must be string or unicode object");
+    }
+    Py_DECREF(newargs);
+    return result;
+}
+
+/* List of methods defined in the module */
+
+static struct PyMethodDef pep3101_methods[] = {
+    {"format",    (PyCFunction)pep3101_format,    METH_VARARGS | METH_KEYWORDS,    pep3101_format__doc__},
+ 
+    {NULL,     (PyCFunction)NULL, 0, NULL}        /* sentinel */
+};
+
+
+/* Initialization function for the module (*must* be called initpep3101) */
+
+static char pep3101_module_documentation[] = 
+""
+;
+
+void
+initpep3101(void)
+{
+    PyObject *m;
+
+    /* Create the module and add the functions */
+    m = Py_InitModule4("pep3101", pep3101_methods,
+        pep3101_module_documentation,
+        (PyObject*)NULL,PYTHON_API_VERSION);
+
+    /* Check for errors */
+    if (PyErr_Occurred())
+        Py_FatalError("can't initialize module pep3101");
+}

Added: sandbox/trunk/pep3101/pep3101.h
==============================================================================
--- (empty file)
+++ sandbox/trunk/pep3101/pep3101.h	Thu Mar  1 09:22:25 2007
@@ -0,0 +1,12 @@
+#ifndef Py_PEP3101_H
+#define Py_PEP3101_H
+
+/* XXX -- need #ifdefs to remove Unicode if not using it, and remove String on Py3K */
+
+PyObject *
+PyString_FormatMethod(PyObject *self, PyObject *args, PyObject *keywords);
+
+PyObject *
+PyUnicode_FormatMethod(PyObject *self, PyObject *args, PyObject *keywords);
+
+#endif

Added: sandbox/trunk/pep3101/setup.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/pep3101/setup.py	Thu Mar  1 09:22:25 2007
@@ -0,0 +1,11 @@
+from distutils.core import setup, Extension
+
+module1 = Extension('pep3101',
+                    sources = ['pep3101.c', 'unicodeformat.c', 'stringformat.c'],
+                    depends = ['unicodeformat.c'],  # Force rebuild of stringobject when this changes
+                    )
+
+setup (name = 'pep3101',
+       version = '1.0',
+       description = 'Extension module to implement features of PEP 3101',
+       ext_modules = [module1])

Added: sandbox/trunk/pep3101/stringformat.c
==============================================================================
--- (empty file)
+++ sandbox/trunk/pep3101/stringformat.c	Thu Mar  1 09:22:25 2007
@@ -0,0 +1,4 @@
+#include "Python.h"
+#define COMPILED_FROM_INSIDE_STRINGFORMAT
+#define C_UNICODE 0
+#include "unicodeformat.c"

Added: sandbox/trunk/pep3101/test_simpleformat.py
==============================================================================
--- (empty file)
+++ sandbox/trunk/pep3101/test_simpleformat.py	Thu Mar  1 09:22:25 2007
@@ -0,0 +1,115 @@
+
+import unittest
+from test import test_support
+
+import pep3101
+
+
+# The test implementation does not allow an argument
+# index or keyword name to be used more than once. The
+# PEP doesn't make any specification on this, but it
+# seems too useful to leave as is.
+
+
+class FormatTest(unittest.TestCase):
+   # All tests run through these functions. They can be
+   # overridden to change the class of string being tested
+   # and the function being used.
+   def formatEquals(self, result, text, *args, **kwargs):
+       text = str(text)
+       result = str(result)
+       val = pep3101.format(text, *args, **kwargs)
+       self.assertEquals(val, result)
+
+   def formatRaises(self, exc, text, *args, **kwargs):
+       exc = exc or Exception #StringFormat.FormatError
+       text = str(text)
+       #prevState = StringFormat.strict_format_errors
+       #StringFormat.strict_format_errors = True
+       try:
+           self.assertRaises(exc,
+                             lambda: pep3101.format(
+                                 text, *args, **kwargs))
+       finally:
+           pass
+           #StringFormat.strict_format_errors = prevState
+
+
+   def test_basic(self):
+       # directly from the pep intro
+       self.formatEquals(
+           "My name is Fred",
+           "My name is {0}", "Fred")
+       self.formatEquals(
+           "My name is Fred :-{}",
+           "My name is {0} :-{{}}", "Fred")
+       self.formatEquals("abc", "{0:}", "abc")  # is this allowed?
+
+   def test_missingargs(self):
+       #self.formatRaises(None, "Doesn't use all {0} args", 42, 24)
+       self.formatRaises(IndexError, "There is no {4} arg", 42, 24)
+       self.formatRaises(KeyError, "There question is {when}", who=True)
+
+   def test_attributes(self):
+       class Container(object):
+           one, _two, four4 = 1, 2, 4
+           def __getattr__(self, name):
+               if name == "five": return 5
+               raise TypeError("Never do this")
+       self.formatEquals(
+           "Count with me; 1 2 4",
+           "Count with me; {0.one} {item._two} {1.four4}",
+           Container, Container, item=Container)
+       self.formatEquals(
+           "Five is 5", "Five is {c.five}", c=Container())
+       self.formatRaises(AttributeError,
+           "Missing {0.rabbit} lookup", Container)
+       self.formatRaises(TypeError,
+           "Forbidden {0.secret} lookup", Container())
+
+   def test_items(self):
+       d = dict(val="value", sum=1)
+       t = tuple(("apple", "ball", "cat"))
+       self.formatEquals(
+           "The value of apple",
+           "The {0[val]} of {t[0]}", d, t=t)
+       # Decided against negative indices for now
+       #self.formatEquals(
+       #    "The shiny red ball",
+       #    "The shiny red {0[-2]}", t)
+
+   def test_formatlookup(self):
+       self.formatEquals("32_0>4d", "{0:{1}}", 32, "0>4d")
+       self.formatEquals("32_*>4d", "{0:{1}{2}4{3}}", 32, "*", ">", "d")
+
+   def test_specifiers(self):
+       self.formatEquals("97_c", "{0:c}", ord("a"))
+       self.formatEquals("8_08b", "{0:08b}", 8)
+       self.formatEquals("8_ >3d", "{0: >3d}", 8)
+       self.formatEquals("0.1515_.0%", "{0:.0%}", .1515)
+
+   def test_custom_format(self):
+       class Custom(object):
+           def __format__(self, specifiers):
+               return specifiers
+       custom = Custom()
+       self.formatEquals("magic", "{0:magic}", custom)
+       self.formatEquals("custom", "{0:{1}}", custom, "custom")
+
+   def test_syntaxerror(self):
+       self.assertRaises(Exception, "}{", True)
+       self.assertRaises(Exception, "{0", True)
+       self.assertRaises(Exception, "{0.[]}", True)
+       self.assertRaises(Exception, "{0[0}", True)
+       self.assertRaises(Exception, "{0[0:foo}", True)
+       self.assertRaises(Exception, "{c]}", True)
+       self.assertRaises(Exception, "{{1}}", True, 0)
+       self.assertRaises(Exception, "{{ {{{0}}", True)
+       self.assertRaises(Exception, "{0}}", True)
+
+
+def test_main():
+   test_support.run_unittest(FormatTest)
+
+if __name__ == "__main__":
+   test_main()

Added: sandbox/trunk/pep3101/unicodeformat.c
==============================================================================
--- (empty file)
+++ sandbox/trunk/pep3101/unicodeformat.c	Thu Mar  1 09:22:25 2007
@@ -0,0 +1,1091 @@
+#define DUMMY_FORMATTING 1
+
+/*
+    unicodeformat.c -- implementation of PEP 3101
+
+    PEP 3101 and example Python implementation written by Talin
+
+    This module designed and written by Patrick Maupin and Eric V Smith
+
+    This module is designed to be compiled standalone, or from inside stringformat.c,
+    to support both unicode and traditional strings.
+*/
+
+/*
+    XXX -- todo: insert a fragment of the source string into error messages
+*/
+
+#ifndef COMPILED_FROM_INSIDE_STRINGFORMAT
+#include "Python.h"
+#define C_UNICODE 1
+#endif
+
+#if C_UNICODE
+#define CH_TYPE                  Py_UNICODE
+#define CH_TYPE_ISDECIMAL        Py_UNICODE_ISDECIMAL
+#define CH_TYPE_TODECIMAL        Py_UNICODE_TODECIMAL
+#define STROBJ_AS_PTR            PyUnicode_AS_UNICODE
+#define STROBJ_GET_SIZE          PyUnicode_GET_SIZE
+#define STROBJ_NEW               PyUnicode_FromUnicode
+#define STROBJ_RESIZE            PyUnicode_Resize
+#define STROBJ_CHECK             PyUnicode_Check
+#define STROBJ_FORMAT            PyUnicode_FormatMethod
+#define STROBJ_STR               PyObject_Unicode
+#else
+#define CH_TYPE                  char
+#define CH_TYPE_ISDECIMAL(x)     ((x >= '0') && (x <= '9'))
+#define CH_TYPE_TODECIMAL(x)     (CH_TYPE_ISDECIMAL(x) ? (x - '0') : -1)
+#define STROBJ_AS_PTR            PyString_AS_STRING
+#define STROBJ_GET_SIZE          PyString_GET_SIZE
+#define STROBJ_NEW               PyString_FromStringAndSize
+#define STROBJ_RESIZE            _PyString_Resize
+#define STROBJ_CHECK             PyString_Check
+#define STROBJ_FORMAT            PyString_FormatMethod
+#define STROBJ_STR               PyObject_Str
+#endif
+
+/* Try to support older versions of Python*/
+#if PYTHON_API_VERSION < 1013
+typedef int Py_ssize_t;
+#define Py_LOCAL_INLINE(x)        static x
+#endif
+
+/* Defines for more efficiently reallocating the string buffer */
+#define INITIAL_SIZE_INCREMENT 100
+#define SIZE_MULTIPLIER 2
+#define MAX_SIZE_INCREMENT  3200
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* XXX -- remove this include if integrated into Python.h ??? */
+#include "pep3101.h"
+
+/*
+    A SubString is a string between two unicode pointers.
+*/
+typedef struct {
+    CH_TYPE *ptr;
+    CH_TYPE *end;
+} SubString;
+
+/*
+   A SubStringObj is like a SubString, but also has an associated
+   object (which may be null).
+*/
+typedef struct {
+    CH_TYPE *ptr;
+    CH_TYPE *end;
+    PyObject * obj;
+} SubStringObj;
+
+/*
+    If this were written in C++, FmtState would be the class,
+    and most of the functions inside this file would be members
+    of this class.
+*/
+typedef struct {
+    /* args passed to PyString_FormatMethod or PyUnicode_FormatMethod */
+    PyObject *args;
+    /* keywords passed to PyString_FormatMethod or PyUnicode_FormatMethod */
+    PyObject *keywords;
+    /* current position and end of the 'self' string passed to FormatMethod */
+    SubString fmtstr;
+    /* Output string we are constructing, including current and end pointers*/
+    SubStringObj outstr;
+    /* Field Specifier, after the colon in {1:{2}}
+       This may or may not have a valid object (the field specifier might
+       just be a substring of the fmtstr.  If it does not have its own
+       object, the .obj struct member will be NULL */
+    SubStringObj fieldspec;
+    /* size_increment is used for optimizing string growth */
+    int size_increment;
+    /* max_recursion is used to limit the ability of a malicious string
+       to damage the stack.  Default value is 4 */
+    int max_recursion;
+    /* By default, leading underscores are not allowed for security reasons */
+    int allow_leading_under;
+    /* positional_arg_set contains one bit for every positional argument
+       that we still expect to be used.  This implementation only checks
+       that the first 32 positional arguments are actually used.  If they
+       want more than that, they probably really need the check, but on
+       the other hand they are probably beyond help, so the check would
+       be necessary but not sufficient :) */
+    int positional_arg_set;
+    /* Keyword arguments can be checked as well */
+    PyObject *keyword_arg_set;
+    /* For some interface functions, we could have a list or tuple of
+       dictionaries to search, e.g. locals()/globals(). */
+    int keywords_is_tuple;
+} FmtState;
+
+/*
+    Our internal conversion functions have this signature.
+
+    returns the number of characters written, or -1 if error
+*/
+/* XXX obviously wrong, but just a placeholder currently */
+typedef Py_ssize_t (*ConversionFunction)(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+
+/*
+    Forward declarations for our conversion functions
+*/
+static Py_ssize_t convert_binary(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_char(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_decimal(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_exponent(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_exponentUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_fixed(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_fixedUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_general(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_generalUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_number(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_octal(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_repr(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_string(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_hex(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_hexUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+static Py_ssize_t convert_percentage(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign);
+
+
+/* Some forward declarations for recursion */
+static PyObject *
+get_field_object(FmtState *fs);
+
+static PyObject *
+recurse_format(FmtState *fs);
+
+/*
+    Most of our errors are value errors, because to Python, the
+    format string is a "value".  Also, it's convenient to return
+    a NULL when we are erroring out.
+*/
+static void *
+SetError(const char *s)
+{
+    PyErr_SetString(PyExc_ValueError, s);
+    return NULL;
+}
+
+/*
+    check_fmtstr returns True if we still have characters
+    left in the format string.
+*/
+Py_LOCAL_INLINE(int)
+check_fmtstr(FmtState *fs)
+{
+    return (fs->fmtstr.ptr < fs->fmtstr.end) ||
+           SetError("Invalid format string");
+}
+
+/*
+    end_identifier returns true if a character marks
+    the end of an identifier string.
+
+    Although the PEP specifies that identifiers are
+    numbers or valid Python identifiers, we just let
+    getattr/getitem handle that, so the implementation
+    is more flexible than the PEP would indicate.
+*/
+Py_LOCAL_INLINE(int)
+end_identifier(CH_TYPE c)
+{
+    switch (c) {
+        case '.': case '[': case ']': case '}': case ':':
+            return 1;
+        default:
+            return 0;
+    }
+}
+
+
+/* returns true if this character is a specifier alignment token */
+Py_LOCAL_INLINE(int)
+alignment_token(CH_TYPE c)
+{
+    switch (c) {
+    case '<': case '>': case '=': case '^':
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+/* returns true if this character is a sign element */
+Py_LOCAL_INLINE(int)
+sign_element(CH_TYPE c)
+{
+    switch (c) {
+    case ' ': case '+': case '-': case '(':
+        return 1;
+    default:
+        return 0;
+    }
+}
+
+
+
+/* returns a pointer to our conversion function, or NULL if invalid */
+Py_LOCAL_INLINE(ConversionFunction)
+conversion_function(CH_TYPE c)
+{
+    switch (c) {
+    case 'b': return convert_binary;          /* base-2 */
+    case 'c': return convert_char;            /* as character */
+    case 'd': return convert_decimal;         /* decimal integer */
+    case 'e': return convert_exponent;        /* exponential notation */
+    case 'E': return convert_exponentUC;      /* exponential notation with uppercase 'E' */
+    case 'f': return convert_fixed;           /* fixed-point */
+    case 'F': return convert_fixedUC;         /* fixed-point with uppercase */
+    case 'g': return convert_general;         /* general number notation */
+    case 'G': return convert_generalUC;       /* general number notation with uppercase 'E' */
+    case 'n': return convert_number;          /* number in locale-specific format */
+    case 'o': return convert_octal;           /* octal */
+    case 'r': return convert_repr;            /* in repr() format */
+    case 's': return convert_string;          /* convert using str() */
+    case 'x': return convert_hex;             /* base 16 */
+    case 'X': return convert_hexUC;           /* base 16 uppercase */
+    case '%': return convert_percentage;      /* as percentage */
+    default:
+        return NULL;
+    }
+}
+
+/* Fill in a SubStringObj from a Python string */
+Py_LOCAL_INLINE(SubStringObj)
+make_substrobj(PyObject *obj)
+{
+    SubStringObj s;
+    s.obj = obj;
+    s.ptr = STROBJ_AS_PTR(obj);
+    s.end = STROBJ_GET_SIZE(obj) + s.ptr;
+    return s;
+}
+
+#if PYTHON_API_VERSION < 1013
+static void
+PySet_Discard(PyObject *myset, PyObject *mykey)
+{
+    /* XXX --- Need to add the right code here */
+}
+#endif
+
+/* XXX -- similar function elsewhere ???? */
+/*
+    output_data dumps characters into our output string
+    buffer.
+
+    In some cases, it has to reallocate the string.
+
+    It returns a status:  0 for a failed reallocation,
+    1 for success.
+*/
+static int
+output_data(FmtState *fs, const CH_TYPE *s, Py_ssize_t count)
+{
+    Py_ssize_t room = fs->outstr.end - fs->outstr.ptr;
+    if (count > room) {
+        CH_TYPE *startptr;
+        Py_ssize_t curlen, maxlen;
+        startptr = STROBJ_AS_PTR(fs->outstr.obj);
+        curlen = fs->outstr.ptr - startptr;
+        maxlen = curlen + count + fs->size_increment;
+        if (STROBJ_RESIZE(&fs->outstr.obj, maxlen) < 0)
+            return 0;
+        startptr = STROBJ_AS_PTR(fs->outstr.obj);
+        fs->outstr.ptr = startptr + curlen;
+        fs->outstr.end = startptr + maxlen;
+        if (fs->size_increment < MAX_SIZE_INCREMENT)
+            fs->size_increment *= SIZE_MULTIPLIER;
+    }
+    memcpy(fs->outstr.ptr, s, count * sizeof(CH_TYPE));
+    fs->outstr.ptr += count;
+    return 1;
+}
+
+/*
+    get_python_identifier is a bit of a misnomer.  It returns
+    a value for use with getattr or getindex.  This value
+    will usually be a string value, but we allow {} inside the
+    text, so it could really be any arbitrary python object,
+    as retrieved from the method arguments.
+*/
+static PyObject *
+get_python_identifier(FmtState *fs, int isargument)
+{
+    CH_TYPE *startptr;
+    PyObject *result;
+    if (!check_fmtstr(fs))
+        return NULL;
+    if (*fs->fmtstr.ptr == '{') {
+        /* This little bit of mutual recursion allows nested dictionary
+           lookups and computed attribute names
+        */
+        if (--fs->max_recursion < 0)
+            return SetError("Max string recursion exceeded");
+        result = get_field_object(fs);
+        fs->max_recursion++;
+        if (result && (*fs->fmtstr.ptr++ != '}'))
+            result = SetError("Expected closing }");
+        return result;
+    }
+    if (end_identifier(*fs->fmtstr.ptr))
+        return SetError("Expected attribute or index");
+    if ((*fs->fmtstr.ptr == '_') && !fs->allow_leading_under)
+        return SetError("Index/attribute leading underscores disallowed");
+
+    for (startptr = fs->fmtstr.ptr;
+         !end_identifier(*fs->fmtstr.ptr);
+         fs->fmtstr.ptr++) {
+        if (!check_fmtstr(fs))
+            return NULL;
+    }
+    result = STROBJ_NEW(startptr, fs->fmtstr.ptr - startptr);
+    if (result == NULL)
+        return NULL;
+    if (isargument && (fs->keyword_arg_set != NULL))
+        PySet_Discard(fs->keyword_arg_set, result);
+    /*
+        We might want to add code here to check for invalid Python
+        identifiers.  All identifiers are eventually passed to getattr
+        or getitem, so there is a check when used.  However, we might
+        want to remove (or not) the ability to have strings like
+        "a/b" or " ab" or "-1" (which is not parsed as a number).
+        For now, this is left as an exercise for the first disgruntled
+        user...
+
+    if (XXX -- need check function) {
+        Py_DECREF(result);
+        PyErr_SetString(PyExc_ValueError, "Invalid embedded Python identifier");
+        return NULL;
+    }
+    */
+    return result;
+}
+
+/*
+    If keywords are supplied as a sequence of dictionaries
+    (e.g. locals/globals) then name_mapper will do multiple
+    lookups until it finds the right information.  This
+    should not be called (keywords_is_tuple should not be
+    set) unless fs->keywords is a tuple.
+*/
+static PyObject *
+name_mapper(PyObject *keywords, PyObject *key)
+{
+    PyObject *result;
+    int index;
+    int lastindex = PyTuple_GET_SIZE(keywords)-1;
+
+    for (index=0;; index++) {
+        result = PyObject_GetItem(PyTuple_GET_ITEM(keywords, index), key);
+        if (result != NULL) {
+            Py_INCREF(result);
+            return result;
+        }
+        if (index >= lastindex)
+            return NULL;
+        PyErr_Clear();
+    }
+}
+
+/*
+    get_integer_index consumes 0 or more decimal digit characters
+    from a format string, updates *result with the corresponding
+    positive integer, and returns the number of digits consumed.
+
+    if the isargument parameter is true, it will remove the
+    integer from the arguments bitset.
+*/
+static int
+get_integer_index(FmtState *fs, Py_ssize_t *result)
+{
+    Py_ssize_t accumulator, digitval, oldaccumulator;
+    int numdigits;
+    accumulator = numdigits = 0;
+    for (;;fs->fmtstr.ptr++, numdigits++) {
+        if (fs->fmtstr.ptr >= fs->fmtstr.end)
+            break;
+        digitval = CH_TYPE_TODECIMAL(*fs->fmtstr.ptr);
+        if (digitval < 0)
+            break;
+        /*
+           This trick was copied from old Unicode format code.  It's cute,
+           but would really suck on an old machine with a slow divide
+           implementation.  Fortunately, in the normal case we do not
+           expect too many digits.
+        */
+        oldaccumulator = accumulator;
+        accumulator *= 10;
+        if ((accumulator+10)/10 != oldaccumulator+1)
+            return (int)SetError("field width or index value too large");
+        accumulator += digitval;
+    }
+    *result = accumulator;
+    return numdigits;
+}
+
+/*
+    get_specifier retrieves the part of the format string
+    between the colon and trailing }.
+*/
+static int
+get_specifier(FmtState *fs)
+{
+    CH_TYPE c;
+
+    int curlycount, gotcurly;
+
+    curlycount = 1;
+    gotcurly = 0;
+
+    fs->fieldspec.ptr = fs->fmtstr.ptr;
+    for (;;) {
+        if (!check_fmtstr(fs))
+            return 0;
+        c = *fs->fmtstr.ptr++;
+        if (c == '{') {
+            gotcurly = 1;
+            curlycount++;
+        }
+        else if (c == '}') {
+            curlycount--;
+            if (curlycount <= 0)
+                break;
+        }
+    }
+    fs->fieldspec.end = fs->fmtstr.ptr - 1;
+    if (gotcurly) {
+        PyObject *myobject;
+        SubString savefmt = fs->fmtstr;
+        fs->fmtstr.ptr = fs->fieldspec.ptr;
+        fs->fmtstr.end = fs->fieldspec.end;
+        myobject = recurse_format(fs);
+        if (myobject == NULL)
+            return 0;
+        fs->fieldspec = make_substrobj(myobject);
+        fs->fmtstr = savefmt;
+    }
+    return 1;
+}
+
+/*
+    get_field_object returns the object inside {}
+    It handles getindex and getattr lookups and consumes
+    the format string up to but not including the trailing
+    } or the optional : format specifier separator.
+*/
+static PyObject *
+get_field_object(FmtState *fs)
+{
+    PyObject *myobj, *subobj, *newobj;
+    CH_TYPE c;
+    Py_ssize_t index;
+    int isindex, expectclose, isnumeric, isargument;
+
+    if (!check_fmtstr(fs))
+        return NULL;
+    isnumeric = (CH_TYPE_ISDECIMAL(*fs->fmtstr.ptr));
+    myobj = isnumeric ? fs->args : fs->keywords;
+    Py_INCREF(myobj);
+
+    for (isindex=1, expectclose=0, isargument=1;;) {
+        if (!check_fmtstr(fs))
+            break;
+        if (!isindex) {
+            if ((subobj = get_python_identifier(fs, isargument)) == NULL)
+                break;
+            newobj = (isargument && fs->keywords_is_tuple)
+                       ? name_mapper(myobj, subobj)
+                       : PyObject_GetAttr(myobj, subobj);
+            Py_DECREF(subobj);
+        }
+        else {
+            isnumeric = (CH_TYPE_ISDECIMAL(*fs->fmtstr.ptr));
+            if (isnumeric) {
+                get_integer_index(fs, &index);
+                if (isargument)
+                    fs->positional_arg_set &= ~(1 << index);
+            }
+            if (isnumeric && PySequence_Check(myobj))
+                newobj = PySequence_GetItem(myobj, index);
+            else {
+                /* XXX -- do we need PyLong_FromLongLong?  Using ssizet, not int... */
+                subobj = isnumeric ?
+                          PyInt_FromLong(index) :
+                          get_python_identifier(fs, isargument);
+                if (subobj == NULL)
+                    break;
+                newobj = PyObject_GetItem(myobj, subobj);
+                Py_DECREF(subobj);
+            }
+        }
+        Py_DECREF(myobj);
+        myobj = newobj;
+        if (expectclose)
+            if  ((!check_fmtstr(fs)) || (*fs->fmtstr.ptr++ != ']')) {
+                SetError("Expected ]");
+                break;
+            }
+        if (!check_fmtstr(fs))
+            break;
+        c = *fs->fmtstr.ptr;
+        if ((c == '}') || (c == ':'))
+           return myobj;
+        fs->fmtstr.ptr++;
+        isargument = 0;
+        isindex = expectclose = (c == '[');
+        if (!isindex && (c != '.')) {
+           SetError("Expected ., [, :, or }");
+           break;
+        }
+    }
+    Py_DECREF(myobj);
+    return NULL;
+}
+/*
+    get_field_and_spec calls subfunctions to retrieve the
+    field object and optional specification string.
+*/
+static PyObject *
+get_field_and_spec(FmtState *fs)
+{
+    PyObject *myobj;
+    CH_TYPE c;
+
+    fs->fieldspec.ptr = fs->fieldspec.end = fs->fmtstr.ptr;
+    fs->fieldspec.obj = NULL;
+
+    myobj = get_field_object(fs);
+    if (myobj != NULL) {
+        if (check_fmtstr(fs)) {
+            c = *fs->fmtstr.ptr++;
+            if ((c == '}') || ((c == ':') && (get_specifier(fs))))
+                return myobj;
+        }
+        Py_DECREF(myobj);
+    }
+    return NULL;
+}
+
+/*
+    user_format is invoked to format an object with a defined __format__
+    attribute.
+*/
+static int
+user_format(FmtState *fs, PyObject *__format__)
+{
+    PyObject *myobj;
+    int ok;
+
+    myobj = fs->fieldspec.obj;
+    if (myobj == NULL) {
+        myobj = STROBJ_NEW(fs->fieldspec.ptr,
+                    fs->fieldspec.end - fs->fieldspec.ptr);
+        if (myobj == NULL)
+            return 0;
+        fs->fieldspec.obj = myobj;   /* Owned by our caller now */
+    }
+    /* XXX -- possible optimization to CallFunctionWithArgs */
+    myobj = PyObject_CallFunction(__format__, "(O)", myobj);
+    if (myobj == NULL)
+        return 0;
+    ok = STROBJ_CHECK(myobj);
+    if (!ok)
+        SetError("__format__ method did not return correct string type");
+    else
+        ok = output_data(fs, STROBJ_AS_PTR(myobj),
+                            STROBJ_GET_SIZE(myobj));
+    Py_DECREF(myobj);
+    return ok;
+}
+
+typedef struct {
+    CH_TYPE fill_char;
+    CH_TYPE align;
+    CH_TYPE sign;
+    Py_ssize_t width;
+    Py_ssize_t precision;
+    CH_TYPE type;
+} DefaultFormat;
+
+/*
+    parse the default specification
+*/
+
+static int
+parse_default_format(FmtState *fs, DefaultFormat *format)
+{
+    Py_ssize_t index = 0;
+    Py_ssize_t specified_width;
+    Py_ssize_t remaining;
+    SubString *spec = &fs->fmtstr;
+
+    format->fill_char = '\0';
+    format->align = '\0';
+    format->sign = '\0';
+    format->width = -1;
+    format->precision = -1;
+    format->type = '\0';
+
+    /* cache the length, since that's convenient */
+    Py_ssize_t spec_len = spec->end - spec->ptr;
+
+    /* If the second char is an alignment token,
+       then parse the fill char */
+    if (spec_len >= 2 && alignment_token(spec->ptr[1])) {
+        format->align = spec->ptr[1];
+        format->fill_char = spec->ptr[0];
+        index = 2;
+    } else if (spec_len >= 1 && alignment_token(spec->ptr[0])) {
+        format->align = spec->ptr[0];
+        index = 1;
+    }
+
+    /* Parse the various sign options */
+    if (index < spec_len && sign_element(spec->ptr[index])) {
+        format->sign = spec->ptr[index];
+        index++;
+        if (index < spec_len && spec->ptr[index] == ')') {
+            index++;
+        }
+    }
+
+    /* The special case for 0-padding (backwards compat) */
+    if (format->fill_char == '\0' && index < spec_len && spec->ptr[index] == '0') {
+        format->fill_char = '0';
+        if (format->align == '\0') {
+            format->align = '=';
+        }
+        index++;
+    }
+
+    specified_width = get_integer_index(fs, &format->width);
+
+    /* recalculate the length, since the pointers may have just changed */
+    spec_len = spec->end - spec->ptr;
+
+    /* if specified_width is 0, we didn't consume any characters for the width.
+       in that case, reset the width to -1, because get_integer_index() will
+       have set it to zero */
+    if (specified_width <= 0) {
+        format->width = -1;
+    }
+
+    /* Parse field precision */
+    if (index < spec_len && spec->ptr[index] == '.') {
+        index++;
+
+        specified_width = get_integer_index(fs, &format->precision);
+
+        /* recalculate the length, since the pointers may have just changed */
+        spec_len = spec->end - spec->ptr;
+
+        /* again, check if any characters specified */
+        if (specified_width <= 0) {
+            format->precision = -1;
+        }
+    }
+
+    /* Finally, parse the type field */
+
+    remaining = spec_len - index;
+    if (remaining > 1) {
+        /* invalid conversion spec */
+        SetError("Invalid conversion specification");
+        return 0;
+    }
+
+    if (remaining == 1) {
+        format->type = spec->ptr[index];
+    }
+
+    return 1;
+}
+
+
+/* conversion functions */
+static Py_ssize_t
+convert_binary(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_char(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_decimal(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_exponent(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_exponentUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_fixed(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_fixedUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_general(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_generalUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_number(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_octal(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_repr(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    int ok;
+    PyObject *s;
+    PyObject *r;
+    Py_ssize_t len;
+
+    r = PyObject_Repr(fieldobj);
+    if (r == NULL)
+        return 0;
+
+    s = STROBJ_STR(r);
+    Py_DECREF(r);
+
+    len = STROBJ_GET_SIZE(s);
+    ok = output_data(fs, STROBJ_AS_PTR(s), len);
+    Py_DECREF(s);
+
+    return ok ? len : -1;
+}
+
+static Py_ssize_t
+convert_string(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    PyObject *myobj;
+    Py_ssize_t ok;
+    Py_ssize_t len;
+
+    myobj = STROBJ_STR(fieldobj);
+    if (myobj == NULL)
+        return -1;
+
+    len = STROBJ_GET_SIZE(myobj);
+
+    ok = output_data(fs, STROBJ_AS_PTR(myobj), len);
+    Py_DECREF(myobj);
+
+    return ok ? len : -1;
+}
+
+static Py_ssize_t
+convert_hex(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_hexUC(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+static Py_ssize_t
+convert_percentage(PyObject *fieldobj, FmtState *fs, CH_TYPE *sign)
+{
+    return -1;
+}
+
+/*
+    default_format -- "Then a miracle occurs"
+*/
+static int default_format(FmtState *fs, PyObject *fieldobj)
+{
+#if DUMMY_FORMATTING == 1
+    PyObject *myobj;
+    int ok;
+
+    /* Test implementation, only at top level */
+    CH_TYPE under = '_';
+
+    myobj = STROBJ_STR(fieldobj);
+    if (myobj == NULL)
+        return 0;
+    ok = output_data(fs, STROBJ_AS_PTR(myobj),
+                         STROBJ_GET_SIZE(myobj));
+    Py_DECREF(myobj);
+    if (!ok)
+        return 0;
+    if (fs->fieldspec.ptr != fs->fieldspec.end) {
+        if (!output_data(fs, &under, 1))
+            return 0;
+        if (!output_data(fs, fs->fieldspec.ptr,
+                             fs->fieldspec.end - fs->fieldspec.ptr))
+            return 0;
+    }
+#else
+
+    Py_ssize_t len;
+    Py_ssize_t padding;
+    DefaultFormat format;
+    ConversionFunction conversion;
+    CH_TYPE sign = '\0';
+    CH_TYPE prefix;
+    CH_TYPE suffix;
+
+    if (!parse_default_format(fs, &format)) {
+        return 0;
+    }
+
+    /* if no type character was specified, look up the default type character, based on the type of our object */
+    if (format.type == '\0') {
+        if (PyInt_Check(fieldobj) || PyLong_Check(fieldobj)) {
+            format.type = 'd';
+        } else if (PyFloat_Check(fieldobj)) {
+            format.type = 'g';
+        } else {
+            format.type = 's';
+        }
+    }
+
+    /* handle conversion functions that logically map to other conversion functions? */
+
+    conversion = conversion_function(format.type);
+    if (conversion == NULL) {
+        SetError("Invalid conversion character");
+        return 0;
+    }
+
+    /* convert to a string first */
+    /* get the length written so that we can fixup inside the buffer, as needed */
+    len = conversion(fieldobj, fs, &sign);
+    if (len < 0)
+        return 0;
+
+    /* we wrote "len" bytes.  see what fixups need to be done */
+
+    /* Handle the sign logic */
+    prefix = '\0';
+    suffix = '\0';
+    if (sign == '-') {
+        if (format.sign == '(') {
+            prefix = '(';
+            suffix = ')';
+        } else {
+            prefix = '-';
+        }
+    } else if (sign == '+') {
+        if (format.sign == '+') {
+            prefix = '+';
+        } else if (format.sign == ' ') {
+            prefix = ' ';
+        }
+    }
+
+    /* Handle the padding logic */
+    if (format.width != -1) {
+        padding = format.width - len - (prefix == '\0' ? 0 : 1) - (suffix == '\0' ? 0 : 1);
+        if (padding > 0) {
+#if 0
+            if align == '>' or align == '^':
+                return fill_char * padding + prefix + result + suffix
+            elif align == '='
+                return prefix + fill_char * padding + result + suffix
+            else:
+                return prefix + result + suffix + fill_char * padding
+
+#endif
+                    }
+    }
+
+#endif
+    return 1;
+}
+
+/*
+    renderfield determines if the field object has a defined __format__
+    method, and dispatches to the appropriate subfunction.
+*/
+static int
+renderfield(FmtState *fs, PyObject *fieldobj)
+{
+    int result;
+    SubString savefmt;
+
+    PyObject *__format__ = PyObject_GetAttrString(fieldobj, "__format__");
+    if (__format__ != NULL) {
+        result = user_format(fs, __format__);
+        Py_DECREF(__format__);
+    }
+    else {
+        /* XXX -- saw other things just do this, but Guido mentioned
+                  that maybe we should check whether the error was
+                  an AttributeError or not.  Figure out if this is
+                  necessary -- if so, and not AttributeError, propagate
+                  the error up the stack.
+        */
+        PyErr_Clear();  /* For GetAttr __format__ */
+
+        savefmt = fs->fmtstr;
+        fs->fmtstr.ptr = fs->fieldspec.ptr;
+        fs->fmtstr.end = fs->fieldspec.end;
+        result = default_format(fs, fieldobj);
+        fs->fmtstr = savefmt;
+    }
+    return result;
+}
+
+/*
+    do_format is the main program loop.  It rummages through
+    the format string, looking for escapes to markup, and
+    calls other functions to move non-markup text to the output,
+    and to perform the markup to the output.
+*/
+static PyObject *
+do_format(FmtState *fs)
+{
+    PyObject *myobj;
+    CH_TYPE c, *start;
+    Py_ssize_t count, total;
+    SubString fmtstr;
+    int doubled, ok;
+
+    fmtstr = fs->fmtstr;
+    count = fmtstr.end - fmtstr.ptr;
+    myobj = STROBJ_NEW(NULL, count + INITIAL_SIZE_INCREMENT);
+    if (myobj == NULL)
+        return NULL;
+    fs->outstr = make_substrobj(myobj);
+    fs->size_increment = INITIAL_SIZE_INCREMENT;
+
+    ok = 1;
+    c = '\0';  /* Avoid compiler warning */
+    while (fmtstr.ptr < fmtstr.end) {
+        start = fmtstr.ptr;
+        count = total = fmtstr.end - start;
+        while (count && ((c = *fmtstr.ptr) != '{') && (c != '}')) {
+            fmtstr.ptr++;
+            count--;
+        }
+        count = total - count;
+        total -= count;
+        doubled = (total > 1) && (fmtstr.ptr[1] == c);
+        if (doubled) {
+            output_data(fs, start, count+1);
+            fmtstr.ptr += 2;
+            continue;
+        } else if (count)
+            output_data(fs, start, count);
+        if (total < 2) {
+            ok = !total ||
+                   (int)SetError("Invalid format string -- { or } at end");
+            break;
+        }
+        if (c == '}') {
+            SetError("Invalid format string -- single } encountered");
+            ok = 0;
+            break;
+        }
+        fs->fmtstr.ptr = fmtstr.ptr + 1;
+        myobj = get_field_and_spec(fs);
+        ok = (myobj != NULL) && renderfield(fs, myobj);
+        Py_XDECREF(fs->fieldspec.obj);
+        Py_XDECREF(myobj);
+        if (!ok)
+             break;
+        fmtstr.ptr = fs->fmtstr.ptr;
+    }
+    myobj = fs->outstr.obj;
+    if (ok) {
+        count = fs->outstr.ptr - STROBJ_AS_PTR(myobj);
+        if (STROBJ_RESIZE(&myobj, count) >= 0)
+            return myobj;
+    }
+    Py_XDECREF(myobj);
+    return NULL;
+}
+
+/*
+    recurse_format is called for nested format specifiers,
+    e.g. {1:{2}}.  It saves off the current information,
+    and recursively calls do_format.
+*/
+static PyObject *
+recurse_format(FmtState *fs)
+{
+    PyObject *result;
+    SubStringObj saveoutstr = fs->outstr;
+    int saveincrement = fs->size_increment;
+    if (--(fs->max_recursion) < 0)
+        return SetError("Max string recursion exceeded");
+    result = do_format(fs);
+    fs->max_recursion++;
+    fs->outstr = saveoutstr;
+    fs->size_increment = saveincrement;
+    return result;
+}
+
+/*
+    STROBJ_FORMAT (actually PyUnicode_FormatMethod or PyString_FormatMethod)
+    is the public interface to the module.
+
+    XXX -- do we need to check input types here, or are we guaranteed
+       they are right????
+*/
+PyObject *
+STROBJ_FORMAT(PyObject *self, PyObject *args, PyObject *keywords)
+{
+    FmtState fs;
+
+    fs.max_recursion = 4;
+    fs.allow_leading_under = 1;
+    fs.positional_arg_set = 0;
+    fs.keyword_arg_set = NULL;
+    fs.keywords_is_tuple = 0;
+
+    fs.fmtstr.ptr = STROBJ_AS_PTR(self);
+    fs.fmtstr.end = fs.fmtstr.ptr + STROBJ_GET_SIZE(self);
+    fs.args = args;
+    fs.keywords = keywords;
+
+    return do_format(&fs);
+}
+
+#ifdef __cplusplus
+}
+#endif


More information about the Python-checkins mailing list