Enumerating formatting strings

Thu Apr 21 16:47:51 EDT 2005

Steve Holden wrote:
> Michael Spencer wrote:
> 
>> Andrew Dalke wrote:
>>
>>> I see you assume that only \w+ can fit inside of a %()
>>> in a format string.  The actual Python code allows anything
>>> up to the balanced closed parens.
>>>
>> Gah! I guess that torpedoes the regexp approach, then.
>>
>> Thanks for looking at this
>>
>> Michael
>>
> While Andrew may have found the "fatal flaw" in your scheme, it's worth 
> pointing out that it works just fine for my original use case.
> 
> regards
>  Steve

Thanks.  Here's a version that overcomes the 'fatal' flaw.

class StringFormatInfo(object):

     def __init__(self, template):
         self.template = template
         self.parse()

     def tokenizer(self):
         lexer = TinyLexer(self.template)
         self.format_type = "POSITIONAL"
         while lexer.search("\%"):
             if lexer.match("\%"):
                 continue
             format = {}
             name = lexer.takeparens()
             if name is not None:
                 self.format_type = "MAPPING"
             format['name'] = name
             format['conversion'] = lexer.match("[\#0\-\+]")
             format['width'] = lexer.match("\d+|\*")
             format['precision'] = lexer.match("\.") and \
                 lexer.match("\d+|\*") or None
             format['lengthmodifier'] = lexer.match("[hlL]")
             ftype = lexer.match('[diouxXeEfFgGcrs]')
             if not ftype:
                 raise ValueError
             else:
                 format['type'] = ftype
             yield format

     def parse(self):
         self.formats = formats = list(self.tokenizer())
         if self.format_type == "MAPPING":
             self.format_names = dict((format['name'], format['type'])
                             for format in formats)
         else:
             format_names = []
             for format in formats:
                 if format['width'] == '*':
                     format_names.append('width')
                 if format['precision'] == '*':
                     format_names.append('precision')
                 format_names.append(format['type'])
             self.format_names = tuple(format_names)

     def __mod__(self, values):
         return self.template % values

     def __repr__(self):
         return "%s Template: %s\nArguments: %s" % \
                 (self.format_type, self.template, self.format_names)
     __str__ = __repr__

SFI = StringFormatInfo

def tests():
     print SFI('%(arg1)s %% %(arg2).*f %()s %s')
     print SFI('%s %*.*d %*s')
     print SFI('%(this(is)a.--test!)s')

import re

class TinyLexer(object):
     def __init__(self, text):
         self.text = text
         self.ptr = 0
         self.len = len(text)
         self.re_cache = {}

     def match(self, regexp, consume = True, anchor = True):
         if isinstance(regexp, basestring):
             cache = self.re_cache
             if regexp not in cache:
                 cache[regexp] = re.compile(regexp)
             regexp = cache[regexp]
         matcher = anchor and regexp.match or regexp.search
         match = matcher(self.text, self.ptr)
         if not match:
             return None
         if consume:
             self.ptr = match.end()
         return match.group()

     def search(self, regexp, consume = True):
         return self.match(regexp, consume=True, anchor=False)

     def takeparens(self):
         start = self.ptr
         if self.text[start] != '(':
             return None
         out = ''
         level = 1
         self.ptr += 1
         while self.ptr < self.len:
             nextchar = self.text[self.ptr]
             level += (nextchar == '(') - (nextchar == ')')
             self.ptr += 1
             if level == 0:
                 return out
             out += nextchar
         raise ValueError, "Unmatched parentheses"