data file auto-parsing

Sun Mar 9 21:04:15 EST 2003

On Wed, 5 Mar 2003 23:21:53 -0800, "Hilbert" <hilbert at microsoft.com>
wrote:

>Hello,
>
>Is there a parser that would convert python-like
>indented data files into data structures? Like:
>
>#--------------
>numbers
>    1
>    4
>    5
>colors
>    light
>        red
>        green
>    dark
>        blue
>#--------------
>
>Would be parsed into a list like:
>
>[    ['numbers',['1','4','5']],
>     ['colors',['light',['red','green']],
>                  ['dark',['blue']] ] ]
>
>Am I crazy?
>Thanks,
>Hilbert
>
>

I once did something like this for work.  Here it is (hopefully
cleaned up a bit)...

Manuel

# ###################################### #
# datafile_parsing00.py

import shlex
import StringIO
import re
import sys
import pprint
import types
import traceback

# ###################################### #

class IndentStream:
    """Stream with indent at start of each line.

    Indent can be a string or
    the number of spaces you wish to indent.

    """

    def __init__(self, indent='    ', stream=None):
        if stream is None:
            self._stream = sys.stdout
        else:
            self._stream = stream
        try:
            indent = ' ' * indent
        except TypeError:
            pass
        self._indent = indent
        self._nl_indent = '\n' + indent
        self._indent_size = len(indent)
        self._comma = 0

    def write(self, s):
        comma = not s.endswith('\n')
        s = s.replace('\n', self._nl_indent)
        if not self._comma: s = self._indent + s
        if s.endswith(self._nl_indent): s = s[:-self._indent_size]
        self._stream.write(s)
        self._comma = comma

def PrintEvalExec(s, stream=None):
    """Print and eval or exec lines in s.

    Each line is considered individually,
    so not suitable for indented code.
    Uses sys._getframe(1) so assignments
    will be in correct namespace.
    Tries to keep output lines less than 75 characters long.

    """

    if stream is None:
        stream = sys.stdout

    _indentstream = IndentStream(4, stream=stream)
    _pprint = pprint.PrettyPrinter(width=71,
stream=_indentstream).pprint  

    f = sys._getframe(1)
    for e in s.split('\n'):
        e = e.lstrip()
        if e:
            try:
                r = eval(e, f.f_globals, f.f_locals)
            except SyntaxError:
                exec e in f.f_globals, f.f_locals
                print >> stream, e
            else:
                if isinstance(r, types.StringTypes) and r.count('\n'):
                    print >> stream, '%s -> <multi-line string...>' %
(e,)
                    print >> _indentstream, r
                else:
                    x = '%s -> %r' % (e, r)
                    if len(x) > 75:
                        print >> stream, '%s ->' % (e,)
                        _pprint(r)
                    else:
                        print >> stream, x

# ###################################### #

_TABSPACES = ' ' * 4

# ###################################### #

_re_indent_rest = re.compile(r'(\s*)(.*)')

class shlexError(Exception):
    """Error in shlex"""
    pass

def _line_to_indent_tokenlist(L):
    """String into indent and tuple of tokens.append
    Makes an attempt to strip # comments."""
    L = L.replace('\t', _TABSPACES)
    re0 = _re_indent_rest.match(L)
    indent = len(re0.group(1))
    shlex0 = shlex.shlex(
        StringIO.StringIO(re0.group(2)))
    tokens = []
    while 1:
        try:
            t = shlex0.get_token()
        except:
            traceback.print_exc()
            err_last = traceback.extract_tb(sys.exc_info()[2])[-1]
            raise shlexError(
                'shlex gagged during get_token()',
                'shlex.token:%r shlex.lineno:%r' % (shlex0.token,
shlex0.lineno),
                err_last)
        if (t == '') or t.startswith('#'): break
        tokens.append(t)
    return indent, tuple(tokens)

def _simplify_list(list0):
    """Scan list0 for elements that are lists of length 1,
    simplify those elements, in place.
    Assumes all elements in list0 are lists,
    and no elements of length 0."""
    temp = []
    for x in list0:
        if len(x) == 1:
            temp.append(x[0])
        else:
            temp.append(x)
    list0[:] = temp[:]

def ProcessInput(s):
    """Indented lines into nested lists.

    Process lines of data, seperated by new-lines.

    """

    processed = [] # rough form of nested list we desire
    parent_list = [] # list of possible parents
    all_p_lines = [] # all p_lines put here too, for later cleaning

    for L in s.split('\n'):

        indent, tokens = _line_to_indent_tokenlist(L)
        if not tokens: continue

        # if only have one token in 'tokens', just get rid of
un-needed tuple
        if len(tokens) < 2:
            tokens = tokens[0]

        p_line = [indent, tokens, []]

        # Remove possible parents with an indent greater than
        # indent we are currently working with.
        while parent_list:
            if parent_list[-1][0] < indent: break
            del parent_list[-1]

        if not parent_list:
            # if no possible parents,
            # should be in 'processed 'at top level
            processed.append(p_line)
        else:
            # otherwise, insert it
            # easiest way to insert it is to find parent at
            # the end of 'parent_list', and 'processed' we
automagically
            # updated at the same time
            # (because it is the same object in 'parent_list' and in
            # 'processed')
            parent_list[-1][2].append(p_line)

        # and make current p_line a possible parent
        parent_list.append(p_line)

        # add to list of all p_lines (for more sneakiness later)
        all_p_lines.append(p_line)

    # now clean up!
    # when we clean 'all_p_lines', 'processed' automagically gets
cleaned too!
    all_p_lines.reverse()    
    for x in all_p_lines:
        del x[0] # no longer need indent information
        if not x[1]:
            # if no children, get rid of children list
            del x[1]
        else:
            _simplify_list(x[1])

    _simplify_list(processed)            

    return processed            

# ###################################### #

def _printevalexec(s):
    PrintEvalExec(s, stream=IndentStream(4))

_test_input = """#--------------
numbers
    1
    4
    5
colors
    light
        red
        green
    dark
        blue
#--------------"""

print '\n*** Example 1 ***\n'
_printevalexec('_test_input')
_printevalexec('ProcessInput(_test_input)')

_test_input = """#--------------
        junk0 # this comment should be ignored

junk1 junk2 junk3 'string #1' # as should this comment
    0
    1
        '1.0'
        '1.1'
        '1.2'
    2 2A 2B
    3
#--------------"""

print '\n*** Example 2 ***\n'
_printevalexec('_test_input')
_printevalexec('ProcessInput(_test_input)')

------------------------------------------------

*** Example 1 ***

    _test_input -> <multi-line string...>
        #--------------
        numbers
            1
            4
            5
        colors
            light
                red
                green
            dark
                blue
        #--------------
    ProcessInput(_test_input) ->
        [['numbers', ['1', '4', '5']],
         ['colors', [['light', ['red', 'green']], ['dark',
['blue']]]]]

*** Example 2 ***

    _test_input -> <multi-line string...>
        #--------------
                junk0 # this comment should be ignored

        junk1 junk2 junk3 'string #1' # as should this comment
            0
            1
                '1.0'
                '1.1'
                '1.2'
            2 2A 2B
            3
        #--------------
    ProcessInput(_test_input) ->
        ['junk0',
         [('junk1', 'junk2', 'junk3', "'string #1'"),
          ['0', ['1', ["'1.0'", "'1.1'", "'1.2'"]], ('2', '2A', '2B'),
'3']]]