[Chicago] is there really no built-in file/iter split() thing?

Fri Nov 30 23:30:24 CET 2007

This works, sort of...it doesn't tokenize it just yet, it chunks it  
into arrays of characters up until a ;

def chunker(file):
     chunk = []
     for line in file:
         for c in line: #read each character from the line
             chunk.append(c)
             if c == ';':
                 yielder = chunk
                 chunk = []
                 yield yielder

I refuse to write a test.

In [64]: for c in chunker(open('./autoused/vendor/rails/activerecord/ 
test/fixtures/db_definitions/sqlite.sql')):
    ....:     print c
    ....:
    ....:
['C', 'R', 'E', 'A', 'T', 'E', ' ', 'T', 'A', 'B', 'L', 'E', ' ',  
"'", 'a', 'c', 'c', 'o', 'u', 'n', 't', 's', "'", ' ', '(', '\n', '  
', ' ', "'", 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', '  
', 'P', 'R', 'I', 'M', 'A', 'R', 'Y', ' ', 'K', 'E', 'Y', ' ', 'N',  
'O', 'T', ' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ', "'", 'f',  
'i', 'r', 'm', '_', 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E',  
'R', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L',  
',', '\n', ' ', ' ', "'", 'c', 'r', 'e', 'd', 'i', 't', '_', 'l',  
'i', 'm', 'i', 't', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', ' ',  
'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L', '\n',  
')', ';']
['\n', '\n', 'C', 'R', 'E', 'A', 'T', 'E', ' ', 'T', 'A', 'B', 'L',  
'E', ' ', "'", 'f', 'u', 'n', 'n', 'y', '_', 'j', 'o', 'k', 'e', 's',  
"'", ' ', '(', '\n', ' ', ' ', "'", 'i', 'd', "'", ' ', 'I', 'N',  
'T', 'E', 'G', 'E', 'R', ' ', 'P', 'R', 'I', 'M', 'A', 'R', 'Y', ' ',  
'K', 'E', 'Y', ' ', 'N', 'O', 'T', ' ', 'N', 'U', 'L', 'L', ',',  
'\n', ' ', ' ', "'", 'n', 'a', 'm', 'e', "'", ' ', 'T', 'E', 'X',  
'T', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L',  
'\n', ')', ';']
['\n', '\n', 'C', 'R', 'E', 'A', 'T', 'E', ' ', 'T', 'A', 'B', 'L',  
'E', ' ', "'", 'c', 'o', 'm', 'p', 'a', 'n', 'i', 'e', 's', "'", ' ',  
'(', '\n', ' ', ' ', "'", 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E',  
'G', 'E', 'R', ' ', 'P', 'R', 'I', 'M', 'A', 'R', 'Y', ' ', 'K', 'E',  
'Y', ' ', 'N', 'O', 'T', ' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', '  
', "'", 't', 'y', 'p', 'e', "'", ' ', 'V', 'A', 'R', 'C', 'H', 'A',  
'R', '(', '2', '5', '5', ')', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T',  
' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ', "'", 'r', 'u', 'b',  
'y', '_', 't', 'y', 'p', 'e', "'", ' ', 'V', 'A', 'R', 'C', 'H', 'A',  
'R', '(', '2', '5', '5', ')', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T',  
' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ', "'", 'f', 'i', 'r',  
'm', '_', 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', ' ',  
'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L', ',',  
'\n', ' ', ' ', "'", 'n', 'a', 'm', 'e', "'", ' ', 'T', 'E', 'X',  
'T', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L',  
',', '\n', ' ', ' ', "'", 'c', 'l', 'i', 'e', 'n', 't', '_', 'o',  
'f', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', ' ', 'D', 'E', 'F',  
'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ',  
"'", 'r', 'a', 't', 'i', 'n', 'g', "'", ' ', 'I', 'N', 'T', 'E', 'G',  
'E', 'R', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', '1', '\n',  
')', ';']

* truncated for awesomeness *

On Nov 30, 2007, at 3:49 PM, Kumar McMillan wrote:

> [In the hope that Chris has another awesome response...]
>
> Here is another: I have a big sql file (45M) and need to iter through
> the statements---no fancy sql parsing, I just want the statements.
> Assuming open('big.sql').read().split(';') would be a dumb idea, I
> couldn't find anything in stdlib to do this.  What am I missing?  I
> thought the tokenize module would but I couldn't see how at first
> glance.
>
> def readsplit(filelike, token):
>     """yields each chunk between tokens in contents of filelike  
> object.
>
>     For example::
>
>>>> [c for c in readsplit(StringIO('''bad; ass; elf in
>         ... the forest;'''), ';')]
>         ...
>         ['bad', ' ass', ' elf in \\nthe forest', '']
>>>> [c for c in readsplit(StringIO(''';
>         ... 1,2,3;
>         ...    and 4; and
>         ... even 5'''), ';')]
>         ...
>         ['', '\\n1,2,3', '\\n   and 4', ' and\\neven 5']
>>>>
>
>     """
>     buf = []
>     for line in filelike:
>         buf.append(line)
>         line = ''.join(buf)
>         buf[:] = []
>         chunks = line.split(';')
>         for chunk in chunks[:-1]:
>             yield chunk
>         buf.append(chunks[-1])
>     if len(buf):
>         yield ''.join(buf) 
> <readsplit.py>_______________________________________________
> Chicago mailing list
> Chicago at python.org
> http://mail.python.org/mailman/listinfo/chicago