[Chicago] is there really no built-in file/iter split() thing?
Chris McAvoy
chris.mcavoy at gmail.com
Fri Nov 30 23:30:24 CET 2007
This works, sort of...it doesn't tokenize it just yet, it chunks it
into arrays of characters up until a ;
def chunker(file):
chunk = []
for line in file:
for c in line: #read each character from the line
chunk.append(c)
if c == ';':
yielder = chunk
chunk = []
yield yielder
I refuse to write a test.
In [64]: for c in chunker(open('./autoused/vendor/rails/activerecord/
test/fixtures/db_definitions/sqlite.sql')):
....: print c
....:
....:
['C', 'R', 'E', 'A', 'T', 'E', ' ', 'T', 'A', 'B', 'L', 'E', ' ',
"'", 'a', 'c', 'c', 'o', 'u', 'n', 't', 's', "'", ' ', '(', '\n', '
', ' ', "'", 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', '
', 'P', 'R', 'I', 'M', 'A', 'R', 'Y', ' ', 'K', 'E', 'Y', ' ', 'N',
'O', 'T', ' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ', "'", 'f',
'i', 'r', 'm', '_', 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E',
'R', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L',
',', '\n', ' ', ' ', "'", 'c', 'r', 'e', 'd', 'i', 't', '_', 'l',
'i', 'm', 'i', 't', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', ' ',
'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L', '\n',
')', ';']
['\n', '\n', 'C', 'R', 'E', 'A', 'T', 'E', ' ', 'T', 'A', 'B', 'L',
'E', ' ', "'", 'f', 'u', 'n', 'n', 'y', '_', 'j', 'o', 'k', 'e', 's',
"'", ' ', '(', '\n', ' ', ' ', "'", 'i', 'd', "'", ' ', 'I', 'N',
'T', 'E', 'G', 'E', 'R', ' ', 'P', 'R', 'I', 'M', 'A', 'R', 'Y', ' ',
'K', 'E', 'Y', ' ', 'N', 'O', 'T', ' ', 'N', 'U', 'L', 'L', ',',
'\n', ' ', ' ', "'", 'n', 'a', 'm', 'e', "'", ' ', 'T', 'E', 'X',
'T', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L',
'\n', ')', ';']
['\n', '\n', 'C', 'R', 'E', 'A', 'T', 'E', ' ', 'T', 'A', 'B', 'L',
'E', ' ', "'", 'c', 'o', 'm', 'p', 'a', 'n', 'i', 'e', 's', "'", ' ',
'(', '\n', ' ', ' ', "'", 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E',
'G', 'E', 'R', ' ', 'P', 'R', 'I', 'M', 'A', 'R', 'Y', ' ', 'K', 'E',
'Y', ' ', 'N', 'O', 'T', ' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', '
', "'", 't', 'y', 'p', 'e', "'", ' ', 'V', 'A', 'R', 'C', 'H', 'A',
'R', '(', '2', '5', '5', ')', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T',
' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ', "'", 'r', 'u', 'b',
'y', '_', 't', 'y', 'p', 'e', "'", ' ', 'V', 'A', 'R', 'C', 'H', 'A',
'R', '(', '2', '5', '5', ')', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T',
' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ', "'", 'f', 'i', 'r',
'm', '_', 'i', 'd', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', ' ',
'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L', ',',
'\n', ' ', ' ', "'", 'n', 'a', 'm', 'e', "'", ' ', 'T', 'E', 'X',
'T', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L',
',', '\n', ' ', ' ', "'", 'c', 'l', 'i', 'e', 'n', 't', '_', 'o',
'f', "'", ' ', 'I', 'N', 'T', 'E', 'G', 'E', 'R', ' ', 'D', 'E', 'F',
'A', 'U', 'L', 'T', ' ', 'N', 'U', 'L', 'L', ',', '\n', ' ', ' ',
"'", 'r', 'a', 't', 'i', 'n', 'g', "'", ' ', 'I', 'N', 'T', 'E', 'G',
'E', 'R', ' ', 'D', 'E', 'F', 'A', 'U', 'L', 'T', ' ', '1', '\n',
')', ';']
* truncated for awesomeness *
On Nov 30, 2007, at 3:49 PM, Kumar McMillan wrote:
> [In the hope that Chris has another awesome response...]
>
> Here is another: I have a big sql file (45M) and need to iter through
> the statements---no fancy sql parsing, I just want the statements.
> Assuming open('big.sql').read().split(';') would be a dumb idea, I
> couldn't find anything in stdlib to do this. What am I missing? I
> thought the tokenize module would but I couldn't see how at first
> glance.
>
> def readsplit(filelike, token):
> """yields each chunk between tokens in contents of filelike
> object.
>
> For example::
>
>>>> [c for c in readsplit(StringIO('''bad; ass; elf in
> ... the forest;'''), ';')]
> ...
> ['bad', ' ass', ' elf in \\nthe forest', '']
>>>> [c for c in readsplit(StringIO(''';
> ... 1,2,3;
> ... and 4; and
> ... even 5'''), ';')]
> ...
> ['', '\\n1,2,3', '\\n and 4', ' and\\neven 5']
>>>>
>
> """
> buf = []
> for line in filelike:
> buf.append(line)
> line = ''.join(buf)
> buf[:] = []
> chunks = line.split(';')
> for chunk in chunks[:-1]:
> yield chunk
> buf.append(chunks[-1])
> if len(buf):
> yield ''.join(buf)
> <readsplit.py>_______________________________________________
> Chicago mailing list
> Chicago at python.org
> http://mail.python.org/mailman/listinfo/chicago
More information about the Chicago
mailing list