should writing Unicode files be so slow

Thu Mar 18 18:29:41 EDT 2010

I have a simple program to read a text (.csv) file and split it into
several smaller files. Tonight I decided to write a unicode variant and was
surprised at the difference in performance. Is there a better way?

> from __future__ import with_statement
> import codecs
> 
> def _rowreader(filename, separator='\t'):
>     """Generator for iteration over potentially large file."""
>     with codecs.open(filename, 'rU', 'utf-8', 'backslashreplace') as tabfile:  
>         for row in tabfile:
>             yield [v.strip() for v in row.split(separator)]  
> 
> def generator_of_output(source_of_lines):
>     for line in source_of_lines:
>         for result in some_function(line):
>             yield result
> 
> def coroutine(outfile_prefix, outfile_suffix, sep='\t'):
>     outfile = '%s_%s.txt'%  (outfile_prefix, outfile_suffix)
>     with codecs.open(outfile, 'w', 'utf-8') as out_part:
>         while True:
>             line = (yield)
>             out_part.write(sep.join(line) + '\n')
> 
> def _file_to_files(infile, outfile_prefix, column, sep):
>     column_values = dict()
>     for line in _rowreader(infile, sep):
>         outfile_suffix = line[column].strip('\'\"')
>         if  outfile_suffix  in  column_values:
>             column_values[outfile_suffix].send(line)
>         else:
>             file_writer = coroutine(outfile_prefix, outfile_suffix, sep)
>             file_writer.next()
>             file_writer.send(line)
>             column_values[outfile_suffix] = file_writer
>     for file_writer in column_values.itervalues():
>         file_writer.close()    

the plain version is the same except for
>     with open(filename, 'rU') as tabfile:
>     with open(outfile, 'wt') as out_part:

The difference:
> "uid","timestamp","taskid","inputid","value"
> "15473178739336026589","2010-02-18T20:50:15+0000","11696870405","73827093507","83523277829"
> "15473178739336026589","2010-02-18T20:50:15+0000","11696870405","11800677379","12192844803"
> "15473178739336026589","2010-02-18T20:50:15+0000","11696870405","31231839235","52725552133"
> 
> sysweb at Bembo:~/UCLC/bbc/wb2$ wc -l wb.csv
> 9293271 wb.csv
> 
> normal version
> sysweb at Bembo:~/UCLC$ time ~/UCL/toolkit/file_splitter.py -o tt --separator comma -k 2 wb.csv
> 
> real	0m43.714s
> user	0m37.370s
> sys	0m2.732s
> 
> unicode version
> sysweb at Bembo:~/UCLC$ time ./file_splitter.py -o t --separator comma -k 2 wb.csv
> 
> real	4m8.695s
> user	3m19.236s
> sys	0m39.262s

-- 
David Clark, MSc, PhD.              UCL Centre for Publishing
                                    Gower Str London WCIE 6BT
What sort of web animal are you?
            <https://www.bbc.co.uk/labuk/experiments/webbehaviour>