Large File Parsing

Sun Jun 15 19:59:10 EDT 2003

I have upto a 3 million record file to parse, remove duplicates and
sort by size then numeric value. Is this the best way to do this in
python. The key is the first column and the ,xx needs removed.

1234567,12
123456789012,12

import os
import string

def filesort( input1,input2 ):
    if (len (input1) > len( input2 )):
        return 1
    if (len (input1) < len( input2 )):
        return -1
    return cmp(input1,input2)

def run():
    lines = open('input.dat').readlines()
    oufile = open ( 'output.dat' , 'w')
    dictfile = {}

    for eachline in lines:
        splitline = string.splitfields(eachline, ',')
        #add \n so that I can use writlines
        filekey = splitline[0] + '\n'
        dictfile[filekey] = eachline
    filekeys = dictfile.keys()
    filekeys.sort(filesort)
    oufile.writelines( filekeys)
    oufile.close()

if __name__ == '__main__':
    run()