Finding empty columns. Is there a faster way?

nn pruebauno at latinmail.com
Thu Apr 21 12:40:36 EDT 2011


time head -1000000 myfile  >/dev/null

real    0m4.57s
user    0m3.81s
sys     0m0.74s

time ./repnullsalt.py '|' myfile
0 1 Null columns:
11, 20, 21, 22, 23, 24, 25, 26, 27, 30, 31, 33, 45, 50, 68

real    1m28.94s
user    1m28.11s
sys     0m0.72s



import sys
def main():
    with open(sys.argv[2],'rb') as inf:
        limit = sys.argv[3] if len(sys.argv)>3 else 1
        dlm = sys.argv[1].encode('latin1')
        nulls = [x==b'' for x in next(inf)[:-1].split(dlm)]
        enum = enumerate
        split = bytes.split
        out = sys.stdout
        prn = print
        for j, r in enum(inf):
            if j%1000000==0:
                prn(j//1000000,end=' ')
                out.flush()
                if j//1000000>=limit:
                    break
            for i, cur in enum(split(r[:-1],dlm)):
                nulls[i] |= cur==b''
    print('Null columns:')
    print(', '.join(str(i+1) for i,val in enumerate(nulls) if val))

if not (len(sys.argv)>2):
    sys.exit("Usage: "+sys.argv[0]+
         " <delimiter> <filename> <limit>")

main()



More information about the Python-list mailing list