looking for speed-up ideas
Ram Bhamidipaty
ramb at sonic.net
Tue Feb 4 16:06:11 EST 2003
Andrew Dalke <adalke at mindspring.com> writes:
> Ram Bhamidipaty wrote:
> > Ok. Here are two scripts:
>
> Indeed. I'm still surprised my code is that much slower.
>
> Here are my thoughts:
> - you use xreadlines.xreadlines(f) where I use "for line in f"
> I thought they are the same speed, but perhaps I'm wrong.
> Could you change that and find out?
With xreadlines the new speed is:
espring> ~/script2.1.py /tmp/foo /tmp/foo_2
3 function calls in 32.920 CPU seconds
>
> - You do
> try / int() / except Value Error / long()
> when I always do long(). It could be that longs are much more
> expensive than ints. Also, in newer pythons the int converts
> to a long as needed so this is no longer needed.
with both xreadlines and the try/excep for long/int:
espring> ~/script2.1.py /tmp/foo /tmp/foo_2
3 function calls in 29.410 CPU seconds
>
> - you do an explicit compare against the max size before insertion
> into the list while I always append. That can be a big slowdown,
> so change
>
> for line in infile:
> if line[:1] == "F":
> ignore, size, name = line.split("/")
> # negate size so 'largest' is sorted first
> fileinfo.append( (-long(size), dirid, name) )
> if len(fileinfo) > 10000:
> # Could use a heapq....
> fileinfo.sort()
> fileinfo = fileinfo[:200]
> into
>
> min_allowed = 0
>
> for line in infile:
> if line[:1] == "F":
> ignore, size, name = line.split("/")
> # negate size so 'largest' is sorted first
> size = long(size)
> if size > min_allowed:
> fileinfo.append( (-long(size), dirid, name) )
> if len(fileinfo) > 10000:
> # Could use a heapq....
> fileinfo.sort()
> del fileinfo[200:]
> min_allowed = -fileinfo[-1][0]
>
And now with all three:
espring> ~/script2.1.py /tmp/foo /tmp/foo_2
3 function calls in 17.610 CPU seconds
Here is the version that generated the above result:
----------------------------------------------------------------
#!/remote/espring/ramb/tools/bin/python
import sys, profile, xreadlines
def process(infile,outfile):
dirid_info = {}
line = infile.readline()
assert line[:1] == "T"
ignore, dirname, dirid = line.split()
dirid_info[dirid] = (None, dirname)
fileinfo = []
min_allowed = 0
for line in xreadlines.xreadlines(infile):
if line[:1] == "F":
ignore, size, name = line.split("/")
try:
size2 = int(size)
except ValueError:
size2 = long(size)
if size2 > min_allowed:
fileinfo.append( (-size2, dirid, name) )
if len(fileinfo) > 10000:
fileinfo.sort()
del fileinfo[200:]
min_allowed = -fileinfo[-1][0]
else:
ignore, dirname, parent_id, dirid = line[:-1].split("/")
dirid_info[dirid] = (parent_id, dirname)
fileinfo.sort()
fileinfo = fileinfo[:200]
for size, dirid, name in fileinfo:
size = -size
components = [name[:-1]] # need to chop newline
while dirid != None:
dirid, dirname = dirid_info[dirid]
components.append(dirname)
components.reverse()
print >>outfile, size, "/".join(components)
f = file(sys.argv[1], "r")
f2 = file(sys.argv[2], "w")
profile.run("process(f, f2)")
----------------------------------------------------------------
And for reference I've included my script1.py with the change suggested
by Tim Peters:
----------------------------------------------------------------
#!/remote/TMhome/ramb/tools/bin/python
import sys, xreadlines, pdb, os.path, imp
import profile
m = imp.find_module("heapqc", ["/remote/TMhome/ramb/src/Downloaded/py_heap"])
imp.load_module("heapqc", m[0], m[1], m[2])
import heapqc
class FileSize:
def __init__(self,name,size):
self.name = name
self.size = size
def __le__(self,other):
return self.size <= other.size
def read_diskinfo_file(f,fout):
global dirs
big_files = []
num_in_list = 0
sys.stdout.flush()
for l in xreadlines.xreadlines(f):
if l[0] == "F":
tup = l.split("/")
try:
fsize = int(tup[1])
except ValueError:
fsize = long(tup[1])
if num_in_list < 200:
big_files.append(FileSize((this_dir_num, tup[2]), fsize))
num_in_list += 1
if num_in_list == 200:
heapqc.heapify(big_files)
else:
if fsize < big_files[0].size:
continue
heapqc.heapreplace(big_files, \
FileSize((this_dir_num, tup[2]), fsize))
elif l[0] == "S":
tup = l.split("/")
name = tup[1]
par_num = int(tup[2])
this_dir_num = int(tup[3])
flist = []
dirs[this_dir_num] = (name, [], flist, this_dir_num, par_num, 0L)
elif l[0] == "T":
tup = l.split()
flist = []
dirs[0] = (tup[1], [], flist, 0, None, 0L)
final_order = []
heapqc.heapify(big_files)
while big_files:
o = heapqc.heappop(big_files)
final_order.append(o)
final_order.reverse()
for o in final_order:
name = get_dir_name(o.name[0]) + o.name[1]
print >>fout, o.size, name,
return
def get_dir_name(num):
global dirs
cur_dir_num = num
cur_name = ""
while cur_dir_num != 0:
dir_tup = dirs[cur_dir_num]
cur_name = os.path.join(dir_tup[0], cur_name)
cur_dir_num = dir_tup[4]
cur_name = os.path.join(dirs[0][0], cur_name)
return cur_name
dirs = {}
in_fname = sys.argv[1]
f_in = file(in_fname, "r")
out_fname = sys.argv[2]
f_out = file(out_fname, "w")
profile.run("read_diskinfo_file(f_in, f_out)")
f_in.close()
f_out.close()
----------------------------------------------------------------
The times for the new script:
espring> ~/script1.py /tmp/foo /tmp/foo_1
18760 function calls in 17.850 CPU seconds
Wow - and now your script is a bit faster than mine. The big win came
from not appending anything that was known to be too small.
-----------------------------------------------------------------
The sad part is this:
espring> cat du_sh_scipt
grep \^F /tmp/foo | sort -t '/' -n -k 2,2 | tail -200 > /tmp/foo_sh
espring> time ~/du_sh_script
8.40u 0.58s 0:09.09 98.7%
To be fair the shell script does not output the full file name, but
that could be handled with some post-processing.
Ok - so now I have a script that is about half the speed of grep + sort.
I guess thats not bad for an interpreted language.
-Ram
More information about the Python-list
mailing list