Walk Directories via Caching (& Dup file detection)
Benjamin Schollnick
junkster at rochester.rr.com
Sun Mar 10 10:23:50 EST 2002
A little while ago, some folks were posting some caching routines for
dirwalk.....
Here's something that I cooked up a while ago....
Now, I've added just recently full & partial MD5 hashing, and I'm slowly
adding duplicate file checking...
It seems to work fine, but it definetly could use some optimizations
& speed ups...
Anyone have any input? Is there some faster method I could use,
that I've missed?
The main idea is that the walk tree is built once, and the class just
walks through the list (in memory), instead of walking through the
hard drive....
(This was written rather quickly when I realized that searching for
hundred's of files, through dirwalk was too slow, because for each file,
it would have to go through the hard drive each time.)
There are two major data structures:
self.index_directory_names = []
self.index_directory_files = []
They are "logically" linked. The files in index_directory_files[x] are
in index_directory_names[x], and vice versa.
I've just recently added the MD5 code, for duplicate detection, which
is really where I'm looking for the majority of any optimization....
It's *FAST*, or at least takes a reasonable amount of time with 45,000
files.... (I think it takes about 30-45 minutes for that...? I haven't
timed it)
- Benjamin
----------------------------------
#from bas_common import * # True / false syn's for boolean expressions
import dircache
import sys
import os
import string
import shutil
import time
import md5
#mac_mode = macintosh
mac_mode = 0 # code stored in bas_common
if mac_mode:
import findertools
import findertools
import macfs
import macostools
def full_md5_digest(filename, block=(8*1024) ):
"""Return 16 byte MD5 digest of entire given file.
"""
f = open(filename, 'rb')
data = f.read(block)
D = md5.new()
while data:
D.update(data)
data = f.read(block)
f.close()
digest = D.digest()
return digest
def short_md5_digest(filename, block=(8*1024) ):
"""Return 16 byte MD5 digest of given file.
"""
f = open(filename, 'rb')
data = f.read(block)
D = md5.new()
D.update(data)
f.close()
digest = D.digest()
return digest
class directory_walk:
def __init__ ( self ):
self.index_directory_names = []
self.index_directory_files = []
self.use_md5 = 1
self.files_to_skip = ["FILE_ID.DIZ", "README.TXT",
"READ.ME", "README", "ID.DOC",
".DS_STORE", # Do Not Remove, Mac OS X File
"ICON", # Do Not Remove, Mac OS File
"ICON_" # Do Not Remove, Mac OS File
]
self.upper_case = None
def __list_files ( self, arg, dirname, names ):
if self.upper_case:
self.index_directory_names.append ( string.upper( string.strip(dirname) ) )
for j in range(0, len(names)):
names[j] = names[j].upper()
else:
self.index_directory_names.append ( string.strip(dirname) )
self.index_directory_files.append ( names )
def build_search_tree ( self, base_directory_name):
self.index_directory_names = []
self.index_directory_files = []
os.path.walk ( base_directory_name, self.__list_files, None)
def use_upper_case ( self ):
self.upper_case = 1
def single_file_exist (self, filename):
if self.upper_case:
filename = string.upper(filename)
listing_size = len(self.index_directory_files)
for x in range(0, listing_size):
try:
junk = self.index_directory_files[x].index(filename)
return (1, string.strip(self.index_directory_names[x] ) )
except:
pass
return (None, None)
def return_duplicates_filenames ( self ):
duplicate_files = []
master_list = []
print "# of directories : ", len(self.index_directory_names)
for x in range(0, len(self.index_directory_names)):
print ".",
for filename in self.index_directory_files[x]:
filename = string.strip(string.upper(filename))
if not(filename in self.files_to_skip):
if os.path.isfile (os.path.join(self.index_directory_names[x], filename)):
master_list.append (filename)
if master_list.count(filename) >= 2:
if duplicate_files.count(filename) == 0:
duplicate_files.append (filename)
print
return duplicate_files
def return_all_directories_from_file ( self, filename):
file_directories = []
for x in range(0, len(self.index_directory_names)):
if self.index_directory_files[x].count(filename) >= 1:
file_directories.append ( self.index_directory_names[x])
return file_directories
def filter_via_md5 ( self, dup_file_list, md5_filter ):
duplicate_files = []
master_list = []
for filename in dup_file_list:
file_locations = self.return_all_directories_from_file (filename)
for location in file_locations:
duplicate_files.append ( (filename, location, md5_filter (os.path.join(location,
filename)) ) )
return duplicate_files
fss, ok = macfs.GetDirectory('Duplicate Search Directory:')
if not ok:
sys.exit(0)
else:
dup_directory = fss.as_pathname()
dir_find = directory_walk()
dir_find.use_upper_case()
print "Search Tree"
dir_find.build_search_tree ( dup_directory )
print "End Search Tree"
output_file = open("results.txt", "w")
print "Begin"
dup_files = dir_find.return_duplicates_filenames ()
md5_output = dir_find.filter_via_md5 ( dup_files, short_md5_digest )
length_of_md5_output = len(md5_output)
optimized_md5 = []
for scan in range(0, length_of_md5_output):
optimized_md5.append (md5_output[scan][2])
print "Number of Files to Validate: ", length_of_md5_output
for counter in range(0, length_of_md5_output):
if optimized_md5.count ( md5_output[counter][2]) >= 2:
for scan in range(counter, length_of_md5_output):
if (md5_output[scan][2] == md5_output[counter][2]) and scan<>counter:
output_file.writelines ( "\r", "-"*40)
output_file.writelines ( "\rFile: %s" % md5_output[counter][0] )
output_file.writelines ( "\r\tLocation : %s " % md5_output[scan][1] )
print "\r", "-"*40
print "\rFile: %s" % md5_output[counter][0]
print "\r\tLocation : %s " % md5_output[scan][1]
while optimized_md5.count ( md5_output[counter][2]):
try:
optimized_md5.remove( md5_output[counter][2])
except:
pass
print "\r\tLocation : %s " % md5_output[counter][1]
output_file.writelines ( "\r\tLocation : %s " % md5_output[counter][1] )
output_file.close()
print "End"
#for filename in dup_files:
# print "-"*40,"\n"
# output_file.writelines ("-"*40,"\r")
# print filename
# output_file.writelines (filename, "\r")
# dup_directories = dir_find.return_all_directories_from_file ( filename )
# for dir in dup_directories:
# print "\t%s" % dir
# output_file.writelines ("\t%s\r" % dir)
# print "-"*40,"\n"
# output_file.writelines ("-"*40,"\r")
sys.exit(5)
More information about the Python-list
mailing list