duplicate file finder (was: how can I make this script shorter?)
Christos TZOTZIOY Georgiou
tzot at sil-tec.gr
Thu Feb 24 11:39:39 EST 2005
On Wed, 23 Feb 2005 01:56:02 -0800, rumours say that Lowell Kirsh
<lkirsh at cs.ubc.ca> might have written:
>Good idea about hashing part of the file before comparing entire files.
>It will make the script longer but the speed increase will most likely
>make it worth it.
My dupefind module was one of the first modules I wrote in python (it was a
shell script once...), so it's not appropriate to post. However, rewriting it
was in my RSN list; after your post, I said, "what the heck" :)
I wouldn't describe it as /clean/ Python code, so any comments on style,
methodology (and of course, bugs!) are mostly welcome. If you have any ideas
how to make it more "Pythonic" in style, perhaps we can even send it to ASPN.
On the upside, it's a good demo of Python dynamism and therefore power.
All first spaces are converted to pipe symbols to keep code indents, and sorry
for the assignment operators lacking spacing at the left, but it helped me know
what I meant in C, so I kept the habit in Python.
"""dupefinder.py -- a module to find duplicate files.
find_duplicate_files(*paths):
. A function returning an iterable of lists of duplicate files
. To be used as
. for duplicate_files in dupefinder.find_duplicate_files(dir1, dir2...):
. # process every group of duplicates
"""
import os, md5, errno, sys
IGNORED_ERROR_CODES= frozenset( [errno.EACCES, errno.ENOENT] )
class File(object):
. """A wrapper for files to facilitate their comparison.
.
. Interface:
. - .get_hash(level) returns appropriate key for the current cmp level
. - .filename
. - .size"""
. __slots__= "filename", "size", "_hashes",
. def __init__(self, filename):
. self.filename= filename
. self.size= os.path.getsize(filename)
. if self.size == 0:
. self._hashes= [0, None, None]
. else:
. self._hashes= [self.size]
. def __repr__(self):
. return "%s('%s')" % (self.__class__.__name__, self.filename)
. def get_hash(self, level):
. """Return appropriate key for level of comparison.
. level == 0 returns file size
. level == 1 returns hash of first few kb
. level == 2 returns md5 sum of whole file"""
. if level >= len(self._hashes):
. if 1 <= len(self._hashes):
. self._hashes.append(self._get_partial_hash())
. if 2 <= len(self._hashes):
. self._hashes.append(self._get_full_hash())
. return self._hashes[level]
. def _get_partial_hash(self):
. fp= open(self.filename)
. try:
. return hash(fp.read(8192))
. finally:
. fp.close()
. def _get_full_hash(self):
. fp= open(self.filename, "rb")
. full_hash= md5.new()
. while 1:
. data= fp.read(65536)
. if not data: break
. full_hash.update(data)
. fp.close()
. return full_hash.digest()
class GenericFilesDict(dict):
. """A virtual wrapper for the dictionaries of file comparisons.
. Not to be used on its own, but through subclassing.
.
. Each subclass should define a _level class attribute to be used with the
. File.get_hash method, and a _next_class attribute pointing to the class
. managing the next level comparisons."""
. __slots__= ()
. def add_file(self, fileobj):
. """Add a File object to self keyed by the appropriate key based on
. self._level. If another file object exists in the same spot, replace it
. by a _next_level_class instance containing the pre-existing and new file
. obj."""
. this_hash= fileobj.get_hash(self._level)
. try:
. result = self[this_hash]
. except KeyError:
. self[this_hash]= fileobj
. return
. try: # there was something in slot [this_hash]
. result.add_file(fileobj) # so add fileobj to it
. except AttributeError: # it has no add_file method, so it's a File
. _= self[this_hash]= self._next_class() # make an instance
. _.add_file(result) # add pre-existing
. _.add_file(fileobj) # add new
. def __repr__(self):
. return "%s%s" % (self.__class__.__name__, dict.__repr__(self))
. def __iter__(self):
. "Return all instances of SameFiles (subclass of list)."
. for item, value in self.iteritems():
. try: _next_class= value._next_class
. except AttributeError: continue
. if _next_class is None:
. yield value
. else:
. for item in value:
. yield item
class SameFiles(list):
. "A list of duplicate files."
. __slots__= ()
. _level= 3
. _next_class= None # search stops here
. def add_file(self, fileobj):
. self.append(fileobj)
class FilesByFullHash(GenericFilesDict):
. """A dictionary keyed on md5 hash of whole file.
. The algorithm assures that all File objects in this dict
. have the same size and the same hash of first few kiB."""
. __slots__= ()
. _level= 2
. _next_class= SameFiles
class FilesByPartialHash(GenericFilesDict):
. """A dictionary keyed on hosh of first few kiB of file.
. The algorithm assures that all File objects in this dict
. have the same size."""
. __slots__= ()
. _level= 1
. _next_class= FilesByFullHash
class FilesBySize(GenericFilesDict):
. """A dictionary keyed on file size."""
. __slots__= ()
. _level= 0
. _next_class= FilesByPartialHash
def find_duplicate_files(*directories):
. """Find all duplicate files in the specified directories.
. The returned object can be iterated on, yielding a list of
. duplicate files every time."""
. dupefinder= FilesBySize()
## count= 0
. for path in directories:
. for dirname, dirnames, filenames in os.walk(path):
. for filename in filenames:
. try:
. dupefinder.add_file(File(os.path.join(dirname, filename)))
. except EnvironmentError, exc:
. if exc.errno not in IGNORED_ERROR_CODES: # deleted or b0rken
symlink
. raise
## count += 1
## if count & 15 == 0:
## sys.stderr.write("\r%d" % count)
. return dupefinder
def pprint(obj, level=0):
. """For testing purposes."""
. indent= " "*level
. if isinstance(obj, File):
. print `obj`
. elif isinstance(obj, list):
. print "list"
. for subobj in obj:
. print indent, "-", repr(subobj)
. else:
. print obj.__class__.__name__
. for key, subobj in obj.iteritems():
. print indent, "+(%r)" % key,
. pprint(subobj, level+1)
if __name__=="__main__":
. result= find_duplicate_files(*sys.argv[1:])
. import pprint
. for group in result:
. pprint.pprint(group)
--
TZOTZIOY, I speak England very best.
"Be strict when sending and tolerant when receiving." (from RFC1958)
I really should keep that in mind when talking with people, actually...
More information about the Python-list
mailing list