signature for a file ?
John Hunter
jdhunter at ace.bsd.uchicago.edu
Tue Jul 30 10:20:39 EDT 2002
>>>>> "Shagshag13" == Shagshag13 <shagshag13 at yahoo.fr> writes:
Shagshag13> i had at home many hdds, that could contain many time
Shagshag13> same files, in many places/directories (-> i'm really
Shagshag13> disorganized). i would like to do some sort on theses
Shagshag13> files. to do this i'm planning to write a python
Shagshag13> script that would compute a kind of CRC32, MD5 or SHA
Shagshag13> (i'm really not competent in that - so here i need
Shagshag13> advices and pointer to some implementations - and to
Shagshag13> know which is the best to had a unique unambiguous
Shagshag13> signature for a file) and then use it to find
Shagshag13> "doubles" : same size + same signature = probably same
Shagshag13> file.
I do a daily backup of some of my zope folders using cron, and get
some identical files if the folder hasn't changed since the last
backup. I wrote this script to kill identical files and (optionally)
replace the duplicates with symbolic links to the original. It has a
debug mode, where it just reports what it would have done and how much
file space you save but doesn't actually do anything.
I never intended this for distribution, but it seems close enough to
what you are talking about that I thought it might give you some
ideas. Use with extreme caution. (never tried it on a non linux
platform. you could replace md5sum with the built-in md5 lib... )
you can call the script like
# check all dat files in current dir
kill_identical_files.py *.dat
# somefile contains a list of files to check
kill_identical_files.py somefile
# files from stdin
find . | kill_identical_files.py
--- begin kill_identical_files.py ---
#!/usr/local/bin/python
import string, os, sys, re
from JdhMixins import Debug, Verbose
# Note stdin approach assumes you have no empty lines in the stdin, to
# allow it to terminate on commands like
# find . | myscript.py
class NextFile:
fh = None
args = None
val = None
count = 0
def __init__( self, args):
"""Pass sys.argv[1:] and this will get the file list from a
single file arg, the stdin, or argv list"""
self.Nargs = len(args)
if self.Nargs==0:
self.stdin = 1
self.fh = sys.stdin
elif self.Nargs==1:
self.file = 1
self.fh = open( args[0], 'r')
else:
self.args = args
def more(self):
if self.fh:
try:
self.val = self.fh.readline()[:-1]
return len(self.val)>0
except:
return None
else:
if self.count >= self.Nargs:
return None
self.val = self.args[self.count]
self.count = self.count + 1
return 1
def nextfile(self):
return self.val
rgx = re.compile('([^\s]+)\s+(.*)$')
n = NextFile( sys.argv[1:] )
v = Verbose('extreme')
d = Debug('on') #make this 'off' to actually alter the file system
link = 1 #replace the killed files with symlinks
s = {}
savings = 0
minSize = 5000 # don't remove files smaller than this
while n.more():
file = n.nextfile()
#Only operate on plain files
if not os.path.isfile(file) or os.path.islink(file):
if v.verbose('extreme'):
print 'Skipping non plainfile %s' % file
continue
fileSize = os.path.getsize(file)
if fileSize<=minSize:
continue
if v.verbose('extreme'):
print 'Computing md5sums for %s' % file
line = os.popen('md5sum %s' % file).readline()
match = rgx.match(line)
if not match:
continue
(sum, file) = (match.group(1), match.group(2))
if s.has_key(sum):
origFile = s[sum]
savings = savings + fileSize
if v.verbose('moderate'):
print 'Removing file %s identical to %s' % (file, origFile)
if link:
print '\t...making symlink from %s to %s' % (file, origFile)
if not d.debug():
os.remove(file)
if link:
os.symlink( origFile, file)
else:
if v.verbose('unbearable'):
print 'Adding new file %s' % file
s[sum] = file
if v.minimal():
print 'Total savings is %1.2fMB\n' % float(savings/1e6)
--- end kill_identical_files.py ---
--- begin JdhMixins.py ---
import types
class Debug:
"""A mixin class to set and test whether you are in debug mode"""
__s = {'on' : 1, 'off' : 0}
__debug = 1
def __init__(self, val='on'):
"""val can be a logic value (eg, 0 or 1) or an string ('on' or
'off')"""
self.set_debug(val)
def set_debug(self, val):
if isinstance(val, types.StringType):
self.__debug = self.__s[val]
elif isinstance(val, types.IntType):
self.__debug = val
else:
raise ValueError, \
'Expected a string or and integer for debug value level'
self.__debug = val
def is_debug():
return self.__debug
def debug(self):
return self.__debug
class Verbose:
""" A mixin class for verbosity."""
__SILENT = 0
__MINIMAL = 10
__MODERATE = 100
__EXTREME = 1000
__UNBEARABLE = 10000
__level = __MINIMAL
__s = {
'silent' : __SILENT,
'quiet' : __SILENT,
'minimal' : __MINIMAL,
'moderate' : __MODERATE,
'extreme' : __EXTREME,
'unbearable' : __UNBEARABLE
}
def __init__(self, level='moderate'):
self.set_verbose(level)
def verbose(self, val='minimal'):
"""So you can do
class MyClass(Verbose())
if self.verbose(): print something
if self.verbose('moderate'): print something else
"""
return self.__level >= self.__s[val]
def set_verbose( self, level):
if isinstance(level, types.StringType):
if level not in self.__s.keys():
raise ValueError, 'Unrecognized level %s' % level
self.__level = self.__s[level]
elif isinstance(level, types.IntType):
self.__level = level
else:
raise ValueError, 'Expected a string or and integer for level'
def is_silent(self):
return self.__level == self.__SILENT
def minimal(self):
return self.__level >= self.__MINIMAL
def moderate(self):
return self.__level >= self.__MODERATE
def extreme(self):
return self.__level >= self.__EXTREME
def unbearable(self):
return self.__level >= self.__UNBEARABLE
--- begin JdhMixins.py ---
More information about the Python-list
mailing list