signature for a file ?

John Hunter jdhunter at ace.bsd.uchicago.edu
Tue Jul 30 10:20:39 EDT 2002


>>>>> "Shagshag13" == Shagshag13  <shagshag13 at yahoo.fr> writes:


    Shagshag13> i had at home many hdds, that could contain many time
    Shagshag13> same files, in many places/directories (-> i'm really
    Shagshag13> disorganized).  i would like to do some sort on theses
    Shagshag13> files. to do this i'm planning to write a python
    Shagshag13> script that would compute a kind of CRC32, MD5 or SHA
    Shagshag13> (i'm really not competent in that - so here i need
    Shagshag13> advices and pointer to some implementations - and to
    Shagshag13> know which is the best to had a unique unambiguous
    Shagshag13> signature for a file) and then use it to find
    Shagshag13> "doubles" : same size + same signature = probably same
    Shagshag13> file.

I do a daily backup of some of my zope folders using cron, and get
some identical files if the folder hasn't changed since the last
backup.  I wrote this script to kill identical files and (optionally)
replace the duplicates with symbolic links to the original.  It has a
debug mode, where it just reports what it would have done and how much
file space you save but doesn't actually do anything.  

I never intended this for distribution, but it seems close enough to
what you are talking about that I thought it might give you some
ideas.  Use with extreme caution.  (never tried it on a non linux
platform.  you could replace md5sum with the built-in md5 lib...  )

you can call the script like

# check all dat files in current dir
kill_identical_files.py *.dat

# somefile contains a list of files to check
kill_identical_files.py somefile  

# files from stdin
find . | kill_identical_files.py

--- begin kill_identical_files.py ---

#!/usr/local/bin/python
import string, os, sys, re
from JdhMixins import Debug, Verbose


# Note stdin approach assumes you have no empty lines in the stdin, to
# allow it to terminate on commands like
#   find . | myscript.py
class NextFile:
    fh = None
    args = None
    val = None
    count = 0
    def __init__( self, args):
        """Pass sys.argv[1:] and this will get the file list from a
        single file arg, the stdin, or argv list"""    
        self.Nargs = len(args)
        if self.Nargs==0:
            self.stdin = 1
            self.fh = sys.stdin
        elif self.Nargs==1:
            self.file = 1
            self.fh = open( args[0], 'r')
        else:
            self.args = args
    def more(self):
        if self.fh:
            try:
                self.val =  self.fh.readline()[:-1]
                return len(self.val)>0
            except:
                return None
        else:            
            if self.count >= self.Nargs:
                return None
            self.val =  self.args[self.count]
            self.count = self.count + 1
            return 1
        
    def nextfile(self):
        return self.val

rgx = re.compile('([^\s]+)\s+(.*)$')
n = NextFile( sys.argv[1:] )
v = Verbose('extreme')
d = Debug('on')   #make this 'off' to actually alter the file system

link = 1  #replace the killed files with symlinks
s = {}

savings = 0
minSize = 5000  # don't remove files smaller than this

while n.more():
    file = n.nextfile()

    #Only operate on plain files
    if not os.path.isfile(file) or os.path.islink(file):
        if v.verbose('extreme'):
            print 'Skipping non plainfile %s' % file
        continue
    fileSize = os.path.getsize(file)
    if fileSize<=minSize:
        continue
    
    if v.verbose('extreme'):
        print 'Computing md5sums for %s' % file
    line = os.popen('md5sum %s' % file).readline()
    match = rgx.match(line)
    if not match:
        continue
    (sum, file) = (match.group(1), match.group(2))
    if s.has_key(sum):
        origFile = s[sum]
        savings = savings + fileSize
        if v.verbose('moderate'):
            print 'Removing file %s identical to %s' % (file, origFile)
            if link:
                print '\t...making symlink from %s to %s'  % (file, origFile)
        if not d.debug():
            os.remove(file)
            if link:
                os.symlink( origFile, file)
    else:
        if v.verbose('unbearable'):
            print 'Adding new file %s' % file
        s[sum] = file

if v.minimal():
    print 'Total savings is %1.2fMB\n' % float(savings/1e6)

--- end kill_identical_files.py ---


--- begin JdhMixins.py ---

import types

class Debug:
    """A mixin class to set and test whether you are in debug mode"""

    __s = {'on' : 1, 'off' : 0}
    __debug = 1

    def __init__(self, val='on'):
        """val can be a logic value (eg, 0 or 1) or an string ('on' or
        'off')"""

        self.set_debug(val)
    
    def set_debug(self, val):
        if isinstance(val, types.StringType):
            self.__debug = self.__s[val]
        elif isinstance(val, types.IntType):
            self.__debug = val
        else:
            raise ValueError, \
                  'Expected a string or and integer for debug value level'

        self.__debug = val

    def is_debug():
        return self.__debug

    def debug(self):
        return self.__debug
    
class Verbose:
    """ A mixin class for verbosity."""  
    __SILENT = 0
    __MINIMAL = 10
    __MODERATE = 100
    __EXTREME = 1000
    __UNBEARABLE = 10000
    __level = __MINIMAL
    
    __s = {
        'silent' : __SILENT,
        'quiet' : __SILENT,
        'minimal' : __MINIMAL,
        'moderate' : __MODERATE,
        'extreme' : __EXTREME,
        'unbearable' : __UNBEARABLE
        }



    def __init__(self, level='moderate'):

        self.set_verbose(level)

    def verbose(self, val='minimal'):
        """So you can do
             class MyClass(Verbose())
             if self.verbose(): print something
             if self.verbose('moderate'): print something else
        """

        return self.__level >= self.__s[val]

    def set_verbose( self, level):

        if isinstance(level, types.StringType):
            if level not in self.__s.keys():
                raise ValueError, 'Unrecognized level  %s' % level
            self.__level = self.__s[level]
        elif isinstance(level, types.IntType):
            self.__level = level
        else:
            raise ValueError, 'Expected a string or and integer for level'

    def is_silent(self):
        return self.__level == self.__SILENT

    def minimal(self):
        return self.__level >= self.__MINIMAL

    def moderate(self):
        return self.__level >= self.__MODERATE

    def extreme(self):
        return self.__level >= self.__EXTREME

    def unbearable(self):
        return self.__level >= self.__UNBEARABLE

--- begin JdhMixins.py ---




More information about the Python-list mailing list