[Spambayes-checkins] spambayes rebal.py,1.1,1.2

Skip Montanaro montanaro@users.sourceforge.net
Thu, 12 Sep 2002 12:33:56 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv1160

Modified Files:
	rebal.py 
Log Message:
nearly complete rewrite which attempts to achieve the following:

    * allows specification of reservoir directory and prefix of set
      directories

    * will automatically fill any set directories which match the -s pattern

    * will migrate files in either direction - in theory, no files should be
      deleted

    * should be a bit more efficient so varying the numbers of trained ham
      and spam shouldn't be a big problem

With no args it should work like the original


Index: rebal.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/rebal.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** rebal.py	5 Sep 2002 16:16:43 -0000	1.1
--- rebal.py	12 Sep 2002 19:33:54 -0000	1.2
***************
*** 1,58 ****
! import os
! import sys
! import random
  
- '''
- dead = """
- Data/Ham/Set2/22467.txt
- Data/Ham/Set5/31389.txt
- Data/Ham/Set1/19642.txt
  """
  
! for f in dead.split():
!     os.unlink(f)
  
! sys.exit(0)
! '''
  
  NPERDIR = 4000
  RESDIR = 'Data/Ham/reservoir'
! res = os.listdir(RESDIR)
  
! stuff = []
! for i in range(1, 6):
!     dir = 'Data/Ham/Set%d' % i
!     fs = os.listdir(dir)
!     stuff.append((dir, fs))
  
! while stuff:
!     dir, fs = stuff.pop()
!     if len(fs) == NPERDIR:
!         continue
  
!     if len(fs) > NPERDIR:
!         f = random.choice(fs)
!         fs.remove(f)
!         print "deleting", f, "from", dir
!         os.unlink(dir + "/" + f)
  
!     elif len(fs) < NPERDIR:
!         print "need a new one for", dir
!         f = random.choice(res)
!         print "How about", f
!         res.remove(f)
  
!         fp = file(RESDIR + "/" + f, 'rb')
!         guts = fp.read()
!         fp.close()
!         os.unlink(RESDIR + "/" + f)
  
!         print guts
!         ok = raw_input('good enough? ')
!         if ok.startswith('y'):
!             fp = file(dir + "/" + f, 'wb')
!             fp.write(guts)
!             fp.close()
!             fs.append(f)
  
!     stuff.append((dir, fs))
--- 1,166 ----
! #!/usr/bin/env python
  
  """
+ rebal.py - rebalance a ham or spam directory, moving files to or from
+ a reservoir directory as necessary.
  
! usage: rebal.py [ options ]
! options:
!    -r res - specify an alternate reservoir [%(RESDIR)s]
!    -s set - specify an alternate Set pfx [%(SETPFX)s]
!    -n num - specify number of files per dir [%(NPERDIR)s]
!    -v     - tell user what's happening [%(VERBOSE)s]
!    -q     - be quiet about what's happening [not %(VERBOSE)s]
!    -c     - confirm file moves into Set directory [%(CONFIRM)s]
!    -Q     - be quiet and don't confirm moves
  
! The script will work with a variable number of Set directories, but they
! must already exist.
! 
! Example: 
! 
!     rebal.py -r reservoir -s Set -n 300
! 
! This will move random files between the directory 'reservoir' and the
! various subdirectories prefixed with 'Set', making sure no more than 300
! files are left in the 'Set' directories when finished.
! 
! Example:
! 
! Suppose you want to shuffle your Set files around, winding up with 300 files
! in each one, you can execute:
! 
!     rebal.py -n 0
!     rebal.py -n 300
! 
! The first run will move all files from the various Data/Ham/Set directories
! to the Data/Ham/reservoir directory.  The second run will randomly parcel
! out 300 files to each of the Data/Ham/Set directories.
! """
! 
! import os
! import sys
! import random
! import glob
! import getopt
  
+ # defaults
  NPERDIR = 4000
  RESDIR = 'Data/Ham/reservoir'
! SETPFX = 'Data/Ham/Set'
! VERBOSE = True
! CONFIRM = True
  
! def usage():
!     print >> sys.stderr, """\
! usage: rebal.py [ options ]
! options:
!    -r res - specify an alternate reservoir [%(RESDIR)s]
!    -s set - specify an alternate Set pfx [%(SETPFX)s]
!    -n num - specify number of files per dir [%(NPERDIR)s]
!    -v     - tell user what's happening [%(VERBOSE)s]
!    -q     - be quiet about what's happening [not %(VERBOSE)s]
!    -c     - confirm file moves into Set directory [%(CONFIRM)s]
!    -Q     - be quiet and don't confirm moves
! """ % globals()
!     
! def migrate(f, dir, verbose):
!     """rename f into dir, making sure to avoid name clashes."""
!     base = os.path.split(f)[-1]
!     if os.path.exists(os.path.join(dir,base)):
!         # this path can get slow if we have a lot of name collisions
!         # but we should rarely encounter that case (so he says smugly)
!         reslist = [int(n) for n in os.listdir(dir)]
!         reslist.sort()
!         out = os.path.join(dir, "%d"%(reslist[-1]+1))
!     else:
!         out = os.path.join(dir, base)
!     if verbose:
!         print "moving", f, "to", out
!     os.rename(f, out)
!     
! def main(args):
!     nperdir = NPERDIR
!     resdir = RESDIR
!     setpfx = SETPFX
!     verbose = VERBOSE
!     confirm = CONFIRM
!     
!     try:
!         opts, args = getopt.getopt(args, "r:s:n:vqcQh")
!     except getopt.GetoptError:
!         usage()
!         return 1
  
!     for opt, arg in opts:
!         if opt == "-n":
!             nperdir = int(arg)
!         elif opt == "-r":
!             resdir = arg
!         elif opt == "-s":
!             setpfx = arg
!         elif opt == "-v":
!             verbose = True
!         elif opt == "-c":
!             confirm = True
!         elif opt == "-q":
!             verbose = False
!         elif opt == "-Q":
!             verbose = confirm = False
!         elif opt == "-h":
!             usage()
!             return 0
  
!     res = os.listdir(resdir)
  
!     dirs = glob.glob(setpfx+"*")
!     if dirs == []:
!         print >> sys.stderr, "no directories beginning with", setpfx, "exist."
!         return 1
  
!     stuff = []
!     n = len(res)
!     for dir in dirs:
!         fs = os.listdir(dir)
!         n += len(fs)
!         stuff.append((dir, fs))
  
!     if nperdir * len(dirs) > n:
!         print >> sys.stderr, "not enough files to go around - use lower -n."
!         return 1
  
!     # if necessary, migrate random files to the reservoir
!     for (dir, fs) in stuff:
!         if nperdir >= len(fs):
!             continue
! 
!         random.shuffle(fs)
!         movethese = fs[nperdir:]
!         del fs[nperdir:]
!         for f in movethese:
!             migrate(os.path.join(dir,f), resdir, verbose)
!         res.extend(movethese)
! 
!     # randomize reservoir once so we can just bite chunks from the front
!     random.shuffle(res)
! 
!     # grow Set* directories from the reservoir
!     for (dir, fs) in stuff:
!         if nperdir == len(fs):
!             continue
! 
!         movethese = res[:nperdir-len(fs)]
!         res = res[nperdir-len(fs):]
!         for f in movethese:
!             if confirm:
!                 print file(os.path.join(resdir,f)).read()
!                 ok = raw_input('good enough? ').lower()
!                 if not ok.startswith('y'):
!                     continue
!             migrate(os.path.join(resdir,f), dir, verbose)
!         fs.extend(movethese)
! 
!     return 0
! 
! if __name__ == "__main__":
!     sys.exit(main(sys.argv[1:]))