[Spambayes-checkins] spambayes/utilities rebal.py,1.2,1.3

Tim Peters tim_one at users.sourceforge.net
Fri Dec 26 01:57:00 EST 2003


Update of /cvsroot/spambayes/spambayes/utilities
In directory sc8-pr-cvs1:/tmp/cvs-serv2235/utilities

Modified Files:
	rebal.py 
Log Message:
Beef up docs and comments.  Don't use "dir" as a vrbl name (it's the
name of a builtin function, so is needlessly confusing as a vrbl name).
Spell prefix as "prefix" instead of "pfx".  Always shrink lists from the
"right end", as that's a constant-time operation.  Eliminate massive
duplication of usage info strings.  Repair logic error:  when moving a
file from a Set directory to the reservoir, the file may need to be
renamed (if a file with the original name already existed in the
reservoir -- this is rare).  In that case, the code still stuck the
original name in its list of reservoir file names, which would lead to
a "file not found" error if the second phase tried to move that file out
 of the reservoir again.  migrate() now returns the base name of the moved
file, so its caller can keep track of the actual name.


Index: rebal.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/utilities/rebal.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** rebal.py	17 Jan 2003 06:42:53 -0000	1.2
--- rebal.py	26 Dec 2003 06:56:58 -0000	1.3
***************
*** 2,7 ****
  
  """
! rebal.py - rebalance a ham or spam directory, moving files to or from
! a reservoir directory as necessary.
  
  usage: rebal.py [ options ]
--- 2,6 ----
  
  """
! rebal.py - rebalance a ham or spam test directory
  
  usage: rebal.py [ options ]
***************
*** 9,13 ****
     -d     - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
     -r res - specify an alternate reservoir [%(RESDIR)s]
!    -s set - specify an alternate Set pfx [%(SETPFX)s]
     -n num - specify number of files per Set dir desired [%(NPERDIR)s]
     -v     - tell user what's happening [%(VERBOSE)s]
--- 8,12 ----
     -d     - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
     -r res - specify an alternate reservoir [%(RESDIR)s]
!    -s set - specify an alternate Set prefix [%(SETPREFIX)s]
     -n num - specify number of files per Set dir desired [%(NPERDIR)s]
     -v     - tell user what's happening [%(VERBOSE)s]
***************
*** 15,21 ****
     -c     - confirm file moves into Set directory [%(CONFIRM)s]
     -Q     - don't confirm moves; this is independent of -v/-q
  
! The script will work with a variable number of Set directories, but they
! must already exist.
  
  Example:
--- 14,30 ----
     -c     - confirm file moves into Set directory [%(CONFIRM)s]
     -Q     - don't confirm moves; this is independent of -v/-q
+    -h     - display this message and quit
  
! Moves files among the Set subdirectories and a reservoir directory as
! necessary.  You should execute this script from the directory containing your
! Data directory.  By default, the Set1, Set2, ..., and reservoir subdirectories
! under (relative path) Data/Ham/ are rebalanced; this can be changed with the
! -s argument.  The script will work with a variable number of Set directories,
! but they must already exist, and the reservoir directory must also exist.
! 
! It's recommended that you run with the -d (dry run) option first, to see what
! the script would do without actually moving any files.  If, e.g., you
! accidentally mix up spam Sets with your Ham reservoir, it could be very
! difficult to recover from that mistake.
  
  Example:
***************
*** 33,37 ****
  
      rebal.py -n 0
!     rebal.py -n 300
  
  The first run will move all files from the various Data/Ham/Set directories
--- 42,46 ----
  
      rebal.py -n 0
!     rebal.py -n 300 -Q
  
  The first run will move all files from the various Data/Ham/Set directories
***************
*** 56,97 ****
  NPERDIR = 4000
  RESDIR = 'Data/Ham/reservoir'
! SETPFX = 'Data/Ham/Set'
  VERBOSE = True
  CONFIRM = True
  DRYRUN = False
  
! def usage(msg):
!     msg = str(msg)
      if msg:
!         print >> sys.stderr, msg
!     print >> sys.stderr, """\
! usage: rebal.py [ options ]
! options:
!    -d     - dry run; display what would be moved, but don't do it [%(DRYRUN)s]
!    -r res - specify an alternate reservoir [%(RESDIR)s]
!    -s set - specify an alternate Set pfx [%(SETPFX)s]
!    -n num - specify number of files per dir [%(NPERDIR)s]
!    -v     - tell user what's happening [%(VERBOSE)s]
!    -q     - be quiet about what's happening [not %(VERBOSE)s]
!    -c     - confirm file moves into Set directory [%(CONFIRM)s]
!    -Q     - be quiet and don't confirm moves
! """ % globals()
  
- def migrate(f, dir, verbose):
-     """rename f into dir, making sure to avoid name clashes."""
      base = os.path.split(f)[-1]
!     out = os.path.join(dir, base)
      while os.path.exists(out):
          basename, ext = os.path.splitext(base)
          digits = random.randrange(100000000)
!         out = os.path.join(dir, str(digits) + ext)
      if verbose:
          print "moving", f, "to", out
      os.rename(f, out)
  
  def main(args):
      nperdir = NPERDIR
      resdir = RESDIR
!     setpfx = SETPFX
      verbose = VERBOSE
      confirm = CONFIRM
--- 65,101 ----
  NPERDIR = 4000
  RESDIR = 'Data/Ham/reservoir'
! SETPREFIX = 'Data/Ham/Set'
  VERBOSE = True
  CONFIRM = True
  DRYRUN = False
  
! def usage(msg=None):
      if msg:
!         print >> sys.stderr, str(msg)
!     print >> sys.stderr, __doc__ % globals()
! 
! def migrate(f, targetdir, verbose):
!     """Move f into targetdir, renaming if needed to avoid name clashes.
! 
!        The basename of the moved file is returned; this may not be the
!        same as the basename of f, if the file had to be renamed because
!        a file with f's basename already existed in targetdir.
!     """
  
      base = os.path.split(f)[-1]
!     out = os.path.join(targetdir, base)
      while os.path.exists(out):
          basename, ext = os.path.splitext(base)
          digits = random.randrange(100000000)
!         out = os.path.join(targetdir, str(digits) + ext)
      if verbose:
          print "moving", f, "to", out
      os.rename(f, out)
+     return os.path.split(f)[-1]
  
  def main(args):
      nperdir = NPERDIR
      resdir = RESDIR
!     setprefix = SETPREFIX
      verbose = VERBOSE
      confirm = CONFIRM
***************
*** 110,114 ****
              resdir = arg
          elif opt == "-s":
!             setpfx = arg
          elif opt == "-v":
              verbose = True
--- 114,118 ----
              resdir = arg
          elif opt == "-s":
!             setprefix = arg
          elif opt == "-v":
              verbose = True
***************
*** 122,141 ****
              dryrun = True
          elif opt == "-h":
!             usage('')
              return 0
  
      res = os.listdir(resdir)
  
!     dirs = glob.glob(setpfx+"*")
      if dirs == []:
!         print >> sys.stderr, "no directories beginning with", setpfx, "exist."
          return 1
  
      stuff = []
      n = len(res)
!     for dir in dirs:
!         fs = os.listdir(dir)
          n += len(fs)
!         stuff.append((dir, fs))
  
      if nperdir * len(dirs) > n:
--- 126,149 ----
              dryrun = True
          elif opt == "-h":
!             usage()
              return 0
+         else:
+             raise SystemError("internal error on option '%s'" % opt)
  
      res = os.listdir(resdir)
  
!     dirs = glob.glob(setprefix + "*")
      if dirs == []:
!         print >> sys.stderr, "no directories starting with", setprefix, "exist."
          return 1
  
+     # stuff <- list of (directory, files) pairs, where directory is the
+     # name of a Set subdirectory, and files is a list of files in that dir.
      stuff = []
      n = len(res)
!     for d in dirs:
!         fs = os.listdir(d)
          n += len(fs)
!         stuff.append((d, fs))
  
      if nperdir * len(dirs) > n:
***************
*** 144,149 ****
  
      # weak check against mixing ham and spam
!     if (setpfx.find("Ham") >= 0 and resdir.find("Spam") >= 0 or
!         setpfx.find("Spam") >= 0 and resdir.find("Ham") >= 0):
          yn = raw_input("Reservoir and Set dirs appear not to match. "
                         "Continue? (y/n) ")
--- 152,157 ----
  
      # weak check against mixing ham and spam
!     if ((setprefix.find("Ham") >= 0 and resdir.find("Spam") >= 0) or
!         (setprefix.find("Spam") >= 0 and resdir.find("Ham") >= 0)):
          yn = raw_input("Reservoir and Set dirs appear not to match. "
                         "Continue? (y/n) ")
***************
*** 151,183 ****
              return 1
  
!     # if necessary, migrate random files to the reservoir
!     for (dir, fs) in stuff:
!         if nperdir >= len(fs):
              continue
  
          random.shuffle(fs)
          movethese = fs[nperdir:]
          del fs[nperdir:]
          if dryrun:
!             print "would move", len(movethese), "files from", dir, \
                    "to reservoir", resdir
          else:
              for f in movethese:
!                 migrate(os.path.join(dir, f), resdir, verbose)
!         res.extend(movethese)
  
!     # randomize reservoir once so we can just bite chunks from the front
      random.shuffle(res)
  
!     # grow Set* directories from the reservoir
!     for (dir, fs) in stuff:
          if nperdir == len(fs):
              continue
  
!         movethese = res[:nperdir-len(fs)]
!         res = res[nperdir-len(fs):]
          if dryrun:
              print "would move", len(movethese), "files from reservoir", \
!                   resdir, "to", dir
          else:
              for f in movethese:
--- 159,196 ----
              return 1
  
!     # If necessary, migrate random files to the reservoir.
!     for (d, fs) in stuff:
!         if len(fs) <= nperdir:
              continue
  
+         # Retain only nperdir files, moving the rest to reservoir.
          random.shuffle(fs)
          movethese = fs[nperdir:]
          del fs[nperdir:]
          if dryrun:
!             print "would move", len(movethese), "files from", d, \
                    "to reservoir", resdir
+             res.extend(movethese)
          else:
              for f in movethese:
!                 newname = migrate(os.path.join(d, f), resdir, verbose)
!                 res.append(newname)
  
!     # Randomize reservoir once so we can just bite chunks from the end.
      random.shuffle(res)
  
!     # Grow Set* directories from the reservoir as needed.
!     for (d, fs) in stuff:
!         assert len(fs) <= nperdir
          if nperdir == len(fs):
              continue
  
!         numtomove = nperdir - len(fs)
!         assert 0 < numtomove <= len(res)
!         movethese = res[-numtomove:]
!         del res[-numtomove:]
          if dryrun:
              print "would move", len(movethese), "files from reservoir", \
!                   resdir, "to", d
          else:
              for f in movethese:
***************
*** 187,192 ****
                      if not ok.startswith('y'):
                          continue
!                 migrate(os.path.join(resdir, f), dir, verbose)
!         fs.extend(movethese)
  
      return 0
--- 200,204 ----
                      if not ok.startswith('y'):
                          continue
!                 migrate(os.path.join(resdir, f), d, verbose)
  
      return 0





More information about the Spambayes-checkins mailing list