[Spambayes-checkins] spambayes mboxtrain.py,1.11,1.12

Wed Aug 13 16:59:54 EDT 2003

Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv1580

Modified Files:
	mboxtrain.py 
Log Message:
Implement patches from:
[ 788001 ] mboxtrain.py maildir bugfix and feature

The main change is the addition of an "-r" switch that will remove the
training data after it is used.  There are also a couple of other minor
improvements.

Index: mboxtrain.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/mboxtrain.py,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** mboxtrain.py	9 Jul 2003 06:35:49 -0000	1.11
--- mboxtrain.py	13 Aug 2003 22:59:52 -0000	1.12
***************
*** 35,39 ****

      -n  train mail residing in "new" directory, in addition to "cur" directory,
!         which is always trained
  """

--- 35,41 ----

      -n  train mail residing in "new" directory, in addition to "cur" directory,
!         which is always trained (Maildir only)
! 
!     -r  remove mail which was trained on (Maildir only)
  """

***************
*** 46,49 ****
--- 48,52 ----
  import sys, os, getopt
  from spambayes import hammie, mboxutils
+ from spambayes.Options import options

  program = sys.argv[0]
***************
*** 63,69 ****

      if is_spam:
!         spamtxt = "spam"
      else:
!         spamtxt = "ham"
      oldtxt = msg.get(TRAINED_HDR)
      if force:
--- 66,72 ----

      if is_spam:
!         spamtxt = options["Headers", "header_spam_string"]
      else:
!         spamtxt = options["Headers", "header_ham_string"]
      oldtxt = msg.get(TRAINED_HDR)
      if force:
***************
*** 83,90 ****
      return True

! def maildir_train(h, path, is_spam, force):
      """Train bayes with all messages from a maildir."""

!     if loud: print "  Reading as Maildir"

      import time
--- 86,93 ----
      return True

! def maildir_train(h, path, is_spam, force, removetrained):
      """Train bayes with all messages from a maildir."""

!     if loud: print "  Reading %s as Maildir" % (path,)

      import time
***************
*** 97,105 ****

      for fn in os.listdir(path):
-         counter += 1
          cfn = os.path.join(path, fn)
          tfn = os.path.normpath(os.path.join(path, "..", "tmp",
                             "%d.%d_%d.%s" % (time.time(), pid,
                                              counter, host)))
          if loud:
              sys.stdout.write("  %s        \r" % fn)
--- 100,110 ----

      for fn in os.listdir(path):
          cfn = os.path.join(path, fn)
          tfn = os.path.normpath(os.path.join(path, "..", "tmp",
                             "%d.%d_%d.%s" % (time.time(), pid,
                                              counter, host)))
+         if (os.path.isdir(cfn)):
+             continue
+         counter += 1
          if loud:
              sys.stdout.write("  %s        \r" % fn)
***************
*** 117,120 ****
--- 122,127 ----
          # people actually use Maildirs?
          os.rename(tfn, cfn)
+         if (removetrained):
+             os.unlink(cfn)

      if loud:
***************
*** 208,212 ****
                 (trained, counter))

! def train(h, path, is_spam, force, trainnew):
      if not os.path.exists(path):
          raise ValueError("Nonexistent path: %s" % path)
--- 215,219 ----
                 (trained, counter))

! def train(h, path, is_spam, force, trainnew, removetrained):
      if not os.path.exists(path):
          raise ValueError("Nonexistent path: %s" % path)
***************
*** 214,220 ****
          mbox_train(h, path, is_spam, force)
      elif os.path.isdir(os.path.join(path, "cur")):
!         maildir_train(h, os.path.join(path, "cur"), is_spam, force)
          if trainnew:
!             maildir_train(h, os.path.join(path, "new"), is_spam, force)
      elif os.path.isdir(path):
          mhdir_train(h, path, is_spam, force)
--- 221,227 ----
          mbox_train(h, path, is_spam, force)
      elif os.path.isdir(os.path.join(path, "cur")):
!         maildir_train(h, os.path.join(path, "cur"), is_spam, force, removetrained)
          if trainnew:
!             maildir_train(h, os.path.join(path, "new"), is_spam, force, removetrained)
      elif os.path.isdir(path):
          mhdir_train(h, path, is_spam, force)
***************
*** 237,241 ****

      try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hfqnd:D:g:s:')
      except getopt.error, msg:
          usage(2, msg)
--- 244,248 ----

      try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hfqnrd:D:g:s:')
      except getopt.error, msg:
          usage(2, msg)
***************
*** 248,251 ****
--- 255,259 ----
      force = False
      trainnew = False
+     removetrained = False
      good = []
      spam = []
***************
*** 263,266 ****
--- 271,276 ----
          elif opt == '-s':
              spam.append(arg)
+         elif opt == "-r":
+             removetrained = True
          elif opt == "-d":
              usedb = True
***************
*** 279,288 ****
      for g in good:
          if loud: print "Training ham (%s):" % g
!         train(h, g, False, force, trainnew)
          save = True

      for s in spam:
          if loud: print "Training spam (%s):" % s
!         train(h, s, True, force, trainnew)
          save = True

--- 289,298 ----
      for g in good:
          if loud: print "Training ham (%s):" % g
!         train(h, g, False, force, trainnew, removetrained)
          save = True

      for s in spam:
          if loud: print "Training spam (%s):" % s
!         train(h, s, True, force, trainnew, removetrained)
          save = True