[Spambayes-checkins] spambayes/contrib tte.py,1.11,1.12

Sun Jul 25 05:00:55 CEST 2004

Update of /cvsroot/spambayes/spambayes/contrib
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3684

Modified Files:
	tte.py 
Log Message:
Add --ratio=N flag to allow the user to adjust the ratio of spam to ham.
Seems unfortunately necessary in this spam-heavy world we live in.

Index: tte.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/contrib/tte.py,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** tte.py	10 Jul 2004 02:59:41 -0000	1.11
--- tte.py	25 Jul 2004 03:00:53 -0000	1.12
***************
*** 23,31 ****
            all messages score correctly.

! -c ext  - Cull all messages which aren't used as training input during any run
!           and write to new ham and spam files with ext as an extra file extension.
!           All messages which are never considered (because one training set is
!           longer than the other or the -m flag was used to reduce the amount of
!           input) are retained.

  -o sect:opt:val -
--- 23,31 ----
            all messages score correctly.

! -c ext  - Cull all messages which aren't used as training input during any
!           run and write to new ham and spam files with ext as an extra file
!           extension.  All messages which are never considered (because one
!           training set is longer than the other or the -m flag was used to
!           reduce the amount of input) are retained.

  -o sect:opt:val -
***************
*** 36,39 ****
--- 36,43 ----
  -R        Walk backwards through the mailbox.

+ --ratio=n Define the number of spam messages to be trained for each ham.
+           The default is 1, but given the sorry state of the Net's email
+           infrastructure these days you'll probably want to raise it.  Keep
+           it as close to 1 as you can...

  Note: The -c command line argument isn't quite as benign as it might first
***************
*** 91,95 ****
          return iter(seq)

! def train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose):
      smisses = hmisses = round = 0
      ham_cutoff = Options.options["Categorization", "ham_cutoff"]
--- 95,100 ----
          return iter(seq)

! def train(store, hambox, spambox, maxmsgs, maxrounds, tdict, reverse, verbose,
!           ratio):
      smisses = hmisses = round = 0
      ham_cutoff = Options.options["Categorization", "ham_cutoff"]
***************
*** 97,102 ****

      while round < maxrounds and (hmisses or smisses or round == 0):
!         hambone = mboxutils.getmbox(ham)
!         spamcan = mboxutils.getmbox(spam)
          if reverse:
              hambone = reversed(list(hambone))
--- 102,107 ----

      while round < maxrounds and (hmisses or smisses or round == 0):
!         hambone = mboxutils.getmbox(hambox)
!         spamcan = mboxutils.getmbox(spambox)
          if reverse:
              hambone = reversed(list(hambone))
***************
*** 111,124 ****
          try:
              while not maxmsgs or nmsgs < maxmsgs:
!                 hammsg = hambone.next()
!                 spammsg = spamcan.next()

!                 nmsgs += 2
                  sys.stdout.write("\r%5d" % nmsgs)
                  sys.stdout.flush()

!                 score = store.spamprob(tokenize(hammsg))
!                 selector = (hammsg["message-id"] or
!                             hammsg["subject"])
                  if score > ham_cutoff and selector is not None:
                      if verbose:
--- 116,137 ----
          try:
              while not maxmsgs or nmsgs < maxmsgs:
!                 ham = hambone.next()
!                 spams = []
!                 for i in range(ratio):
!                     try:
!                         spams.append(spamcan.next())
!                     except StopIteration:
!                         # no spams left so exit
!                         if not spams:
!                             raise
!                         # use what we've collected
!                         break

!                 nmsgs += 1 + len(spams)
                  sys.stdout.write("\r%5d" % nmsgs)
                  sys.stdout.flush()

!                 score = store.spamprob(tokenize(ham))
!                 selector = ham["message-id"] or ham["subject"]
                  if score > ham_cutoff and selector is not None:
                      if verbose:
***************
*** 126,142 ****
                              score, selector)
                      hmisses += 1
!                     tdict[hammsg["message-id"]] = True
!                     store.learn(tokenize(hammsg), False)

!                 score = store.spamprob(tokenize(spammsg))
!                 selector = (spammsg["message-id"] or
!                             spammsg["subject"])
!                 if score < spam_cutoff and selector is not None:
!                     if verbose:
!                         print >> sys.stderr, "miss spam: %.6f %s" % (
!                             score, selector)
!                     smisses += 1
!                     tdict[spammsg["message-id"]] = True
!                     store.learn(tokenize(spammsg), True)

          except StopIteration:
--- 139,156 ----
                              score, selector)
                      hmisses += 1
!                     tdict[ham["message-id"]] = True
!                     store.learn(tokenize(ham), False)

!                 for spam in spams:
!                     score = store.spamprob(tokenize(spam))
!                     selector = (spam["message-id"] or
!                                 spam["subject"])
!                     if score < spam_cutoff and selector is not None:
!                         if verbose:
!                             print >> sys.stderr, "miss spam: %.6f %s" % (
!                                 score, selector)
!                         smisses += 1
!                         tdict[spam["message-id"]] = True
!                         store.learn(tokenize(spam), True)

          except StopIteration:
***************
*** 176,180 ****
                                      "database=", "pickle=", "verbose",
                                      "option=", "max=", "maxrounds=",
!                                     "cullext=", "reverse"])
      except getopt.GetoptError, msg:
          usage(msg)
--- 190,194 ----
                                      "database=", "pickle=", "verbose",
                                      "option=", "max=", "maxrounds=",
!                                     "cullext=", "reverse", "ratio="])
      except getopt.GetoptError, msg:
          usage(msg)
***************
*** 186,189 ****
--- 200,204 ----
      verbose = False
      reverse = False
+     sh_ratio = 1
      for opt, arg in opts:
          if opt in ("-h", "--help"):
***************
*** 206,209 ****
--- 221,226 ----
          elif opt in ('-o', '--option'):
              Options.options.set_from_cmdline(arg, sys.stderr)
+         elif opt == '--ratio':
+             sh_ratio = int(arg)

      if ham is None or spam is None:
***************
*** 221,225 ****

      tdict = {}
!     train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose)

      store.store()
--- 238,243 ----

      tdict = {}
!     train(store, ham, spam, maxmsgs, maxrounds, tdict, reverse, verbose,
!           sh_ratio)

      store.store()