[Spambayes-checkins] spambayes hammie.py,1.5,1.6

Guido van Rossum gvanrossum@users.sourceforge.net
Fri, 06 Sep 2002 21:20:45 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv4871

Modified Files:
	hammie.py 
Log Message:
Fixed a bug in the opening of a folder given with "+foo" (wasn't using
_factory).

Add a -u option similar to that of GBayes.py.  For this, factored the
opening of the mbox out of train() into a separate function getmbox(),
and the formatting of the clues out of filter().

(The -u option needs work; it currently doesn't report the message
number in a very useful way.)


Index: hammie.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/hammie.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** hammie.py	6 Sep 2002 20:48:29 -0000	1.5
--- hammie.py	7 Sep 2002 04:20:43 -0000	1.6
***************
*** 10,16 ****
          show usage and exit
      -g PATH
!         mbox or directory of known good messages (non-spam)
      -s PATH
!         mbox or directory of known spam messages
      -p FILE
          use file as the persistent store.  loads data from this file if it
--- 10,18 ----
          show usage and exit
      -g PATH
!         mbox or directory of known good messages (non-spam) to train on.
      -s PATH
!         mbox or directory of known spam messages to train on.
!     -u PATH
!         mbox of unknown messages.  A ham/spam decision is reported for each.
      -p FILE
          use file as the persistent store.  loads data from this file if it
***************
*** 179,184 ****
  
  
! def train(bayes, msgs, is_spam):
!     """Train bayes with all messages from a mailbox."""
      def _factory(fp):
          try:
--- 181,186 ----
  
  
! def getmbox(msgs):
!     """Return an iterable mbox object given a file/directory/folder name."""
      def _factory(fp):
          try:
***************
*** 190,194 ****
          import mhlib
          mh = mhlib.MH()
!         mbox = mailbox.MHMailbox(os.path.join(mh.getpath(), msgs[1:]))
      elif os.path.isdir(msgs):
          # XXX Bogus: use an MHMailbox if the pathname contains /Mail/,
--- 192,197 ----
          import mhlib
          mh = mhlib.MH()
!         mbox = mailbox.MHMailbox(os.path.join(mh.getpath(), msgs[1:]),
!                                  _factory)
      elif os.path.isdir(msgs):
          # XXX Bogus: use an MHMailbox if the pathname contains /Mail/,
***************
*** 201,205 ****
--- 204,212 ----
          fp = open(msgs)
          mbox = mailbox.PortableUnixMailbox(fp, _factory)
+     return mbox
  
+ def train(bayes, msgs, is_spam):
+     """Train bayes with all messages from a mailbox."""
+     mbox = getmbox(msgs)
      i = 0
      for msg in mbox:
***************
*** 212,215 ****
--- 219,227 ----
      print
  
+ def formatclues(clues, sep="; "):
+     """Format the clues into something readable."""
+     # XXX Maybe sort by prob first?
+     return sep.join(["%r: %.2f" % (word, prob) for word, prob in clues])
+ 
  def filter(bayes, input, output):
      """Filter (judge) a message"""
***************
*** 221,228 ****
          disp = "Yes"
      disp += "; %.2f" % prob
!     disp += "; " + "; ".join(map(lambda x: "%s: %.2f" % (`x[0]`, x[1]), clues))
      msg.add_header("X-Spam-Disposition", disp)
      output.write(str(msg))
  
  def usage(code, msg=''):
      if msg:
--- 233,259 ----
          disp = "Yes"
      disp += "; %.2f" % prob
!     disp += "; " + formatclues(clues)
      msg.add_header("X-Spam-Disposition", disp)
      output.write(str(msg))
  
+ def score(bayes, msgs):
+     """Score (judge) all messages from a mailbox."""
+     # XXX The reporting needs work!
+     mbox = getmbox(msgs)
+     i = 0
+     spams = hams = 0
+     for msg in mbox:
+         i += 1
+         prob, clues = bayes.spamprob(tokenize(str(msg)), True)
+         isspam = prob >= 0.9
+         print "%6d %4.2f %1s" % (i, prob, isspam and "S" or "."),
+         if isspam:
+             spams += 1
+             print formatclues(clues)
+         else:
+             hams += 1
+             print
+     print "Total %d spam, %d ham" % (spams, hams)
+ 
  def usage(code, msg=''):
      if msg:
***************
*** 234,238 ****
  def main():
      try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hdfg:s:p:')
      except getopt.error, msg:
          usage(1, msg)
--- 265,269 ----
  def main():
      try:
!         opts, args = getopt.getopt(sys.argv[1:], 'hdfg:s:p:u:')
      except getopt.error, msg:
          usage(1, msg)
***************
*** 242,246 ****
  
      pck = "hammie.db"
!     good = spam = None
      do_filter = usedb = False
      for opt, arg in opts:
--- 273,277 ----
  
      pck = "hammie.db"
!     good = spam = unknown = None
      do_filter = usedb = False
      for opt, arg in opts:
***************
*** 257,260 ****
--- 288,293 ----
          elif opt == "-f":
              do_filter = True
+         elif opt == '-u':
+             unknown = arg
      if args:
          usage(1)
***************
*** 294,297 ****
--- 327,333 ----
      if do_filter:
          filter(bayes, sys.stdin, sys.stdout)
+ 
+     if unknown:
+         score(bayes, unknown)
  
  if __name__ == "__main__":