[Spambayes-checkins] spambayes pop3graph.py,NONE,1.1

Richie Hindle richiehindle@users.sourceforge.net
Wed Nov 20 12:30:18 2002


Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv17757

Added Files:
	pop3graph.py 
Log Message:
Script for producing ASCII graphs of classifier performance, based
on pop3proxy corpuses.


--- NEW FILE: pop3graph.py ---
"""Analyse the pop3proxy's caches and produce a graph of how accurate
classifier has been over time.  Only really meaningful if you started
with an empty database."""

from __future__ import division

import sys, mboxutils
from FileCorpus import FileCorpus, FileMessageFactory, GzipFileMessageFactory
from Options import options

def main():
   # Create the corpuses and the factory that reads the messages.
   if options.pop3proxy_cache_use_gzip:
       messageFactory = GzipFileMessageFactory()
   else:
       messageFactory = FileMessageFactory()
   spamCorpus = FileCorpus(messageFactory, options.pop3proxy_spam_cache)
   hamCorpus = FileCorpus(messageFactory, options.pop3proxy_ham_cache)

   # Read in all the trained messages.
   allTrained = {}
   for corpus, disposition in [(spamCorpus, 'Yes'), (hamCorpus, 'No')]:
      for m in corpus:
         message = mboxutils.get_message(m.getSubstance())
         message._pop3CacheDisposition = disposition
         allTrained[m.key()] = message

   # Sort the messages into the order they arrived, then work out a scaling
   # factor for the graph - 'limit' is the widest it can be in characters.
   keys = allTrained.keys()
   keys.sort()
   limit = 70
   if len(keys) < limit:
      scale = 1
   else:
      scale = len(keys) // (limit//2)

   # Build the data - an array of cumulative success indexed by count.
   count = successful = 0
   successByCount = []
   for key in keys:
      message = allTrained[key]
      disposition = message[options.hammie_header_name]
      if (message._pop3CacheDisposition == disposition):
         successful += 1
      count += 1
      if count % scale == (scale-1):
         successByCount.append(successful // scale)

   # Build the graph, as a list of rows of characters.
   size = count // scale
   graph = [[" " for i in range(size+3)] for j in range(size)]
   for c in range(size):
      graph[c][1] = "|"
      graph[c][c+3] = "."
      graph[successByCount[c]][c+3] = "*"
   graph.reverse()

   # Print the graph.
   print "\n   Success of the classifier over time:\n"
   print "   . - Number of messages over time"
   print "   * - Number of correctly classified messages over time\n\n"
   for row in range(size):
      line = ''.join(graph[row])
      if row == 0:
         print line + " %d" % count
      elif row == (count - successful) // scale:
         print line + " %d" % successful
      else:
         print line
   print " " + "_" * (size+2)

if __name__ == '__main__':
   main()





More information about the Spambayes-checkins mailing list