[Spambayes-checkins] spambayes clpik.py,NONE,1.1 README.txt,1.31,1.32

Tim Peters tim_one@users.sourceforge.net
Fri, 04 Oct 2002 21:22:52 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv25637

Modified Files:
	README.txt 
Added Files:
	clpik.py 
Log Message:
An example analysis program showing how to access the pickles
produced by clgen.py, and how to generate potentially interesting
histograms from them.


--- NEW FILE: clpik.py ---
#! /usr/bin/env python

# Analyze a clim.pik file.

"""Usage: %(program)s  [options] [central_limit_pickle_file]

An example analysis program showing to access info from a central-limit
pickle file created by clgen.py.  This program produces histograms of
various things.

Scores for all predictions are saved at the end of binary pickle clim.pik.
This contains two lists of tuples, the first list with a tuple for every
ham predicted, the second list with a tuple for every spam predicted.  Each
tuple has these values:

    tag         the msg identifier
    is_spam     True if msg came from a spam Set, False if from a ham Set
    zham        the msg zscore relative to the population ham
    zspam       the msg zscore relative to the population spam
    hmean       the raw mean ham score
    smean       the raw mean spam score
    n           the number of clues used to judge this msg

Note that hmean and smean are the same under use_central_limit; they're
very likely to differ under use_central_limit2.

Where:
    -h
        Show usage and exit.
    -n int
        Number of histogram buckets to display.  Default 100.

If no file is named on the cmdline, clim.pik is used.
"""

import sys
import cPickle as pickle

from Histogram import Hist

fname = 'clim.pik'

program = sys.argv[0]

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def dump(nbuckets, tag, n, hmean, zham, smean, zspam):
    for msg, hist in [('# words', n),
                      ('ham mean', hmean),
                      ('ham zscore', zham),
                      ('spam mean', smean),
                      ('spam zscore', zspam)]:
        print
        print tag, msg + ':',
        hist.display(nbuckets)

def drive(fname, nbuckets):
    print 'Reading', fname, '...'
    f = open(fname, 'rb')
    ham = pickle.load(f)
    spam = pickle.load(f)
    f.close()

    print 'Building histograms for', len(ham), 'ham &', len(spam), 'spam'
    ham_n = Hist(lo=None, hi=None)
    spam_n = Hist(lo=None, hi=None)

    ham_as_ham_mean   = Hist(lo=None, hi=None)
    ham_as_spam_mean  = Hist(lo=None, hi=None)
    spam_as_ham_mean  = Hist(lo=None, hi=None)
    spam_as_spam_mean = Hist(lo=None, hi=None)

    ham_as_ham_zscore   = Hist(lo=None, hi=None)
    ham_as_spam_zscore  = Hist(lo=None, hi=None)
    spam_as_ham_zscore  = Hist(lo=None, hi=None)
    spam_as_spam_zscore = Hist(lo=None, hi=None)

    for msgid, is_spam, zham, zspam, hmean, smean, n in ham:
        ham_n.add(n)
        ham_as_ham_mean.add(hmean)
        ham_as_ham_zscore.add(zham)
        ham_as_spam_mean.add(smean)
        ham_as_spam_zscore.add(zspam)

    dump(nbuckets, 'ham', ham_n, ham_as_ham_mean, ham_as_ham_zscore,
         ham_as_spam_mean, ham_as_spam_zscore)

    for msgid, is_spam, zham, zspam, hmean, smean, n in spam:
        spam_n.add(n)
        spam_as_ham_mean.add(hmean)
        spam_as_ham_zscore.add(zham)
        spam_as_spam_mean.add(smean)
        spam_as_spam_zscore.add(zspam)

    dump(nbuckets, 'spam', spam_n, spam_as_ham_mean, spam_as_ham_zscore,
         spam_as_spam_mean, spam_as_spam_zscore)

def main():
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'hn:',
                                   ['ham-keep=', 'spam-keep='])
    except getopt.error, msg:
        usage(1, msg)

    nbuckets = 100
    for opt, arg in opts:
        if opt == '-h':
            usage(0)
        elif opt == '-n':
            nbuckets = int(arg)

    fname = 'clim.pik'
    if args:
        fname = args.pop(0)
    if args:
        usage(1, "No more than one positional argument allowed")

    drive(fname, nbuckets)

if __name__ == "__main__":
    main()

Index: README.txt
===================================================================
RCS file: /cvsroot/spambayes/spambayes/README.txt,v
retrieving revision 1.31
retrieving revision 1.32
diff -C2 -d -r1.31 -r1.32
*** README.txt	5 Oct 2002 02:53:43 -0000	1.31
--- README.txt	5 Oct 2002 04:22:49 -0000	1.32
***************
*** 180,184 ****
      internal information about every prediction made.  This will go
      away someday.
!     XXX Still need tools to analyze this data.
  
  
--- 180,188 ----
      internal information about every prediction made.  This will go
      away someday.
! 
! clpik.py
!     An example analysis program showing how to access the pickles
!     produced by clgen.py, and how to generate potentially interesting
!     histograms from them.