[Spambayes-checkins] spambayes rmspik.py,NONE,1.1

Tim Peters tim_one@users.sourceforge.net
Sat, 05 Oct 2002 16:46:01 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv13010

Added Files:
	rmspik.py 
Log Message:
This is Rob Hooft's central-limit binary-pickle "RMS ZScore" emulator,
edited for coding style (long lines, whitespace).


--- NEW FILE: rmspik.py ---
#! /usr/bin/env python

# Analyze a clim.pik file.

"""Usage: %(program)s  [options] [central_limit_pickle_file]

An example analysis program showing to access info from a central-limit
pickle file created by clgen.py.  This program produces histograms of
various things.

Scores for all predictions are saved at the end of binary pickle clim.pik.
This contains two lists of tuples, the first list with a tuple for every
ham predicted, the second list with a tuple for every spam predicted.  Each
tuple has these values:

    tag         the msg identifier
    is_spam     True if msg came from a spam Set, False if from a ham Set
    zham        the msg zscore relative to the population ham
    zspam       the msg zscore relative to the population spam
    hmean       the raw mean ham score
    smean       the raw mean spam score
    n           the number of clues used to judge this msg

Note that hmean and smean are the same under use_central_limit; they're
very likely to differ under use_central_limit2.

Where:
    -h
        Show usage and exit.

If no file is named on the cmdline, clim.pik is used.
"""

surefactor = 1000 # This is basically the inverse of the accepted fp/fn rate
punsure = False # Print unsure decisions (otherwise only sure-but-false)

import sys,math,os
import cPickle as pickle

program = sys.argv[0]

def usage(code, msg=''):
    """Print usage message and sys.exit(code)."""
    if msg:
        print >> sys.stderr, msg
        print >> sys.stderr
    print >> sys.stderr, __doc__ % globals()
    sys.exit(code)

def chance(x):
    if x>=0:
        return 1.0
    x=-x/math.sqrt(2)
    if x<1.4:
        return 1.0
    assert x>=1.4
    x=float(x)
    pre=math.exp(-x**2)/math.sqrt(math.pi)/x
    post=1-(1/(2*x**2))
    return pre*post

knownfalse = {}

def readknownfalse():
    global knownfalse
    knownfalse = {}
    try:
        f = open('knownfalse.dat')
    except IOError:
        return
    for line in f:
        key, desc = line.split(None, 1)
        knownfalse[key] = desc[:-1]
    f.close()
    print "%d descriptions from knownfalse.dat" % len(knownfalse)

def prknown(tag):
    bn = os.path.basename(tag)
    if bn in knownfalse:
        print " ==>", knownfalse[bn]

def drive(fname):
    print 'Reading', fname, '...'
    f = open(fname, 'rb')
    ham = pickle.load(f)
    spam = pickle.load(f)
    f.close()

    zhamsum2 = 0.0
    nham = 0
    for msg in ham:
        if msg[1]:
            print "spam in ham",msg
        else:
            zhamsum2 += msg[2]**2
            nham += 1
    rmszham = math.sqrt(zhamsum2 / nham)
    print "Nham=", nham
    print "RmsZham=", rmszham

    zspamsum2 = 0.0
    nspam = 0
    for msg in spam:
        if not msg[1]:
            print "ham in spam",msg
        else:
            zspamsum2 += msg[3]**2
            nspam += 1
    rmszspam = math.sqrt(zspamsum2 / nspam)
    print "Nspam=", nspam
    print "RmsZspam=", rmszspam

    #========= Analyze ham
    print "=" * 70
    print "HAM:"
    nsureok = nunsureok = nunsurenok = nsurenok = 0
    for msg in ham:
        zham = msg[2] / rmszham
        zspam = msg[3] / rmszspam
        cham = chance(zham)
        cspam = chance(zspam)
        if cham > surefactor*cspam and cham > 0.01:
            nsureok += 1 # very certain
        elif cham > cspam:
            nunsureok += 1
            #print "Unsure",msg[0]
            #prknown(msg[0])
        else:
            if cspam > surefactor*cham and cspam > 0.01:
                reason = "SURE!"
                nsurenok += 1
            elif cham < 0.01 and cspam < 0.01:
                reason = "neither?"
                nunsurenok += 1
            elif cham > 0.1 and cspam > 0.1:
                reason = "both?"
                nunsurenok += 1
            else:
                reason = "Unsure"
                nunsurenok += 1
            if reason=="SURE!" or punsure:
                print "FALSE POSITIVE: zham=%.2f zspam=%.2f %s %s" % (
                      zham, zspam, msg[0], reason)
                prknown(msg[0])
    print "Sure/ok      ", nsureok
    print "Unsure/ok    ", nunsureok
    print "Unsure/not ok", nunsurenok
    print "Sure/not ok  ", nsurenok
    print "Unsure rate = %.2f%%" % (100.*(nunsureok + nunsurenok) / len(ham))
    print "Sure fp rate = %.2f%%; Unsure fp rate = %.2f%%" % (
          100.*nsurenok / (nsurenok + nsureok),
          100.*nunsurenok / (nunsurenok + nunsureok))
    #========= Analyze spam
    print "="*70
    print "SPAM:"
    nsureok = nunsureok = nunsurenok = nsurenok = 0
    for msg in spam:
        zham = msg[2] / rmszham
        zspam = msg[3] / rmszspam
        cham = chance(zham)
        cspam = chance(zspam)
        if cspam > surefactor*cham and cspam > 0.01:
            nsureok += 1 # very certain
        elif cspam > cham:
            nunsureok += 1
            #print "Unsure",msg[0]
            #prknown(msg[0])
        else:
            if cham > surefactor*cspam and cham > 0.01:
                reason = "SURE!"
                nsurenok += 1
            elif cham < 0.01 and cspam < 0.01:
                reason = "neither?"
                nunsurenok += 1
            elif cham > 0.1 and cspam > 0.1:
                reason = "both?"
                nunsurenok += 1
            else:
                reason = "Unsure"
                nunsurenok += 1
            if reason=="SURE!" or punsure:
                print "FALSE NEGATIVE: zham=%.2f zspam=%.2f %s %s" % (
                      zham, zspam, msg[0], reason)
                prknown(msg[0])
    print "Sure/ok      ", nsureok
    print "Unsure/ok    ", nunsureok
    print "Unsure/not ok", nunsurenok
    print "Sure/not ok  ", nsurenok
    print "Unsure rate = %.2f%%"% (100.*(nunsureok + nunsurenok) / len(ham))
    print "Sure fn rate = %.2f%%; Unsure fn rate = %.2f%%" % (
          100.*nsurenok / (nsurenok + nsureok),
          100.*nunsurenok / (nunsurenok + nunsureok))

def main():
    import getopt

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'h')
    except getopt.error, msg:
        usage(1, msg)

    nbuckets = 100
    for opt, arg in opts:
        if opt == '-h':
            usage(0)

    fname = 'clim.pik'
    if args:
        fname = args.pop(0)
    if args:
        usage(1, "No more than one positional argument allowed")

    readknownfalse()
    drive(fname)

if __name__ == "__main__":
    main()