[Python-checkins] python/nondist/sandbox/spambayes timtest.py,NONE,1.1

Fri, 30 Aug 2002 23:50:27 -0700

Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv15716

Added Files:
	timtest.py 
Log Message:
This is a driver I've been using for test runs.  It's specific to my
corpus directories, but has useful stuff in it all the same.


--- NEW FILE: timtest.py ---
#! /usr/bin/env python

NSETS = 5
SPAMDIRS = ["Data/Spam/Set%d" % i for i in range(1, NSETS+1)]
HAMDIRS  = ["Data/Ham/Set%d" % i for i in range(1, NSETS+1)]
SPAMHAMDIRS = zip(SPAMDIRS, HAMDIRS)

import os
import re
from sets import Set
import email
from email import message_from_string

import Tester
import classifier

def textparts(msg):
    text = Set()
    redundant_html = Set()
    for part in msg.walk():
        if part.get_content_type() == 'multipart/alternative':
            textpart = htmlpart = None
            for subpart in part.get_payload():
                ctype = subpart.get_content_type()
                if ctype == 'text/plain':
                    textpart = subpart
                elif ctype == 'text/html':
                    htmlpart = subpart

            if textpart is not None:
                text.add(textpart)
                if htmlpart is not None:
                    redundant_html.add(htmlpart)
            elif htmlpart is not None:
                text.add(htmlpart)

        elif part.get_content_maintype() == 'text':
            text.add(part)

    return text - redundant_html

url_re = re.compile(r"http://([^\s>'\"\x7f-\xff]+)", re.IGNORECASE)
urlsep_re = re.compile(r"[;?:@&=+,$.]")

def tokenize(string):
    # Skip headers.
    i = string.find('\n\n')
    nohead = None
    if i >= 0:
        nohead = string[i+2:]
        for url in url_re.findall(nohead):
            for i, piece in enumerate(url.lower().split('/')):
                prefix = "url%d:" % i
                for chunk in urlsep_re.split(piece):
                    yield prefix + chunk

    try:
        msg = message_from_string(string)
    except email.Errors.MessageParseError:
        yield 'control: MessageParseError'
        if nohead is not None:
            for w in nohead.split():
                if 3 <= len(w) <= 12:
                    yield w
        return

    for part in textparts(msg):
        try:
            text = part.get_payload(decode=1)
        except:
            yield 'control: get_payload crapped out'
        else:
            if text is None:
                yield 'control: payload is None'
            else:
                for w in text.split():
                    if 3 <= len(w) <= 12:
                        yield w

class Msg(object):
    def __init__(self, dir, name):
        path = dir + "/" + name
        self.path = path
        f = open(path, 'rb')
        guts = f.read()
        f.close()
#        # Skip the headers.
#        i = guts.find('\n\n')
#        if i >= 0:
#            guts = guts[i+2:]
        self.guts = guts

    def __iter__(self):
        return tokenize(self.guts)

    def __hash__(self):
        return hash(self.path)

    def __eq__(self, other):
        return self.path == other.path

class MsgStream(object):
    def __init__(self, directory):
        self.directory = directory

    def produce(self):
        directory = self.directory
        for fname in os.listdir(directory):
            yield Msg(directory, fname)

    def __iter__(self):
        return self.produce()


def drive():
    falsepos = Set()
    falseneg = Set()
    for spamdir, hamdir in SPAMHAMDIRS:
        c = classifier.GrahamBayes()
        t = Tester.Test(c)
        print "Training on", hamdir, "&", spamdir, "...",
        t.train(MsgStream(hamdir), MsgStream(spamdir))
        print t.nham, "hams &", t.nspam, "spams"

        for sd2, hd2 in SPAMHAMDIRS:
            if (sd2, hd2) == (spamdir, hamdir):
                continue
            t.reset_test_results()
            print "    testing against", hd2, "&", sd2, "...",
            t.predict(MsgStream(sd2), True)
            t.predict(MsgStream(hd2), False)
            print t.nham_tested, "hams &", t.nspam_tested, "spams"

            print "    false positive:", t.false_positive_rate()
            print "    false negative:", t.false_negative_rate()

            newfpos = Set(t.false_positives()) - falsepos
            falsepos |= newfpos
            print "    new false positives:", [e.path for e in newfpos]
            for e in newfpos:
                print '*' * 78
                print e.path
                prob, clues = c.spamprob(e, True)
                print "prob =", prob
                for clue in clues:
                    print "prob(%r) = %g" % clue
                print e.guts

            newfneg = Set(t.false_negatives()) - falseneg
            falseneg |= newfneg
            print "    new false negatives:", [e.path for e in newfneg]
            for e in newfneg:
                print '*' * 78
                print e.path
                prob, clues = c.spamprob(e, True)
                print "prob =", prob
                for clue in clues:
                    print "prob(%r) = %g" % clue
                print e.guts[:1000]

            print

            print "    best discriminators:"
            stats = [(r.killcount, w) for w, r in c.wordinfo.iteritems()]
            stats.sort()
            del stats[:-30]
            for count, w in stats:
                r = c.wordinfo[w]
                print "        %r %d %g" % (w, r.killcount, r.spamprob)
            print

drive()