[Python-checkins] python/nondist/sandbox/spambayes Tester.py,NONE,1.1

Tue, 27 Aug 2002 15:10:07 -0700

Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv3215

Added Files:
	Tester.py 
Log Message:
A start at a testing class.  There isn't a lot here, but it automates
much of the tedium, and as the doctest shows it can already do
useful things, like remembering which inputs were misclassified.


--- NEW FILE: Tester.py ---
class Test:
    # Pass a classifier instance (an instance of GrahamBayes).
    # Loop:
    #     Optional:
    #         Train it, via train().
    #     reset_test_results()
    #     Loop:
    #         invoke predict() with (probably new) examples
    #         Optional:
    #             suck out the results, via instance vrbls and
    #             false_negative_rate(), false_positive_rate(),
    #             false_negatives(), and false_positives()

    def __init__(self, classifier):
        self.classifier = classifier
        # The number of ham and spam instances in the training data.
        self.nham = self.nspam = 0
        self.reset_test_results()

    def reset_test_results(self):
        # The number of ham and spam instances tested.
        self.nham_tested = self.nspam_tested = 0

        # The number of test instances correctly and incorrectly classified.
        self.nham_right = 0
        self.nham_wrong = 0
        self.nspam_right = 0
        self.nspam_wrong = 0

        # Lists of bad predictions.
        self.ham_wrong_examples = []    # False positives:  ham called spam.
        self.spam_wrong_examples = []   # False negatives:  spam called ham.

    # Train the classifier on streams of ham and spam.  Updates probabilities
    # before returning.
    def train(self, hamstream=None, spamstream=None):
        learn = self.classifier.learn
        if hamstream is not None:
            for example in hamstream:
                learn(example, False, False)
                self.nham += 1
        if spamstream is not None:
            for example in spamstream:
                learn(example, True, False)
                self.nspam += 1
        self.classifier.update_probabilities()

    # Run prediction on each sample in stream.  You're swearing that stream
    # is entirely composed of spam (is_spam True), or of ham (is_spam False).
    # Note that mispredictions are saved, and can be retrieved later via
    # false_negatives (ham mistakenly called spam) and false_positives (spam
    # mistakenly called ham).  For this reason, you may wish to wrap examples
    # in a little class that identifies the example in a useful way, and whose
    # __iter__ produces a token stream for the classifier.
    def predict(self, stream, is_spam):
        guess = self.classifier.spamprob
        for example in stream:
            is_spam_guessed = guess(example) > 0.90
            correct = is_spam_guessed == is_spam
            if is_spam:
                self.nspam_tested += 1
                if correct:
                    self.nspam_right += 1
                else:
                    self.nspam_wrong += 1
                    self.spam_wrong_examples.append(example)
            else:
                self.nham_tested += 1
                if correct:
                    self.nham_right += 1
                else:
                    self.nham_wrong += 1
                    self.ham_wrong_examples.append(example)

        assert self.nham_right + self.nham_wrong == self.nham_tested
        assert self.nspam_right + self.nspam_wrong == self.nspam_tested

    def false_positive_rate(self):
        return float(self.nham_wrong) / self.nham_tested

    def false_negative_rate(self):
        return float(self.nspam_wrong) / self.nspam_tested

    def false_positives(self):
        return self.ham_wrong_examples

    def false_negatives(self):
        return self.spam_wrong_examples


class _Example:
    def __init__(self, name, words):
        self.name = name
        self.words = words
    def __iter__(self):
        return iter(self.words)

_easy_test = """
    >>> from classifier import GrahamBayes

    >>> good1 = _Example('', ['a', 'b', 'c'] * 10)
    >>> good2 = _Example('', ['a', 'b'] * 10)
    >>> bad1 = _Example('', ['d'] * 10)

    >>> t = Test(GrahamBayes())
    >>> t.train([good1, good2], [bad1])
    >>> t.reset_test_results()
    >>> t.predict([_Example('goodham', ['a', 'b']),
    ...            _Example('badham', ['d'])
    ...           ], False)
    >>> t.predict([_Example('goodspam', ['d', 'd']),
    ...            _Example('badspam1', ['c']),
    ...            _Example('badspam2', ['a'] * 15 + ['d'] * 1000),
    ...            _Example('badspam3', ['d', 'a', 'b', 'c'])
    ...           ], True)

    >>> t.nham_tested
    2
    >>> t.nham_right, t.nham_wrong
    (1, 1)
    >>> t.false_positive_rate()
    0.5
    >>> [e.name for e in t.false_positives()]
    ['badham']

    >>> t.nspam_tested
    4
    >>> t.nspam_right, t.nspam_wrong
    (1, 3)
    >>> t.false_negative_rate()
    0.75
    >>> [e.name for e in t.false_negatives()]
    ['badspam1', 'badspam2', 'badspam3']
"""

__test__ = {'easy': _easy_test}

def _test():
    import doctest, Tester
    doctest.testmod(Tester)

if __name__ == '__main__':
    _test()