[Python-checkins] python/nondist/sandbox/spambayes Tester.py,NONE,1.1
tim_one@users.sourceforge.net
tim_one@users.sourceforge.net
Tue, 27 Aug 2002 15:10:07 -0700
Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv3215
Added Files:
Tester.py
Log Message:
A start at a testing class. There isn't a lot here, but it automates
much of the tedium, and as the doctest shows it can already do
useful things, like remembering which inputs were misclassified.
--- NEW FILE: Tester.py ---
class Test:
# Pass a classifier instance (an instance of GrahamBayes).
# Loop:
# Optional:
# Train it, via train().
# reset_test_results()
# Loop:
# invoke predict() with (probably new) examples
# Optional:
# suck out the results, via instance vrbls and
# false_negative_rate(), false_positive_rate(),
# false_negatives(), and false_positives()
def __init__(self, classifier):
self.classifier = classifier
# The number of ham and spam instances in the training data.
self.nham = self.nspam = 0
self.reset_test_results()
def reset_test_results(self):
# The number of ham and spam instances tested.
self.nham_tested = self.nspam_tested = 0
# The number of test instances correctly and incorrectly classified.
self.nham_right = 0
self.nham_wrong = 0
self.nspam_right = 0
self.nspam_wrong = 0
# Lists of bad predictions.
self.ham_wrong_examples = [] # False positives: ham called spam.
self.spam_wrong_examples = [] # False negatives: spam called ham.
# Train the classifier on streams of ham and spam. Updates probabilities
# before returning.
def train(self, hamstream=None, spamstream=None):
learn = self.classifier.learn
if hamstream is not None:
for example in hamstream:
learn(example, False, False)
self.nham += 1
if spamstream is not None:
for example in spamstream:
learn(example, True, False)
self.nspam += 1
self.classifier.update_probabilities()
# Run prediction on each sample in stream. You're swearing that stream
# is entirely composed of spam (is_spam True), or of ham (is_spam False).
# Note that mispredictions are saved, and can be retrieved later via
# false_negatives (ham mistakenly called spam) and false_positives (spam
# mistakenly called ham). For this reason, you may wish to wrap examples
# in a little class that identifies the example in a useful way, and whose
# __iter__ produces a token stream for the classifier.
def predict(self, stream, is_spam):
guess = self.classifier.spamprob
for example in stream:
is_spam_guessed = guess(example) > 0.90
correct = is_spam_guessed == is_spam
if is_spam:
self.nspam_tested += 1
if correct:
self.nspam_right += 1
else:
self.nspam_wrong += 1
self.spam_wrong_examples.append(example)
else:
self.nham_tested += 1
if correct:
self.nham_right += 1
else:
self.nham_wrong += 1
self.ham_wrong_examples.append(example)
assert self.nham_right + self.nham_wrong == self.nham_tested
assert self.nspam_right + self.nspam_wrong == self.nspam_tested
def false_positive_rate(self):
return float(self.nham_wrong) / self.nham_tested
def false_negative_rate(self):
return float(self.nspam_wrong) / self.nspam_tested
def false_positives(self):
return self.ham_wrong_examples
def false_negatives(self):
return self.spam_wrong_examples
class _Example:
def __init__(self, name, words):
self.name = name
self.words = words
def __iter__(self):
return iter(self.words)
_easy_test = """
>>> from classifier import GrahamBayes
>>> good1 = _Example('', ['a', 'b', 'c'] * 10)
>>> good2 = _Example('', ['a', 'b'] * 10)
>>> bad1 = _Example('', ['d'] * 10)
>>> t = Test(GrahamBayes())
>>> t.train([good1, good2], [bad1])
>>> t.reset_test_results()
>>> t.predict([_Example('goodham', ['a', 'b']),
... _Example('badham', ['d'])
... ], False)
>>> t.predict([_Example('goodspam', ['d', 'd']),
... _Example('badspam1', ['c']),
... _Example('badspam2', ['a'] * 15 + ['d'] * 1000),
... _Example('badspam3', ['d', 'a', 'b', 'c'])
... ], True)
>>> t.nham_tested
2
>>> t.nham_right, t.nham_wrong
(1, 1)
>>> t.false_positive_rate()
0.5
>>> [e.name for e in t.false_positives()]
['badham']
>>> t.nspam_tested
4
>>> t.nspam_right, t.nspam_wrong
(1, 3)
>>> t.false_negative_rate()
0.75
>>> [e.name for e in t.false_negatives()]
['badspam1', 'badspam2', 'badspam3']
"""
__test__ = {'easy': _easy_test}
def _test():
import doctest, Tester
doctest.testmod(Tester)
if __name__ == '__main__':
_test()