[Spambayes-checkins] spambayes classifier.py,1.19,1.20
Tim Peters
tim_one@users.sourceforge.net
Tue, 24 Sep 2002 15:14:04 -0700
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv15524
Modified Files:
classifier.py
Log Message:
central_limit_compute_population_stats2(): Squashed code duplication.
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** classifier.py 24 Sep 2002 03:29:48 -0000 1.19
--- classifier.py 24 Sep 2002 22:14:01 -0000 1.20
***************
*** 483,486 ****
--- 483,488 ----
# XXX More stuff should be reworked to use this as a helper function.
def _getclues(self, wordstream):
+ mindist = options.robinson_minimum_prob_strength
+
# A priority queue to remember the MAX_DISCRIMINATORS best
# probabilities, where "best" means largest distance from 0.5.
***************
*** 500,504 ****
distance = abs(prob - 0.5)
! if distance > smallest_best:
heapreplace(nbest, (distance, prob, word, record))
smallest_best = nbest[0][0]
--- 502,506 ----
distance = abs(prob - 0.5)
! if distance >= mindist and distance > smallest_best:
heapreplace(nbest, (distance, prob, word, record))
smallest_best = nbest[0][0]
***************
*** 764,782 ****
sum += prob
sumsq += prob * prob
n = len(seen)
if is_spam:
self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq
! spamsum = self.spamsum
! self.spammean = ldexp(spamsum, -64) / self.spamn
! spamvar = self.spamsumsq * self.spamn - spamsum**2
! self.spamvar = ldexp(spamvar, -128) / (self.spamn ** 2)
print 'spammean', self.spammean, 'spamvar', self.spamvar
else:
self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq
! hamsum = self.hamsum
! self.hammean = ldexp(hamsum, -64) / self.hamn
! hamvar = self.hamsumsq * self.hamn - hamsum**2
! self.hamvar = ldexp(hamvar, -128) / (self.hamn ** 2)
print 'hammean', self.hammean, 'hamvar', self.hamvar
--- 766,782 ----
sum += prob
sumsq += prob * prob
+
n = len(seen)
+ mean = ldexp(sum, -64) / n
+ var = sumsq * n - sum**2
+ var = ldexp(var, -128) / n**2
if is_spam:
self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq
! self.spammean, self.spamvar = mean, var
print 'spammean', self.spammean, 'spamvar', self.spamvar
else:
self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq
! self.hammean, self.hamvar = mean, var
print 'hammean', self.hammean, 'hamvar', self.hamvar