[Spambayes-checkins] spambayes classifier.py,1.19,1.20

Tim Peters tim_one@users.sourceforge.net
Tue, 24 Sep 2002 15:14:04 -0700


Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv15524

Modified Files:
	classifier.py 
Log Message:
central_limit_compute_population_stats2():  Squashed code duplication.


Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/classifier.py,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** classifier.py	24 Sep 2002 03:29:48 -0000	1.19
--- classifier.py	24 Sep 2002 22:14:01 -0000	1.20
***************
*** 483,486 ****
--- 483,488 ----
      # XXX More stuff should be reworked to use this as a helper function.
      def _getclues(self, wordstream):
+         mindist = options.robinson_minimum_prob_strength
+ 
          # A priority queue to remember the MAX_DISCRIMINATORS best
          # probabilities, where "best" means largest distance from 0.5.
***************
*** 500,504 ****
  
              distance = abs(prob - 0.5)
!             if distance > smallest_best:
                  heapreplace(nbest, (distance, prob, word, record))
                  smallest_best = nbest[0][0]
--- 502,506 ----
  
              distance = abs(prob - 0.5)
!             if distance >= mindist and distance > smallest_best:
                  heapreplace(nbest, (distance, prob, word, record))
                  smallest_best = nbest[0][0]
***************
*** 764,782 ****
                  sum += prob
                  sumsq += prob * prob
          n = len(seen)
  
          if is_spam:
              self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq
!             spamsum = self.spamsum
!             self.spammean = ldexp(spamsum, -64) / self.spamn
!             spamvar = self.spamsumsq * self.spamn - spamsum**2
!             self.spamvar = ldexp(spamvar, -128) / (self.spamn ** 2)
              print 'spammean', self.spammean, 'spamvar', self.spamvar
          else:
              self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq
!             hamsum = self.hamsum
!             self.hammean = ldexp(hamsum, -64) / self.hamn
!             hamvar = self.hamsumsq * self.hamn - hamsum**2
!             self.hamvar = ldexp(hamvar, -128) / (self.hamn ** 2)
              print 'hammean', self.hammean, 'hamvar', self.hamvar
  
--- 766,782 ----
                  sum += prob
                  sumsq += prob * prob
+ 
          n = len(seen)
+         mean = ldexp(sum, -64) / n
+         var = sumsq * n - sum**2
+         var = ldexp(var, -128) / n**2
  
          if is_spam:
              self.spamn, self.spamsum, self.spamsumsq = n, sum, sumsq
!             self.spammean, self.spamvar = mean, var
              print 'spammean', self.spammean, 'spamvar', self.spamvar
          else:
              self.hamn, self.hamsum, self.hamsumsq = n, sum, sumsq
!             self.hammean, self.hamvar = mean, var
              print 'hammean', self.hammean, 'hamvar', self.hamvar