[Spambayes-checkins] spambayes/spambayes Options.py, 1.72,
1.73 classifier.py, 1.9, 1.10
Tony Meyer
anadelonbrin at users.sourceforge.net
Sun Sep 14 21:08:13 EDT 2003
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv10633/spambayes
Modified Files:
Options.py classifier.py
Log Message:
Goodbye Gary-combining! (Use the Last-Gary tag to get it back).
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.72
retrieving revision 1.73
diff -C2 -d -r1.72 -r1.73
*** Options.py 9 Sep 2003 07:03:54 -0000 1.72
--- Options.py 15 Sep 2003 01:08:11 -0000 1.73
***************
*** 371,382 ****
REAL, RESTORE),
- ("use_gary_combining", "Use gary-combining", False,
- """The combining scheme currently detailed on the Robinson web page.
- The middle ground here is touchy, varying across corpus, and within
- a corpus across amounts of training data. It almost never gives
- extreme scores (near 0.0 or 1.0), but the tail ends of the ham and
- spam distributions overlap.""",
- BOOLEAN, RESTORE),
-
("use_chi_squared_combining", "Use chi-squared combining", True,
"""For vectors of random, uniformly distributed probabilities,
--- 371,374 ----
Index: classifier.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/classifier.py,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** classifier.py 5 Sep 2003 01:15:28 -0000 1.9
--- classifier.py 15 Sep 2003 01:08:11 -0000 1.10
***************
*** 25,28 ****
--- 25,31 ----
# advance, and it's touchy.
#
+ # The last version of the Gary-combining scheme can be retrieved from our
+ # CVS repository via tag Last-Gary.
+ #
# The chi-combining scheme used by default here gets closer to the theoretical
# basis of Gary's combining scheme, and does give extreme scores, but also
***************
*** 108,181 ****
# spamprob() implementations. One of the following is aliased to
# spamprob, depending on option settings.
!
! def gary_spamprob(self, wordstream, evidence=False):
! """Return best-guess probability that wordstream is spam.
!
! wordstream is an iterable object producing words.
! The return value is a float in [0.0, 1.0].
!
! If optional arg evidence is True, the return value is a pair
! probability, evidence
! where evidence is a list of (word, probability) pairs.
! """
!
! from math import frexp
!
! # This combination method is due to Gary Robinson; see
! # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
!
! # The real P = this P times 2**Pexp. Likewise for Q. We're
! # simulating unbounded dynamic float range by hand. If this pans
! # out, *maybe* we should store logarithms in the database instead
! # and just add them here. But I like keeping raw counts in the
! # database (they're easy to understand, manipulate and combine),
! # and there's no evidence that this simulation is a significant
! # expense.
! P = Q = 1.0
! Pexp = Qexp = 0
! clues = self._getclues(wordstream)
! for prob, word, record in clues:
! P *= 1.0 - prob
! Q *= prob
! if P < 1e-200: # move back into range
! P, e = frexp(P)
! Pexp += e
! if Q < 1e-200: # move back into range
! Q, e = frexp(Q)
! Qexp += e
!
! P, e = frexp(P)
! Pexp += e
! Q, e = frexp(Q)
! Qexp += e
!
! num_clues = len(clues)
! if num_clues:
! #P = 1.0 - P**(1./num_clues)
! #Q = 1.0 - Q**(1./num_clues)
! #
! # (x*2**e)**n = x**n * 2**(e*n)
! n = 1.0 / num_clues
! P = 1.0 - P**n * 2.0**(Pexp * n)
! Q = 1.0 - Q**n * 2.0**(Qexp * n)
!
! # (P-Q)/(P+Q) is in -1 .. 1; scaling into 0 .. 1 gives
! # ((P-Q)/(P+Q)+1)/2 =
! # ((P-Q+P-Q)/(P+Q)/2 =
! # (2*P/(P+Q)/2 =
! # P/(P+Q)
! prob = P/(P+Q)
! else:
! prob = 0.5
!
! if evidence:
! clues = [(w, p) for p, w, r in clues]
! clues.sort(lambda a, b: cmp(a[1], b[1]))
! return prob, clues
! else:
! return prob
!
! if options["Classifier", "use_gary_combining"]:
! spamprob = gary_spamprob
# Across vectors of length n, containing random uniformly-distributed
--- 111,116 ----
# spamprob() implementations. One of the following is aliased to
# spamprob, depending on option settings.
! # Currently only chi-squared is available, but maybe there will be
! # an alternative again someday.
# Across vectors of length n, containing random uniformly-distributed
More information about the Spambayes-checkins
mailing list