[Spambayes-checkins] spambayes/spambayes tokenizer.py,1.18,1.19

Mon Dec 15 03:16:28 EST 2003

Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv2167/spambayes

Modified Files:
	tokenizer.py 
Log Message:
I'm not sure how, but somehow I ended up dedenting a function in the last
checkin.  Undo that.

Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -C2 -d -r1.18 -r1.19
*** tokenizer.py	15 Dec 2003 07:52:41 -0000	1.18
--- tokenizer.py	15 Dec 2003 08:16:26 -0000	1.19
***************
*** 659,692 ****
  def tokenize_word(word, _len=len, maxword=options["Tokenizer",
                                                    "skip_max_word_size"]):
!    n = _len(word)
!    # Make sure this range matches in tokenize().
!    if 3 <= n <= maxword:
!        yield word
! 
!    elif n >= 3:
!        # A long word.
! 
!        # Don't want to skip embedded email addresses.
!        # An earlier scheme also split up the y in x at y on '.'.  Not splitting
!        # improved the f-n rate; the f-p rate didn't care either way.
!        if n < 40 and '.' in word and word.count('@') == 1:
!            p1, p2 = word.split('@')
!            yield 'email name:' + p1
!            yield 'email addr:' + p2
  
!        else:
!            # There's value in generating a token indicating roughly how
!            # many chars were skipped.  This has real benefit for the f-n
!            # rate, but is neutral for the f-p rate.  I don't know why!
!            # XXX Figure out why, and/or see if some other way of summarizing
!            # XXX this info has greater benefit.
!            if options["Tokenizer", "generate_long_skips"]:
!                yield "skip:%c %d" % (word[0], n // 10 * 10)
!            if has_highbit_char(word):
!                hicount = 0
!                for i in map(ord, word):
!                    if i >= 128:
!                        hicount += 1
!                yield "8bit%%:%d" % round(hicount * 100.0 / len(word))
  
  # Generate tokens for:
--- 659,692 ----
  def tokenize_word(word, _len=len, maxword=options["Tokenizer",
                                                    "skip_max_word_size"]):
!     n = _len(word)
!     # Make sure this range matches in tokenize().
!     if 3 <= n <= maxword:
!         yield word
!         
!     elif n >= 3:
!         # A long word.
!         
!         # Don't want to skip embedded email addresses.
!         # An earlier scheme also split up the y in x at y on '.'.  Not splitting
!         # improved the f-n rate; the f-p rate didn't care either way.
!         if n < 40 and '.' in word and word.count('@') == 1:
!             p1, p2 = word.split('@')
!             yield 'email name:' + p1
!             yield 'email addr:' + p2
  
!         else:
!             # There's value in generating a token indicating roughly how
!             # many chars were skipped.  This has real benefit for the f-n
!             # rate, but is neutral for the f-p rate.  I don't know why!
!             # XXX Figure out why, and/or see if some other way of summarizing
!             # XXX this info has greater benefit.
!             if options["Tokenizer", "generate_long_skips"]:
!                 yield "skip:%c %d" % (word[0], n // 10 * 10)
!             if has_highbit_char(word):
!                 hicount = 0
!                 for i in map(ord, word):
!                     if i >= 128:
!                         hicount += 1
!                 yield "8bit%%:%d" % round(hicount * 100.0 / len(word))
  
  # Generate tokens for: