[Spambayes-checkins] spambayes Options.py,1.56,1.57 tokenizer.py,1.48,1.49

Sat Oct 26 17:15:46 2002

Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv11212

Modified Files:
	Options.py tokenizer.py 
Log Message:
Removed option ignore_redundant_html.  This made some kind of sense in
the early c.l.py tests, before we stripped HTML tags; it doesn't make
sense anymore.

Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.56
retrieving revision 1.57
diff -C2 -d -r1.56 -r1.57
*** Options.py	26 Oct 2002 16:01:13 -0000	1.56
--- Options.py	26 Oct 2002 16:15:38 -0000	1.57
***************
*** 38,53 ****
  # sign of HTML is so despised on tech lists; however, the advantage
  # of setting it true eventually vanishes even there given enough
! # training data.  If you set this true, you should almost certainly set
! # ignore_redundant_html true too.
  retain_pure_html_tags: False

- # If true, when a multipart/alternative has both text/plain and text/html
- # sections, the text/html section is ignored.  That's likely a dubious
- # idea in general, so false is likely a better idea here.  In the c.l.py
- # tests, it helped a lot when retain_pure_html_tags was true (in that case,
- # keeping the HTML tags in the "redundant" HTML was almost certain to score
- # the multipart/alternative as spam, regardless of content).
- ignore_redundant_html: False
- 
  # If true, the first few characters of application/octet-stream sections
  # are used, undecoded.  What 'few' means is decided by octet_prefix_size.
--- 38,44 ----
  # sign of HTML is so despised on tech lists; however, the advantage
  # of setting it true eventually vanishes even there given enough
! # training data.
  retain_pure_html_tags: False

  # If true, the first few characters of application/octet-stream sections
  # are used, undecoded.  What 'few' means is decided by octet_prefix_size.
***************
*** 282,286 ****
  all_options = {
      'Tokenizer': {'retain_pure_html_tags': boolean_cracker,
-                   'ignore_redundant_html': boolean_cracker,
                    'safe_headers': ('get', lambda s: Set(s.split())),
                    'count_all_header_lines': boolean_cracker,
--- 273,276 ----

Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.48
retrieving revision 1.49
diff -C2 -d -r1.48 -r1.49
*** tokenizer.py	25 Oct 2002 16:34:19 -0000	1.48
--- tokenizer.py	26 Oct 2002 16:15:40 -0000	1.49
***************
*** 490,494 ****
  # the text/plain and text/html alternatives may have entirely different
  # content.  options.ignore_redundant_html was introduced to control this,
! # and it defaults to False.

  ##############################################################################
--- 490,494 ----
  # the text/plain and text/html alternatives may have entirely different
  # content.  options.ignore_redundant_html was introduced to control this,
! # and it defaults to False.  Later:  ignore_redundant_html was removed.

  ##############################################################################
***************
*** 514,562 ****

  # textparts(msg) returns a set containing all the text components of msg.
! # There's no point decoding binary blobs (like images).
! 
! if options.ignore_redundant_html:
!     # If a multipart/alternative has both plain text and HTML versions of a
!     # msg, ignore the HTML part:  HTML decorations have monster-high spam
!     # probabilities, and innocent newbies often post using HTML.
!     def textparts(msg):
!         text = Set()
!         redundant_html = Set()
!         for part in msg.walk():
!             if part.get_type() == 'multipart/alternative':
!                 # Descend this part of the tree, adding any redundant HTML text
!                 # part to redundant_html.
!                 htmlpart = textpart = None
!                 stack = part.get_payload()[:]
!                 while stack:
!                     subpart = stack.pop()
!                     ctype = subpart.get_type('text/plain')
!                     if ctype == 'text/plain':
!                         textpart = subpart
!                     elif ctype == 'text/html':
!                         htmlpart = subpart
!                     elif ctype == 'multipart/related':
!                         stack.extend(subpart.get_payload())
! 
!                 if textpart is not None:
!                     text.add(textpart)
!                     if htmlpart is not None:
!                         redundant_html.add(htmlpart)
!                 elif htmlpart is not None:
!                     text.add(htmlpart)
! 
!             elif part.get_content_maintype() == 'text':
!                 text.add(part)
! 
!         return text - redundant_html
! 
! else:
!     # Use all text parts.  If a text/plain and text/html part happen to
!     # have redundant content, so it goes.
!     def textparts(msg):
!         return Set(filter(lambda part: part.get_content_maintype() == 'text',
!                           msg.walk()))

  def octetparts(msg):
      return Set(filter(lambda part:
                        part.get_type() == 'application/octet-stream',
--- 514,528 ----

  # textparts(msg) returns a set containing all the text components of msg.
! # There's no point decoding binary blobs (like images).  If a text/plain
! # and text/html part happen to have redundant content, it doesn't matter
! # to results, since training and scoring are done on the set of all
! # words in the msg, without regard to how many times a given word appears.
! def textparts(msg):
!     """Return a set of all msg parts with content maintype 'text'."""
!     return Set(filter(lambda part: part.get_content_maintype() == 'text',
!                       msg.walk()))

  def octetparts(msg):
+     """Return a set of all msg parts with type 'application/octet-stream'."""
      return Set(filter(lambda part:
                        part.get_type() == 'application/octet-stream',
***************
*** 1056,1064 ****
          it's recommended to leave that at its default of false.

-         If a multipart/alternative section has both text/plain and text/html
-         sections, options.ignore_redundant_html controls whether the HTML
-         part is ignored.  Except in special cases, it's recommended to
-         leave that at its default of false.
- 
          If options.check_octets is True, the first few undecoded characters
          of application/octet-stream parts of the message body become tokens.
--- 1022,1025 ----
***************
*** 1067,1071 ****
          if options.check_octets:
              # Find, decode application/octet-stream parts of the body,
!             # tokenizing the first few characters of each chunk
              for part in octetparts(msg):
                  text = part.get_payload(decode=False)
--- 1028,1032 ----
          if options.check_octets:
              # Find, decode application/octet-stream parts of the body,
!             # tokenizing the first few characters of each chunk.
              for part in octetparts(msg):
                  text = part.get_payload(decode=False)