[Spambayes-checkins] spambayes Options.py,1.56,1.57
tokenizer.py,1.48,1.49
Tim Peters
tim_one@users.sourceforge.net
Sat Oct 26 17:15:46 2002
Update of /cvsroot/spambayes/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv11212
Modified Files:
Options.py tokenizer.py
Log Message:
Removed option ignore_redundant_html. This made some kind of sense in
the early c.l.py tests, before we stripped HTML tags; it doesn't make
sense anymore.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Options.py,v
retrieving revision 1.56
retrieving revision 1.57
diff -C2 -d -r1.56 -r1.57
*** Options.py 26 Oct 2002 16:01:13 -0000 1.56
--- Options.py 26 Oct 2002 16:15:38 -0000 1.57
***************
*** 38,53 ****
# sign of HTML is so despised on tech lists; however, the advantage
# of setting it true eventually vanishes even there given enough
! # training data. If you set this true, you should almost certainly set
! # ignore_redundant_html true too.
retain_pure_html_tags: False
- # If true, when a multipart/alternative has both text/plain and text/html
- # sections, the text/html section is ignored. That's likely a dubious
- # idea in general, so false is likely a better idea here. In the c.l.py
- # tests, it helped a lot when retain_pure_html_tags was true (in that case,
- # keeping the HTML tags in the "redundant" HTML was almost certain to score
- # the multipart/alternative as spam, regardless of content).
- ignore_redundant_html: False
-
# If true, the first few characters of application/octet-stream sections
# are used, undecoded. What 'few' means is decided by octet_prefix_size.
--- 38,44 ----
# sign of HTML is so despised on tech lists; however, the advantage
# of setting it true eventually vanishes even there given enough
! # training data.
retain_pure_html_tags: False
# If true, the first few characters of application/octet-stream sections
# are used, undecoded. What 'few' means is decided by octet_prefix_size.
***************
*** 282,286 ****
all_options = {
'Tokenizer': {'retain_pure_html_tags': boolean_cracker,
- 'ignore_redundant_html': boolean_cracker,
'safe_headers': ('get', lambda s: Set(s.split())),
'count_all_header_lines': boolean_cracker,
--- 273,276 ----
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.48
retrieving revision 1.49
diff -C2 -d -r1.48 -r1.49
*** tokenizer.py 25 Oct 2002 16:34:19 -0000 1.48
--- tokenizer.py 26 Oct 2002 16:15:40 -0000 1.49
***************
*** 490,494 ****
# the text/plain and text/html alternatives may have entirely different
# content. options.ignore_redundant_html was introduced to control this,
! # and it defaults to False.
##############################################################################
--- 490,494 ----
# the text/plain and text/html alternatives may have entirely different
# content. options.ignore_redundant_html was introduced to control this,
! # and it defaults to False. Later: ignore_redundant_html was removed.
##############################################################################
***************
*** 514,562 ****
# textparts(msg) returns a set containing all the text components of msg.
! # There's no point decoding binary blobs (like images).
!
! if options.ignore_redundant_html:
! # If a multipart/alternative has both plain text and HTML versions of a
! # msg, ignore the HTML part: HTML decorations have monster-high spam
! # probabilities, and innocent newbies often post using HTML.
! def textparts(msg):
! text = Set()
! redundant_html = Set()
! for part in msg.walk():
! if part.get_type() == 'multipart/alternative':
! # Descend this part of the tree, adding any redundant HTML text
! # part to redundant_html.
! htmlpart = textpart = None
! stack = part.get_payload()[:]
! while stack:
! subpart = stack.pop()
! ctype = subpart.get_type('text/plain')
! if ctype == 'text/plain':
! textpart = subpart
! elif ctype == 'text/html':
! htmlpart = subpart
! elif ctype == 'multipart/related':
! stack.extend(subpart.get_payload())
!
! if textpart is not None:
! text.add(textpart)
! if htmlpart is not None:
! redundant_html.add(htmlpart)
! elif htmlpart is not None:
! text.add(htmlpart)
!
! elif part.get_content_maintype() == 'text':
! text.add(part)
!
! return text - redundant_html
!
! else:
! # Use all text parts. If a text/plain and text/html part happen to
! # have redundant content, so it goes.
! def textparts(msg):
! return Set(filter(lambda part: part.get_content_maintype() == 'text',
! msg.walk()))
def octetparts(msg):
return Set(filter(lambda part:
part.get_type() == 'application/octet-stream',
--- 514,528 ----
# textparts(msg) returns a set containing all the text components of msg.
! # There's no point decoding binary blobs (like images). If a text/plain
! # and text/html part happen to have redundant content, it doesn't matter
! # to results, since training and scoring are done on the set of all
! # words in the msg, without regard to how many times a given word appears.
! def textparts(msg):
! """Return a set of all msg parts with content maintype 'text'."""
! return Set(filter(lambda part: part.get_content_maintype() == 'text',
! msg.walk()))
def octetparts(msg):
+ """Return a set of all msg parts with type 'application/octet-stream'."""
return Set(filter(lambda part:
part.get_type() == 'application/octet-stream',
***************
*** 1056,1064 ****
it's recommended to leave that at its default of false.
- If a multipart/alternative section has both text/plain and text/html
- sections, options.ignore_redundant_html controls whether the HTML
- part is ignored. Except in special cases, it's recommended to
- leave that at its default of false.
-
If options.check_octets is True, the first few undecoded characters
of application/octet-stream parts of the message body become tokens.
--- 1022,1025 ----
***************
*** 1067,1071 ****
if options.check_octets:
# Find, decode application/octet-stream parts of the body,
! # tokenizing the first few characters of each chunk
for part in octetparts(msg):
text = part.get_payload(decode=False)
--- 1028,1032 ----
if options.check_octets:
# Find, decode application/octet-stream parts of the body,
! # tokenizing the first few characters of each chunk.
for part in octetparts(msg):
text = part.get_payload(decode=False)