[Spambayes-checkins] spambayes tokenizer.py,1.69,1.70
Tim Peters
tim_one@users.sourceforge.net
Sun Nov 24 07:41:05 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv16042
Modified Files:
tokenizer.py
Log Message:
Revamped the "look for special things and get rid of them" body
tokenization code, making most of this work thru a common new Stripper
class. Moved the <style and HTML comment stripping into that framework,
so that re stack blowups should never happen again.
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.69
retrieving revision 1.70
diff -C2 -d -r1.69 -r1.70
*** tokenizer.py 19 Nov 2002 02:13:00 -0000 1.69
--- tokenizer.py 24 Nov 2002 07:41:03 -0000 1.70
***************
*** 611,643 ****
msg.walk()))
- url_re = re.compile(r"""
- (https? | ftp) # capture the protocol
- :// # skip the boilerplate
- # Do a reasonable attempt at detecting the end. It may or may not
- # be in HTML, may or may not be in quotes, etc. If it's full of %
- # escapes, cool -- that's a clue too.
- ([^\s<>"'\x7f-\xff]+) # capture the guts
- """, re.VERBOSE) # '
-
- urlsep_re = re.compile(r"[;?:@&=+,$.]")
-
has_highbit_char = re.compile(r"[\x80-\xff]").search
# Cheap-ass gimmick to probabilistically find HTML/XML tags.
html_re = re.compile(r"""
<
(?![\s<>]) # e.g., don't match 'a < b' or '<<<' or 'i<<5' or 'a<>b'
! (?:
! # style sheets can be very long
! style\b # maybe it's <style>, or maybe <style type=...>, etc.
! .{0,2048}?
! </style
! | # so can comments
! !--
! .{0,2048}?
! --
! | # guessing that other tags are usually "short"
! [^>]{0,256} # search for the end '>', but don't run wild
! )
>
""", re.VERBOSE | re.DOTALL)
--- 611,625 ----
msg.walk()))
has_highbit_char = re.compile(r"[\x80-\xff]").search
# Cheap-ass gimmick to probabilistically find HTML/XML tags.
+ # Note that <style and HTML comments are handled by crack_html_style()
+ # and crack_html_comment() instead -- they can be very long, and long
+ # minimal matches have a nasty habit of blowing the C stack.
html_re = re.compile(r"""
<
(?![\s<>]) # e.g., don't match 'a < b' or '<<<' or 'i<<5' or 'a<>b'
! # guessing that other tags are usually "short"
! [^>]{0,256} # search for the end '>', but don't run wild
>
""", re.VERBOSE | re.DOTALL)
***************
*** 882,885 ****
--- 864,919 ----
return log(n)/c
+
+ class Stripper(object):
+ def __init__(self, find_start, find_end):
+ # find_start and find_end have signature
+ # string, int -> match_object
+ # where the search starts at string[int:int]. If a match isn't found,
+ # they must return None. The match_object for find_start, if not
+ # None, is passed to self.tokenize, which returns a (possibly empty)
+ # list of tokens to generate. Subclasses may override tokenize().
+ # Text between find_start and find_end is thrown away, except for
+ # whatever tokenize() produces. A match_object must support method
+ # span() -> int, int # the slice bounds of what was matched
+ self.find_start = find_start
+ self.find_end = find_end
+
+ # Efficiency note: This is cheaper than it looks if there aren't any
+ # special sections. Under the covers, string[0:] is optimized to
+ # return string (no new object is built), and likewise ' '.join([string])
+ # is optimized to return string. It would actually slow this code down
+ # to special-case these "do nothing" special cases at the Python level!
+
+ def analyze(self, text):
+ i = 0
+ retained = []
+ pushretained = retained.append
+ tokens = []
+ while True:
+ m = self.find_start(text, i)
+ if not m:
+ pushretained(text[i:])
+ break
+ start, end = m.span()
+ pushretained(text[i : start])
+ tokens.extend(self.tokenize(m))
+ m = self.find_end(text, end)
+ if not m:
+ break
+ dummy, i = m.span()
+ # Replace each skipped portion with a single blank.
+ return ' '.join(retained), tokens
+
+ def tokenize(self, match_object):
+ # Override this if you want to suck info out of the start pattern.
+ return []
+
+ # Strip out uuencoded sections and produce tokens. The return value
+ # is (new_text, sequence_of_tokens), where new_text no longer contains
+ # uuencoded stuff. Note that we're not bothering to decode it! Maybe
+ # we should. One of my persistent false negatives is a spam containing
+ # nothing but a uuencoded money.txt; OTOH, uuencoded seems to be on
+ # its way out (that's an old spam).
+
uuencode_begin_re = re.compile(r"""
^begin \s+
***************
*** 891,949 ****
uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)
! # Strip out uuencoded sections and produce tokens. The return value
! # is (new_text, sequence_of_tokens), where new_text no longer contains
! # uuencoded stuff. Note that we're not bothering to decode it! Maybe
! # we should. One of my persistent false negatives is a spam containing
! # nothing but a uuencoded money.txt; OTOH, uuencoded seems to be on
! # its way out (that's an old spam).
! #
! # Efficiency note: This is cheaper than it looks if there aren't any
! # uuencoded sections. Under the covers, string[0:] is optimized to
! # return string (no new object is built), and likewise ''.join([string])
! # is optimized to return string. It would actually slow this code down
! # to special-case these "do nothing" special cases at the Python level!
! def crack_uuencode(text):
! new_text = []
! tokens = []
! i = 0
! while True:
! # Invariant: Through text[:i], all non-uuencoded text is in
! # new_text, and tokens contains summary clues for all uuencoded
! # portions. text[i:] hasn't been looked at yet.
! m = uuencode_begin_re.search(text, i)
! if not m:
! new_text.append(text[i:])
! break
! start, end = m.span()
! new_text.append(text[i : start])
mode, fname = m.groups()
! tokens.append('uuencode mode:%s' % mode)
! tokens.extend(['uuencode:%s' % x for x in crack_filename(fname)])
! m = uuencode_end_re.search(text, end)
! if not m:
! break
! i = m.end()
! return ''.join(new_text), tokens
! def crack_urls(text):
! new_text = []
! clues = []
! pushclue = clues.append
! i = 0
! while True:
! # Invariant: Through text[:i], all non-URL text is in new_text, and
! # clues contains clues for all URLs. text[i:] hasn't been looked at
! # yet.
! m = url_re.search(text, i)
! if not m:
! new_text.append(text[i:])
! break
proto, guts = m.groups()
! start, end = m.span()
! new_text.append(text[i : start])
! new_text.append(' ')
- pushclue("proto:" + proto)
# Lose the trailing punctuation for casual embedding, like:
# The code is at http://mystuff.org/here? Didn't resolve.
--- 925,964 ----
uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)
! class UUencodeStripper(Stripper):
! def __init__(self):
! Stripper.__init__(self, uuencode_begin_re.search,
! uuencode_end_re.search)
!
! def tokenize(self, m):
mode, fname = m.groups()
! return (['uuencode mode:%s' % mode] +
! ['uuencode:%s' % x for x in crack_filename(fname)])
! crack_uuencode = UUencodeStripper().analyze
!
! # Strip and specially tokenize embedded URLish thingies.
!
! url_re = re.compile(r"""
! (https? | ftp) # capture the protocol
! :// # skip the boilerplate
! # Do a reasonable attempt at detecting the end. It may or may not
! # be in HTML, may or may not be in quotes, etc. If it's full of %
! # escapes, cool -- that's a clue too.
! ([^\s<>"'\x7f-\xff]+) # capture the guts
! """, re.VERBOSE) # '
!
! urlsep_re = re.compile(r"[;?:@&=+,$.]")
!
! class URLStripper(Stripper):
! def __init__(self):
! # The empty regexp matches anything at once.
! Stripper.__init__(self, url_re.search, re.compile("").search)
!
! def tokenize(self, m):
proto, guts = m.groups()
! tokens = ["proto:" + proto]
! pushclue = tokens.append
# Lose the trailing punctuation for casual embedding, like:
# The code is at http://mystuff.org/here? Didn't resolve.
***************
*** 956,963 ****
for chunk in urlsep_re.split(piece):
pushclue("url:" + chunk)
! i = end
- return ''.join(new_text), clues
# Scan HTML for constructs often seen in viruses and worms.
--- 971,999 ----
for chunk in urlsep_re.split(piece):
pushclue("url:" + chunk)
+ return tokens
! crack_urls = URLStripper().analyze
!
! # Nuke HTML <style gimmicks.
! html_style_start_re = re.compile(r"""
! < \s* style\b [^>]* >
! """, re.VERBOSE)
!
! class StyleStripper(Stripper):
! def __init__(self):
! Stripper.__init__(self, html_style_start_re.search,
! re.compile(r"</style>").search)
!
! crack_html_style = StyleStripper().analyze
!
! # Nuke HTML comments.
!
! class CommentStripper(Stripper):
! def __init__(self):
! Stripper.__init__(self, re.compile(r"<!--").search,
! re.compile(r"-->").search)
!
! crack_html_comment = CommentStripper().analyze
# Scan HTML for constructs often seen in viruses and worms.
***************
*** 1232,1251 ****
text = text.lower()
- # Get rid of uuencoded sections.
- text, tokens = crack_uuencode(text)
- for t in tokens:
- yield t
-
if options.replace_nonascii_chars:
# Replace high-bit chars and control chars with '?'.
text = text.translate(non_ascii_translate_tab)
- # Special tagging of embedded URLs.
- text, tokens = crack_urls(text)
- for t in tokens:
- yield t
-
for t in find_html_virus_clues(text):
yield "virus:%s" % t
# Remove HTML/XML tags. Also .
--- 1268,1287 ----
text = text.lower()
if options.replace_nonascii_chars:
# Replace high-bit chars and control chars with '?'.
text = text.translate(non_ascii_translate_tab)
for t in find_html_virus_clues(text):
yield "virus:%s" % t
+
+ # Get rid of uuencoded sections, embedded URLs, <style gimmicks,
+ # and HTML comments.
+ for cracker in (crack_uuencode,
+ crack_urls,
+ crack_html_style,
+ crack_html_comment):
+ text, tokens = cracker(text)
+ for t in tokens:
+ yield t
# Remove HTML/XML tags. Also .
More information about the Spambayes-checkins
mailing list