[Spambayes-checkins] spambayes tokenizer.py,1.69,1.70

Sun Nov 24 07:41:05 2002

Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv16042

Modified Files:
	tokenizer.py 
Log Message:
Revamped the "look for special things and get rid of them" body
tokenization code, making most of this work thru a common new Stripper
class.  Moved the <style and HTML comment stripping into that framework,
so that re stack blowups should never happen again.

Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.69
retrieving revision 1.70
diff -C2 -d -r1.69 -r1.70
*** tokenizer.py	19 Nov 2002 02:13:00 -0000	1.69
--- tokenizer.py	24 Nov 2002 07:41:03 -0000	1.70
***************
*** 611,643 ****
                        msg.walk()))

- url_re = re.compile(r"""
-     (https? | ftp)  # capture the protocol
-     ://             # skip the boilerplate
-     # Do a reasonable attempt at detecting the end.  It may or may not
-     # be in HTML, may or may not be in quotes, etc.  If it's full of %
-     # escapes, cool -- that's a clue too.
-     ([^\s<>"'\x7f-\xff]+)  # capture the guts
- """, re.VERBOSE)                        # '
- 
- urlsep_re = re.compile(r"[;?:@&=+,$.]")
- 
  has_highbit_char = re.compile(r"[\x80-\xff]").search

  # Cheap-ass gimmick to probabilistically find HTML/XML tags.
  html_re = re.compile(r"""
      <
      (?![\s<>])  # e.g., don't match 'a < b' or '<<<' or 'i<<5' or 'a<>b'
!     (?:
!         # style sheets can be very long
!         style\b     # maybe it's <style>, or maybe <style type=...>, etc.
!         .{0,2048}?
!         </style
!     |   # so can comments
!         !--
!         .{0,2048}?
!         --
!     |   # guessing that other tags are usually "short"
!         [^>]{0,256} # search for the end '>', but don't run wild
!     )
      >
  """, re.VERBOSE | re.DOTALL)
--- 611,625 ----
                        msg.walk()))

  has_highbit_char = re.compile(r"[\x80-\xff]").search

  # Cheap-ass gimmick to probabilistically find HTML/XML tags.
+ # Note that <style and HTML comments are handled by crack_html_style()
+ # and crack_html_comment() instead -- they can be very long, and long
+ # minimal matches have a nasty habit of blowing the C stack.
  html_re = re.compile(r"""
      <
      (?![\s<>])  # e.g., don't match 'a < b' or '<<<' or 'i<<5' or 'a<>b'
!     # guessing that other tags are usually "short"
!     [^>]{0,256} # search for the end '>', but don't run wild
      >
  """, re.VERBOSE | re.DOTALL)
***************
*** 882,885 ****
--- 864,919 ----
      return log(n)/c

+ 
+ class Stripper(object):
+     def __init__(self, find_start, find_end):
+         # find_start and find_end have signature
+         #     string, int -> match_object
+         # where the search starts at string[int:int].  If a match isn't found,
+         # they must return None.  The match_object for find_start, if not
+         # None, is passed to self.tokenize, which returns a (possibly empty)
+         # list of tokens to generate.  Subclasses may override tokenize().
+         # Text between find_start and find_end is thrown away, except for
+         # whatever tokenize() produces.  A match_object must support method
+         #     span() -> int, int    # the slice bounds of what was matched
+         self.find_start = find_start
+         self.find_end = find_end
+ 
+     # Efficiency note:  This is cheaper than it looks if there aren't any
+     # special sections.  Under the covers, string[0:] is optimized to
+     # return string (no new object is built), and likewise ' '.join([string])
+     # is optimized to return string.  It would actually slow this code down
+     # to special-case these "do nothing" special cases at the Python level!
+ 
+     def analyze(self, text):
+         i = 0
+         retained = []
+         pushretained = retained.append
+         tokens = []
+         while True:
+             m = self.find_start(text, i)
+             if not m:
+                 pushretained(text[i:])
+                 break
+             start, end = m.span()
+             pushretained(text[i : start])
+             tokens.extend(self.tokenize(m))
+             m = self.find_end(text, end)
+             if not m:
+                 break
+             dummy, i = m.span()
+         # Replace each skipped portion with a single blank.
+         return ' '.join(retained), tokens
+ 
+     def tokenize(self, match_object):
+         # Override this if you want to suck info out of the start pattern.
+         return []
+ 
+ # Strip out uuencoded sections and produce tokens.  The return value
+ # is (new_text, sequence_of_tokens), where new_text no longer contains
+ # uuencoded stuff.  Note that we're not bothering to decode it!  Maybe
+ # we should.  One of my persistent false negatives is a spam containing
+ # nothing but a uuencoded money.txt; OTOH, uuencoded seems to be on
+ # its way out (that's an old spam).
+ 
  uuencode_begin_re = re.compile(r"""
      ^begin \s+
***************
*** 891,949 ****
  uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)

! # Strip out uuencoded sections and produce tokens.  The return value
! # is (new_text, sequence_of_tokens), where new_text no longer contains
! # uuencoded stuff.  Note that we're not bothering to decode it!  Maybe
! # we should.  One of my persistent false negatives is a spam containing
! # nothing but a uuencoded money.txt; OTOH, uuencoded seems to be on
! # its way out (that's an old spam).
! #
! # Efficiency note:  This is cheaper than it looks if there aren't any
! # uuencoded sections.  Under the covers, string[0:] is optimized to
! # return string (no new object is built), and likewise ''.join([string])
! # is optimized to return string.  It would actually slow this code down
! # to special-case these "do nothing" special cases at the Python level!
! def crack_uuencode(text):
!     new_text = []
!     tokens = []
!     i = 0
!     while True:
!         # Invariant:  Through text[:i], all non-uuencoded text is in
!         # new_text, and tokens contains summary clues for all uuencoded
!         # portions.  text[i:] hasn't been looked at yet.
!         m = uuencode_begin_re.search(text, i)
!         if not m:
!             new_text.append(text[i:])
!             break
!         start, end = m.span()
!         new_text.append(text[i : start])
          mode, fname = m.groups()
!         tokens.append('uuencode mode:%s' % mode)
!         tokens.extend(['uuencode:%s' % x for x in crack_filename(fname)])
!         m = uuencode_end_re.search(text, end)
!         if not m:
!             break
!         i = m.end()

!     return ''.join(new_text), tokens

! def crack_urls(text):
!     new_text = []
!     clues = []
!     pushclue = clues.append
!     i = 0
!     while True:
!         # Invariant:  Through text[:i], all non-URL text is in new_text, and
!         # clues contains clues for all URLs.  text[i:] hasn't been looked at
!         # yet.
!         m = url_re.search(text, i)
!         if not m:
!             new_text.append(text[i:])
!             break
          proto, guts = m.groups()
!         start, end = m.span()
!         new_text.append(text[i : start])
!         new_text.append(' ')

-         pushclue("proto:" + proto)
          # Lose the trailing punctuation for casual embedding, like:
          #     The code is at http://mystuff.org/here?  Didn't resolve.
--- 925,964 ----
  uuencode_end_re = re.compile(r"^end\s*\n", re.MULTILINE)

! class UUencodeStripper(Stripper):
!     def __init__(self):
!         Stripper.__init__(self, uuencode_begin_re.search,
!                                 uuencode_end_re.search)
! 
!     def tokenize(self, m):
          mode, fname = m.groups()
!         return (['uuencode mode:%s' % mode] +
!                 ['uuencode:%s' % x for x in crack_filename(fname)])

! crack_uuencode = UUencodeStripper().analyze

! 
! # Strip and specially tokenize embedded URLish thingies.
! 
! url_re = re.compile(r"""
!     (https? | ftp)  # capture the protocol
!     ://             # skip the boilerplate
!     # Do a reasonable attempt at detecting the end.  It may or may not
!     # be in HTML, may or may not be in quotes, etc.  If it's full of %
!     # escapes, cool -- that's a clue too.
!     ([^\s<>"'\x7f-\xff]+)  # capture the guts
! """, re.VERBOSE)                        # '
! 
! urlsep_re = re.compile(r"[;?:@&=+,$.]")
! 
! class URLStripper(Stripper):
!     def __init__(self):
!         # The empty regexp matches anything at once.
!         Stripper.__init__(self, url_re.search, re.compile("").search)
! 
!     def tokenize(self, m):
          proto, guts = m.groups()
!         tokens = ["proto:" + proto]
!         pushclue = tokens.append

          # Lose the trailing punctuation for casual embedding, like:
          #     The code is at http://mystuff.org/here?  Didn't resolve.
***************
*** 956,963 ****
              for chunk in urlsep_re.split(piece):
                  pushclue("url:" + chunk)

!         i = end

-     return ''.join(new_text), clues

  # Scan HTML for constructs often seen in viruses and worms.
--- 971,999 ----
              for chunk in urlsep_re.split(piece):
                  pushclue("url:" + chunk)
+         return tokens

! crack_urls = URLStripper().analyze
! 
! # Nuke HTML <style gimmicks.
! html_style_start_re = re.compile(r"""
!     < \s* style\b [^>]* >
! """, re.VERBOSE)
! 
! class StyleStripper(Stripper):
!     def __init__(self):
!         Stripper.__init__(self, html_style_start_re.search,
!                                 re.compile(r"</style>").search)
! 
! crack_html_style = StyleStripper().analyze
! 
! # Nuke HTML comments.
! 
! class CommentStripper(Stripper):
!     def __init__(self):
!         Stripper.__init__(self, re.compile(r"<!--").search,
!                                 re.compile(r"-->").search)
! 
! crack_html_comment = CommentStripper().analyze

  # Scan HTML for constructs often seen in viruses and worms.
***************
*** 1232,1251 ****
              text = text.lower()

-             # Get rid of uuencoded sections.
-             text, tokens = crack_uuencode(text)
-             for t in tokens:
-                 yield t
- 
              if options.replace_nonascii_chars:
                  # Replace high-bit chars and control chars with '?'.
                  text = text.translate(non_ascii_translate_tab)

-             # Special tagging of embedded URLs.
-             text, tokens = crack_urls(text)
-             for t in tokens:
-                 yield t
- 
              for t in find_html_virus_clues(text):
                  yield "virus:%s" % t

              # Remove HTML/XML tags.  Also &nbsp;.
--- 1268,1287 ----
              text = text.lower()

              if options.replace_nonascii_chars:
                  # Replace high-bit chars and control chars with '?'.
                  text = text.translate(non_ascii_translate_tab)

              for t in find_html_virus_clues(text):
                  yield "virus:%s" % t
+ 
+             # Get rid of uuencoded sections, embedded URLs, <style gimmicks,
+             # and HTML comments.
+             for cracker in (crack_uuencode,
+                             crack_urls,
+                             crack_html_style,
+                             crack_html_comment):
+                 text, tokens = cracker(text)
+                 for t in tokens:
+                     yield t

              # Remove HTML/XML tags.  Also &nbsp;.