[Spambayes-checkins] spambayes/spambayes tokenizer.py,1.30,1.31

Skip Montanaro montanaro at users.sourceforge.net
Thu Feb 12 17:07:58 EST 2004


Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21901

Modified Files:
	tokenizer.py 
Log Message:
Collect all potential MTA complaints, not just sendmail's "may be forged"
(from Tim Peters).


Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** tokenizer.py	29 Jan 2004 15:02:11 -0000	1.30
--- tokenizer.py	12 Feb 2004 22:07:55 -0000	1.31
***************
*** 649,653 ****
  #       by manatee.mojam.com (8.12.1-20030917/8.12.1/Submit) id hBIEQFxF018044
  #       for skip at manatee.mojam.com; Thu, 18 Dec 2003 08:26:15 -0600
! received_host_re = re.compile(r'from ([a-zA-Z0-9._-]+[a-zA-Z])[)\s]')
  # 99% of the time, the receiving host places the sender's ip address in
  # square brackets as it should, but every once in awhile it turns up in
--- 649,653 ----
  #       by manatee.mojam.com (8.12.1-20030917/8.12.1/Submit) id hBIEQFxF018044
  #       for skip at manatee.mojam.com; Thu, 18 Dec 2003 08:26:15 -0600
! received_host_re = re.compile(r'from ([a-z0-9._-]+[a-z])[)\s]')
  # 99% of the time, the receiving host places the sender's ip address in
  # square brackets as it should, but every once in awhile it turns up in
***************
*** 1093,1096 ****
--- 1093,1098 ----
          return tokens
  
+ received_complaints_re = re.compile(r'\([a-z]+(?:\s+[a-z]+)+\)')
+ 
  class SlurpingURLStripper(URLStripper):
      def __init__(self):
***************
*** 1452,1467 ****
          if options["Tokenizer", "mine_received_headers"]:
              for header in msg.get_all("received", ()):
!                 # Sendmail adds a chit to Received: headers if it thinks
!                 # the sender is forging its identity.  That seems to be
!                 # a pretty reliable spam clue.  I'll leave it for others
!                 # to decide if this should be pulled outside the check
!                 # for mine_received_headers.
!                 if header.lower().find('may be forged') != -1:
!                     yield 'received:may be forged'
                  for pat, breakdown in [(received_host_re, breakdown_host),
                                         (received_ip_re, breakdown_ipaddr)]:
                      m = pat.search(header)
                      if m:
!                         for tok in breakdown(m.group(1).lower()):
                              yield 'received:' + tok
  
--- 1454,1470 ----
          if options["Tokenizer", "mine_received_headers"]:
              for header in msg.get_all("received", ()):
!                 # everything here should be case insensitive and not be
!                 # split across continuation lines, so normalize whitespace
!                 # and letter case just once per header
!                 header = ' '.join(header.split()).lower()
! 
!                 for clue in received_complaints_re.findall(header):
!                     yield 'received:' + clue
! 
                  for pat, breakdown in [(received_host_re, breakdown_host),
                                         (received_ip_re, breakdown_ipaddr)]:
                      m = pat.search(header)
                      if m:
!                         for tok in breakdown(m.group(1)):
                              yield 'received:' + tok
  




More information about the Spambayes-checkins mailing list