[Spambayes-checkins] spambayes/spambayes tokenizer.py,1.30,1.31
Skip Montanaro
montanaro at users.sourceforge.net
Thu Feb 12 17:07:58 EST 2004
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21901
Modified Files:
tokenizer.py
Log Message:
Collect all potential MTA complaints, not just sendmail's "may be forged"
(from Tim Peters).
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** tokenizer.py 29 Jan 2004 15:02:11 -0000 1.30
--- tokenizer.py 12 Feb 2004 22:07:55 -0000 1.31
***************
*** 649,653 ****
# by manatee.mojam.com (8.12.1-20030917/8.12.1/Submit) id hBIEQFxF018044
# for skip at manatee.mojam.com; Thu, 18 Dec 2003 08:26:15 -0600
! received_host_re = re.compile(r'from ([a-zA-Z0-9._-]+[a-zA-Z])[)\s]')
# 99% of the time, the receiving host places the sender's ip address in
# square brackets as it should, but every once in awhile it turns up in
--- 649,653 ----
# by manatee.mojam.com (8.12.1-20030917/8.12.1/Submit) id hBIEQFxF018044
# for skip at manatee.mojam.com; Thu, 18 Dec 2003 08:26:15 -0600
! received_host_re = re.compile(r'from ([a-z0-9._-]+[a-z])[)\s]')
# 99% of the time, the receiving host places the sender's ip address in
# square brackets as it should, but every once in awhile it turns up in
***************
*** 1093,1096 ****
--- 1093,1098 ----
return tokens
+ received_complaints_re = re.compile(r'\([a-z]+(?:\s+[a-z]+)+\)')
+
class SlurpingURLStripper(URLStripper):
def __init__(self):
***************
*** 1452,1467 ****
if options["Tokenizer", "mine_received_headers"]:
for header in msg.get_all("received", ()):
! # Sendmail adds a chit to Received: headers if it thinks
! # the sender is forging its identity. That seems to be
! # a pretty reliable spam clue. I'll leave it for others
! # to decide if this should be pulled outside the check
! # for mine_received_headers.
! if header.lower().find('may be forged') != -1:
! yield 'received:may be forged'
for pat, breakdown in [(received_host_re, breakdown_host),
(received_ip_re, breakdown_ipaddr)]:
m = pat.search(header)
if m:
! for tok in breakdown(m.group(1).lower()):
yield 'received:' + tok
--- 1454,1470 ----
if options["Tokenizer", "mine_received_headers"]:
for header in msg.get_all("received", ()):
! # everything here should be case insensitive and not be
! # split across continuation lines, so normalize whitespace
! # and letter case just once per header
! header = ' '.join(header.split()).lower()
!
! for clue in received_complaints_re.findall(header):
! yield 'received:' + clue
!
for pat, breakdown in [(received_host_re, breakdown_host),
(received_ip_re, breakdown_ipaddr)]:
m = pat.search(header)
if m:
! for tok in breakdown(m.group(1)):
yield 'received:' + tok
More information about the Spambayes-checkins
mailing list