[Spambayes-checkins] spambayes/spambayes tokenizer.py, 1.29,
1.30 Options.py, 1.102, 1.103
Skip Montanaro
montanaro at projects.sourceforge.net
Thu Jan 29 10:02:14 EST 2004
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv27571/spambayes
Modified Files:
tokenizer.py Options.py
Log Message:
>From a suggestion by someone whose name I forgot...
Recognize "abbreviated" URLs of the form www.xyz.com or ftp.xyz.com as
http://www.xyz.com and ftp://ftp.xyz.com, respectively. This gets rid of
some fairly common "skip:w NNN" tokens.
Enabled by the new tokenizer option, x-fancy_url_recognition. I don't see
any particular reason not to make this the default, but guarding it with the
option allows people to more easily test for negative side effects.
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** tokenizer.py 12 Jan 2004 08:38:23 -0000 1.29
--- tokenizer.py 29 Jan 2004 15:02:11 -0000 1.30
***************
*** 986,989 ****
--- 986,1005 ----
# Strip and specially tokenize embedded URLish thingies.
+ url_fancy_re = re.compile(r"""
+ \b # the preceeding character must not be alphanumeric
+ (?:
+ (?:
+ (https? | ftp) # capture the protocol
+ :// # skip the boilerplate
+ )|
+ (?= ftp\.[^\.\s<>"'\x7f-\xff] )| # allow the protocol to be missing, but only if
+ (?= www\.[^\.\s<>"'\x7f-\xff] ) # the rest of the url starts "www.x" or "ftp.x"
+ )
+ # Do a reasonable attempt at detecting the end. It may or may not
+ # be in HTML, may or may not be in quotes, etc. If it's full of %
+ # escapes, cool -- that's a clue too.
+ ([^\s<>"'\x7f-\xff]+) # capture the guts
+ """, re.VERBOSE) # '
+
url_re = re.compile(r"""
(https? | ftp) # capture the protocol
***************
*** 995,998 ****
--- 1011,1015 ----
""", re.VERBOSE) # '
+
urlsep_re = re.compile(r"[;?:@&=+,$.]")
***************
*** 1000,1008 ****
def __init__(self):
# The empty regexp matches anything at once.
! Stripper.__init__(self, url_re.search, re.compile("").search)
def tokenize(self, m):
proto, guts = m.groups()
assert guts
tokens = ["proto:" + proto]
pushclue = tokens.append
--- 1017,1036 ----
def __init__(self):
# The empty regexp matches anything at once.
! if options["Tokenizer", "x-fancy_url_recognition"]:
! search = url_fancy_re.search
! else:
! search = url_re.search
! Stripper.__init__(self, search, re.compile("").search)
def tokenize(self, m):
proto, guts = m.groups()
assert guts
+ if proto is None:
+ if guts.lower().startswith("www"):
+ proto = "http"
+ elif guts.lower().startswith("ftp"):
+ proto = "ftp"
+ else:
+ proto = "unknown"
tokens = ["proto:" + proto]
pushclue = tokens.append
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.102
retrieving revision 1.103
diff -C2 -d -r1.102 -r1.103
*** Options.py 19 Jan 2004 17:58:20 -0000 1.102
--- Options.py 29 Jan 2004 15:02:11 -0000 1.103
***************
*** 151,154 ****
--- 151,159 ----
BOOLEAN, RESTORE),
+ ("x-fancy_url_recognition", "Extract URLs without http:// prefix", False,
+ """(EXPERIMENTAL) Recognize 'www.python.org' or ftp.python.org as URLs
+ instead of just long words.""",
+ BOOLEAN, RESTORE),
+
("replace_nonascii_chars", "Replace non-ascii characters", False,
"""If true, replace high-bit characters (ord(c) >= 128) and control
More information about the Spambayes-checkins
mailing list