[spambayes-dev] message subject filtering
Seth Goodman
sethg at GoodmanAssociates.com
Wed Sep 1 05:27:52 CEST 2004
> From: John Moriarty
> Sent: Tuesday, August 31, 2004 7:42 PM
<...>
> Wonder if this message with all those spam message header quotes gets
> deleted as spam;)
Here's how it scored on my system.
--
Seth Goodman
***************************************************************
Combined Score: 0% (2.10065e-010)
Internal ham score (*H*): 1
Internal spam score (*S*): 5.55112e-016
# ham trained on: 1260
# spam trained on: 1309
150 Significant Tokens
token spamprob #ham #spam
'filtering' 0.0155709 14 0
'folder.' 0.0155709 14 0
'spambayes,' 0.0180723 12 0
'spams' 0.0302013 7 0
'filter.' 0.0348837 6 0
'restaurant' 0.0348837 6 0
'wrote:' 0.0377975 30 1
'message-----' 0.03904 29 1
'spambayes' 0.0395184 169 7
'kenny' 0.0412844 5 0
'pitt' 0.0412844 5 0
'spam?' 0.0412844 5 0
'exceeds' 0.0505618 4 0
'irrelevant' 0.0505618 4 0
'problematic' 0.0505618 4 0
'question,' 0.0505618 4 0
'vs.' 0.0505618 4 0
'outlook' 0.057784 129 8
'ham' 0.0611551 18 1
'besides' 0.0652174 3 0
'headers.' 0.0652174 3 0
'partially' 0.0652174 3 0
'spammer' 0.0652174 3 0
'0.6' 0.0918367 2 0
'clues' 0.0918367 2 0
"hasn't" 0.0918367 2 0
'observations' 0.0918367 2 0
'probability' 0.0918367 2 0
'score.' 0.0918367 2 0
'spammy' 0.0918367 2 0
'subject:filtering' 0.0918367 2 0
'urls' 0.0918367 2 0
'downloaded' 0.0960671 20 2
'spam' 0.0980099 117 13
'spam.' 0.122818 22 3
'cc:no real name:2**0' 0.126144 28 4
'junk' 0.12979 40 6
'likely' 0.139058 25 4
'identifying' 0.141076 7 1
'sent:' 0.143691 47 8
'messages,' 0.153008 17 3
'0.4' 0.155172 1 0
'add-in,' 0.155172 1 0
'got:' 0.155172 1 0
'invariably' 0.155172 1 0
'joke' 0.155172 1 0
'obscure' 0.155172 1 0
'theses' 0.155172 1 0
'wording' 0.155172 1 0
'cc:' 0.1601 6 1
'training' 0.169601 48 10
'case' 0.173236 33 7
'good.' 0.173672 10 2
'user' 0.179492 58 13
'issues' 0.183649 35 8
'random' 0.185056 5 1
'maybe' 0.18721 30 7
'header:In-Reply-To:1' 0.188084 88 21
'seems' 0.189561 46 11
'august' 0.191786 21 5
'proxy' 0.2007 16 4
'header:Importance:1' 0.213684 245 69
'everyone.' 0.219234 4 1
'weird' 0.219234 4 1
'using' 0.222277 206 61
'update' 0.224625 67 20
'messages' 0.226939 53 16
'appears' 0.232045 26 8
'data.' 0.233298 10 3
'seem' 0.234093 32 10
'pop3' 0.2355 13 4
'properly' 0.2355 13 4
'url' 0.239458 28 9
'subject:spambayes' 0.239792 31 10
'download' 0.244353 60 20
'regards,' 0.253755 108 38
'and/or' 0.255968 34 12
'speed' 0.256066 20 7
'refuse' 0.256606 6 2
'newsletters' 0.261476 14 5
'cc:2**0' 0.264469 70 26
'falls' 0.268912 3 1
'vastly' 0.268912 3 1
'x-mailer:microsoft outlook cws, build 9.0.2416 (9.0.2911.0)' 0.268912
3 1
'sometimes' 0.276398 18 7
'text' 0.280013 35 14
'gets' 0.28081 25 10
'processing' 0.285637 22 9
'does' 0.287482 122 51
'meeting' 0.289726 31 13
'true.' 0.291402 5 2
'learning' 0.291771 12 5
'subject:subject' 0.291771 12 5
'enough' 0.291965 40 17
'option' 0.295123 21 9
'domain' 0.296626 14 6
'probably' 0.297196 39 17
'note' 0.299816 52 23
'however,' 0.300611 63 28
"what's" 0.301911 27 12
'so,' 0.304793 42 19
'capacity' 0.306329 9 4
'saying' 0.307076 22 10
'it.' 0.308495 106 49
'possible' 0.310605 43 20
'particular' 0.313772 15 7
'skip:- 10' 0.319851 33 16
'word' 0.320398 37 18
'subject:] ' 0.324429 347 173
'john' 0.326766 28 14
'usually' 0.32918 12 6
'times' 0.329979 53 27
'trained' 0.330009 10 5
"aren't" 0.331238 8 4
'e-mail' 0.333142 164 85
'when' 0.33456 274 143
'cases,' 0.337125 4 2
'equally' 0.337125 4 2
'medicine' 0.337125 4 2
'itself' 0.338762 19 10
'effect' 0.340334 17 9
'header:Errors-To:1' 0.341055 361 194
'message.' 0.343773 59 32
'100' 0.344597 24 13
'0.5' 0.347748 2 1
'cheer' 0.347748 2 1
'detecting' 0.347748 2 1
'regardless' 0.348294 11 6
'absolutely' 0.655798 10 20
'cost' 0.661048 30 61
'men' 0.678659 9 20
'ratio' 0.683663 3 7
'confidence' 0.683917 7 16
'subject:message' 0.692332 11 26
'presence' 0.711008 3 8
'relief' 0.718228 1 3
'pain' 0.724071 5 14
'prices' 0.736657 15 44
'canadian' 0.75361 3 10
'lowest' 0.785289 8 31
'names,' 0.800158 2 9
'arbiter' 0.844828 0 1
'easily.' 0.844828 0 1
'cash' 0.849598 8 48
'regulate' 0.851022 1 7
'generic' 0.875673 2 16
'seduce' 0.908163 0 2
'meds' 0.931618 1 17
'potency' 0.934783 0 3
'unbelievable' 0.958716 0 5
<...>
All Message Tokens
443 unique tokens
'$397'
"'david"
"'john"
'(+353)'
'(0)87'
'(sb_server),'
'(someone'
'*have*'
'*increases*'
'...'
'...so'
'...spambayes'
'0.4'
'0.5'
'0.6'
'100'
'18-00'
'19:58'
'2004'
'2833'
'530'
'90%'
'about'
'above'
'absolutely'
'account'
'across'
'action'
'active'
'add-in,'
'address'
'again'
'all'
'allow'
'alone,'
'already'
'also'
'analyze'
'and'
'and/or'
'any'
'appears'
'arbiter'
'are'
"aren't"
'assigns'
'attract'
'august'
'aware'
'based'
'bear'
'because'
'become'
'been'
'before'
'before.'
'besides'
'best'
'between'
'blink.'
'body'
'body,'
'brain'
'break'
'but'
'buy'
'calculating'
'can'
'canadian'
'capable'
'capacity'
'card'
'case'
'cases,'
'cash'
'cc:'
'cc:2**0'
'cc:addr:baltimore.com'
'cc:addr:david.kirwan'
'cc:no real name:2**0'
'cheer'
'client'
'clue.'
'clues'
'coda'
'confidence'
'confused'
'consoorder'
'contain'
'contains'
'content-type:text/plain'
'cost'
'countryside'
'create'
'cruickshank'
'data.'
'day'
'days'
'deleted'
'deliberately'
'detecting'
'determined'
'developer'
'directory'
'discarding'
'discards'
'disposal,'
'dodgy'
'does'
'dollar'
'domain'
'download'
'downloaded'
'e-mail'
'easily.'
'effect'
'effect,'
'either'
'email addr:hotmail.com]'
'email addr:python.org'
'email name:[mailto:kennypitt'
'email name:spambayes-dev'
'enough'
'entire'
'equally'
'etc.'
'even'
'ever'
'everyone.'
'exceeds'
'exists'
'extremely'
'falls'
'far'
'fast'
'fence.'
'few'
'filter.'
'filtering'
'first'
'folder.'
'folder:'
'for'
'from'
'from:'
'from:addr:heli'
'from:addr:helimodels.com'
'from:name:john moriarty'
'general.'
'generic'
'get'
'gets'
'gibberish'
'give'
'giveaway.'
'good'
'good.'
'got:'
'ham'
'hammy,'
'hanley'
'happens'
'has'
"hasn't"
'have'
'header'
'header.'
'header:Date:1'
'header:Errors-To:1'
'header:From:1'
'header:Importance:1'
'header:In-Reply-To:1'
'header:MIME-Version:1'
'header:Message-ID:1'
'header:Received:8'
'header:Return-Path:1'
'header:Subject:1'
'header:To:1'
'headers'
'headers.'
'hegemony'
'hello'
'here'
'high-volume'
'hit'
'house'
'how'
'however,'
'human'
'identified'
'identify'
'identifying'
'ignores'
'important'
'indicator'
'information'
'interested'
'into'
'invariably'
'irrelevant'
'issues'
'it.'
'its'
'itself'
'john'
'joke'
'junk'
'just'
'kenny'
'kind'
"kirwan'"
'last'
'learning'
'legitimate'
'less'
'likely'
'line'
'list'
'lists,'
'loads'
'lot'
'lots'
'lowest'
'mail'
'mailing'
'make'
'many'
'maybe'
'medicine'
'meds'
'meeting'
'men'
'mere'
'message'
'message-----'
'message-id:@user'
'message.'
'messages'
'messages,'
'mistake'
'molehill'
'more'
'moriarty'
"moriarty';"
'most'
'nadir'
'name,'
'names,'
'necessarily'
'needs'
'negative,'
'new'
'newsletters'
'next'
'not'
'note'
'number'
'obscure'
'observations'
'of.'
'on,'
'one'
'online'
'only'
'open'
'option'
'other'
'out'
'outlook'
'outnumbers'
'outweigh'
'pain'
'partially'
'particular'
'patterns'
'people'
'phamacy'
'pitt'
'plus'
'pop3'
'positive'
'possible'
'possibly)'
'potency'
'potentially'
'ppl'
'predominate?'
'presence'
'prices'
'probability'
'probably'
'problematic'
'process'
'processed'
'processing'
'properly'
'proto:http'
'proxy'
'punctuations'
'question,'
'quotes'
'random'
'rare'
'ratio'
're:'
'receive'
'receives.'
'receives...'
'receiving'
'reckon'
'reduces'
'refuse'
'regardless'
'regards,'
'regulate'
'relief'
'reply-to:none'
'restaurant'
'saying'
'score.'
'seduce'
'seem'
'seems'
'seen'
'seller,'
'sender:addr:python.org'
'sender:addr:spambayes-dev-bounces'
'sender:no real name:2**0'
'sensually'
'sent:'
'shipping!'
'shows:'
'significant'
'since'
'skip:( 10'
'skip:- 10'
'skip:[ 10'
'skip:_ 40'
'skip:c 10'
'skip:s 10'
'skip:u 10'
'skip:w 10'
'so,'
'some'
'sometimes'
'spam'
'spam.'
'spam;)'
'spam?'
'spambayes'
'spambayes,'
'spammer'
'spammy'
'spams'
'speed'
'spot'
'still'
'stuff'
'stumble'
'subject'
'subject:'
'subject: '
'subject:-'
'subject:['
'subject:] '
'subject:dev'
'subject:filtering'
'subject:message'
'subject:spambayes'
'subject:subject'
'successfully'
'such'
'text'
'than'
'that'
'the'
'them'
'then'
'there'
'these'
'theses'
'they'
'think'
'this'
'those'
'through'
'time'
'times'
'to:'
'to:2**0'
'to:addr:python.org'
'to:addr:spambayes-dev'
'to:no real name:2**0'
'today'
'told'
'tommorow'
'toss'
'trained'
'training'
'true'
'true.'
'unbelievable'
'update'
'url'
'url:listinfo'
'url:mail'
'url:mailman'
'url:org'
'url:python'
'url:spambayes-dev'
'urls'
'urls.'
'used'
'user'
'using'
'usually'
'vastly'
'very'
'vs.'
'want'
'web'
'weird'
'well'
'what'
"what's"
'when'
'whether'
'while'
'who'
'will'
'with'
'wonder'
'word'
'wording'
'words'
'words,'
'work'
'would'
'wrote:'
'x-mailer:microsoft outlook cws, build 9.0.2416 (9.0.2911.0)'
'you'
'you,'
'your'
More information about the spambayes-dev
mailing list