[Spambayes-checkins] spambayes Corpus.py,1.2,1.2.2.1
Tim Stone
timstone4@users.sourceforge.net
Fri Nov 22 00:28:21 2002
Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv22886
Modified Files:
Tag: hammie-playground
Corpus.py
Log Message:
Added methods to Message class:
getSubject()
getFrom()
getDate()
getHeaders()
getBody()
getHeadersList()
Index: Corpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v
retrieving revision 1.2
retrieving revision 1.2.2.1
diff -C2 -d -r1.2 -r1.2.2.1
*** Corpus.py 16 Nov 2002 19:03:15 -0000 1.2
--- Corpus.py 22 Nov 2002 00:28:19 -0000 1.2.2.1
***************
*** 230,234 ****
return msg
!
class ExpiryCorpus:
--- 230,234 ----
return msg
!
class ExpiryCorpus:
***************
*** 272,276 ****
def __init__(self):
'''Constructor()'''
! pass
def load(self):
--- 272,278 ----
def __init__(self):
'''Constructor()'''
!
! self.bodytxt = None
! self.hdrtxt = None
def load(self):
***************
*** 297,301 ****
'''Instance as a printable string'''
! return self.substance
def name(self):
--- 299,303 ----
'''Instance as a printable string'''
! return self.getSubstance()
def name(self):
***************
*** 311,322 ****
def setSubstance(self, sub):
'''set this message substance'''
!
self.substance = sub
!
def getSubstance(self):
'''Return this message substance'''
!
return self.substance
!
def setSpamprob(self, prob):
'''Score of the last spamprob calc, may not be persistent'''
--- 313,329 ----
def setSubstance(self, sub):
'''set this message substance'''
!
self.substance = sub
! bodyRE = re.compile(r"\r?\n(\r?\n)(.*)", re.DOTALL+re.MULTILINE)
! bmatch = bodyRE.search(sub)
! if bmatch:
! self.bodytxt = bmatch.group(2)
! self.hdrtxt = sub[:bmatch.start(2)]
!
def getSubstance(self):
'''Return this message substance'''
!
return self.substance
!
def setSpamprob(self, prob):
'''Score of the last spamprob calc, may not be persistent'''
***************
*** 327,331 ****
'''Returns substance as tokens'''
! return tokenizer.tokenize(self.substance)
def createTimeStamp(self):
--- 334,338 ----
'''Returns substance as tokens'''
! return tokenizer.tokenize(self.getSubstance())
def createTimeStamp(self):
***************
*** 335,338 ****
--- 342,399 ----
raise NotImplementedError
+ def getFrom(self):
+ '''Return a message From header content'''
+
+ if self.hdrtxt:
+ match = re.search(r'^From:(.*)$', self.hdrtxt, re.MULTILINE)
+ return match.group(1)
+ else:
+ return None
+
+ def getSubject(self):
+ '''Return a message Subject header contents'''
+
+ if self.hdrtxt:
+ match = re.search(r'^Subject:(.*)$', self.hdrtxt, re.MULTILINE)
+ return match.group(1)
+ else:
+ return None
+
+ def getDate(self):
+ '''Return a message Date header contents'''
+
+ if self.hdrtxt:
+ match = re.search(r'^Date:(.*)$', self.hdrtxt, re.MULTILINE)
+ return match.group(1)
+ else:
+ return None
+
+ def getHeadersList(self):
+ '''Return a list of message header tuples'''
+
+ hdrregex = re.compile(r'^([A-Za-z0-9-_]*): ?(.*)$', re.MULTILINE)
+ data = re.sub(r'\r?\n\r?\s',' ',self.hdrtxt,re.MULTILINE)
+ match = hdrregex.findall(data)
+
+ return match
+
+ def getHeaders(self):
+ '''Return message headers as text'''
+
+ return self.hdrtxt
+
+ def getBody(self):
+ '''Return the message body'''
+
+ return self.bodytxt
+
+ def stripSBDHeader(self):
+ '''Removes the X-Spambayes-Disposition: header from the message'''
+
+ # This is useful for training, where a spammer may be spoofing
+ # our header, to make sure that our header doesn't become an
+ # overweight clue to hamminess
+
+ raise NotImplementedError
More information about the Spambayes-checkins
mailing list