[Spambayes-checkins] spambayes Corpus.py,1.2,1.2.2.1

Tim Stone timstone4@users.sourceforge.net
Fri Nov 22 00:28:21 2002


Update of /cvsroot/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv22886

Modified Files:
      Tag: hammie-playground
	Corpus.py 
Log Message:
Added methods to Message class:

    getSubject()
    getFrom()
    getDate()
    getHeaders()
    getBody()
    getHeadersList()

Index: Corpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/Corpus.py,v
retrieving revision 1.2
retrieving revision 1.2.2.1
diff -C2 -d -r1.2 -r1.2.2.1
*** Corpus.py	16 Nov 2002 19:03:15 -0000	1.2
--- Corpus.py	22 Nov 2002 00:28:19 -0000	1.2.2.1
***************
*** 230,234 ****
  
          return msg
!         
  
  class ExpiryCorpus:
--- 230,234 ----
  
          return msg
! 
  
  class ExpiryCorpus:
***************
*** 272,276 ****
      def __init__(self):
          '''Constructor()'''
!         pass
  
      def load(self):
--- 272,278 ----
      def __init__(self):
          '''Constructor()'''
! 
!         self.bodytxt = None
!         self.hdrtxt = None
  
      def load(self):
***************
*** 297,301 ****
          '''Instance as a printable string'''
  
!         return self.substance
  
      def name(self):
--- 299,303 ----
          '''Instance as a printable string'''
  
!         return self.getSubstance()
  
      def name(self):
***************
*** 311,322 ****
      def setSubstance(self, sub):
          '''set this message substance'''
!         
          self.substance = sub
!         
      def getSubstance(self):
          '''Return this message substance'''
!         
          return self.substance
!         
      def setSpamprob(self, prob):
          '''Score of the last spamprob calc, may not be persistent'''
--- 313,329 ----
      def setSubstance(self, sub):
          '''set this message substance'''
! 
          self.substance = sub
!         bodyRE = re.compile(r"\r?\n(\r?\n)(.*)", re.DOTALL+re.MULTILINE)
!         bmatch = bodyRE.search(sub)
!         if bmatch:
!             self.bodytxt = bmatch.group(2)
!             self.hdrtxt = sub[:bmatch.start(2)]
! 
      def getSubstance(self):
          '''Return this message substance'''
! 
          return self.substance
! 
      def setSpamprob(self, prob):
          '''Score of the last spamprob calc, may not be persistent'''
***************
*** 327,331 ****
          '''Returns substance as tokens'''
  
!         return tokenizer.tokenize(self.substance)
  
      def createTimeStamp(self):
--- 334,338 ----
          '''Returns substance as tokens'''
  
!         return tokenizer.tokenize(self.getSubstance())
  
      def createTimeStamp(self):
***************
*** 335,338 ****
--- 342,399 ----
          raise NotImplementedError
  
+     def getFrom(self):
+         '''Return a message From header content'''
+ 
+         if self.hdrtxt:
+             match = re.search(r'^From:(.*)$', self.hdrtxt, re.MULTILINE)
+             return match.group(1)
+         else:
+             return None
+ 
+     def getSubject(self):
+         '''Return a message Subject header contents'''
+ 
+         if self.hdrtxt:
+             match = re.search(r'^Subject:(.*)$', self.hdrtxt, re.MULTILINE)
+             return match.group(1)
+         else:
+             return None
+ 
+     def getDate(self):
+         '''Return a message Date header contents'''
+ 
+         if self.hdrtxt:
+             match = re.search(r'^Date:(.*)$', self.hdrtxt, re.MULTILINE)
+             return match.group(1)
+         else:
+             return None
+ 
+     def getHeadersList(self):
+         '''Return a list of message header tuples'''
+ 
+         hdrregex = re.compile(r'^([A-Za-z0-9-_]*): ?(.*)$', re.MULTILINE)
+         data = re.sub(r'\r?\n\r?\s',' ',self.hdrtxt,re.MULTILINE)
+         match = hdrregex.findall(data)
+ 
+ 	return match
+ 	
+     def getHeaders(self):
+         '''Return message headers as text'''
+         
+         return self.hdrtxt
+ 
+     def getBody(self):
+         '''Return the message body'''
+ 
+         return self.bodytxt
+ 
+     def stripSBDHeader(self):
+         '''Removes the X-Spambayes-Disposition: header from the message'''
+ 
+         # This is useful for training, where a spammer may be spoofing
+         # our header, to make sure that our header doesn't become an
+         # overweight clue to hamminess
+ 
+         raise NotImplementedError
  
  





More information about the Spambayes-checkins mailing list