[Spambayes-checkins] spambayes/spambayes storage.py,1.16,1.17

Skip Montanaro montanaro at users.sourceforge.net
Wed Aug 6 19:39:14 EDT 2003


Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31616

Modified Files:
	storage.py 
Log Message:
**** Danger, Will Robinson!  Do not use the PGClassifier class yet! ****

This is an initial stab at SQLClassifier and PGClassifier classes.  This
still needs a lot of work, to wit:

    * I've tried to break functionality into the two classes in such a way
      that adding other SQLClassifier subclasses should be reasonably easy,
      but I don't know much about writing portable SQL.  Python's DB API
      helps, to be sure, but isn't perfect.

    * Scoring messages is dreadfully slow.  I don't know if I'm commit()ing
      too frequently, creating too many cursors or if I have some other
      problem.  My past use of SQL has generally been of the "scads of
      SELECTs per INSERT" sort of thing, so I've never paid a lot of
      attention to commit().

    * I've encountered a couple bad cases.  With the word column defined as
      bytea (PostgreSQL's binary string type), both of these calls fail if c
      is a cursor object:

  	c.execute("select * from bayes where word=%s", ('report.\\n";',))
	c.execute("select * from bayes where word=%s", ('reserved\x00',))

      If the word column is defined as the more traditional varchar(128),
      the first call succeeds but the second still fails.


Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v
retrieving revision 1.16
retrieving revision 1.17
diff -C2 -d -r1.16 -r1.17
*** storage.py	25 Jul 2003 05:17:22 -0000	1.16
--- storage.py	7 Aug 2003 01:39:12 -0000	1.17
***************
*** 258,261 ****
--- 258,417 ----
          self.changed_words[word] = WORD_DELETED
  
+ 
+ class SQLClassifier(classifier.Classifier):
+     def __init__(self, db_name):
+         '''Constructor(database name)'''
+ 
+         classifier.Classifier.__init__(self)
+         self.statekey = "saved state"
+         self.db_name = db_name
+         self.load()
+ 
+     def load(self):
+         '''Load state from the database'''
+         raise NotImplementedError, "must be implemented in subclass"
+ 
+     def store(self):
+         '''Save state to the database'''
+         self._set_row(self.statekey, self.nspam, self.nham)
+ 
+     def cursor(self):
+         '''Return a new db cursor'''
+         raise NotImplementedError, "must be implemented in subclass"
+ 
+     def fetchall(self, c):
+         '''Return all rows as a dict'''
+         raise NotImplementedError, "must be implemented in subclass"
+ 
+     def commit(self, c):
+         '''Commit the current transaction - may commit at db or cursor'''
+         raise NotImplementedError, "must be implemented in subclass"
+         
+     def create_bayes(self):
+         '''Create a new bayes table'''
+         c = self.cursor()
+         c.execute(self.table_definition)
+         self.commit(c)
+ 
+     def _get_row(self, word):
+         '''Return row matching word'''
+         try:
+             c = self.cursor()
+             c.execute("select * from bayes"
+                       "  where word=%s",
+                       (word,))
+         except Exception, e:
+             print "error:", (e, word)
+             raise
+         rows = self.fetchall(c)
+ 
+         if rows:
+             return rows[0]
+         else:
+             return {}
+ 
+     def _set_row(self, word, nspam, nham):
+         c = self.cursor()
+         if self._has_key(word):
+             c.execute("update bayes"
+                       "  set nspam=%s,nham=%s"
+                       "  where word=%s",
+                       (nspam, nham, word))
+         else:
+             c.execute("insert into bayes"
+                       "  (nspam, nham, word)"
+                       "  values (%s, %s, %s)",
+                       (nspam, nham, word))
+         self.commit(c)
+ 
+     def _delete_row(self, word):
+         c = self.cursor()
+         c.execute("delete from bayes"
+                   "  where word=%s",
+                   (word,))
+         self.commit(c)
+ 
+     def _has_key(self, key):
+         c = self.cursor()
+         c.execute("select word from bayes"
+                   "  where word=%s",
+                   (key,))
+         return len(self.fetchall(c)) > 0
+ 
+     def _wordinfoget(self, word):
+         if isinstance(word, unicode):
+             word = word.encode("utf-8")
+ 
+         row = self._get_row(word)
+         if row:
+             item = self.WordInfoClass()
+             item.__setstate__((row["nspam"], row["nham"]))
+             return item
+         else:
+             return self.WordInfoClass()
+ 
+     def _wordinfoset(self, word, record):
+         if isinstance(word, unicode):
+             word = word.encode("utf-8")
+         self._set_row(word, record.spamcount, record.hamcount)
+ 
+     def _wordinfodel(self, word):
+         if isinstance(word, unicode):
+             word = word.encode("utf-8")
+         self._delete_row(word)
+ 
+ 
+ class PGClassifier(SQLClassifier):
+     '''Classifier object persisted in a Postgres database'''
+     def __init__(self, db_name):
+         self.table_definition = ("create table bayes ("
+                                  "  word bytea not null default '',"
+                                  "  nspam integer not null default 0,"
+                                  "  nham integer not null default 0,"
+                                  "  primary key(word)"
+                                  ")")
+         SQLClassifier.__init__(self, db_name)
+ 
+     def cursor(self):
+         return self.db.cursor()
+ 
+     def fetchall(self, c):
+         return c.dictfetchall()
+ 
+     def commit(self, c):
+         self.db.commit()
+ 
+     def load(self):
+         '''Load state from database'''
+ 
+         import psycopg
+         
+         if options.verbose:
+             print 'Loading state from',self.db_name,'database'
+ 
+         self.db = psycopg.connect(self.db_name)
+ 
+         c = self.cursor()
+         try:
+             c.execute("select count(*) from bayes")
+         except psycopg.ProgrammingError:
+             self.db.rollback()
+             self.create_bayes()
+         
+         if self._has_key(self.statekey):
+             row = self._get_row(self.statekey)
+             self.nspam = row["nspam"]
+             self.nham = row["nham"]
+             if options.verbose:
+                 print '%s is an existing database, with %d spam and %d ham' \
+                       % (self.db_name, self.nspam, self.nham)
+         else:
+             # new database
+             if options.verbose:
+                 print self.db_name,'is a new database'
+             self.nspam = 0
+             self.nham = 0
+ 
+ 
  class Trainer:
      '''Associates a Classifier object and one or more Corpora, \





More information about the Spambayes-checkins mailing list