[Spambayes-checkins] spambayes/spambayes storage.py,1.16,1.17
Skip Montanaro
montanaro at users.sourceforge.net
Wed Aug 6 19:39:14 EDT 2003
Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1:/tmp/cvs-serv31616
Modified Files:
storage.py
Log Message:
**** Danger, Will Robinson! Do not use the PGClassifier class yet! ****
This is an initial stab at SQLClassifier and PGClassifier classes. This
still needs a lot of work, to wit:
* I've tried to break functionality into the two classes in such a way
that adding other SQLClassifier subclasses should be reasonably easy,
but I don't know much about writing portable SQL. Python's DB API
helps, to be sure, but isn't perfect.
* Scoring messages is dreadfully slow. I don't know if I'm commit()ing
too frequently, creating too many cursors or if I have some other
problem. My past use of SQL has generally been of the "scads of
SELECTs per INSERT" sort of thing, so I've never paid a lot of
attention to commit().
* I've encountered a couple bad cases. With the word column defined as
bytea (PostgreSQL's binary string type), both of these calls fail if c
is a cursor object:
c.execute("select * from bayes where word=%s", ('report.\\n";',))
c.execute("select * from bayes where word=%s", ('reserved\x00',))
If the word column is defined as the more traditional varchar(128),
the first call succeeds but the second still fails.
Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v
retrieving revision 1.16
retrieving revision 1.17
diff -C2 -d -r1.16 -r1.17
*** storage.py 25 Jul 2003 05:17:22 -0000 1.16
--- storage.py 7 Aug 2003 01:39:12 -0000 1.17
***************
*** 258,261 ****
--- 258,417 ----
self.changed_words[word] = WORD_DELETED
+
+ class SQLClassifier(classifier.Classifier):
+ def __init__(self, db_name):
+ '''Constructor(database name)'''
+
+ classifier.Classifier.__init__(self)
+ self.statekey = "saved state"
+ self.db_name = db_name
+ self.load()
+
+ def load(self):
+ '''Load state from the database'''
+ raise NotImplementedError, "must be implemented in subclass"
+
+ def store(self):
+ '''Save state to the database'''
+ self._set_row(self.statekey, self.nspam, self.nham)
+
+ def cursor(self):
+ '''Return a new db cursor'''
+ raise NotImplementedError, "must be implemented in subclass"
+
+ def fetchall(self, c):
+ '''Return all rows as a dict'''
+ raise NotImplementedError, "must be implemented in subclass"
+
+ def commit(self, c):
+ '''Commit the current transaction - may commit at db or cursor'''
+ raise NotImplementedError, "must be implemented in subclass"
+
+ def create_bayes(self):
+ '''Create a new bayes table'''
+ c = self.cursor()
+ c.execute(self.table_definition)
+ self.commit(c)
+
+ def _get_row(self, word):
+ '''Return row matching word'''
+ try:
+ c = self.cursor()
+ c.execute("select * from bayes"
+ " where word=%s",
+ (word,))
+ except Exception, e:
+ print "error:", (e, word)
+ raise
+ rows = self.fetchall(c)
+
+ if rows:
+ return rows[0]
+ else:
+ return {}
+
+ def _set_row(self, word, nspam, nham):
+ c = self.cursor()
+ if self._has_key(word):
+ c.execute("update bayes"
+ " set nspam=%s,nham=%s"
+ " where word=%s",
+ (nspam, nham, word))
+ else:
+ c.execute("insert into bayes"
+ " (nspam, nham, word)"
+ " values (%s, %s, %s)",
+ (nspam, nham, word))
+ self.commit(c)
+
+ def _delete_row(self, word):
+ c = self.cursor()
+ c.execute("delete from bayes"
+ " where word=%s",
+ (word,))
+ self.commit(c)
+
+ def _has_key(self, key):
+ c = self.cursor()
+ c.execute("select word from bayes"
+ " where word=%s",
+ (key,))
+ return len(self.fetchall(c)) > 0
+
+ def _wordinfoget(self, word):
+ if isinstance(word, unicode):
+ word = word.encode("utf-8")
+
+ row = self._get_row(word)
+ if row:
+ item = self.WordInfoClass()
+ item.__setstate__((row["nspam"], row["nham"]))
+ return item
+ else:
+ return self.WordInfoClass()
+
+ def _wordinfoset(self, word, record):
+ if isinstance(word, unicode):
+ word = word.encode("utf-8")
+ self._set_row(word, record.spamcount, record.hamcount)
+
+ def _wordinfodel(self, word):
+ if isinstance(word, unicode):
+ word = word.encode("utf-8")
+ self._delete_row(word)
+
+
+ class PGClassifier(SQLClassifier):
+ '''Classifier object persisted in a Postgres database'''
+ def __init__(self, db_name):
+ self.table_definition = ("create table bayes ("
+ " word bytea not null default '',"
+ " nspam integer not null default 0,"
+ " nham integer not null default 0,"
+ " primary key(word)"
+ ")")
+ SQLClassifier.__init__(self, db_name)
+
+ def cursor(self):
+ return self.db.cursor()
+
+ def fetchall(self, c):
+ return c.dictfetchall()
+
+ def commit(self, c):
+ self.db.commit()
+
+ def load(self):
+ '''Load state from database'''
+
+ import psycopg
+
+ if options.verbose:
+ print 'Loading state from',self.db_name,'database'
+
+ self.db = psycopg.connect(self.db_name)
+
+ c = self.cursor()
+ try:
+ c.execute("select count(*) from bayes")
+ except psycopg.ProgrammingError:
+ self.db.rollback()
+ self.create_bayes()
+
+ if self._has_key(self.statekey):
+ row = self._get_row(self.statekey)
+ self.nspam = row["nspam"]
+ self.nham = row["nham"]
+ if options.verbose:
+ print '%s is an existing database, with %d spam and %d ham' \
+ % (self.db_name, self.nspam, self.nham)
+ else:
+ # new database
+ if options.verbose:
+ print self.db_name,'is a new database'
+ self.nspam = 0
+ self.nham = 0
+
+
class Trainer:
'''Associates a Classifier object and one or more Corpora, \
More information about the Spambayes-checkins
mailing list