[Python-checkins] python/dist/src/Lib dumbdbm.py,1.22,1.23

tim_one@users.sourceforge.net tim_one@users.sourceforge.net
Sat, 12 Jul 2003 13:11:27 -0700


Update of /cvsroot/python/python/dist/src/Lib
In directory sc8-pr-cvs1:/tmp/cvs-serv28670/Lib

Modified Files:
	dumbdbm.py 
Log Message:
There's a persistent rumor on the spambayes mailing list that dumbdbm
databases are associated with corruption problems, so I studied this code
carefully and ran some brutal stress tests.  I didn't find any bugs,
although it's unclear whether this code *intends* that __setitem__ can
leave the directory file out of synch with the data file (so
if a dumbdbm isn't properly closed, and the value of an existing key
was ever replaced, corruption is almost certain, where "corruption"
means the directory file is out of synch with the data file).

Added many comments and generally modernized the code.  Examples of the
latter:  we have better ways of reading a whole file line-by-line now;
eval() now tolerates a trailing newline; the %r format code can be used
to avoid explicit repr/backtick calls; and the code often broke tuples
into their components when it was clearer and faster to just leave them
as tuples.


Index: dumbdbm.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/dumbdbm.py,v
retrieving revision 1.22
retrieving revision 1.23
diff -C2 -d -r1.22 -r1.23
*** dumbdbm.py	11 Jul 2003 04:09:55 -0000	1.22
--- dumbdbm.py	12 Jul 2003 20:11:25 -0000	1.23
***************
*** 34,42 ****
  class _Database(UserDict.DictMixin):
  
!     def __init__(self, file, mode):
          self._mode = mode
!         self._dirfile = file + _os.extsep + 'dir'
!         self._datfile = file + _os.extsep + 'dat'
!         self._bakfile = file + _os.extsep + 'bak'
          # Mod by Jack: create data file if needed
          try:
--- 34,57 ----
  class _Database(UserDict.DictMixin):
  
!     def __init__(self, filebasename, mode):
          self._mode = mode
! 
!         # The directory file is a text file.  Each line looks like
!         #    "%r, (%d, %d)\n" % (key, pos, siz)
!         # where key is the string key, pos is the offset into the dat
!         # file of the associated value's first byte, and siz is the number
!         # of bytes in the associated value.
!         self._dirfile = filebasename + _os.extsep + 'dir'
! 
!         # The data file is a binary file pointed into by the directory
!         # file, and holds the values associated with keys.  Each value
!         # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
!         # binary 8-bit string value.
!         self._datfile = filebasename + _os.extsep + 'dat'
!         self._bakfile = filebasename + _os.extsep + 'bak'
! 
!         # The index is an in-memory dict, mirroring the directory file.
!         self._index = None  # maps keys to (pos, siz) pairs
! 
          # Mod by Jack: create data file if needed
          try:
***************
*** 47,50 ****
--- 62,66 ----
          self._update()
  
+     # Read directory file into the in-memory index dict.
      def _update(self):
          self._index = {}
***************
*** 54,72 ****
              pass
          else:
!             while 1:
!                 line = f.readline().rstrip()
!                 if not line: break
!                 key, (pos, siz) = eval(line)
!                 self._index[key] = (pos, siz)
              f.close()
  
      def _commit(self):
!         try: _os.unlink(self._bakfile)
!         except _os.error: pass
!         try: _os.rename(self._dirfile, self._bakfile)
!         except _os.error: pass
          f = _open(self._dirfile, 'w', self._mode)
          for key, (pos, siz) in self._index.items():
!             f.write("%s, (%s, %s)\n" % (`key`, `pos`, `siz`))
          f.close()
  
--- 70,95 ----
              pass
          else:
!             for line in f:
!                 key, pos_and_siz_pair = eval(line)
!                 self._index[key] = pos_and_siz_pair
              f.close()
  
+     # Write the index dict to the directory file.  The original directory
+     # file (if any) is renamed with a .bak extension first.  If a .bak
+     # file currently exists, it's deleted.
      def _commit(self):
!         try:
!             _os.unlink(self._bakfile)
!         except _os.error:
!             pass
! 
!         try:
!             _os.rename(self._dirfile, self._bakfile)
!         except _os.error:
!             pass
! 
          f = _open(self._dirfile, 'w', self._mode)
          for key, (pos, siz) in self._index.items():
!             f.write("%r, (%d, %d)\n" % (key, pos, siz))
          f.close()
  
***************
*** 79,97 ****
          return dat
  
      def _addval(self, val):
          f = _open(self._datfile, 'rb+')
          f.seek(0, 2)
          pos = int(f.tell())
- ## Does not work under MW compiler
- ##              pos = ((pos + _BLOCKSIZE - 1) / _BLOCKSIZE) * _BLOCKSIZE
- ##              f.seek(pos)
          npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
          f.write('\0'*(npos-pos))
          pos = npos
- 
          f.write(val)
          f.close()
          return (pos, len(val))
  
      def _setval(self, pos, val):
          f = _open(self._datfile, 'rb+')
--- 102,124 ----
          return dat
  
+     # Append val to the data file, starting at a _BLOCKSIZE-aligned
+     # offset.  The data file is first padded with NUL bytes (if needed)
+     # to get to an aligned offset.  Return pair
+     #     (starting offset of val, len(val))
      def _addval(self, val):
          f = _open(self._datfile, 'rb+')
          f.seek(0, 2)
          pos = int(f.tell())
          npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
          f.write('\0'*(npos-pos))
          pos = npos
          f.write(val)
          f.close()
          return (pos, len(val))
  
+     # Write val to the data file, starting at offset pos.  The caller
+     # is responsible for ensuring that there's enough room starting at
+     # pos to hold val, without overwriting some other value.  Return
+     # pair (pos, len(val)).
      def _setval(self, pos, val):
          f = _open(self._datfile, 'rb+')
***************
*** 101,108 ****
          return (pos, len(val))
  
!     def _addkey(self, key, (pos, siz)):
!         self._index[key] = (pos, siz)
          f = _open(self._dirfile, 'a', self._mode)
!         f.write("%s, (%s, %s)\n" % (`key`, `pos`, `siz`))
          f.close()
  
--- 128,138 ----
          return (pos, len(val))
  
!     # key is a new key whose associated value starts in the data file
!     # at offset pos and with length size.  Add an index record to
!     # the in-memory index dict, and append one to the index file.
!     def _addkey(self, key, pos_and_siz_pair):
!         self._index[key] = pos_and_siz_pair
          f = _open(self._dirfile, 'a', self._mode)
!         f.write("%r, %r\n" % (key, pos_and_siz_pair))
          f.close()
  
***************
*** 110,129 ****
          if not type(key) == type('') == type(val):
              raise TypeError, "keys and values must be strings"
!         if not key in self._index:
!             (pos, siz) = self._addval(val)
!             self._addkey(key, (pos, siz))
          else:
              pos, siz = self._index[key]
              oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
              newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
              if newblocks <= oldblocks:
!                 pos, siz = self._setval(pos, val)
!                 self._index[key] = pos, siz
              else:
!                 pos, siz = self._addval(val)
!                 self._index[key] = pos, siz
  
      def __delitem__(self, key):
          del self._index[key]
          self._commit()
  
--- 140,170 ----
          if not type(key) == type('') == type(val):
              raise TypeError, "keys and values must be strings"
!         if key not in self._index:
!             self._addkey(key, self._addval(val))
          else:
+             # See whether the new value is small enough to fit in the
+             # (padded) space currently occupied by the old value.
              pos, siz = self._index[key]
              oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
              newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
              if newblocks <= oldblocks:
!                 self._index[key] = self._setval(pos, val)
              else:
!                 # The new value doesn't fit in the (padded) space used
!                 # by the old value.  The blocks used by the old value are
!                 # forever lost.
!                 self._index[key] = self._addval(val)
! 
!             # Note that _index may be out of synch with the directory
!             # file now:  _setval() and _addval() don't update the directory
!             # file.
  
      def __delitem__(self, key):
+         # The blocks used by the associated value are lost.
          del self._index[key]
+         # XXX It's unclear why we do a _commit() here (the code always
+         # XXX has, so I'm not changing it).  _setitem__ doesn't try to
+         # XXX keep the directory file in synch.  Why should we?  Or
+         # XXX why shouldn't __setitem__?
          self._commit()