some kind of LFU dict...

Joh joh12005 at yahoo.fr
Fri Jan 28 10:39:42 EST 2005


Hello,

(first i'm sorry for my bad english...)

for a program, i need to have some kind of dictionary which will
contain a huge amount of keys and values. i have tried 'to do it
simple' using a normal dict but when available memory was exhausted,
my computer started to be really slow as it swap-ed.

so i wondered if i can not use some kind of cache, i googled and found
information on LRU, LFU, and LFU interested me : keep only most
frequently used items inside dict (and memory) and writing others on
disk for a possible future reuse.

here is a silly attempt to do it, it still missed some features but
before to continue i would like to have your opinions. i'm interested
in any propositions or refactorisations :) and btw i may have missed
others already/tuned/better solutions...

best.

---

import fileinput, pickle, random, shutil, sys

class sLFU:
    TMP = """%s.bak"""
    """ a simple kind of Least Frequently Used container"""
    def __init__(self, size, ratio, filename = None):
        """ filename = a temporary file which will could be deleted"""
        self.d = {} #   container
        self.size, self.ratio = size, ratio
        self.freqs, self.fmin = {}, 0
        self.filename = filename
        if self.filename: open(self.filename, 'w')
    def __setitem__(self, obj, v):
        self.d[obj] = v
        self.freqs[obj] = self.freqs.get(obj, self.fmin) + 1
        if len(self.d) > self.size:
            """ lfu = { (size / ratio) least frequently used
objects... }"""
            freqs = [(f, obj) for (obj, f) in self.freqs.items()]
            # maybe should i use a generator () like
            # ((f, obj) for (obj, f) in self.freqs.items())
            freqs.sort()
            lfu = {}
            # and here enumerate(sorted(freqs))
            for i, (f, obj) in enumerate(freqs):
                if i > self.size / self.ratio:
                    break
                lfu[obj] = self.d[obj]
            """ next minimal frequency will be max frequency of the
lfu
            (maybe it would be better to substract this from others in
            self.freqs)"""
            self.fmin = f
            """   and now delete theses objects..."""
            for obj in lfu:
                    del self.freqs[obj]
                    del self.d[obj]                
            if self.filename:
                """ now must save lfu to disk...
                as i do not managed to do otherwise, copy file to a
tmp
                file and read it line by line, updating when
necessary...
                there must be plenty rooms for improvement here :("""
                shutil.copy(self.filename, self.TMP % self.filename)
                fhs = open(self.TMP % self.filename)
                fh = open(self.filename, 'w')
                """ flag to ignore 'value' line"""
                ignoreNext = False
                for i, line in enumerate(fhs):
                    """ first copy old lfu which were already set,
updating
                    them if necessary..."""
                    if ignoreNext:
                        ignoreNext = False
                        continue
                    line = line.rstrip()
                    if i % 2 == 0:
                        obj = self.loads(line)
                        if obj not in lfu and obj in self.d:
                            """ obj available from memory, do not to
                            keep it on disk..."""
                            ignoreNext = True
                            continue
                        elif obj in lfu:
                            """ obj was available from memory, but as
a lfu,
                            was removed and must be updated on disk"""
                            fh.write("%s\n" % line)
                            fh.write("%s\n" % self.dumps(lfu[obj]))
                            del lfu[obj]
                            ignoreNext = True
                            continue
                    """ default behaviour : copy line..."""
                    fh.write("%s\n" % line)
                """ from now lfu should contain only unseen lfu
objects,
                add them to file..."""
                fh = open(self.filename, 'a')
                for obj in lfu:
                    fh.write("%s\n" % self.dumps(obj))
                    fh.write("%s\n" % self.dumps(lfu[obj]))
    def __getitem__(self, obj):
        if obj in self.d:
            """ from memory :)"""
            return self.d[obj]
        if self.filename:
            """ if a filename was provided, search inside file if
            such an obj is present, then return it ; else raise
            IndexErrror."""
            fh = open(self.filename)
            found = False
            for i, line in enumerate(fh):
                line = line.rstrip()
                if found:
                    v = self.loads(line)
                    self.d[obj] = v
                    self.freqs[obj] = self.fmin + 1
                    return v
                if obj == self.loads(line):
                    found = True
        raise KeyError(obj)
    """ maybe class methods would have been better here,
    actually i would have liked static + class...
    totally arbitrary format, haved choosed to use pickle,
    odd line contain 'obj'
    (next) even line contain 'value'
    """
    def dumps(self, s):
        return pickle.dumps(s).replace("\n", "\t")
    staticmethod(dumps)            
    def loads(self, s):
        return pickle.loads(s.replace("\t", "\n"))
    staticmethod(loads)            
        


lfu = sLFU(2, 2, 'slfu.txt')
lfu[1] 
8  
= {'1': 
1fd2
'a'}
lfu[2] = []
lfu[3] = '3'
lfu[2] = [1,]
lfu[2] = [1,2]
lfu[4] = 4
lfu[5] = '5'

print "#d", len(lfu.d)
print lfu.d
print "".join(open(lfu.filename).readlines())



More information about the Python-list mailing list