[Tutor] help for implementation..........

Abdirizak abdi a_abdi406@yahoo.com
Wed Apr 23 11:03:03 2003


--0-1948235366-1051110175=:27221
Content-Type: text/plain; charset=us-ascii

Hi,I was trying to implement a program that indexes documents and incorporates Document frequency (df): i.e. The number of documents that the term occurs in,the list of documents containing the term, and term frequency (tf): The number of times that the term occurs in each document. I have started something I would appreciate if someonewith information retrievial can give me hint or idea to proceed..... this is what i have developed: def index(file, key):
        """create an index """         stop_list = [
           'about', 'all', 'also', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 
           ...............]        # A regular expression for extracting all non xml words        
        aword =re.compile (r'<[^<>]*>|\b[\w-]+\b')
        adict = {}
        temp = aword.findall(file.lower())        i = -1
        #print temp # prints a listfor test  #eliminate all stoplist words
        for word in temp:
            if word[0] != '<' and word not in stop_list:
                   
                   
                    i = i + 1      
                    #add the word tokens and their positions
                    #to dictionary
                    adict.setdefault(word,[]).append(i)
                    
                    
        # used for counting total words
        temp = 0
        test = []
        for word in adict.items():
                test.append( word )
                temp += 1
        print test        
        print temp       
        #-----------------------------------------------------
        print adict #for testing 
        print "i = %d" %(i) # for testing
        #print "total = %d" %(total) # for testing
        
        #incoorporate the frequency calculation and remember we started -1       
        nwords = math.log(i + 1)
        
        print "nwords = %d" %(nwords) # for testing
        print "---------------"  #for testing
        #----------------------------------------------------------
        
        #create a dictionary that will be used for updating
        # the initial dictionary created 
        tempDict = {}
        
        for word, positions in adict.items():                
            # coumpute the word frequency
            TermFreq = int(100 *len(positions) / nwords)
            #print len(positions), nwords
            #InvDocFreq = int( temp / len(positions))
            #print len(positions), TermFreq , InvDocFreq,temp ,nwords               
            #Inv_Index = int(10 * (TermFreq * InvDocFreq))
            
            # this dictionary puts the keys with multiple variables
            tempDict.setdefault(word,[]).append(key)
            tempDict.setdefault(word,[]).append(TermFreq)
            tempDict.setdefault(word,[]).append(positions)
            
        #print tempDict # for testing purpose
        
        
        #call for function sort_by_freq which sorts the elements
        # in a dictionary by frequency
        sorted_list = sort_by_freq(tempDict)
        print sorted_list       
        #print sorted_list for test not to be used for the program
        for line in sorted_list[0:45]:
                print "word: %10s  freq: %4d" % (line[0],line[1][1])    thanks in advance
         


---------------------------------
Do you Yahoo!?
The New Yahoo! Search - Faster. Easier. Bingo.
--0-1948235366-1051110175=:27221
Content-Type: text/html; charset=us-ascii

<DIV>Hi,</DIV>
<DIV>I was trying to implement a program that indexes documents and incorporates Document frequency (<I>df</I>): i.e. The number of documents that the term occurs in,the list of documents containing the term, and term frequency (<I>tf</I>): The number of times that the term occurs in each document.&nbsp;I have&nbsp;started something I would appreciate if someone</DIV>
<DIV>with information&nbsp;retrievial can give me hint or idea to proceed.....</DIV>
<DIV>&nbsp;</DIV>
<DIV>this is what i have developed:</DIV>
<DIV>&nbsp;</DIV>
<DIV>def index(file, key):<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; """create an index&nbsp;"""</DIV>
<DIV>&nbsp;</DIV>
<DIV>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; stop_list = [<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 'about', 'all', 'also', 'an', 'and', 'any', 'are', 'as', 'at', 'be',&nbsp;<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; ...............]</DIV>
<DIV>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # A regular expression for extracting all non xml words&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; aword =re.compile (r'&lt;[^&lt;&gt;]*&gt;|\b[\w-]+\b')<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; adict = {}<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; temp = aword.findall(file.lower())</DIV>
<DIV>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; i = -1<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #print temp # prints a listfor test&nbsp; #eliminate all stoplist words<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; for word in temp:<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; if word[0] != '&lt;' and word not in stop_list:<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; i = i + 1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #add the word tokens and their positions<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #to dictionary<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; adict.setdefault(word,[]).append(i)<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # used for counting total words<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; temp = 0<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; test = []<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; for word in adict.items():<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; test.append( word )<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; temp += 1<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print test&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print temp&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #-----------------------------------------------------<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print adict #for testing <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print "i = %d" %(i) # for testing<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #print "total = %d" %(total) # for testing<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #incoorporate the frequency calculation and remember we started -1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; nwords = math.log(i + 1)<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print "nwords = %d" %(nwords) # for testing<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print "---------------"&nbsp; #for testing<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #----------------------------------------------------------<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #create a dictionary that will be used for updating<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # the initial dictionary created <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; tempDict = {}<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; for word, positions in adict.items():&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # coumpute the word frequency<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; TermFreq = int(100 *len(positions) / nwords)<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #print len(positions), nwords<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #InvDocFreq = int( temp / len(positions))<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #print len(positions), TermFreq , InvDocFreq,temp ,nwords&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #Inv_Index = int(10 * (TermFreq * InvDocFreq))<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # this dictionary puts the keys with multiple variables<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; tempDict.setdefault(word,[]).append(key)<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; tempDict.setdefault(word,[]).append(TermFreq)<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; tempDict.setdefault(word,[]).append(positions)<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #print tempDict # for testing purpose<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #call for function sort_by_freq which sorts the elements<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; # in a dictionary by frequency<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; sorted_list = sort_by_freq(tempDict)<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print sorted_list&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; #print sorted_list for test not to be used for the program<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; for line in sorted_list[0:45]:<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; print "word: %10s&nbsp; freq: %4d" % (line[0],line[1][1])</DIV>
<DIV>&nbsp;</DIV>
<DIV>&nbsp;&nbsp; thanks in advance<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<FONT face="Courier New" size=2></FONT></DIV><p><br><hr size=1>Do you Yahoo!?<br>
<a href="http://us.rd.yahoo.com/search/mailsig/*http://search.yahoo.com">The New Yahoo! Search</a> - Faster. Easier. Bingo.
--0-1948235366-1051110175=:27221--