[Tutor] program that processes tokenized words in xml

Tue May 6 09:31:01 2003

--0-260525435-1052227845=:78244
Content-Type: multipart/alternative; boundary="0-100186042-1052227845=:78244"

--0-100186042-1052227845=:78244
Content-Type: text/plain; charset=us-ascii

Hi everyone, I was working on a program that indexes a file that has a tokenizedwords such as the following: <S ID='S-0'> <W>Similarity-Based</W> <W>Estimation</W> <W>of</W> <W>Word</W>  what my program needs to do is to index the words in between <W>...</W>  I already set up class that reads the file line by line,Can anyone suggest how I can incorporate a  regular expression for  eliminating these tags?  I have attached the program with this e-mail  ....Please help and have a look thanks in advance  

---------------------------------
Do you Yahoo!?
The New Yahoo! Search - Faster. Easier. Bingo.
--0-100186042-1052227845=:78244
Content-Type: text/html; charset=us-ascii

<DIV>
<DIV>Hi everyone,</DIV>
<DIV>&nbsp;</DIV>
<DIV>I was working on a program that indexes a file that has a tokenized</DIV>
<DIV>words such as the following:</DIV>
<DIV><FONT size=2></FONT>&nbsp;</DIV>
<DIV><FONT size=2>&lt;S ID='S-0'&gt; &lt;W&gt;Similarity-Based&lt;/W&gt; &lt;W&gt;Estimation&lt;/W&gt; &lt;W&gt;of&lt;/W&gt; &lt;W&gt;Word&lt;/W&gt;</FONT></DIV>
<DIV>&nbsp;</DIV>
<DIV><FONT size=2>&nbsp;what my program needs to do is to index the words in between &lt;W&gt;...&lt;/W&gt;&nbsp; I already set up class that reads the file line by line,</FONT></DIV>
<DIV>Can anyone suggest how I can incorporate a&nbsp; regular expression for &nbsp;</DIV>
<DIV>eliminating these tags? </DIV>
<DIV>&nbsp;</DIV>
<DIV><STRONG>I have attached the program with this e-mail</STRONG>&nbsp; ....Please help and have a look</DIV>
<DIV>&nbsp;</DIV>
<DIV>thanks in advance</DIV>
<DIV>&nbsp;</DIV>
<DIV><FONT size=2>&nbsp;</DIV></FONT></DIV><p><hr SIZE=1>
Do you Yahoo!?<br>
<a href="http://us.rd.yahoo.com/search/mailsig/*http://search.yahoo.com">The New Yahoo! Search</a> - Faster. Easier. Bingo.
--0-100186042-1052227845=:78244--
--0-260525435-1052227845=:78244
Content-Type: text/plain; name="index.py"
Content-Description: index.py
Content-Disposition: inline; filename="index.py"

#!/usr/bin/python
import sys, socket, time, os, errno, re, random, profile,string,math
import win32con,win32file

# backwards-compatibility hack
try: StopIteration
except: StopIteration = "StopIteration"

def search_file(filename,searchpath):
  """It takes a filename and a searchpath and returns the
     filename if it exists otherwise it returns none
  """
  for path in string.split(searchpath):
    candidate = os.path.join(path,filename)
    if os.path.exists(candidate): return os.path.abspath(candidate)
  return None

def ContainsAny(filename, word):
  """it takes a word and and a filename and checks whether
     the word contains in the filename given. this is then
     used for calculating the # of Documents that a word contains
  """
  line = open(filename)
  #print line
  input = line.read()
  test = input.lower()
  #print test
  for txt in test.split():
      #print txt
      #print word
      if cmp(txt,word)== 0:
          return 1
      else:
          return 0  

jumper = random.Random()
jumper.seed()
serial = 0
def link_file_into(dir, file):
    """ the routine links a file to a directory"""
    while 1:
        global serial
        serial = serial + 1
        filename = str(serial)
        try:
            # XXX assumes symlink() is atomic
            #os.path.walk(file,os.path.join(dir, filename))
            file_name = search_file(file,os.path.join(dir, filename))
            return file_name
        except OSError, err:
            err_errno, msg = err
            if err_errno == errno.EEXIST:
                serial = serial + 10**jumper.randrange(3)
            else:
                raise

class word_iterator:

    def __init__(self, filename):
        self.file = open(filename)
        self.readline()

    def readline(self):
        #aword = re.compile (r'<[^<>]*>|\b[\w-]+\b')#|<W>([^<]+)</W>') #added now

        self.line = self.file.readline() #already there
        print self.line
        #test = aword.findall(self.line)
        #print test

        #self.line = ' '.join(self.line)

        if self.line == '': self.words = None
        else: self.words = filter(None,  self.line.split())#mod form \W ->\s+
        #print self.words #testing  
        #re.split(r'\W+' changed littletbit

    def next(self):
        """ this routine gets the next line of the text to be
            line form the file input
        """
        while 1:
            if self.words is None: raise StopIteration
            elif self.words: return self.words.pop()
            else:
                self.readline()

class text_type:
    def getwords(self, filename):
        "Return a word iterator."
        return word_iterator(filename)

def makedirs(dirname):
      """"check if directory exists if not create """
      if not os.path.exists(dirname): os.makedirs(dirname)

class indexer:

    def docsdir(self): return os.path.join(self.indexdir, "docs")
    def wordsdir(self): return os.path.join(self.indexdir, "words")

    def __init__(self, indexdir):
        self.indexdir = indexdir
        self.count = 0 #extra
        self.TestCount = 0  #extra
        self.words = {}

    texttypeinstance = text_type()
    def Filetype(self, origname, linkedname):
        """ this routine avoids all unwanted file extension
            and returns a texttype instance
        """
        for file_extension in ['.zip', '.gz', '.pdf', '.PDF']:
            if origname[-len(file_extension):] == file_extension:
                return None
        return self.texttypeinstance

    #this function checks the case of the characters  
    def check_case(self, word): return word.lower()

    def Good_words(self, word):
      """ this function eliminates all stopwords with in the stop_list
          and returns the required good words
      """
      #stoplist words to be excluded the indexing process
      stop_list = [
           'about', 'all', 'also', 'an', 'and', 'any', 'are', 'as', 'at', 'be', 
           'because', 'been', 'being', 'but', 'by', 'can', 'cannot', 'did', 'do',
           'doing', 'each', 'either', 'else', 'even', 'for', 'from', 'get', 'got',
           'had', 'has', 'have', 'he', 'her', 'hers', 'herself', 'him', 'himself',
           'his', 'how', 'if', 'in', 'into', 'is', 'it', 'its', 'me', 'my', 'myself',
           'no', 'not', 'of', 'on', 'one', 'only', 'onto', 'or', 'our', 'ourselves',
           'she', 'since', 'so', 'some', 'take', 'than', 'that', 'the', 'their', 'them',
           'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through',
           'to', 'too', 'unless', 'until', 'upon',  'us', 'very', 'was', 'we', 'were',
           'what', 'when', 'where', 'which', 'while', 'who', 'whoever', 'whom',
           'whomever', 'whose', 'why', 'will', 'with', 'without', 'would', 'yes',
           'you', 'your', 'yours', 'yourself', 'yourselves','a','such']

      return (word not in stop_list) 

    def indexfile(self, origname, linkedname):
        """Add a file to the full-text index.
        indexdir   ---> the top-level directory the index lives in
        origname   ---> the file's original name (used for guessing type)
        linkedname ---> the name the file is known as in this system
        This routine figures out what kind of file the file is,
        extracts words from it, changes the case them, and adds them to the index.

        """

        filetype = self.Filetype(origname, linkedname)

        if filetype is None: return  #report as failure!
        filewords = filetype.getwords(os.path.join(self.docsdir(),
                                                   linkedname)) 

        words = {}
        try:
            while 1:

                word = self.check_case(filewords.next())
                print word[0],
                break
                if self.Good_words(word):
                    if not words.has_key(word): words[word] = 0
                    words[word] = words[word] + 1

                #compute the doc weighting
                #print"words[word] = %d" %(words[word]) #testing
                #print"count_doc = %d" %(self.count)#testing
                #print"Test_count = %d" %(self.TestCount)#testing

                #compute the doc weighting    
                #Inverse_doc_freq = math.log(self.count / self.TestCount)
                #Doc_weighting = words[word] * Inverse_doc_freq
                #print Doc_weighting

        except StopIteration:
            pass
        for word in words.keys():
            if not self.words.has_key(word): self.words[word] = []
            self.words[word].append((linkedname, words[word]))
            #self.words[word].append(Doc_weighting)

    def index(self, filename):
        """ this function calls the index function,it retrieves
            the appropriate file to be indexed and it also counts
            the number of docs processed.
        """
        self.count = self.count + 1  #works
        filename = os.path.abspath(filename)
        #----------------------------
        file = open(filename) # Ia am here start form tommorow
        test = file.read()
        x = test.lower()

        for i in x.split():
          check = ContainsAny(filename, i)
        if check == 1:
          self.TestCount = self.TestCount + 1
        #print self.TestCount

        #------------------------------
        print "\nTest count = %d" %(self.TestCount)
        print filename #extra
        print "\ncount_doc = %d" %(self.count)#extra

        docsdir = self.docsdir()
        makedirs(docsdir)
        linkedname = link_file_into(docsdir, filename)
        self.indexfile(filename, linkedname)

    def commit(self):
        wordsdir = self.wordsdir()
        makedirs(wordsdir)
        for word in self.words.keys():
            wordfile = open(os.path.join(wordsdir, word), "a")
            for file, data in self.words[word]:
                # FIXME: really big concurrent indexing jobs could
                # exceed PIPE_MAX bytes and result in interleaved appends
                # due to buffering
                wordfile.write("\n%s: %s" % (file, data))
            #posix.fdatasync(wordfile.fileno()) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!
            win32file._get_osfhandle(wordfile.fileno())   
            wordfile.close()
        self.words.clear()
        #print " test = %d" %(self.count) # this is the # of DOcuments in the sys.

def main(argv):
    if len(argv) < 2:
        sys.stderr.write(usage + "\n")
        return 1
    indexdir = argv[1]
    files = argv[2:]
    myindexer = indexer(indexdir)
    try:
        for file in files:
            print "indexing....", file,
            sys.stdout.flush()
            myindexer.index(file)
            print "...done."
    finally:
        print "committing....",
        sys.stdout.flush()
        myindexer.commit()
        print "....done."
    return 0
#-------------------------------------------END-----------------------------------

if __name__ == "__main__": sys.exit(main(sys.argv))
--0-260525435-1052227845=:78244--