[Tutor] program that processes tokenized words in xml
Abdirizak abdi
a_abdi406@yahoo.com
Tue May 6 09:31:01 2003
--0-260525435-1052227845=:78244
Content-Type: multipart/alternative; boundary="0-100186042-1052227845=:78244"
--0-100186042-1052227845=:78244
Content-Type: text/plain; charset=us-ascii
Hi everyone, I was working on a program that indexes a file that has a tokenizedwords such as the following: <S ID='S-0'> <W>Similarity-Based</W> <W>Estimation</W> <W>of</W> <W>Word</W> what my program needs to do is to index the words in between <W>...</W> I already set up class that reads the file line by line,Can anyone suggest how I can incorporate a regular expression for eliminating these tags? I have attached the program with this e-mail ....Please help and have a look thanks in advance
---------------------------------
Do you Yahoo!?
The New Yahoo! Search - Faster. Easier. Bingo.
--0-100186042-1052227845=:78244
Content-Type: text/html; charset=us-ascii
<DIV>
<DIV>Hi everyone,</DIV>
<DIV> </DIV>
<DIV>I was working on a program that indexes a file that has a tokenized</DIV>
<DIV>words such as the following:</DIV>
<DIV><FONT size=2></FONT> </DIV>
<DIV><FONT size=2><S ID='S-0'> <W>Similarity-Based</W> <W>Estimation</W> <W>of</W> <W>Word</W></FONT></DIV>
<DIV> </DIV>
<DIV><FONT size=2> what my program needs to do is to index the words in between <W>...</W> I already set up class that reads the file line by line,</FONT></DIV>
<DIV>Can anyone suggest how I can incorporate a regular expression for </DIV>
<DIV>eliminating these tags? </DIV>
<DIV> </DIV>
<DIV><STRONG>I have attached the program with this e-mail</STRONG> ....Please help and have a look</DIV>
<DIV> </DIV>
<DIV>thanks in advance</DIV>
<DIV> </DIV>
<DIV><FONT size=2> </DIV></FONT></DIV><p><hr SIZE=1>
Do you Yahoo!?<br>
<a href="http://us.rd.yahoo.com/search/mailsig/*http://search.yahoo.com">The New Yahoo! Search</a> - Faster. Easier. Bingo.
--0-100186042-1052227845=:78244--
--0-260525435-1052227845=:78244
Content-Type: text/plain; name="index.py"
Content-Description: index.py
Content-Disposition: inline; filename="index.py"
#!/usr/bin/python
import sys, socket, time, os, errno, re, random, profile,string,math
import win32con,win32file
# backwards-compatibility hack
try: StopIteration
except: StopIteration = "StopIteration"
def search_file(filename,searchpath):
"""It takes a filename and a searchpath and returns the
filename if it exists otherwise it returns none
"""
for path in string.split(searchpath):
candidate = os.path.join(path,filename)
if os.path.exists(candidate): return os.path.abspath(candidate)
return None
def ContainsAny(filename, word):
"""it takes a word and and a filename and checks whether
the word contains in the filename given. this is then
used for calculating the # of Documents that a word contains
"""
line = open(filename)
#print line
input = line.read()
test = input.lower()
#print test
for txt in test.split():
#print txt
#print word
if cmp(txt,word)== 0:
return 1
else:
return 0
jumper = random.Random()
jumper.seed()
serial = 0
def link_file_into(dir, file):
""" the routine links a file to a directory"""
while 1:
global serial
serial = serial + 1
filename = str(serial)
try:
# XXX assumes symlink() is atomic
#os.path.walk(file,os.path.join(dir, filename))
file_name = search_file(file,os.path.join(dir, filename))
return file_name
except OSError, err:
err_errno, msg = err
if err_errno == errno.EEXIST:
serial = serial + 10**jumper.randrange(3)
else:
raise
class word_iterator:
def __init__(self, filename):
self.file = open(filename)
self.readline()
def readline(self):
#aword = re.compile (r'<[^<>]*>|\b[\w-]+\b')#|<W>([^<]+)</W>') #added now
self.line = self.file.readline() #already there
print self.line
#test = aword.findall(self.line)
#print test
#self.line = ' '.join(self.line)
if self.line == '': self.words = None
else: self.words = filter(None, self.line.split())#mod form \W ->\s+
#print self.words #testing
#re.split(r'\W+' changed littletbit
def next(self):
""" this routine gets the next line of the text to be
line form the file input
"""
while 1:
if self.words is None: raise StopIteration
elif self.words: return self.words.pop()
else:
self.readline()
class text_type:
def getwords(self, filename):
"Return a word iterator."
return word_iterator(filename)
def makedirs(dirname):
""""check if directory exists if not create """
if not os.path.exists(dirname): os.makedirs(dirname)
class indexer:
def docsdir(self): return os.path.join(self.indexdir, "docs")
def wordsdir(self): return os.path.join(self.indexdir, "words")
def __init__(self, indexdir):
self.indexdir = indexdir
self.count = 0 #extra
self.TestCount = 0 #extra
self.words = {}
texttypeinstance = text_type()
def Filetype(self, origname, linkedname):
""" this routine avoids all unwanted file extension
and returns a texttype instance
"""
for file_extension in ['.zip', '.gz', '.pdf', '.PDF']:
if origname[-len(file_extension):] == file_extension:
return None
return self.texttypeinstance
#this function checks the case of the characters
def check_case(self, word): return word.lower()
def Good_words(self, word):
""" this function eliminates all stopwords with in the stop_list
and returns the required good words
"""
#stoplist words to be excluded the indexing process
stop_list = [
'about', 'all', 'also', 'an', 'and', 'any', 'are', 'as', 'at', 'be',
'because', 'been', 'being', 'but', 'by', 'can', 'cannot', 'did', 'do',
'doing', 'each', 'either', 'else', 'even', 'for', 'from', 'get', 'got',
'had', 'has', 'have', 'he', 'her', 'hers', 'herself', 'him', 'himself',
'his', 'how', 'if', 'in', 'into', 'is', 'it', 'its', 'me', 'my', 'myself',
'no', 'not', 'of', 'on', 'one', 'only', 'onto', 'or', 'our', 'ourselves',
'she', 'since', 'so', 'some', 'take', 'than', 'that', 'the', 'their', 'them',
'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through',
'to', 'too', 'unless', 'until', 'upon', 'us', 'very', 'was', 'we', 'were',
'what', 'when', 'where', 'which', 'while', 'who', 'whoever', 'whom',
'whomever', 'whose', 'why', 'will', 'with', 'without', 'would', 'yes',
'you', 'your', 'yours', 'yourself', 'yourselves','a','such']
return (word not in stop_list)
def indexfile(self, origname, linkedname):
"""Add a file to the full-text index.
indexdir ---> the top-level directory the index lives in
origname ---> the file's original name (used for guessing type)
linkedname ---> the name the file is known as in this system
This routine figures out what kind of file the file is,
extracts words from it, changes the case them, and adds them to the index.
"""
filetype = self.Filetype(origname, linkedname)
if filetype is None: return #report as failure!
filewords = filetype.getwords(os.path.join(self.docsdir(),
linkedname))
words = {}
try:
while 1:
word = self.check_case(filewords.next())
print word[0],
break
if self.Good_words(word):
if not words.has_key(word): words[word] = 0
words[word] = words[word] + 1
#compute the doc weighting
#print"words[word] = %d" %(words[word]) #testing
#print"count_doc = %d" %(self.count)#testing
#print"Test_count = %d" %(self.TestCount)#testing
#compute the doc weighting
#Inverse_doc_freq = math.log(self.count / self.TestCount)
#Doc_weighting = words[word] * Inverse_doc_freq
#print Doc_weighting
except StopIteration:
pass
for word in words.keys():
if not self.words.has_key(word): self.words[word] = []
self.words[word].append((linkedname, words[word]))
#self.words[word].append(Doc_weighting)
def index(self, filename):
""" this function calls the index function,it retrieves
the appropriate file to be indexed and it also counts
the number of docs processed.
"""
self.count = self.count + 1 #works
filename = os.path.abspath(filename)
#----------------------------
file = open(filename) # Ia am here start form tommorow
test = file.read()
x = test.lower()
for i in x.split():
check = ContainsAny(filename, i)
if check == 1:
self.TestCount = self.TestCount + 1
#print self.TestCount
#------------------------------
print "\nTest count = %d" %(self.TestCount)
print filename #extra
print "\ncount_doc = %d" %(self.count)#extra
docsdir = self.docsdir()
makedirs(docsdir)
linkedname = link_file_into(docsdir, filename)
self.indexfile(filename, linkedname)
def commit(self):
wordsdir = self.wordsdir()
makedirs(wordsdir)
for word in self.words.keys():
wordfile = open(os.path.join(wordsdir, word), "a")
for file, data in self.words[word]:
# FIXME: really big concurrent indexing jobs could
# exceed PIPE_MAX bytes and result in interleaved appends
# due to buffering
wordfile.write("\n%s: %s" % (file, data))
#posix.fdatasync(wordfile.fileno()) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!
win32file._get_osfhandle(wordfile.fileno())
wordfile.close()
self.words.clear()
#print " test = %d" %(self.count) # this is the # of DOcuments in the sys.
def main(argv):
if len(argv) < 2:
sys.stderr.write(usage + "\n")
return 1
indexdir = argv[1]
files = argv[2:]
myindexer = indexer(indexdir)
try:
for file in files:
print "indexing....", file,
sys.stdout.flush()
myindexer.index(file)
print "...done."
finally:
print "committing....",
sys.stdout.flush()
myindexer.commit()
print "....done."
return 0
#-------------------------------------------END-----------------------------------
if __name__ == "__main__": sys.exit(main(sys.argv))
--0-260525435-1052227845=:78244--