python student at university of jordan.

heba abukaff habukaff50 at yahoo.com
Sat Oct 25 03:47:42 CEST 2014


Hi,
my name is heba ibrahim abukaff from jordan ,iam a computer information system student at university of jordan . 
i have a trouble using the tokenizer to find the frequency list for URL using arabic text.and iam using python 2.7.2 on winXP,I tried this code but every time i run the code appears error with first line 
COULD YOU HELP ME.
WITH REGARDS.
::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
python 2.7.2 (default, Jun 12 2011, 15:08:59) [MSC v.1500 32 bit (Intel)] on win32
> import re, codecs
import nltk 

from urllib import request
url = "http://ar.wikipedia.org/wiki/%D9%85%D9%88%D9%82%D8%B9_%D9%88%D9%8A%D8%A8"
response = request.urlopen(url)
raw = response.read().decode('utf8')
print(raw)

import re, codecs import nltk from nltk.probability
import * def construct_frequency_list(words):
    
outfile = codecs.open(r'raw.txt' , 'w')
fd = nltk.probability.FreqDist()
for w in words: fd.inc(w)
print "Total number of words: %d Vocbulary size : %d" % (fd.N(), fd.B())
print "word with highest count: %s" % (fd.max())
tokenlist = fd.iteritems()
for (key, value) in tokenlist:
print>>outfile, "%s\t%d" % (key, value)
outfile.close() def read_textfile(raw ):
lines = codecs.open(raw ,'r','utf_8').readlines()
outfile = codecs.open(r'raw.txt' ,'w','utf_8')
counter = 0 wordlist = [] for line in lines:
tokens = line.rstrip().lstrip().split()
for t in tokens: wordlist.append(t)
print>>outfile, '%s\t%d' % (t, len(t))
counter += len(t) outfile.close()
return wordlist if __name__ == "__main__":
 words = read_textfile(r'raw .txt')
construct_frequency_list(words)
print "Done!"
 



More information about the Python-list mailing list