need help in how to make my script read arabic lang

enas khalil enas_khalil at
Sun Nov 20 14:35:19 CET 2005

hello ,
  i want to know if yu please how can i use python code in tagging arabic text file 
  my code is as follow :
  # -*- coding: cp1256 -*-
import codecs
from nltk.tagger import *
from nltk.corpus import brown
from nltk.tokenizer import WhitespaceTokenizer
from nltk import *
from nltk.tokenreader.tagged import TaggedTokenReader
  # Tokenize ten texts from the Brown Corpus
train_tokens = []
  text_str = (open('fataha2.txt').read())
reader = TaggedTokenReader(SUBTOKENS='WORDS')
text_token = reader.read_token(text_str)
print text_token['WORDS']
  for l in text_token['WORDS']:
  #Initialise and train a unigram tagger
mytagger = UnigramTagger(SUBTOKENS='WORDS')
  for xx in train_tokens:
    cc = reader.read_token(xx['TEXT'])
   # print cc.keys()
    cc['SUBTOKENS']= cc['WORDS']
#Once a UnigramTagger has been trained, the tag() method can be used to tag new text: 
text_token = Token(TEXT="ÇáÍãÏ ááå ÑÈ ÇáÚÇáãíä")
#print 'The first example : Using Unigram Tagger the reseults are : '
print text_token
  and i  got the following error :
  Traceback (most recent call last):
  File "I:/examples/", line 13, in ?
  File "C:\Python24\lib\encodings\", line 18, in encode
    return codecs.charmap_encode(input,errors,encoding_map)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc8 in position 0: ordinal not in range(128)
   please help

 Yahoo! FareChase - Search multiple travel sites in one click.  
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <>

More information about the Python-list mailing list