Question about Tashaphyne package in python

Sat Mar 2 22:06:53 EST 2013

I have a Python code that take an Arabic word and get the root and also remove diacritics, but i I have a problem with the output. For example  : when the input is "العربيه" the output is:"عرب" which is right answer but when the input is "كاتب" the output is:"ب", and when the input is "يخاف" the output is " خف".

This is my code:

# -*- coding=utf-8 -*-

import re
from arabic_const import *
import Tashaphyne
from Tashaphyne import *
import enum
from enum import Enum
search_type=Enum('unvoc_word','voc_word','root_word')

HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
#--------------------------------------
def strip_tashkeel(w):
        "strip vowel from a word and return a result word"
        return HARAKAT_pat.sub('', w)

#strip tatweel from a word and return a result word
#--------------------------------------
def strip_tatweel(w):
        "strip tatweel from a word and return a result word"
        return re.sub(ur'[%s]' % TATWEEL,       '', w)

#--------------------------------------
def normalize_hamza(w):
        "strip vowel from a word and return a result word"
        w = ALEFAT_pat.sub(ALEF, w)
        return HAMZAT_pat.sub(HAMZA, w)

#--------------------------------------
def normalize_lamalef(w):
        "strip vowel from a word and return a result word"
        return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w)

#--------------------------------------
def normalize_spellerrors(w):
        "strip vowel from a word and return a result word"
        w = re.sub(ur'[%s]' % TEH_MARBUTA,      HEH, w)
        return re.sub(ur'[%s]' % ALEF_MAKSURA,  YEH, w)
def guess_stem(self,word):
                """
                Detetect affixed letters based or phonetic root composition.
                In Arabic language, there are some letters which can't be adjacent in a root.
                This function return True, if the word is valid, else, return False

                @param word: the word.
                @type word: unicode.
                @return: word with a '-' to indicate the stemming position.
                @rtype: unicode
                """
        # certain roots are forbiden in arabic
        #exprimed in letters sequences
        # but this sequence can be used for affixation
        #then we can guess that this letters are affixed
        #
        #treat one prefixe letter
        # we strip harkat and shadda
                word=ar_strip_marks(word);
                prefixes_letters=(TEH, MEEM,LAM,WAW,BEH, KAF,FEH,HAMZA,YEH,NOON)
                prefixes_forbiden={
                ALEF_HAMZA_ABOVE:(ALEF_HAMZA_ABOVE,ZAH,AIN,GHAIN),
                BEH:(BEH,FEH,MEEM),
                TEH:(THEH,DAL,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH),
                FEH:(BEH,FEH,MEEM),
                KAF:(JEEM,DAD,TAH,ZAH,QAF,KAF),
                LAM:(REH,SHEEN,LAM,NOON),
                MEEM:(BEH,FEH,MEEM),
                NOON:(REH,LAM,NOON),
                WAW:(WAW,YEH),
                YEH:(THEH,JEEM,HAH,KHAH,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH,GHAIN,KAF,HEH,YEH),
                        }

                word_guess=word;
                if len(word)>=2:
                        c1=word[0];
                        c2=word[1];
#                       if c1 in prefixes_letters and (c1 in prefixes_forbiden.keys() and c2 in prefixes_forbiden[c1]):
                        if  prefixes_forbiden.has_key(c1) and c2 in prefixes_forbiden[c1]:

                                word_guess=u"%s-%s"%(c1,word[1:])
                                if len(word_guess)>=4:
                                        c1=word_guess[2];
                                        c2=word_guess[3];
                                        if c1 in prefixes_letters and ( c2 in prefixes_forbiden[c1]):
                                                word_guess=u"%s-%s"%(c1,word_guess[2:])

        # treat two suffixe letters
                bisuffixes_letters=(KAF+MEEM,KAF+NOON,HEH+MEEM,HEH+NOON)

                bisuffixes_forbiden={
                HEH+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
                KAF+MEEM:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM, KHAH,ZAIN,SEEN, SHEEN,DAD, TAH,ZAH,GHAIN, FEH, QAF,KAF, LAM, NOON, HEH,YEH),
                HEH+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
                KAF+NOON:(ALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH,THAL,SHEEN,DAD, TAH,ZAH,AIN, GHAIN, QAF,KAF, NOON, HEH,YEH),

                        }
        ##      word_guess=word;
                word=word_guess;
                if len(word)>=3:
                        bc_last=word[-2:];
                        bc_blast=word[-3:-2]
                        if bc_last in bisuffixes_letters:
                                if bc_blast in bisuffixes_forbiden[bc_last]:
                                        word_guess=u"%s-%s"%(word[:-2],bc_last)

        # treat one suffixe letters
                suffixes_letters=(KAF,TEH,HEH)

                suffixes_forbiden={
                TEH:(THEH,JEEM,DAL,THAL,ZAIN,SHEEN,TAH,ZAH),
                KAF:(THEH,JEEM,KHAH, THAL,TAH,ZAH,GHAIN,QAF),
                HEH:(TEH,HAH,KHAH,DAL,REH,SEEN,SHEEN,SAD,ZAH,AIN,GHAIN),
                        }
                word=word_guess;
                c_last=word[-1:];
                c_blast=word[-2:-1]
                if c_last in suffixes_letters:
                        if c_blast in suffixes_forbiden[c_last]:
                                word_guess=u"%s-%s"%(word[:-1],c_last)

                return word_guess;

def normalize_text(word,searchtype):
        word = strip_tashkeel(word)
        print word
        word = strip_tatweel(word)
        print word
        word = normalize_lamalef(word)
        print word
        word = normalize_hamza(word)
        print word
        word = normalize_spellerrors(word)
        print word
        if searchtype==search_type.root_word.index:
           """ArListem=ArabicLightStemmer()
           stem=ArListem.lightStem(word)
           word=ArListem.get_stem()
           print word
           w=ArListem.get_prefix()
           print w
           word=ArListem.get_root()"""
           word=guess_stem(word,w)
        print word
        return word