Y
yomnasalah91
I have a Python code that take an Arabic word and get the root and also remove diacritics, but i I have a problem with the output. For example : whenthe input is "العربيه" the output is:"عرب" which is right answer but when the input is "كاتب" the output is:"ب", and when the input is "يخاÙ" the output is " Ø®Ù".
This is my code:
# -*- coding=utf-8 -*-
import re
from arabic_const import *
import Tashaphyne
from Tashaphyne import *
import enum
from enum import Enum
search_type=Enum('unvoc_word','voc_word','root_word')
HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
#--------------------------------------
def strip_tashkeel(w):
"strip vowel from a word and return a result word"
return HARAKAT_pat.sub('', w)
#strip tatweel from a word and return a result word
#--------------------------------------
def strip_tatweel(w):
"strip tatweel from a word and return a result word"
return re.sub(ur'[%s]' % TATWEEL, '', w)
#--------------------------------------
def normalize_hamza(w):
"strip vowel from a word and return a result word"
w = ALEFAT_pat.sub(ALEF, w)
return HAMZAT_pat.sub(HAMZA, w)
#--------------------------------------
def normalize_lamalef(w):
"strip vowel from a word and return a result word"
return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w)
#--------------------------------------
def normalize_spellerrors(w):
"strip vowel from a word and return a result word"
w = re.sub(ur'[%s]' % TEH_MARBUTA, HEH, w)
return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, w)
def guess_stem(self,word):
"""
Detetect affixed letters based or phonetic root composition..
In Arabic language, there are some letters which can't be adjacent in a root.
This function return True, if the word is valid, else, return False
@param word: the word.
@type word: unicode.
@return: word with a '-' to indicate the stemming position.
@rtype: unicode
"""
# certain roots are forbiden in arabic
#exprimed in letters sequences
# but this sequence can be used for affixation
#then we can guess that this letters are affixed
#
#treat one prefixe letter
# we strip harkat and shadda
word=ar_strip_marks(word);
prefixes_letters=(TEH, MEEM,LAM,WAW,BEH, KAF,FEH,HAMZA,YEH,NOON)
prefixes_forbiden={
ALEF_HAMZA_ABOVEALEF_HAMZA_ABOVE,ZAH,AIN,GHAIN),
BEHBEH,FEH,MEEM),
TEHTHEH,DAL,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH),
FEHBEH,FEH,MEEM),
KAFJEEM,DAD,TAH,ZAH,QAF,KAF),
LAMREH,SHEEN,LAM,NOON),
MEEMBEH,FEH,MEEM),
NOONREH,LAM,NOON),
WAWWAW,YEH),
YEHTHEH,JEEM,HAH,KHAH,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH,GHAIN,KAF,HEH,YEH),
}
word_guess=word;
if len(word)>=2:
c1=word[0];
c2=word[1];
# if c1 in prefixes_letters and (c1 in prefixes_forbiden.keys() and c2 in prefixes_forbiden[c1]):
if prefixes_forbiden.has_key(c1) and c2 in prefixes_forbiden[c1]:
word_guess=u"%s-%s"%(c1,word[1:])
if len(word_guess)>=4:
c1=word_guess[2];
c2=word_guess[3];
if c1 in prefixes_letters and ( c2 in prefixes_forbiden[c1]):
word_guess=u"%s-%s"%(c1,word_guess[2:])
# treat two suffixe letters
bisuffixes_letters=(KAF+MEEM,KAF+NOON,HEH+MEEM,HEH+NOON)
bisuffixes_forbiden={
HEH+MEEMALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
KAF+MEEMALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM, KHAH,ZAIN,SEEN, SHEEN,DAD, TAH,ZAH,GHAIN, FEH, QAF,KAF, LAM, NOON,HEH,YEH),
HEH+NOONALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
KAF+NOONALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH,THAL,SHEEN,DAD, TAH,ZAH,AIN, GHAIN, QAF,KAF, NOON, HEH,YEH),
}
## word_guess=word;
word=word_guess;
if len(word)>=3:
bc_last=word[-2:];
bc_blast=word[-3:-2]
if bc_last in bisuffixes_letters:
if bc_blast in bisuffixes_forbiden[bc_last]:
word_guess=u"%s-%s"%(word[:-2],bc_last)
# treat one suffixe letters
suffixes_letters=(KAF,TEH,HEH)
suffixes_forbiden={
TEHTHEH,JEEM,DAL,THAL,ZAIN,SHEEN,TAH,ZAH),
KAFTHEH,JEEM,KHAH, THAL,TAH,ZAH,GHAIN,QAF),
HEHTEH,HAH,KHAH,DAL,REH,SEEN,SHEEN,SAD,ZAH,AIN,GHAIN),
}
word=word_guess;
c_last=word[-1:];
c_blast=word[-2:-1]
if c_last in suffixes_letters:
if c_blast in suffixes_forbiden[c_last]:
word_guess=u"%s-%s"%(word[:-1],c_last)
return word_guess;
def normalize_text(word,searchtype):
word = strip_tashkeel(word)
print word
word = strip_tatweel(word)
print word
word = normalize_lamalef(word)
print word
word = normalize_hamza(word)
print word
word = normalize_spellerrors(word)
print word
if searchtype==search_type.root_word.index:
"""ArListem=ArabicLightStemmer()
stem=ArListem.lightStem(word)
word=ArListem.get_stem()
print word
w=ArListem.get_prefix()
print w
word=ArListem.get_root()"""
word=guess_stem(word,w)
print word
return word
This is my code:
# -*- coding=utf-8 -*-
import re
from arabic_const import *
import Tashaphyne
from Tashaphyne import *
import enum
from enum import Enum
search_type=Enum('unvoc_word','voc_word','root_word')
HARAKAT_pat = re.compile(ur"[" + u"".join([FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA]) + u"]")
HAMZAT_pat = re.compile(ur"[" + u"".join([WAW_HAMZA, YEH_HAMZA]) + u"]");
ALEFAT_pat = re.compile(ur"[" + u"".join([ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, HAMZA_ABOVE, HAMZA_BELOW]) + u"]");
LAMALEFAT_pat = re.compile(ur"[" + u"".join([LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE]) + u"]");
#--------------------------------------
def strip_tashkeel(w):
"strip vowel from a word and return a result word"
return HARAKAT_pat.sub('', w)
#strip tatweel from a word and return a result word
#--------------------------------------
def strip_tatweel(w):
"strip tatweel from a word and return a result word"
return re.sub(ur'[%s]' % TATWEEL, '', w)
#--------------------------------------
def normalize_hamza(w):
"strip vowel from a word and return a result word"
w = ALEFAT_pat.sub(ALEF, w)
return HAMZAT_pat.sub(HAMZA, w)
#--------------------------------------
def normalize_lamalef(w):
"strip vowel from a word and return a result word"
return LAMALEFAT_pat.sub(u'%s%s' % (LAM, ALEF), w)
#--------------------------------------
def normalize_spellerrors(w):
"strip vowel from a word and return a result word"
w = re.sub(ur'[%s]' % TEH_MARBUTA, HEH, w)
return re.sub(ur'[%s]' % ALEF_MAKSURA, YEH, w)
def guess_stem(self,word):
"""
Detetect affixed letters based or phonetic root composition..
In Arabic language, there are some letters which can't be adjacent in a root.
This function return True, if the word is valid, else, return False
@param word: the word.
@type word: unicode.
@return: word with a '-' to indicate the stemming position.
@rtype: unicode
"""
# certain roots are forbiden in arabic
#exprimed in letters sequences
# but this sequence can be used for affixation
#then we can guess that this letters are affixed
#
#treat one prefixe letter
# we strip harkat and shadda
word=ar_strip_marks(word);
prefixes_letters=(TEH, MEEM,LAM,WAW,BEH, KAF,FEH,HAMZA,YEH,NOON)
prefixes_forbiden={
ALEF_HAMZA_ABOVEALEF_HAMZA_ABOVE,ZAH,AIN,GHAIN),
BEHBEH,FEH,MEEM),
TEHTHEH,DAL,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH),
FEHBEH,FEH,MEEM),
KAFJEEM,DAD,TAH,ZAH,QAF,KAF),
LAMREH,SHEEN,LAM,NOON),
MEEMBEH,FEH,MEEM),
NOONREH,LAM,NOON),
WAWWAW,YEH),
YEHTHEH,JEEM,HAH,KHAH,THAL,ZAIN,SHEEN,SAD,DAD,TAH,ZAH,GHAIN,KAF,HEH,YEH),
}
word_guess=word;
if len(word)>=2:
c1=word[0];
c2=word[1];
# if c1 in prefixes_letters and (c1 in prefixes_forbiden.keys() and c2 in prefixes_forbiden[c1]):
if prefixes_forbiden.has_key(c1) and c2 in prefixes_forbiden[c1]:
word_guess=u"%s-%s"%(c1,word[1:])
if len(word_guess)>=4:
c1=word_guess[2];
c2=word_guess[3];
if c1 in prefixes_letters and ( c2 in prefixes_forbiden[c1]):
word_guess=u"%s-%s"%(c1,word_guess[2:])
# treat two suffixe letters
bisuffixes_letters=(KAF+MEEM,KAF+NOON,HEH+MEEM,HEH+NOON)
bisuffixes_forbiden={
HEH+MEEMALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
KAF+MEEMALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM, KHAH,ZAIN,SEEN, SHEEN,DAD, TAH,ZAH,GHAIN, FEH, QAF,KAF, LAM, NOON,HEH,YEH),
HEH+NOONALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH, SAD, DAD, TAH,ZAH,AIN,GHAIN,HEH,YEH),
KAF+NOONALEF_HAMZA_ABOVE,HAMZA,WAW_HAMZA,YEH_HAMZA,BEH,THEH,JEEM,HAH, KHAH,THAL,SHEEN,DAD, TAH,ZAH,AIN, GHAIN, QAF,KAF, NOON, HEH,YEH),
}
## word_guess=word;
word=word_guess;
if len(word)>=3:
bc_last=word[-2:];
bc_blast=word[-3:-2]
if bc_last in bisuffixes_letters:
if bc_blast in bisuffixes_forbiden[bc_last]:
word_guess=u"%s-%s"%(word[:-2],bc_last)
# treat one suffixe letters
suffixes_letters=(KAF,TEH,HEH)
suffixes_forbiden={
TEHTHEH,JEEM,DAL,THAL,ZAIN,SHEEN,TAH,ZAH),
KAFTHEH,JEEM,KHAH, THAL,TAH,ZAH,GHAIN,QAF),
HEHTEH,HAH,KHAH,DAL,REH,SEEN,SHEEN,SAD,ZAH,AIN,GHAIN),
}
word=word_guess;
c_last=word[-1:];
c_blast=word[-2:-1]
if c_last in suffixes_letters:
if c_blast in suffixes_forbiden[c_last]:
word_guess=u"%s-%s"%(word[:-1],c_last)
return word_guess;
def normalize_text(word,searchtype):
word = strip_tashkeel(word)
print word
word = strip_tatweel(word)
print word
word = normalize_lamalef(word)
print word
word = normalize_hamza(word)
print word
word = normalize_spellerrors(word)
print word
if searchtype==search_type.root_word.index:
"""ArListem=ArabicLightStemmer()
stem=ArListem.lightStem(word)
word=ArListem.get_stem()
print word
w=ArListem.get_prefix()
print w
word=ArListem.get_root()"""
word=guess_stem(word,w)
print word
return word