How to use the pyarabic.araby.strip_tashkeel function in PyArabic

To help you get started, we’ve selected a few PyArabic examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github linuxscout / arramooz / scripts / nouns / spelldict.py View on Github external
if not noun_tuple or not noun_tuple.get('vocalized',''):
            return ""
        nb = 0
        prefix_table =[]
        suffix_table =[]
        stem_table = []
        flags_table ={}
        for procletic, encletic, suffix in self.affixes_list:
            affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
                      +snconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \
                      +snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags'] 
            #test if the  given word from dictionary accept those
            # tags given by affixes
            # دراسة توافق الزوائد مع خصائص الاسم،
            # مثلا هل يقبل الاسم التأنيث.
            suffix_nm = araby.strip_tashkeel(suffix)
            encletic_nm = araby.strip_tashkeel(encletic)
            
            if nspell.validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_nm):

                if nspell.is_compatible_proaffix_affix(noun_tuple, procletic, encletic, suffix):
                    vocalized, semi_vocalized, segmented = nspell.vocalize(noun_tuple['vocalized'], procletic,  suffix, encletic)
                    if VERIFY_INPUT: 
                        print (u"\t".join([  segmented,  vocalized])).encode('utf8')
                        tags = self.get_tags(noun_tuple, affix_tags) 
                        print (u"\t".join([  araby.strip_tashkeel(vocalized),  noun_tuple['unvocalized'], tags])).encode('utf8')
                        print ("*" + u"\t".join([  araby.strip_tashkeel(vocalized),  noun_tuple['unvocalized'], u','.join(affix_tags)])).encode('utf8')
                    nb += 1
                    listfields = segmented.split('-')
                    if len(listfields) == 4:
                        pref = listfields[0]
                        stem = listfields[1]
github hci-lab / PyQuran / tools / searchHelper.py View on Github external
def hellper_get_sequance_positions(verse,sequance):
    verse = strip_tashkeel(verse)
    sequance = strip_tashkeel(sequance)
    sequance = sequance.split()
    verse = verse.split()
    positions = []
    for n,v in enumerate(verse):
        if v not in sequance:
            continue
        for en,se in enumerate(sequance):
            if se != verse[n]:
                break
            if en == len(sequance)-1:
                positions.append(n)
            n+=1
    return positions
github linuxscout / arabicstopwords / scripts / csvdict.py View on Github external
"""
                UNVOCALIZED TEXT NOT NULL,
                PROCLETIC TEXT,
                TAGS TEXT,
                VOCALIZED TEXT,
                STEM TEXT,
                TYPE TEXT,
                ORIGINAL TEXT,
                ENCLETIC TEXT
                """       
                #~ print(tuple_table[conj])
                stemmed, tags = conj     
                result_fields['stemmed'] = stemmed
                result_fields['vocalized'] = ar_stopwords.standardize_form(result_fields['stemmed']);
                result_fields['word']      = ar_stopwords.standardize_form(result_fields['stemmed']);
                result_fields['standard']  = araby.strip_tashkeel(result_fields['vocalized']);
                parts = stemmed.split(';')
                if len(parts)>=3:
                    result_fields['procletic'] = parts[0]
                    result_fields['stem'] = parts[1]
                    result_fields['encletic'] = parts[2]                    
                result_fields['tags'] = tags #fields.get("tags", 'tags')
                result_fields['unvocalized'] = result_fields['standard']
                
                fields_table.append(result_fields)
            return fields_table
github linuxscout / mishkal / support / aranasyn / stemmedsynword.py View on Github external
def get_unvoriginal(self, ):
        """
        Get the unvocalized  original form of the input word
        @return: the given unvocalized original.
        @rtype: unicode string
        """
        if self.unvoriginal:
            return self.unvoriginal            
        else :
            if self.original:
                self.unvoriginal =  araby.strip_tashkeel(self.original)
            else:
                return u""
            return self.unvoriginal
github linuxscout / alyahmor / alyahmor / genelex.py View on Github external
def get_vocalized_affixes_dict(self, forms = []):
        """ display vocalized affixes in a dict"""
        forms_dict = {}
        if forms:
            for form in forms:
                unvoc = araby.strip_tashkeel(form)
                if unvoc in forms_dict:
                    forms_dict[unvoc].append(form)
                else:
                   forms_dict[unvoc] = [form,]
        for key in forms_dict:
            if len(forms_dict[key])>=2:
                forms_dict[key].sort()
                forms_dict[key] = list(set(forms_dict[key]))
        return forms_dict
github linuxscout / mishkal / interfaces / gui / gui / customdictionary.py View on Github external
def add(self, word):
        """
        add a new vocalization given by user for unrecongnized word
        @return: vocalized word
        @rtype: none
        """
        word_nm = araby.strip_tashkeel(word)
        if word_nm not in self.dictio:
            self.dictio[word_nm] = [word, ]
        else:
            if word not in self.dictio[word_nm]:
                self.dictio[word_nm].append(word)
        try:
            self.cdfile = open(self.filename, "a+")
            text = u"%s\t%s\n"%(word_nm, u':'.join(self.dictio[word_nm]))
            self.cdfile.write(text.encode('utf8'))
            self.cdfile.close()            
        except:
            print "updating:can't update cutom dictionary'"
    def __del__(self,):
github linuxscout / mishkal / support / qalsadi / analex.py View on Github external
the given word, and give ذئب.
    @param word_vocalised: the input word.
    @type word_vocalised: unicode.
    @param resulted_data: the founded resulat from dictionary.
    @type resulted_data: list of dict.
    @return: list of dictionaries of analyzed words with tags.
    @rtype: list.
    """
    #print word_vocalised.encode('utf8')
    filtred_data = []
    inputword = araby.strip_tashkeel(word_vocalised)
    for item in resulted_data:
        if 'vocalized' in item.__dict__:  #.has_key('vocalized') :
            #~ if 'vocalized' in item :
            #~ outputword = araby.strip_tashkeel(item['vocalized'])
            outputword = araby.strip_tashkeel(item.__dict__['vocalized'])
            #~ print u'\t'.join([inputword, outputword]).encode('utf8')
            if inputword == outputword:
                #item['tags'] += ':a'
                filtred_data.append(item)
            #~ filtred_data.append(item)
    return filtred_data

github linuxscout / arramooz / scripts / verbs / spelldict.py View on Github external
flags += svconst.TabSuffixes[pronoun]['full'];
                                    
                                #   add flag yeh for the الأفعال الخمسة 
                                if tense == const.TenseFuture and pronoun in (const.PronounAnti, const.PronounAntuma, const.PronounAntuma_f, 
                                                                              const.PronounAntum, const.PronounHuma, const.PronounHuma_f, const.PronounHum ):
                                    flags+=u"Ha"; 
                                                                      
                            # add double object suffixe, if the verb is double transitive, and the tense is indicative 
                            if v['double_trans'] and tense in const.TableIndicativeTense:
                                
                                # add flags for suffixes (double object)
                                    flags += svconst.TabDisplayTagDouble[pronoun]['full'];
                            
                            #add an entree to the table entrie
                            # this allows to reduce many cases into one entree
                            word_nm = araby.strip_tashkeel(conjugTable[tense][pronoun]);
                            if TableEntries.has_key(word_nm):
                                TableEntries[word_nm] += flags;
                            else:
                                TableEntries[word_nm] = flags;
                            #print (u'%s/%s\t%s%s'%(ar_strip_marks(conjugTable[tense][pronoun]), flags, word,verb_cat)).encode('utf8');
                # print element from the TableEntries
                for key in TableEntries.keys():
                    if key!="":
                        line +=u'%s/%s\n'%(key, vspell.unify_flags(TableEntries[key]))               
            
        return line
github linuxscout / mishkal / support / qalsadi / analex.py View on Github external
to treat some normalized cases,
    the analyzer return the vocalized like words
    ُIf the word is ذئب, the normalized form is ذءب,
    which can give from dictionary ذئبـ ذؤب.
    this function filter normalized resulted word according
    the given word, and give ذئب.
    @param word_vocalised: the input word.
    @type word_vocalised: unicode.
    @param resulted_data: the founded resulat from dictionary.
    @type resulted_data: list of dict.
    @return: list of dictionaries of analyzed words with tags.
    @rtype: list.
    """
    #print word_vocalised.encode('utf8')
    filtred_data = []
    inputword = araby.strip_tashkeel(word_vocalised)
    for item in resulted_data:
        if 'vocalized' in item.__dict__:  #.has_key('vocalized') :
            #~ if 'vocalized' in item :
            #~ outputword = araby.strip_tashkeel(item['vocalized'])
            outputword = araby.strip_tashkeel(item.__dict__['vocalized'])
            #~ print u'\t'.join([inputword, outputword]).encode('utf8')
            if inputword == outputword:
                #item['tags'] += ':a'
                filtred_data.append(item)
            #~ filtred_data.append(item)
    return filtred_data