How to use PyArabic - 10 common examples

To help you get started, we’ve selected a few PyArabic examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github linuxscout / mishkal / tests / tools / testanalex.py View on Github external
#! /usr/bin/python
import sys
sys.path.append('lib');
import re
import string
import datetime
import getopt
import os
import pyarabic.araby as araby
import qalsadi.analex
					
scriptname = os.path.splitext(os.path.basename(sys.argv[0]))[0]
scriptversion = '0.1'
AuthorName="Taha Zerrouki"
#Token_pattern=re.compile(u"([\w%s ]+)"%(u"".join(araby.TASHKEEL),),re.UNICODE);
Clause_pattern=re.compile(u"([\w%s\s]+)"%(u"".join(araby.TASHKEEL),),re.UNICODE);

#Token_pattern=re.compile(u"([^\w]+)",re.UNICODE);
# Token_pattern=re.compile(u"([^\w%s\s])+"%(u"".join(araby.TASHKEEL),),re.UNICODE);

def phraseSplit(text):
	"""
	Split Text into clauses
	@param text: input text;
	@type text: unicode;
	@return: list of clauses
	@rtype: list of unicode
	"""
	if text:
		list_phrase = Clause_pattern.split(text);
		if list_phrase:
			j =- 1;
github hci-lab / PyQuran / testing / ali_testing.py View on Github external
def separate_token_with_dicrites(token):
    """gets a token(string) with taskeel, and returns a list of strings,
    each string in the list represents each character in the token with its own tashkeel.
    Args:
        token (str): string represents a word or aya or sura
    Returns:
        [str]: a list contains the token characters with their tashkeel.
    """
    token_without_tatweel = araby.strip_tatweel(token)
    print(token_without_tatweel)
    hroof_with_tashkeel = []
    for index,i in enumerate(token):
        if((token[index] in (alphabet or alefat or hamzat) )):
            k = index
            harf_with_taskeel =token[index]
            while((k+1) != len(token) and (token[k+1] in (tashkeel or harakat or shortharakat or tanwin ))):
                harf_with_taskeel =harf_with_taskeel+""+token[k+1]
                k = k + 1
            index = k
            hroof_with_tashkeel.append(harf_with_taskeel)
    return hroof_with_tashkeel
github linuxscout / alyahmor / tests / test_print_affixes.py View on Github external
def main(args):

    generator = alyahmor.genelex.genelex()
    print ('NOUN_AFFIX_LIST=')
    noun_affixes = generator.generate_affix_list(word_type="noun", vocalized=False)
    print(arepr(noun_affixes).replace(',', ',\n'))
    
    print('VERB_AFFIX_LIST=')
    verb_affixes = generator.generate_affix_list(word_type="verb", vocalized=False)
    print(arepr(verb_affixes).replace('],', '],\n'))
    print(arepr(verb_affixes).replace(',', ',\n'))
    return 0
if __name__ == '__main__':
github linuxscout / alyahmor / tests / test_print_affixes.py View on Github external
def main(args):

    generator = alyahmor.genelex.genelex()
    print ('NOUN_AFFIX_LIST=')
    noun_affixes = generator.generate_affix_list(word_type="noun", vocalized=False)
    print(arepr(noun_affixes).replace(',', ',\n'))
    
    print('VERB_AFFIX_LIST=')
    verb_affixes = generator.generate_affix_list(word_type="verb", vocalized=False)
    print(arepr(verb_affixes).replace('],', '],\n'))
    print(arepr(verb_affixes).replace(',', ',\n'))
    return 0
if __name__ == '__main__':
github linuxscout / alyahmor / tests / test_print_affixes.py View on Github external
def main(args):

    generator = alyahmor.genelex.genelex()
    print ('NOUN_AFFIX_LIST=')
    noun_affixes = generator.generate_affix_list(word_type="noun", vocalized=False)
    print(arepr(noun_affixes).replace(',', ',\n'))
    
    print('VERB_AFFIX_LIST=')
    verb_affixes = generator.generate_affix_list(word_type="verb", vocalized=False)
    print(arepr(verb_affixes).replace('],', '],\n'))
    print(arepr(verb_affixes).replace(',', ',\n'))
    return 0
if __name__ == '__main__':
github linuxscout / alyahmor / tests / test_genelex.py View on Github external
#~ print(u"\n".join((unv_forms)).encode('utf8'))
            voc_forms = generator.get_vocalized_forms(list_forms)
            #~ print(u"\n".join((voc_forms)).encode('utf8')) 
            voc_forms_dict = generator.get_vocalized_forms_dict(list_forms)
            print(arepr(voc_forms_dict).replace('],', '],\n'))

        if wtype == "verb":
            print('************verb*****')
            list_forms =generator.generate_verb_forms(word)
            #~ print(arepr(verb_forms).replace('),', '),\n').replace('],', '],\n'))
            unv_forms = generator.get_unvocalized_forms(list_forms)
            #~ print(u"\n".join((unv_forms)).encode('utf8'))
            voc_forms = generator.get_vocalized_forms(list_forms)
            #~ print(u"\n".join((voc_forms)).encode('utf8'))
            voc_forms_dict = generator.get_vocalized_forms_dict(list_forms[:10])
            print(arepr(voc_forms_dict).replace('],', '],\n'))
github linuxscout / arramooz / scripts / nouns / spelldict.py View on Github external
if not noun_tuple or not noun_tuple.get('vocalized',''):
            return ""
        nb = 0
        prefix_table =[]
        suffix_table =[]
        stem_table = []
        flags_table ={}
        for procletic, encletic, suffix in self.affixes_list:
            affix_tags = snconst.COMP_PREFIX_LIST_TAGS[procletic]['tags'] \
                      +snconst.COMP_SUFFIX_LIST_TAGS[encletic]['tags'] \
                      +snconst.CONJ_SUFFIX_LIST_TAGS[suffix]['tags'] 
            #test if the  given word from dictionary accept those
            # tags given by affixes
            # دراسة توافق الزوائد مع خصائص الاسم،
            # مثلا هل يقبل الاسم التأنيث.
            suffix_nm = araby.strip_tashkeel(suffix)
            encletic_nm = araby.strip_tashkeel(encletic)
            
            if nspell.validate_tags(noun_tuple, affix_tags, procletic, encletic_nm, suffix_nm):

                if nspell.is_compatible_proaffix_affix(noun_tuple, procletic, encletic, suffix):
                    vocalized, semi_vocalized, segmented = nspell.vocalize(noun_tuple['vocalized'], procletic,  suffix, encletic)
                    if VERIFY_INPUT: 
                        print (u"\t".join([  segmented,  vocalized])).encode('utf8')
                        tags = self.get_tags(noun_tuple, affix_tags) 
                        print (u"\t".join([  araby.strip_tashkeel(vocalized),  noun_tuple['unvocalized'], tags])).encode('utf8')
                        print ("*" + u"\t".join([  araby.strip_tashkeel(vocalized),  noun_tuple['unvocalized'], u','.join(affix_tags)])).encode('utf8')
                    nb += 1
                    listfields = segmented.split('-')
                    if len(listfields) == 4:
                        pref = listfields[0]
                        stem = listfields[1]
github hci-lab / PyQuran / tools / searchHelper.py View on Github external
def hellper_get_sequance_positions(verse,sequance):
    verse = strip_tashkeel(verse)
    sequance = strip_tashkeel(sequance)
    sequance = sequance.split()
    verse = verse.split()
    positions = []
    for n,v in enumerate(verse):
        if v not in sequance:
            continue
        for en,se in enumerate(sequance):
            if se != verse[n]:
                break
            if en == len(sequance)-1:
                positions.append(n)
            n+=1
    return positions
github linuxscout / arabicstopwords / scripts / csvdict.py View on Github external
"""
                UNVOCALIZED TEXT NOT NULL,
                PROCLETIC TEXT,
                TAGS TEXT,
                VOCALIZED TEXT,
                STEM TEXT,
                TYPE TEXT,
                ORIGINAL TEXT,
                ENCLETIC TEXT
                """       
                #~ print(tuple_table[conj])
                stemmed, tags = conj     
                result_fields['stemmed'] = stemmed
                result_fields['vocalized'] = ar_stopwords.standardize_form(result_fields['stemmed']);
                result_fields['word']      = ar_stopwords.standardize_form(result_fields['stemmed']);
                result_fields['standard']  = araby.strip_tashkeel(result_fields['vocalized']);
                parts = stemmed.split(';')
                if len(parts)>=3:
                    result_fields['procletic'] = parts[0]
                    result_fields['stem'] = parts[1]
                    result_fields['encletic'] = parts[2]                    
                result_fields['tags'] = tags #fields.get("tags", 'tags')
                result_fields['unvocalized'] = result_fields['standard']
                
                fields_table.append(result_fields)
            return fields_table
github linuxscout / mishkal / support / collocations / pyarabic / named.py View on Github external
>>> detectNamedPosition(u"قال خالد بن رافع  حدثني أحمد بن عنبر عن خاله");
	((1,3), (6,8))
	"""
	wordlist#=text.split(u' ');
	#print words;
	positions = [];
	startNamed =-1;
	endNamed   =False;
	# print u":".join(wordlist).encode('utf8');
	for i in range(len(wordlist)):
		word=wordlist[i];
		if i+1=0: 
			previous=araby.stripTashkeel(wordlist[i-1]);
			if previous and startNamed<0  and previous[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
				previous=previous[1:];
		else: previous = u''
		#save the original word with possible harakat if exist
		word_nm=araby.stripTashkeel(word);
		key=word_nm;
		# the first word can have prefixes 
		if word_nm and startNamed<0  and word_nm[0] in (u'و', u'ف', u'ل', u'ب', u'ك'):
			key=word_nm[1:];
		if startNamed<0 and key in (u'ابن', ):
			startNamed=i;
			endNamed=i

		elif key in (u'ابن', u'بن',u'أبو',u'أبا', u'أبي', u'عبد' , u'عبيد' , u'بنو', u'بني', u'بنت'):
			if startNamed<0:
				startNamed=i;