How to use the pyarabic.araby.strip_tatweel function in PyArabic

To help you get started, we’ve selected a few PyArabic examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github hci-lab / PyQuran / testing / ali_testing.py View on Github external
def separate_token_with_dicrites(token):
    """gets a token(string) with taskeel, and returns a list of strings,
    each string in the list represents each character in the token with its own tashkeel.
    Args:
        token (str): string represents a word or aya or sura
    Returns:
        [str]: a list contains the token characters with their tashkeel.
    """
    token_without_tatweel = araby.strip_tatweel(token)
    print(token_without_tatweel)
    hroof_with_tashkeel = []
    for index,i in enumerate(token):
        if((token[index] in (alphabet or alefat or hamzat) )):
            k = index
            harf_with_taskeel =token[index]
            while((k+1) != len(token) and (token[k+1] in (tashkeel or harakat or shortharakat or tanwin ))):
                harf_with_taskeel =harf_with_taskeel+""+token[k+1]
                k = k + 1
            index = k
            hroof_with_tashkeel.append(harf_with_taskeel)
    return hroof_with_tashkeel
github linuxscout / mishkal / support / yaraspell / spelldict.py View on Github external
        @rtype: Boolean
        """
        if not word: 
            return True
        if word.isdigit():
            return True
        for c in word:
            if c in string.punctuation:
                return True
        # test if the word is previouslly spelled
        # can get True or False
        if word in self.worddict:
            test = self.worddict.get(word, False)
        else:
            # if the word is not spelled
            word = araby.strip_tatweel(word)
            self.stemmer.segment(word)        
            # extract the affix 
            stem = self.stemmer.get_stem()
            affix = u"-".join([self.stemmer.get_prefix(), self.stemmer.get_suffix()])
            # lookup in the database
            test = self.database.lookup(word, stem, affix)
            self.worddict[word] = test
        return test
github linuxscout / mishkal / support / qalsadi / analex.py View on Github external
def check_word(self, word, guessedtag=""):
        """
        Analyze one word morphologically as verbs
        @param word: the input word.
        @type word: unicode.
        @return: list of dictionaries of analyzed words with tags.
        @rtype: list.
        """

        word = araby.strip_tatweel(word)
        word_vocalised = word
        word_nm = araby.strip_tashkeel(word)
        # get analysed details from cache if used
        if self.allow_cache_use and self.cache.is_already_checked(word_nm):
            #~ print (u"'%s'"%word).encode('utf8'), 'found'
            resulted_data = self.cache.get_checked(word_nm)
        else:
            resulted_data = []
            # if word is a pounctuation
            resulted_data += self.check_word_as_pounct(word_nm)
            # Done: if the word is a stop word we have  some problems,
            # the stop word can also be another normal word (verb or noun),
            # we must consider it in future works
            # if word is stopword allow stop words analysis
            resulted_data += self.check_word_as_stopword(word_nm)
github linuxscout / fareh / scripts / rule_builder.py View on Github external
def clean(self, strng):
        """
        clean a string from unnecessary whitespaces
        """
        #if type(strng) == str or type(strng) == unicode:
        if type(strng) == str:#python3
            strng = araby.strip_tatweel(strng)
            return re.sub(u'\s+', ' ', strng).strip()
        if type(strng) == list:
            l= [re.sub(u'\s+', ' ', s).strip() for s in strng]
            return [araby.strip_tatweel(s) for s in l]
            
        else:
            return strng
github hci-lab / PyQuran / core / pyquran.py View on Github external
"""Grouping each letter with its diacritics.

        Args:
            sentance: str

        Returns:
            [str]: a list of _x_, where _x_ is the letter accompanied with its
            diacritics.

    Example:
    ```python
    q.grouping_letter_diacritics('إِنَّا أَعْطَيْنَكَ الْكَوْثَرَ')\n
    >>> ['إِ', 'نَّ', 'ا', ' ', 'أَ', 'عْ', 'طَ', 'يْ', 'نَ', 'كَ', ' ', 'ا', 'لْ', 'كَ', 'وْ', 'ثَ', 'رَ']
    ```
    """
    sentance_without_tatweel = strip_tatweel(sentance)
    print(sentance_without_tatweel)
    hroof_with_tashkeel = []
    for index,i in enumerate(sentance):
        if((sentance[index] in (alphabet or alefat or hamzat)or sentance[index] is ' ' )):
            k = index
            harf_with_taskeel =sentance[index]
            while((k+1) != len(sentance) and (sentance[k+1] in (tashkeel or harakat or shortharakat or tanwin ))):
                harf_with_taskeel =harf_with_taskeel+""+sentance[k+1]
                k = k + 1
            index = k
            hroof_with_tashkeel.append(harf_with_taskeel)
    return hroof_with_tashkeel
github linuxscout / fareh / scripts / rule_builder.py View on Github external
def clean(self, strng):
        """
        clean a string from unnecessary whitespaces
        """
        #if type(strng) == str or type(strng) == unicode:
        if type(strng) == str:#python3
            strng = araby.strip_tatweel(strng)
            return re.sub(u'\s+', ' ', strng).strip()
        if type(strng) == list:
            l= [re.sub(u'\s+', ' ', s).strip() for s in strng]
            return [araby.strip_tatweel(s) for s in l]
            
        else:
            return strng
github hci-lab / PyQuran / core / pyquran.py View on Github external
Returns:
           str : zero and ones for each token
  '''

  marksDictionary = {'ْ': 0, '': 0, 'ُ': 1, 'َ': 1, 'ِ': 1, 'ّ': 1, 'ٌ': 1, 'ً': 1, 'ٍ': 1}
  charWithOutTashkeelOrSukun = ''
  tashkeelPatternList = []  # list of zeros and ones
  marksList = []

  # convert the List o to string without spaces
  ayahModified = ''.join(ayah.strip())
  tashkeelPatternStringWithSpace = ''

  # check is there a tatweel in ayah or not
  if(tatweel in ayahModified):
     ayahModified = strip_tatweel(ayahModified)

  # check whether exist alef_mad in ayah if exist unpack the alef mad
  if (alef_mad in ayahModified):
      ayahModified = unpack_alef_mad(ayahModified)


  # separate tashkeel from the ayah
  ayahOrAyatWithoutTashkeel, marks = separate(ayahModified)

  for mark in marks:
  #the pyarabic returns the char of marks without tashkeel with 'ـ' so if check about this mark if not exist
  #append in list harakat and zero or ones in tashkeel pattern list if yes append the marks and patterns
    if (mark != 'ـ'):
      marksList.append(mark)
      tashkeelPatternList.append(marksDictionary[mark])
    else: