How to use ngram - 10 common examples

To help you get started, we’ve selected a few ngram examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github gpoulter / python-ngram / test_ngram.py View on Github external
def test_set_operations(self):
        """Test advanced set operations"""
        items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
        items2 = set(["cdefg", "lmnop"])
        idx1 = NGram(items1)
        idx2 = NGram(items2)
        results = lambda L: sorted(x[0] for x in L)
        # Item removal
        self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"])
        idx1.remove('abcde')
        self.assertEqual(results(idx1.search('cde')), ["cdefg"])
        # Set intersection operation
        items1.remove('abcde')
        idx1.intersection_update(idx2)
        self.assertEqual(idx1, items1.intersection(items2))
        self.assertEqual(results(idx1.search('lmn')), [])
        self.assertEqual(results(idx1.search('ijk')), [])
        self.assertEqual(results(idx1.search('def')), ['cdefg'])
github gpoulter / python-ngram / test_ngram.py View on Github external
def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""
        
        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'),
            [('askfjwehiuasdfji', 1.0),
             ('asdfawe', 0.17391304347826086),
             ('asfwef', 0.083333333333333329),
             ('adfwe', 0.041666666666666664),
            ])
        self.assertEqual(idx.search('afadfwe')[:2],
                         [('adfwe', 0.59999999999999998), 
                          ('asdfawe', 0.20000000000000001)])
        
        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
github gpoulter / python-ngram / test_ngram.py View on Github external
rewrite still uses the same underlying algorithm"""
        
        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'),
            [('askfjwehiuasdfji', 1.0),
             ('asdfawe', 0.17391304347826086),
             ('asfwef', 0.083333333333333329),
             ('adfwe', 0.041666666666666664),
            ])
        self.assertEqual(idx.search('afadfwe')[:2],
                         [('adfwe', 0.59999999999999998), 
                          ('asdfawe', 0.20000000000000001)])
        
        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
github gpoulter / python-ngram / test_ngram.py View on Github external
# Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'),
            [('askfjwehiuasdfji', 1.0),
             ('asdfawe', 0.17391304347826086),
             ('asfwef', 0.083333333333333329),
             ('adfwe', 0.041666666666666664),
            ])
        self.assertEqual(idx.search('afadfwe')[:2],
                         [('adfwe', 0.59999999999999998), 
                          ('asdfawe', 0.20000000000000001)])
        
        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
github codysmithd / SAIL / _test_ngram.py View on Github external
import ngram

ngram.load()

while True:
	user_input = input("Enter word to test: ")
	print(ngram.get_unigram_score("<s>", user_input))
</s>
github Ezhil-Language-Foundation / open-tamil / tests / unigram_tests.py View on Github external
def test_basic_unigram_counts(self):
        z = Corpus("data/ex.unicode")
        for letter in z.next_tamil_letter():
            #if ( LINUX ): print(letter)
            pass
        # LetterModels
        q = Unigram( "data/ex.unicode" )
        q.frequency_model( )
        if not PYTHON3:
            #if ( LINUX ):  print(unicode(q))
            pass
        else:
            #if ( LINUX ):  print( q )
            pass
        self.assertEqual( q.letter[u"ஷை"] + q.letter[u"சி"] , q.letter[u"ந"] )
        del z, q
github codysmithd / SAIL / _test_ngram.py View on Github external
import ngram

ngram.load()

while True:
	user_input = input("Enter word to test: ")
	print(ngram.get_unigram_score("<s>", user_input))
</s>
github Ezhil-Language-Foundation / open-tamil / tests / word_distance.py View on Github external
        distances = list( map( lambda w: edit_distance( pizhai, w) , agarathi_sorkal ) )
        print(distances)
github usc-isi-i2 / dig-lsh-clustering / tokenizer / generateTokens.py View on Github external
def getNGrams(text,type,n):
        #removes the stop words
        tokens = list(tokenize_input(text))
        if(type == "word"):
            if(len(tokens) > n):
                return ["".join(j) for j in zip(*[tokens[i:] for i in range(n)])]
            else:
            #returns the word directly if n is greater than number of words
                a=[]
                a.append(text)
                return a
        if(type == "character"):
            ngramObject = ngram.NGram(N=n)
            ngram_char_tokens = list(ngramObject.split(text))
            ## remove first n-1 and last n-1 tokens as they are not complete they have $ signs
            if len(text) > n:
                return ngram_char_tokens[n-1:(len(ngram_char_tokens)-(n-1))]
            else:
                tokens = []
                tokens.append(text)
                return tokens
        else:
            return list(tokenize_input(text))
github usc-isi-i2 / dig-lsh-clustering / tokenizer / old / generateTokens.py View on Github external
def getNGrams(text,type,n):
    #removes the stop words
    tokens = list(tokenize_input(text))
    if(type == "word"):
        if(len(tokens) > n):
            return ["".join(j) for j in zip(*[tokens[i:] for i in range(n)])]
        else:
            #returns the word directly if n is greater than number of words
            a=[]
            a.append(text)
            return a
    if(type == "character"):
        ngramObject = ngram.NGram(N=n)
        ngram_char_tokens = list(ngramObject.split(text))
        if len(text) > n:
            return ngram_char_tokens
        else:
            a = []
            a.append(text)
            return a
    else:
        return list(tokenize_input(text))