How to use the ngram.NGram function in ngram

To help you get started, we’ve selected a few ngram examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github gpoulter / python-ngram / test_ngram.py View on Github external
def test_set_operations(self):
        """Test advanced set operations"""
        items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
        items2 = set(["cdefg", "lmnop"])
        idx1 = NGram(items1)
        idx2 = NGram(items2)
        results = lambda L: sorted(x[0] for x in L)
        # Item removal
        self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"])
        idx1.remove('abcde')
        self.assertEqual(results(idx1.search('cde')), ["cdefg"])
        # Set intersection operation
        items1.remove('abcde')
        idx1.intersection_update(idx2)
        self.assertEqual(idx1, items1.intersection(items2))
        self.assertEqual(results(idx1.search('lmn')), [])
        self.assertEqual(results(idx1.search('ijk')), [])
        self.assertEqual(results(idx1.search('def')), ['cdefg'])
github gpoulter / python-ngram / test_ngram.py View on Github external
def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""
        
        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'),
            [('askfjwehiuasdfji', 1.0),
             ('asdfawe', 0.17391304347826086),
             ('asfwef', 0.083333333333333329),
             ('adfwe', 0.041666666666666664),
            ])
        self.assertEqual(idx.search('afadfwe')[:2],
                         [('adfwe', 0.59999999999999998), 
                          ('asdfawe', 0.20000000000000001)])
        
        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
github usc-isi-i2 / dig-lsh-clustering / tokenizer / generateTokens.py View on Github external
def getNGrams(text,type,n):
        #removes the stop words
        tokens = list(tokenize_input(text))
        if(type == "word"):
            if(len(tokens) > n):
                return ["".join(j) for j in zip(*[tokens[i:] for i in range(n)])]
            else:
            #returns the word directly if n is greater than number of words
                a=[]
                a.append(text)
                return a
        if(type == "character"):
            ngramObject = ngram.NGram(N=n)
            ngram_char_tokens = list(ngramObject.split(text))
            ## remove first n-1 and last n-1 tokens as they are not complete they have $ signs
            if len(text) > n:
                return ngram_char_tokens[n-1:(len(ngram_char_tokens)-(n-1))]
            else:
                tokens = []
                tokens.append(text)
                return tokens
        else:
            return list(tokenize_input(text))
github usc-isi-i2 / dig-lsh-clustering / tokenizer / old / generateTokens.py View on Github external
def getNGrams(text,type,n):
    #removes the stop words
    tokens = list(tokenize_input(text))
    if(type == "word"):
        if(len(tokens) > n):
            return ["".join(j) for j in zip(*[tokens[i:] for i in range(n)])]
        else:
            #returns the word directly if n is greater than number of words
            a=[]
            a.append(text)
            return a
    if(type == "character"):
        ngramObject = ngram.NGram(N=n)
        ngram_char_tokens = list(ngramObject.split(text))
        if len(text) > n:
            return ngram_char_tokens
        else:
            a = []
            a.append(text)
            return a
    else:
        return list(tokenize_input(text))
github TensorMSA / tensormsa / chatbot / nlp / entity_recognizer.py View on Github external
def check_all_dict(self, ner_conv, cb_data, cb_data_order, cb_data_th):
        """
        check other dict when failed to find matching value
        :param ner_conv:
        :return:
        """
        result = []
        for key in cb_data_order:
            model = ngram.NGram(key=self.lower)
            model.update(cb_data.get(key))
            result = list(map(lambda x: x[0], model.search(ner_conv, threshold=cb_data_th[key])))[0:4]
            if len(result) > 0:
                return result, key
        return result, None
github gpoulter / python-ngram / ngram.py View on Github external
def __reduce__(self):
        """Return state information for pickling, no references to this
        instance.  The key function must be None, a builtin function, or
        a named module-level function.

        >>> from ngram import NGram
        >>> n = NGram([0xDEAD, 0xBEEF], key=hex)
        >>> import pickle
        >>> p = pickle.dumps(n)
        >>> m = pickle.loads(p)
        >>> sorted(list(m))
        [48879, 57005]
        """
        return NGram, (list(self), self.threshold, self.warp, self._key,
                       self.N, self._pad_len, self._pad_char)
github TensorMSA / tensormsa / third_party / ngram / ngram_compare_mro.py View on Github external
self.param['conninfo'] = netconf['conninfo']

        if self.param['datatype'] == 'file':
            self.get_file_data(data_node)
        elif self.param['datatype'] == 'db':
            self.get_db_data()

        item = []
        for val in self.param['list']:
            try:
                item_tuple = (val['item_code'].strip(), val['item_leaf'].strip(), val['item_desc'].strip())
                item.append(item_tuple)
            except:
                logging.info('Error Data' + val['item_code'])

        dataset = ngram.NGram(item, key=lambda x: x[2])
        dataset = sorted(dataset, key=lambda x: x[0])
        findset = ngram.NGram(item, key=lambda x: x[2])

        logging.info('================================================================================================')
        return_data = {}
        for data in dataset:
            findset.remove(data)
            result = findset.search(data[2], self.param['standard'])

            for r in range(len(result)):
                if return_data.get(data[0]) == None:
                    return_data[data[0]] = {}
                    return_data[data[0]]['desc'] = data[2]
                    # logging.info(str(data[0]) + ':' + str(data[2]))
                return_data[data[0]][result[r][0][0]] = {'item_desc': result[r][0][2], 'item_perc': result[r][1]}
                # logging.info(' - '+str(result[r][0][0])+'('+str(result[r][1])+')' + ':' + str(result[r][0][2]))