How to use the underthesea.pos_tag function in underthesea

To help you get started, we’ve selected a few underthesea examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github undertheseanlp / underthesea / tests / pos_tag / test_pos_tag.py View on Github external
def test_accuracy(self):
        output = pos_tag(u"Tổng Bí thư: Ai trót để tay nhúng chàm thì hãy sớm tự gột rửa")
        self.assertEqual(len(output), 13)
github undertheseanlp / underthesea / tests / pos_tag / test_pos_tag.py View on Github external
def test_simple_cases(self):
        sentence = u""
        actual = pos_tag(sentence)
        expected = []
        self.assertEqual(actual, expected)
github BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github external
elif pos_tagger == main.tr('botok - Tibetan POS Tagger'):
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(main,
                                                                     word_tokenizer = word_tokenizer)
        tokens = botok_tokenizer.tokenize(' '.join(tokens))

        for token in tokens:
            if token.pos:
                tokens_tagged.append((token.text, token.pos))
            else:
                tokens_tagged.append((token.text, token.chunk_type))

    # Vietnamese
    elif pos_tagger == main.tr('Underthesea - Vietnamese POS Tagger'):
        tokens_tagged = underthesea.pos_tag(' '.join(tokens))

    # Convert to Universal Tagset
    if (tagset == 'custom' and main.settings_custom['pos_tagging']['to_universal_pos_tags'] or
        tagset == 'universal'):

        mappings = {tag: tag_universal
                    for tag, tag_universal, _, _ in main.settings_custom['tagsets']['mappings'][lang][pos_tagger]}
        tokens_tagged = list(tokens_tagged)

        # Issue warnings if any tag is missing from the mapping table
        for _, tag in tokens_tagged:
            if tag not in mappings:
                print(f'Warning: tag "{tag}" is missing from the {wordless_conversion.to_lang_text(main, lang)} mapping table!')

        tokens_tagged = [(token, mappings.get(tag, 'X'))
                         for token, tag in tokens_tagged]
github undertheseanlp / underthesea / underthesea / chunking / __init__.py View on Github external
Examples
    --------

    >>> # -*- coding: utf-8 -*-
    >>> from underthesea import chunk
    >>> sentence = "Nghi vấn 4 thi thể Triều Tiên trôi dạt bờ biển Nhật Bản"
    >>> chunk(sentence)
    [('Nghi vấn', 'N', 'B-NP'),
    ('4', 'M', 'B-NP'),
    ('thi thể', 'N', 'B-NP'),
    ('Triều Tiên', 'Np', 'B-NP'),
    ('trôi dạt', 'V', 'B-VP'),
    ('bờ biển', 'N', 'B-NP'),
    ('Nhật Bản', 'Np', 'B-NP')]
    """
    sentence = pos_tag(sentence)
    crf_model = CRFChunkingPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result