How to use the underthesea.word_tokenize function in underthesea

To help you get started, we’ve selected a few underthesea examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_special_cases_2(self):
        sentence = u"="
        actual = word_tokenize(sentence)
        expected = ["="]
        self.assertEqual(actual, expected)

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_special_cases_3(self):
        sentence = u"=))"
        actual = word_tokenize(sentence)
        expected = ["=))"]
        self.assertEqual(actual, expected)

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_simple_cases(self):
        sentence = u""
        actual = word_tokenize(sentence)
        expected = []
        self.assertEqual(actual, expected)

        actual = word_tokenize(sentence, format="text")
        expected = u""
        self.assertEqual(actual, expected)

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_decomposed_from(self):
        text = u"yếu"
        acutal = word_tokenize(text)
        expected = [u'yếu']
        self.assertEqual(acutal, expected)

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_word_tokenize_2(self):
        """ Case with special character tab
        """
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(1)
        try:
            text = u"""000000000000_753889211466429	"""
            word_tokenize(text)
        except Exception as e:
            raise (e)

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_word_tokenize(self):
        text = u"""Tổng thống Nga coi việc Mỹ không kích căn cứ quân sự của Syria là "sự gây hấn nhằm vào một quốc gia có chủ quyền", gây tổn hại đến quan hệ Moscow-Washington."""
        word_tokenize(text)

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_url_1(self):
        text = u"https://www.facebook.com/photo.php?fbid=1627680357512432&set=a.1406713109609159.1073741826.100008114498358&type=1 mình muốn chia sẻ bài viết của một bác nói về thực trạng của bộ giáo dục bây giờ! mọi người vào đọc và chia sẻ để Phạm Vũ Luận BIẾT!"
        actual = word_tokenize(text, format='text')
        expected = u"https://www.facebook.com/photo.php?fbid=1627680357512432&set=a.1406713109609159.1073741826.100008114498358&type=1 mình muốn chia_sẻ bài viết của một bác nói về thực_trạng của bộ giáo_dục bây_giờ ! mọi người vào đọc và chia_sẻ để Phạm_Vũ_Luận BIẾT !"
        self.assertEqual(actual, expected)

undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github

def test_simple_cases(self):
        sentence = u""
        actual = word_tokenize(sentence)
        expected = []
        self.assertEqual(actual, expected)

        actual = word_tokenize(sentence, format="text")
        expected = u""
        self.assertEqual(actual, expected)

undertheseanlp / underthesea / underthesea / pos_tag / __init__.py View on Github

Examples
    --------
    >>> # -*- coding: utf-8 -*-
    >>> from underthesea import pos_tag
    >>> sentence = "Chợ thịt chó nổi tiếng ở TPHCM bị truy quét"
    >>> pos_tag(sentence)
    [('Chợ', 'N'),
    ('thịt', 'N'),
    ('chó', 'N'),
    ('nổi tiếng', 'A'),
    ('ở', 'E'),
    ('TPHCM', 'Np'),
    ('bị', 'V'),
    ('truy quét', 'V')]
    """
    sentence = word_tokenize(sentence)
    crf_model = CRFPOSTagPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result

BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github

sentences = wordless_sentence_tokenize(main, text, lang = 'bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append([token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang = 'vie',
                                                   sentence_tokenizer = 'Underthesea - Vietnamese Sentence Tokenizer')

        for sentence in sentences:
            tokens_hierarchical.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_hierarchical):
        tokens_hierarchical[i] = [token.strip()
                                  for token in sentence
                                  if token.strip()]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(sentence[-1], boundary = '', sentence_ending = True)
    else:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(sentence[-1], boundary = ' ', sentence_ending = True)

How to use the underthesea.word_tokenize function in underthesea

To help you get started, we’ve selected a few underthesea examples, based on popular ways it is used in public projects.

underthesea

Package Health Score

Popular underthesea functions

Similar packages