How to use the underthesea.word_tokenize function in underthesea

To help you get started, we’ve selected a few underthesea examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_special_cases_2(self):
        sentence = u"="
        actual = word_tokenize(sentence)
        expected = ["="]
        self.assertEqual(actual, expected)
github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_special_cases_3(self):
        sentence = u"=))"
        actual = word_tokenize(sentence)
        expected = ["=))"]
        self.assertEqual(actual, expected)
github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_simple_cases(self):
        sentence = u""
        actual = word_tokenize(sentence)
        expected = []
        self.assertEqual(actual, expected)

        actual = word_tokenize(sentence, format="text")
        expected = u""
        self.assertEqual(actual, expected)
github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_decomposed_from(self):
        text = u"yếu"
        acutal = word_tokenize(text)
        expected = [u'yếu']
        self.assertEqual(acutal, expected)
github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_word_tokenize_2(self):
        """ Case with special character tab
        """
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(1)
        try:
            text = u"""000000000000_753889211466429	"""
            word_tokenize(text)
        except Exception as e:
            raise (e)
github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_word_tokenize(self):
        text = u"""Tổng thống Nga coi việc Mỹ không kích căn cứ quân sự của Syria là "sự gây hấn nhằm vào một quốc gia có chủ quyền", gây tổn hại đến quan hệ Moscow-Washington."""
        word_tokenize(text)
github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_url_1(self):
        text = u"https://www.facebook.com/photo.php?fbid=1627680357512432&set=a.1406713109609159.1073741826.100008114498358&type=1 mình muốn chia sẻ bài viết của một bác nói về thực trạng của bộ giáo dục bây giờ! mọi người vào đọc và chia sẻ để Phạm Vũ Luận BIẾT!"
        actual = word_tokenize(text, format='text')
        expected = u"https://www.facebook.com/photo.php?fbid=1627680357512432&set=a.1406713109609159.1073741826.100008114498358&type=1 mình muốn chia_sẻ bài viết của một bác nói về thực_trạng của bộ giáo_dục bây_giờ ! mọi người vào đọc và chia_sẻ để Phạm_Vũ_Luận BIẾT !"
        self.assertEqual(actual, expected)
github undertheseanlp / underthesea / tests / word_tokenize / test_word_tokenize.py View on Github external
def test_simple_cases(self):
        sentence = u""
        actual = word_tokenize(sentence)
        expected = []
        self.assertEqual(actual, expected)

        actual = word_tokenize(sentence, format="text")
        expected = u""
        self.assertEqual(actual, expected)
github undertheseanlp / underthesea / underthesea / pos_tag / __init__.py View on Github external
Examples
    --------
    >>> # -*- coding: utf-8 -*-
    >>> from underthesea import pos_tag
    >>> sentence = "Chợ thịt chó nổi tiếng ở TPHCM bị truy quét"
    >>> pos_tag(sentence)
    [('Chợ', 'N'),
    ('thịt', 'N'),
    ('chó', 'N'),
    ('nổi tiếng', 'A'),
    ('ở', 'E'),
    ('TPHCM', 'Np'),
    ('bị', 'V'),
    ('truy quét', 'V')]
    """
    sentence = word_tokenize(sentence)
    crf_model = CRFPOSTagPredictor.Instance()
    result = crf_model.predict(sentence, format)
    return result
github BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github external
sentences = wordless_sentence_tokenize(main, text, lang = 'bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append([token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang = 'vie',
                                                   sentence_tokenizer = 'Underthesea - Vietnamese Sentence Tokenizer')

        for sentence in sentences:
            tokens_hierarchical.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_hierarchical):
        tokens_hierarchical[i] = [token.strip()
                                  for token in sentence
                                  if token.strip()]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(sentence[-1], boundary = '', sentence_ending = True)
    else:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(sentence[-1], boundary = ' ', sentence_ending = True)