How to use the pythainlp.tokenize.word_tokenize function in pythainlp

To help you get started, we’ve selected a few pythainlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github PyThaiNLP / pythainlp / pythainlp / cli / tokenize.py View on Github external
def __init__(self, *args, **kwargs):
        self.keep_whitespace = True
        self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
        self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
        self.run = word_tokenize
        super().__init__(*args, **kwargs)
github BLKSerene / Wordless / src / wordless_text / wordless_text_processing.py View on Github external
sentences = wordless_sentence_tokenize(main, text, lang = 'rus')

        for sentence in sentences:
            tokens_hierarchical.append([token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wordless_sentence_tokenize(main, text, lang = 'tha',
                                               sentence_tokenizer = 'PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr('PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_hierarchical.append(pythainlp.tokenize.word_tokenize(sentence, engine = 'newmm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_hierarchical.append(pythainlp.tokenize.word_tokenize(sentence, engine = 'mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_hierarchical.append(pythainlp.tokenize.word_tokenize(sentence, engine = 'longest-matching'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang = 'bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append([token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
github PyThaiNLP / pythainlp / pythainlp / ulmfit / utils.py View on Github external
def tokenize(self, text):
        """
        :meth: tokenize text with selected engine
        :param str text: text to tokenize
        :return: tokenized text
        """
        return [t for t in word_tokenize(self.sub_br(text), engine=self.engine)]
github Kyubyong / word2word / _make.py View on Github external
import Mykytea
        opt="-model jp-0.4.7-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang=="zh_cn":
        import Mykytea
        opt = "-model ctb-0.4.0-1.mod"
        tokenizer = Mykytea.Mykytea(opt)
    elif lang=="zh_tw":
        import jieba
        tokenizer = jieba
    elif lang=="vi":
        from pyvi import ViTokenizer
        tokenizer = ViTokenizer
    elif lang=="th":
        from pythainlp.tokenize import word_tokenize
        tokenizer = word_tokenize
    elif lang=="ar":
        import pyarabic.araby as araby
        tokenizer = araby
    # elif lang=="en":
    #     from nltk import word_tokenize
    #     tokenizer = word_tokenize
    else:
        from nltk.tokenize import ToktokTokenizer
        tokenizer = ToktokTokenizer()

    return tokenizer
github PyThaiNLP / pythainlp / pythainlp / translate / th2en_word2word.py View on Github external
def _translate(text):
    tokenized_sentence = " ".join(th_word_tokenize(text))
    _hypothesis = th2en_word2word_model.translate(tokenized_sentence)
    hypothesis = en_word_detokenize.detokenize([_hypothesis])
    return hypothesis
github PyThaiNLP / pythainlp / pythainlp / sentiment / ulmfit_sent.py View on Github external
def get_sentiment(text, return_score=False):
    words = word_tokenize(text)
    tensor = LongTensor([stoi[word] for word in words]).view(-1, 1).cpu()
    tensor = Variable(tensor, volatile=False)
    MODEL.reset()
    pred, *_ = MODEL(tensor)
    result = pred.data.cpu().numpy().reshape(-1)

    if return_score:
        return softmax(result)
    else:
        return np.argmax(result)
github PyThaiNLP / pythainlp / pythainlp / ulmfit / utils.py View on Github external
def document_vector(ss, m, stoi, tok_engine="newmm"):
    """
    :meth: `document_vector` get document vector using pretrained ULMFiT model
    :param str ss: sentence to extract embeddings
    :param m: pyTorch model
    :param dict stoi: string-to-integer dict e.g. {'_unk_':0, '_pad_':1,'first_word':2,'second_word':3,...}
    :param str tok_engine: tokenization engine (recommend using `newmm` if you are using pretrained ULMFiT model)
    :return: `numpy.array` of document vector sized 300
    """
    s = word_tokenize(ss)
    t = LongTensor([stoi[i] for i in s]).view(-1, 1).cuda()
    t = Variable(t, volatile=False)
    m.reset()
    pred, *_ = m[0](t)
    # get average of last lstm layer along bptt
    res = to_np(torch.mean(pred[-1], 0).view(-1))
    return res
github PyThaiNLP / pythainlp / pythainlp / Text.py View on Github external
def Text(text):
    if not isinstance(text, list):
        text = word_tokenize(str(text))
    return nltk.Text(text)