How to use the pythainlp.tokenize.THAI2FIT_TOKENIZER.word_tokenize function in pythainlp

To help you get started, we’ve selected a few pythainlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github PyThaiNLP / pythainlp / tests / View on Github external
def test_process_thai_dense(self):
        text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146"

        actual = process_thai(

        # after pre_rules_th
        # >>> "👍👍👍 # Ana มากxxrep4 น้้อยน้อย .1146"
        # after tokenize with word_tokenize(engine="newmm")
        # >>> ["👍👍👍", " ", "#", "Ana", " ", "มาก", "xxrep", "4",
        #             " ", "น้อย", "น้อย", " ", ".", "1146"]
        # after post_rules_th
        # -- because it performs `replace_wrep_post` before `ungroup_emoji`,
        #    3 repetitive emoji are not marked with special token "xxwrep num"
        # >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก",
        #       "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ",
        #       ".", "1146"]
github PyThaiNLP / pythainlp / pythainlp / word_vector / View on Github external
array([[-0.00421414, -0.08881307,  0.05081136, -0.05632929, -0.06607185,
        0.03059357, -0.113882  , -0.00074836,  0.05035743,  0.02914307,
        0.02893357,  0.11327957,  0.04562086, -0.05015393,  0.11641257,
        0.32304936, -0.05054322,  0.03639471, -0.06531371,  0.05048079]])
    >>> sentence_vectorizer(sentence, use_mean=False)
    array([[-0.05899798, -1.24338295,  0.711359  , -0.78861002, -0.92500597,
        0.42831   , -1.59434797, -0.01047703,  0.705004  ,  0.40800299,
        0.40506999,  1.58591403,  0.63869202, -0.702155  ,  1.62977601,
        4.52269109, -0.70760502,  0.50952601, -0.914392  ,  0.70673105]])
    vec = zeros((1, WV_DIM))

    words = THAI2FIT_TOKENIZER.word_tokenize(text)
    len_words = len(words)

    if not len_words:
        return vec

    for word in words:
        if word == " ":
            word = _TK_SP
        elif word == "\n":
            word = _TK_EOL

        if word in _MODEL.index2word:
            vec += _MODEL.word_vec(word)

    if use_mean:
        vec /= len_words
github PyThaiNLP / pythainlp / pythainlp / ulmfit / View on Github external
def process_thai(
    text: str,
    pre_rules: Collection = pre_rules_th_sparse,
    tok_func: Callable = THAI2FIT_TOKENIZER.word_tokenize,
    post_rules: Collection = post_rules_th_sparse,
) -> Collection[str]:
    Process Thai texts for models (with sparse features as default)

    :param str text: text to be cleaned
    :param list[func] pre_rules: rules to apply before tokenization.
    :param func tok_func: tokenization function (by default, **tok_func** is

    :param list[func]  post_rules: rules to apply after tokenizations

    :return: a list of cleaned tokenized texts
    :rtype: list[str]

github PyThaiNLP / pythainlp / pythainlp / ulmfit / View on Github external
>>> config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1,
             qrnn=False, tie_weights=True, out_bias=True, output_p=0.25,
             hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
        >>> trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)
        >>> learn = language_model_learner(data, AWD_LSTM, config=config,
                                           pretrained=False, **trn_args)
        >>> document_vector('วันนี้วันดีปีใหม่', learn, data)

    :See Also:
        * A notebook showing how to train `ulmfit` language model and its
          usage, `Jupyter Notebook \


    s = THAI2FIT_TOKENIZER.word_tokenize(text)
    t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to(
    m = learn.model[0]
    res = m(t).cpu().detach().numpy()
    if agg == "mean":
        res = res.mean(0)
    elif agg == "sum":
        res = res.sum(0)
        raise ValueError("Aggregate by mean or sum")

    return res