Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_process_thai_dense(self):
text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146"
actual = process_thai(
text,
pre_rules=pre_rules_th,
post_rules=post_rules_th,
tok_func=THAI2FIT_TOKENIZER.word_tokenize,
)
# after pre_rules_th
# >>> "👍👍👍 # Ana มากxxrep4 น้้อยน้อย .1146"
#
# after tokenize with word_tokenize(engine="newmm")
# >>> ["👍👍👍", " ", "#", "Ana", " ", "มาก", "xxrep", "4",
# " ", "น้อย", "น้อย", " ", ".", "1146"]
# after post_rules_th
# -- because it performs `replace_wrep_post` before `ungroup_emoji`,
# 3 repetitive emoji are not marked with special token "xxwrep num"
#
# >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก",
# "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ",
# ".", "1146"]
array([[-0.00421414, -0.08881307, 0.05081136, -0.05632929, -0.06607185,
0.03059357, -0.113882 , -0.00074836, 0.05035743, 0.02914307,
...
0.02893357, 0.11327957, 0.04562086, -0.05015393, 0.11641257,
0.32304936, -0.05054322, 0.03639471, -0.06531371, 0.05048079]])
>>>
>>> sentence_vectorizer(sentence, use_mean=False)
array([[-0.05899798, -1.24338295, 0.711359 , -0.78861002, -0.92500597,
0.42831 , -1.59434797, -0.01047703, 0.705004 , 0.40800299,
...
0.40506999, 1.58591403, 0.63869202, -0.702155 , 1.62977601,
4.52269109, -0.70760502, 0.50952601, -0.914392 , 0.70673105]])
"""
vec = zeros((1, WV_DIM))
words = THAI2FIT_TOKENIZER.word_tokenize(text)
len_words = len(words)
if not len_words:
return vec
for word in words:
if word == " ":
word = _TK_SP
elif word == "\n":
word = _TK_EOL
if word in _MODEL.index2word:
vec += _MODEL.word_vec(word)
if use_mean:
vec /= len_words
def process_thai(
text: str,
pre_rules: Collection = pre_rules_th_sparse,
tok_func: Callable = THAI2FIT_TOKENIZER.word_tokenize,
post_rules: Collection = post_rules_th_sparse,
) -> Collection[str]:
"""
Process Thai texts for models (with sparse features as default)
:param str text: text to be cleaned
:param list[func] pre_rules: rules to apply before tokenization.
:param func tok_func: tokenization function (by default, **tok_func** is
:func:`pythainlp.tokenize.word_tokenize`)
:param list[func] post_rules: rules to apply after tokenizations
:return: a list of cleaned tokenized texts
:rtype: list[str]
>>> config = dict(emb_sz=400, n_hid=1550, n_layers=4, pad_token=1,
qrnn=False, tie_weights=True, out_bias=True, output_p=0.25,
hidden_p=0.1, input_p=0.2, embed_p=0.02, weight_p=0.15)
>>> trn_args = dict(drop_mult=0.9, clip=0.12, alpha=2, beta=1)
>>> learn = language_model_learner(data, AWD_LSTM, config=config,
pretrained=False, **trn_args)
>>> document_vector('วันนี้วันดีปีใหม่', learn, data)
:See Also:
* A notebook showing how to train `ulmfit` language model and its
usage, `Jupyter Notebook \
`_
"""
s = THAI2FIT_TOKENIZER.word_tokenize(text)
t = torch.tensor(data.vocab.numericalize(s), requires_grad=False).to(
device
)
m = learn.model[0].encoder.to(device)
res = m(t).cpu().detach().numpy()
if agg == "mean":
res = res.mean(0)
elif agg == "sum":
res = res.sum(0)
else:
raise ValueError("Aggregate by mean or sum")
return res