Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def __init__(self, *args, **kwargs):
self.keep_whitespace = True
self.algorithm = DEFAULT_WORD_TOKENIZE_ENGINE
self.separator = DEFAULT_WORD_TOKEN_SEPARATOR
self.run = word_tokenize
super().__init__(*args, **kwargs)
sentences = wordless_sentence_tokenize(main, text, lang = 'rus')
for sentence in sentences:
tokens_hierarchical.append([token.text for token in razdel.tokenize(sentence)])
# Thai
elif 'PyThaiNLP' in word_tokenizer:
# Preserve sentence boundaries
sentences = wordless_sentence_tokenize(main, text, lang = 'tha',
sentence_tokenizer = 'PyThaiNLP - Thai Sentence Tokenizer')
if word_tokenizer == main.tr('PyThaiNLP - Maximum Matching Algorithm + TCC'):
for sentence in sentences:
tokens_hierarchical.append(pythainlp.tokenize.word_tokenize(sentence, engine = 'newmm'))
elif word_tokenizer == main.tr('PyThaiNLP - Maximum Matching Algorithm'):
for sentence in sentences:
tokens_hierarchical.append(pythainlp.tokenize.word_tokenize(sentence, engine = 'mm'))
elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
for sentence in sentences:
tokens_hierarchical.append(pythainlp.tokenize.word_tokenize(sentence, engine = 'longest-matching'))
# Tibetan
elif 'botok' in word_tokenizer:
if flat_tokens:
sentences = [text]
else:
sentences = wordless_sentence_tokenize(main, text, lang = 'bod')
botok_tokenizer = wordless_text_utils.check_botok_tokenizers(main, word_tokenizer)
for sentence in sentences:
tokens_hierarchical.append([token.text for token in botok_tokenizer.tokenize(sentence)])
# Vietnamese
elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
def tokenize(self, text):
"""
:meth: tokenize text with selected engine
:param str text: text to tokenize
:return: tokenized text
"""
return [t for t in word_tokenize(self.sub_br(text), engine=self.engine)]
import Mykytea
opt="-model jp-0.4.7-1.mod"
tokenizer = Mykytea.Mykytea(opt)
elif lang=="zh_cn":
import Mykytea
opt = "-model ctb-0.4.0-1.mod"
tokenizer = Mykytea.Mykytea(opt)
elif lang=="zh_tw":
import jieba
tokenizer = jieba
elif lang=="vi":
from pyvi import ViTokenizer
tokenizer = ViTokenizer
elif lang=="th":
from pythainlp.tokenize import word_tokenize
tokenizer = word_tokenize
elif lang=="ar":
import pyarabic.araby as araby
tokenizer = araby
# elif lang=="en":
# from nltk import word_tokenize
# tokenizer = word_tokenize
else:
from nltk.tokenize import ToktokTokenizer
tokenizer = ToktokTokenizer()
return tokenizer
def _translate(text):
tokenized_sentence = " ".join(th_word_tokenize(text))
_hypothesis = th2en_word2word_model.translate(tokenized_sentence)
hypothesis = en_word_detokenize.detokenize([_hypothesis])
return hypothesis
def get_sentiment(text, return_score=False):
words = word_tokenize(text)
tensor = LongTensor([stoi[word] for word in words]).view(-1, 1).cpu()
tensor = Variable(tensor, volatile=False)
MODEL.reset()
pred, *_ = MODEL(tensor)
result = pred.data.cpu().numpy().reshape(-1)
if return_score:
return softmax(result)
else:
return np.argmax(result)
def document_vector(ss, m, stoi, tok_engine="newmm"):
"""
:meth: `document_vector` get document vector using pretrained ULMFiT model
:param str ss: sentence to extract embeddings
:param m: pyTorch model
:param dict stoi: string-to-integer dict e.g. {'_unk_':0, '_pad_':1,'first_word':2,'second_word':3,...}
:param str tok_engine: tokenization engine (recommend using `newmm` if you are using pretrained ULMFiT model)
:return: `numpy.array` of document vector sized 300
"""
s = word_tokenize(ss)
t = LongTensor([stoi[i] for i in s]).view(-1, 1).cuda()
t = Variable(t, volatile=False)
m.reset()
pred, *_ = m[0](t)
# get average of last lstm layer along bptt
res = to_np(torch.mean(pred[-1], 0).view(-1))
return res
def Text(text):
if not isinstance(text, list):
text = word_tokenize(str(text))
return nltk.Text(text)