Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer
tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()
class MarkovText(markovify.Text):
def word_split(self, sentence):
return tokenizer.tokenize(sentence)
def word_join(self, words):
return detokenizer.detokenize(words, return_str=True)
class MarkovUserName(markovify.Text):
def word_split(self, word):
return list(word)
def word_join(self, characters):
import markovify
from sacremoses import MosesDetokenizer, MosesTokenizer
tokenizer = MosesTokenizer()
detokenizer = MosesDetokenizer()
class MarkovText(markovify.Text):
def word_split(self, sentence):
return tokenizer.tokenize(sentence)
def word_join(self, words):
return detokenizer.detokenize(words, return_str=True)
class MarkovUserName(markovify.Text):
def word_split(self, word):
return list(word)
def word_join(self, characters):
return "".join(characters)
def detokenize_file(language, processes, xml_unescape, encoding, quiet):
moses = MosesDetokenizer(lang=language)
moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(moses_detokenize(str.split(line)), end="\n", file=fout)
else:
document_iterator = map(str.split, fin.readlines())
for outline in parallelize_preprocess(
moses_detokenize, document_iterator, processes, progress_bar=(not quiet)
):
print(outline, end="\n", file=fout)
def detokenize_file(language, processes, xml_unescape, encoding, quiet):
moses = MosesDetokenizer(lang=language)
moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(moses_detokenize(str.split(line)), end="\n", file=fout)
else:
document_iterator = map(str.split, fin.readlines())
for outline in parallelize_preprocess(
moses_detokenize, document_iterator, processes, progress_bar=(not quiet)
):
print(outline, end="\n", file=fout)
def detokenize_file(language, processes, xml_unescape, encoding, quiet):
moses = MosesDetokenizer(lang=language)
moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
with click.get_text_stream("stdin", encoding=encoding) as fin:
with click.get_text_stream("stdout", encoding=encoding) as fout:
# If it's single process, joblib parallization is slower,
# so just process line by line normally.
if processes == 1:
for line in tqdm(fin.readlines()):
print(moses_detokenize(str.split(line)), end="\n", file=fout)
else:
document_iterator = map(str.split, fin.readlines())
for outline in parallelize_preprocess(
moses_detokenize, document_iterator, processes, progress_bar=(not quiet)
):
print(outline, end="\n", file=fout)
def _load_model(self, filename):
"""
Loads pre-trained truecasing file.
:returns: A dictionary of the best, known objects as values from `_casing_to_model()`
:rtype: {'best': dict, 'known': Counter}
"""
casing = defaultdict(Counter)
with open(filename, encoding=self.encoding) as fin:
for line in fin:
line = line.strip().split()
for token, count in grouper(line, 2):
count = count.split("/")[0].strip("()")
casing[token.lower()][token] = int(count)
# Returns the best and known object from `_casing_to_model()`
return self._casing_to_model(casing)
# Keep track of no. of quotation marks.
quote_counts = {u"'": 0, u'"': 0, u"``": 0, u"`": 0, u"''": 0}
# The *prepend_space* variable is used to control the "effects" of
# detokenization as the function loops through the list of tokens and
# changes the *prepend_space* accordingly as it sequentially checks
# through the language specific and language independent conditions.
prepend_space = " "
detokenized_text = ""
tokens = text.split()
# Iterate through every token and apply language specific detokenization rule(s).
for i, token in enumerate(iter(tokens)):
# Check if the first char is CJK.
if is_cjk(token[0]) and self.lang != "ko":
# Perform left shift if this is a second consecutive CJK word.
if i > 0 and is_cjk(tokens[i - 1][-1]):
detokenized_text += token
# But do nothing special if this is a CJK word that doesn't follow a CJK word
else:
detokenized_text += prepend_space + token
prepend_space = " "
# If it's a currency symbol.
elif re.search(u"^[" + self.IsSc + u"\(\[\{\¿\¡]+$", token):
# Perform right shift on currency and other random punctuation items
detokenized_text += prepend_space + token
prepend_space = ""
elif re.search(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$", token):
# In French, these punctuations are prefixed with a non-breakable space.
if self.lang == "fr" and re.search(r"^[\?\!\:\;\\\%]$", token):
detokenized_text += " "
# Perform left shift on punctuation items.
def __init__(self, args):
self.args = args
if getattr(args, 'moses_source_lang', None) is None:
args.moses_source_lang = getattr(args, 'source_lang', 'en')
if getattr(args, 'moses_target_lang', None) is None:
args.moses_target_lang = getattr(args, 'target_lang', 'en')
try:
from sacremoses import MosesTokenizer, MosesDetokenizer
self.tok = MosesTokenizer(args.moses_source_lang)
self.detok = MosesDetokenizer(args.moses_target_lang)
except ImportError:
raise ImportError('Please install Moses tokenizer with: pip install sacremoses')
def tokenize_captions(captions, lang='en'):
"""Tokenizes captions list with Moses tokenizer.
"""
tokenizer = MosesTokenizer(lang=lang)
return [tokenizer.tokenize(caption, return_str=True) for caption in captions]
for sentence in sentences:
tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
toktok_tokenizer = nltk.ToktokTokenizer()
for sentence in sentences:
tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
# Sacremoses
elif 'Sacremoses' in word_tokenizer:
if flat_tokens:
sentences = [text]
else:
sentences = wordless_sentence_tokenize(main, text, lang)
if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))
for sentence in sentences:
tokens_hierarchical.append(moses_tokenizer.tokenize(sentence, escape = False))
elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
moses_tokenizer = sacremoses.MosesTokenizer(lang = wordless_conversion.to_iso_639_1(main, lang))
for sentence in sentences:
tokens_hierarchical.append(moses_tokenizer.penn_tokenize(sentence))
# spaCy
elif 'spaCy' in word_tokenizer:
nlp = main.__dict__[f'spacy_nlp_{lang}']
doc = nlp(text)
# See Issue #3479: https://github.com/explosion/spaCy/issues/3479
doc.is_parsed = True
if flat_tokens: