Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
special_tokens.append(constants.END_OF_SENTENCE_TOKEN)
vocab = data.Vocab(special_tokens=special_tokens)
num_oov_buckets = 1
if args.sentencepiece is not None:
import pyonmttok # pylint: disable=import-outside-toplevel
if args.size_multiple == 1:
vocab_size = args.size
else:
# Round vocabulary size to the next multiple of args.size_multiple
vocab_size = (
args.size - (args.size + num_oov_buckets) % args.size_multiple + args.size_multiple)
sp_params = dict(map(lambda arg: tuple(arg.split("=")), args.sentencepiece))
sp_trainer = pyonmttok.SentencePieceLearner(
keep_vocab=True, vocab_size=vocab_size, **sp_params)
for data_file in args.data:
sp_trainer.ingest_file(data_file)
sp_trainer.learn(args.save_vocab, verbose=True)
args.save_vocab = args.save_vocab + ".vocab"
vocab.load(args.save_vocab, file_format="sentencepiece")
else:
if args.from_vocab is not None:
vocab.load(args.from_vocab, file_format=args.from_format)
tokenizer = tokenizers.make_tokenizer(args.tokenizer_config)
for data_file in args.data:
vocab.add_from_text(data_file, tokenizer=tokenizer)
vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency)
vocab.pad_to_multiple(args.size_multiple, num_oov_buckets=num_oov_buckets)
vocab.serialize(args.save_vocab)
elif self.tokenizer_opt['type'] == 'pyonmttok':
if "params" not in self.tokenizer_opt:
raise ValueError(
"Missing mandatory tokenizer option 'params'")
import pyonmttok
if self.tokenizer_opt["mode"] is not None:
mode = self.tokenizer_opt["mode"]
else:
mode = None
# load can be called multiple times: modify copy
tokenizer_params = dict(self.tokenizer_opt["params"])
for key, value in self.tokenizer_opt["params"].items():
if key.endswith("path"):
tokenizer_params[key] = os.path.join(
self.model_root, value)
tokenizer = pyonmttok.Tokenizer(mode,
**tokenizer_params)
self.tokenizer = tokenizer
else:
raise ValueError("Invalid value for tokenizer type")
self.load_time = timer.tick()
self.reset_unload_timer()
self.loading_lock.set()
elif self.tokenizer_opt['type'] == 'pyonmttok':
if "params" not in self.tokenizer_opt:
raise ValueError(
"Missing mandatory tokenizer option 'params'")
import pyonmttok
if self.tokenizer_opt["mode"] is not None:
mode = self.tokenizer_opt["mode"]
else:
mode = None
# load can be called multiple times: modify copy
tokenizer_params = dict(self.tokenizer_opt["params"])
for key, value in self.tokenizer_opt["params"].items():
if key.endswith("path"):
tokenizer_params[key] = os.path.join(
self.model_root, value)
tokenizer = pyonmttok.Tokenizer(mode,
**tokenizer_params)
self.tokenizer = tokenizer
else:
raise ValueError("Invalid value for tokenizer type")
if self.postprocess_opt is not None:
self.logger.info("Loading postprocessor")
self.postprocessor = []
for function_path in self.postprocess_opt:
function = get_function_by_path(function_path)
self.postprocessor.append(function)
self.load_time = timer.tick()
self.reset_unload_timer()
self.loading_lock.set()
def __init__(self, **kwargs):
self._config = copy.deepcopy(kwargs)
mode = "conservative"
if "mode" in kwargs:
mode = kwargs["mode"]
del kwargs["mode"]
self._tokenizer = pyonmttok.Tokenizer(mode, **kwargs)
elif self.tokenizer_opt['type'] == 'pyonmttok':
if "params" not in self.tokenizer_opt:
raise ValueError(
"Missing mandatory tokenizer option 'params'")
import pyonmttok
if self.tokenizer_opt["mode"] is not None:
mode = self.tokenizer_opt["mode"]
else:
mode = None
# load can be called multiple times: modify copy
tokenizer_params = dict(self.tokenizer_opt["params"])
for key, value in self.tokenizer_opt["params"].items():
if key.endswith("path"):
tokenizer_params[key] = os.path.join(
self.model_root, value)
tokenizer = pyonmttok.Tokenizer(mode,
**tokenizer_params)
self.tokenizer = tokenizer
else:
raise ValueError("Invalid value for tokenizer type")
self.load_time = timer.tick()
self.reset_unload_timer()
self.loading_lock.set()
def __init__(self, export_dir):
imported = tf.saved_model.load(export_dir)
self._translate_fn = imported.signatures["serving_default"]
sp_model_path = os.path.join(export_dir, "assets.extra", "wmtende.model")
self._tokenizer = pyonmttok.Tokenizer("none", sp_model_path=sp_model_path)
def build_tokenizer(args):
"""Builds a tokenizer based on user arguments."""
import pyonmttok
local_args = {}
for k, v in six.iteritems(args):
if isinstance(v, six.string_types):
local_args[k] = v.encode('utf-8')
else:
local_args[k] = v
mode = local_args['mode']
del local_args['mode']
del local_args['vocabulary']
return pyonmttok.Tokenizer(mode, **local_args)