How to use pyonmttok - 7 common examples

To help you get started, we’ve selected a few pyonmttok examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github OpenNMT / OpenNMT-tf / opennmt / bin / build_vocab.py View on Github external
special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
    special_tokens.append(constants.END_OF_SENTENCE_TOKEN)

  vocab = data.Vocab(special_tokens=special_tokens)
  num_oov_buckets = 1

  if args.sentencepiece is not None:
    import pyonmttok  # pylint: disable=import-outside-toplevel
    if args.size_multiple == 1:
      vocab_size = args.size
    else:
      # Round vocabulary size to the next multiple of args.size_multiple
      vocab_size = (
          args.size - (args.size + num_oov_buckets) % args.size_multiple + args.size_multiple)
    sp_params = dict(map(lambda arg: tuple(arg.split("=")), args.sentencepiece))
    sp_trainer = pyonmttok.SentencePieceLearner(
        keep_vocab=True, vocab_size=vocab_size, **sp_params)
    for data_file in args.data:
      sp_trainer.ingest_file(data_file)
    sp_trainer.learn(args.save_vocab, verbose=True)
    args.save_vocab = args.save_vocab + ".vocab"
    vocab.load(args.save_vocab, file_format="sentencepiece")
  else:
    if args.from_vocab is not None:
      vocab.load(args.from_vocab, file_format=args.from_format)
    tokenizer = tokenizers.make_tokenizer(args.tokenizer_config)
    for data_file in args.data:
      vocab.add_from_text(data_file, tokenizer=tokenizer)
    vocab = vocab.prune(max_size=args.size, min_frequency=args.min_frequency)
    vocab.pad_to_multiple(args.size_multiple, num_oov_buckets=num_oov_buckets)

  vocab.serialize(args.save_vocab)
github omerktz / TraFix / open_nmt / onmt / translate / translation_server.py View on Github external
elif self.tokenizer_opt['type'] == 'pyonmttok':
                if "params" not in self.tokenizer_opt:
                    raise ValueError(
                        "Missing mandatory tokenizer option 'params'")
                import pyonmttok
                if self.tokenizer_opt["mode"] is not None:
                    mode = self.tokenizer_opt["mode"]
                else:
                    mode = None
                # load can be called multiple times: modify copy
                tokenizer_params = dict(self.tokenizer_opt["params"])
                for key, value in self.tokenizer_opt["params"].items():
                    if key.endswith("path"):
                        tokenizer_params[key] = os.path.join(
                            self.model_root, value)
                tokenizer = pyonmttok.Tokenizer(mode,
                                                **tokenizer_params)
                self.tokenizer = tokenizer
            else:
                raise ValueError("Invalid value for tokenizer type")

        self.load_time = timer.tick()
        self.reset_unload_timer()
        self.loading_lock.set()
github OpenNMT / OpenNMT-py / onmt / translate / translation_server.py View on Github external
elif self.tokenizer_opt['type'] == 'pyonmttok':
                if "params" not in self.tokenizer_opt:
                    raise ValueError(
                        "Missing mandatory tokenizer option 'params'")
                import pyonmttok
                if self.tokenizer_opt["mode"] is not None:
                    mode = self.tokenizer_opt["mode"]
                else:
                    mode = None
                # load can be called multiple times: modify copy
                tokenizer_params = dict(self.tokenizer_opt["params"])
                for key, value in self.tokenizer_opt["params"].items():
                    if key.endswith("path"):
                        tokenizer_params[key] = os.path.join(
                            self.model_root, value)
                tokenizer = pyonmttok.Tokenizer(mode,
                                                **tokenizer_params)
                self.tokenizer = tokenizer
            else:
                raise ValueError("Invalid value for tokenizer type")

        if self.postprocess_opt is not None:
            self.logger.info("Loading postprocessor")
            self.postprocessor = []

            for function_path in self.postprocess_opt:
                function = get_function_by_path(function_path)
                self.postprocessor.append(function)

        self.load_time = timer.tick()
        self.reset_unload_timer()
        self.loading_lock.set()
github OpenNMT / OpenNMT-tf / opennmt / tokenizers / opennmt_tokenizer.py View on Github external
def __init__(self, **kwargs):
    self._config = copy.deepcopy(kwargs)
    mode = "conservative"
    if "mode" in kwargs:
      mode = kwargs["mode"]
      del kwargs["mode"]
    self._tokenizer = pyonmttok.Tokenizer(mode, **kwargs)
github Niger-Volta-LTI / iranlowo / src / onmt / translate / translation_server.py View on Github external
elif self.tokenizer_opt['type'] == 'pyonmttok':
                if "params" not in self.tokenizer_opt:
                    raise ValueError(
                        "Missing mandatory tokenizer option 'params'")
                import pyonmttok
                if self.tokenizer_opt["mode"] is not None:
                    mode = self.tokenizer_opt["mode"]
                else:
                    mode = None
                # load can be called multiple times: modify copy
                tokenizer_params = dict(self.tokenizer_opt["params"])
                for key, value in self.tokenizer_opt["params"].items():
                    if key.endswith("path"):
                        tokenizer_params[key] = os.path.join(
                            self.model_root, value)
                tokenizer = pyonmttok.Tokenizer(mode,
                                                **tokenizer_params)
                self.tokenizer = tokenizer
            else:
                raise ValueError("Invalid value for tokenizer type")

        self.load_time = timer.tick()
        self.reset_unload_timer()
        self.loading_lock.set()
github OpenNMT / OpenNMT-tf / examples / serving / python / ende_client.py View on Github external
def __init__(self, export_dir):
    imported = tf.saved_model.load(export_dir)
    self._translate_fn = imported.signatures["serving_default"]
    sp_model_path = os.path.join(export_dir, "assets.extra", "wmtende.model")
    self._tokenizer = pyonmttok.Tokenizer("none", sp_model_path=sp_model_path)
github SYSTRAN / similarity / src / tokenizer.py View on Github external
def build_tokenizer(args):
    """Builds a tokenizer based on user arguments."""
    import pyonmttok
    local_args = {}
    for k, v in six.iteritems(args):
        if isinstance(v, six.string_types):
            local_args[k] = v.encode('utf-8')
        else:
            local_args[k] = v
    mode = local_args['mode']
    del local_args['mode']
    del local_args['vocabulary']
    return pyonmttok.Tokenizer(mode, **local_args)

pyonmttok

Fast and customizable text tokenization library with BPE and SentencePiece support

MIT
Latest version published 2 years ago

Package Health Score

54 / 100
Full package analysis