How to use the pyonmttok.Tokenizer function in pyonmttok

To help you get started, we’ve selected a few pyonmttok examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github omerktz / TraFix / open_nmt / onmt / translate / translation_server.py View on Github external
elif self.tokenizer_opt['type'] == 'pyonmttok':
                if "params" not in self.tokenizer_opt:
                    raise ValueError(
                        "Missing mandatory tokenizer option 'params'")
                import pyonmttok
                if self.tokenizer_opt["mode"] is not None:
                    mode = self.tokenizer_opt["mode"]
                else:
                    mode = None
                # load can be called multiple times: modify copy
                tokenizer_params = dict(self.tokenizer_opt["params"])
                for key, value in self.tokenizer_opt["params"].items():
                    if key.endswith("path"):
                        tokenizer_params[key] = os.path.join(
                            self.model_root, value)
                tokenizer = pyonmttok.Tokenizer(mode,
                                                **tokenizer_params)
                self.tokenizer = tokenizer
            else:
                raise ValueError("Invalid value for tokenizer type")

        self.load_time = timer.tick()
        self.reset_unload_timer()
        self.loading_lock.set()
github OpenNMT / OpenNMT-py / onmt / translate / translation_server.py View on Github external
elif self.tokenizer_opt['type'] == 'pyonmttok':
                if "params" not in self.tokenizer_opt:
                    raise ValueError(
                        "Missing mandatory tokenizer option 'params'")
                import pyonmttok
                if self.tokenizer_opt["mode"] is not None:
                    mode = self.tokenizer_opt["mode"]
                else:
                    mode = None
                # load can be called multiple times: modify copy
                tokenizer_params = dict(self.tokenizer_opt["params"])
                for key, value in self.tokenizer_opt["params"].items():
                    if key.endswith("path"):
                        tokenizer_params[key] = os.path.join(
                            self.model_root, value)
                tokenizer = pyonmttok.Tokenizer(mode,
                                                **tokenizer_params)
                self.tokenizer = tokenizer
            else:
                raise ValueError("Invalid value for tokenizer type")

        if self.postprocess_opt is not None:
            self.logger.info("Loading postprocessor")
            self.postprocessor = []

            for function_path in self.postprocess_opt:
                function = get_function_by_path(function_path)
                self.postprocessor.append(function)

        self.load_time = timer.tick()
        self.reset_unload_timer()
        self.loading_lock.set()
github OpenNMT / OpenNMT-tf / opennmt / tokenizers / opennmt_tokenizer.py View on Github external
def __init__(self, **kwargs):
    self._config = copy.deepcopy(kwargs)
    mode = "conservative"
    if "mode" in kwargs:
      mode = kwargs["mode"]
      del kwargs["mode"]
    self._tokenizer = pyonmttok.Tokenizer(mode, **kwargs)
github Niger-Volta-LTI / iranlowo / src / onmt / translate / translation_server.py View on Github external
elif self.tokenizer_opt['type'] == 'pyonmttok':
                if "params" not in self.tokenizer_opt:
                    raise ValueError(
                        "Missing mandatory tokenizer option 'params'")
                import pyonmttok
                if self.tokenizer_opt["mode"] is not None:
                    mode = self.tokenizer_opt["mode"]
                else:
                    mode = None
                # load can be called multiple times: modify copy
                tokenizer_params = dict(self.tokenizer_opt["params"])
                for key, value in self.tokenizer_opt["params"].items():
                    if key.endswith("path"):
                        tokenizer_params[key] = os.path.join(
                            self.model_root, value)
                tokenizer = pyonmttok.Tokenizer(mode,
                                                **tokenizer_params)
                self.tokenizer = tokenizer
            else:
                raise ValueError("Invalid value for tokenizer type")

        self.load_time = timer.tick()
        self.reset_unload_timer()
        self.loading_lock.set()
github OpenNMT / OpenNMT-tf / examples / serving / python / ende_client.py View on Github external
def __init__(self, export_dir):
    imported = tf.saved_model.load(export_dir)
    self._translate_fn = imported.signatures["serving_default"]
    sp_model_path = os.path.join(export_dir, "assets.extra", "wmtende.model")
    self._tokenizer = pyonmttok.Tokenizer("none", sp_model_path=sp_model_path)
github SYSTRAN / similarity / src / tokenizer.py View on Github external
def build_tokenizer(args):
    """Builds a tokenizer based on user arguments."""
    import pyonmttok
    local_args = {}
    for k, v in six.iteritems(args):
        if isinstance(v, six.string_types):
            local_args[k] = v.encode('utf-8')
        else:
            local_args[k] = v
    mode = local_args['mode']
    del local_args['mode']
    del local_args['vocabulary']
    return pyonmttok.Tokenizer(mode, **local_args)

pyonmttok

Fast and customizable text tokenization library with BPE and SentencePiece support

MIT
Latest version published 1 year ago

Package Health Score

59 / 100
Full package analysis