How to use the bpemb.util.sentencepiece_load function in bpemb

To help you get started, we’ve selected a few bpemb examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
def __setstate__(self, state):
        # load SentencePiece after the BPEmb object has been unpickled
        model_file = (
            state["cache_dir"] / state["lang"] / state['model_file'].name)
        if not model_file.exists():
            model_rel_path = Path(state["lang"]) / model_file.name
            model_file = self._load_file(
                str(model_rel_path), cache_dir=state["cache_dir"])
        state['spm'] = sentencepiece_load(model_file)
        self.__dict__ = state
github flairNLP / flair / flair / embeddings.py View on Github external
# write out the binary sentence piece model into the expected directory
        self.cache_dir: Path = Path(flair.cache_root) / "embeddings"
        if "spm_model_binary" in self.__dict__:
            # if the model was saved as binary and it is not found on disk, write to appropriate path
            if not os.path.exists(self.cache_dir / state["lang"]):
                os.makedirs(self.cache_dir / state["lang"])
            self.model_file = self.cache_dir / model_file
            with open(self.model_file, "wb") as out:
                out.write(self.__dict__["spm_model_binary"])
        else:
            # otherwise, use normal process and potentially trigger another download
            self.model_file = self._load_file(model_file)

        # once the modes if there, load it with sentence piece
        state["spm"] = sentencepiece_load(self.model_file)
github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
if not available:
                raise ValueError("No BPEmb models for language " + lang)
            if vs not in available:
                available = sorted(available)
                _vs = vs
                if vs < available[0]:
                    vs = available[0]
                else:
                    vs = available[-1]
                print("BPEmb fallback: {} from vocab size {} to {}".format(lang, _vs, vs))
        self.vocab_size = self.vs = vs
        self.dim = dim
        self.cache_dir = Path(cache_dir)
        model_file = self.model_tpl.format(lang=lang, vs=vs)
        self.model_file = self._load_file(model_file)
        self.spm = sentencepiece_load(self.model_file)
        if encode_extra_options:
            self.spm.SetEncodeExtraOptions(encode_extra_options)
        emb_file = self.emb_tpl.format(lang=lang, vs=vs, dim=dim)
        self.emb_file = self._load_file(emb_file, archive=True)
        self.emb = load_word2vec_file(self.emb_file, add_pad=add_pad_emb)
        self.most_similar = self.emb.most_similar
        assert self.dim == self.emb.vectors.shape[1]
        self.do_preproc = preprocess
        self.BOS_str = "<s>"
        self.EOS_str = "</s>"
        self.BOS = self.spm.PieceToId(self.BOS_str)
        self.EOS = self.spm.PieceToId(self.EOS_str)