How to use bpemb - 10 common examples

To help you get started, we’ve selected a few bpemb examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jiangxinyang227 / NLP-Project / text_generator_raw / predictors / predict.py View on Github external
def __init__(self, config):
        super(Predictor, self).__init__(config)
        self.config = config
        self.model = None
        self.sess = None
        # self.builder = tf.saved_model.builder.SavedModelBuilder("savedModel")

        if self.config["use_bpe"]:
            self.bpe_zh = BPEmb(lang="zh", vs=config["vocab_size"])
        else:
            # 加载词汇表
            self.word_to_idx = self.load_vocab()
            self.idx_to_label = {value: key for key, value in self.word_to_idx.items()}

        # 初始化模型
        self.create_model()
        print("load model finished")
        # 加载计算图
        self.load_graph()
        print("load graph finished")
github flairNLP / flair / flair / embeddings.py View on Github external
for sentence in sentences:
            for token in sentence:
                embedding = embedded[index]
                token.set_embedding(self.name, embedding)
                index += 1

        return sentences

    def __str__(self):
        return self.name

    def extra_repr(self):
        return "min_freq={}".format(self.min_freq)


class BPEmbSerializable(BPEmb):
    def __getstate__(self):
        state = self.__dict__.copy()
        # save the sentence piece model as binary file (not as path which may change)
        state["spm_model_binary"] = open(self.model_file, mode="rb").read()
        state["spm"] = None
        return state

    def __setstate__(self, state):
        from bpemb.util import sentencepiece_load

        model_file = self.model_tpl.format(lang=state["lang"], vs=state["vs"])
        self.__dict__ = state

        # write out the binary sentence piece model into the expected directory
        self.cache_dir: Path = Path(flair.cache_root) / "embeddings"
        if "spm_model_binary" in self.__dict__:
github gentaiscool / meta-emb / utils / data.py View on Github external
if word not in word2id:
                word2id[word] = len(word2id)
                id2word[len(id2word)] = word

    for i in range(len(test_inputs)):
        for word in test_inputs[i]:
            if word not in word2id:
                word2id[word] = len(word2id)
                id2word[len(id2word)] = word

    # BPE-LEVEL
    bpe_embs = []
    if bpe_lang_list is not None:
        print("Loading BPE:", bpe_lang_list)
        for i in range(len(bpe_lang_list)):
            bpemb = BPEmb(lang=bpe_lang_list[i], dim=bpe_emb_size, vs=bpe_vocab, cache_dir=bpe_cache)
            bpe_embs.append(bpemb)

    # CHAR-LEVEL
    for i in range(len(word_list)):
        for word in word_list[i]:
            for char in word:
                if char not in char2id:
                    char2id[char] = len(char2id)
                    id2char[len(id2char)] = char

    for i in range(len(train_inputs)):
        for word in train_inputs[i]:
            for char in word:
                if char not in char2id:
                    char2id[char] = len(char2id)
                    id2char[len(id2char)] = char
github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
def _load_file(self, file, archive=False, cache_dir=None):
        if not cache_dir:
            if hasattr(self, "cache_dir"):
                cache_dir = self.cache_dir
            else:
                from tempfile import mkdtemp
                cache_dir = mkdtemp()
        cached_file = Path(cache_dir) / file
        if cached_file.exists():
            return cached_file
        suffix = self.archive_suffix if archive else ""
        file_url = self.base_url + file + suffix
        print("downloading", file_url)
        return http_get(file_url, cached_file, ignore_tardir=True)
github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
*,
            lang: str,
            vs: int = 10000,
            dim: int = 100,
            cache_dir: Path = Path.home() / Path(".cache/bpemb"),
            preprocess: bool = True,
            encode_extra_options: str = None,
            add_pad_emb: bool = False,
            vs_fallback: bool = True):
        self.lang = lang = BPEmb._get_lang(lang)
        if self.lang == 'multi':
            if dim != 300:
                print('Setting dim=300 for multilingual BPEmb')
                dim = 300
        if vs_fallback:
            available = BPEmb.available_vocab_sizes(lang)
            if not available:
                raise ValueError("No BPEmb models for language " + lang)
            if vs not in available:
                available = sorted(available)
                _vs = vs
                if vs < available[0]:
                    vs = available[0]
                else:
                    vs = available[-1]
                print("BPEmb fallback: {} from vocab size {} to {}".format(lang, _vs, vs))
        self.vocab_size = self.vs = vs
        self.dim = dim
        self.cache_dir = Path(cache_dir)
        model_file = self.model_tpl.format(lang=lang, vs=vs)
        self.model_file = self._load_file(model_file)
        self.spm = sentencepiece_load(self.model_file)
github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
state['spm'] = None
        return state

    def __setstate__(self, state):
        # load SentencePiece after the BPEmb object has been unpickled
        model_file = (
            state["cache_dir"] / state["lang"] / state['model_file'].name)
        if not model_file.exists():
            model_rel_path = Path(state["lang"]) / model_file.name
            model_file = self._load_file(
                str(model_rel_path), cache_dir=state["cache_dir"])
        state['spm'] = sentencepiece_load(model_file)
        self.__dict__ = state


__all__ = [BPEmb]
github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
def available_vocab_sizes(lang: str) -> Set[int]:
        """
        Return the available vocabulary sizes for the given language.

        Parameters
        ----------
        lang: ``str'', required
            The language identifier.

        Returns
        -------
            The available vocabulary sizes.
        """
        from .available_vocab_sizes import vocab_sizes
        lang = BPEmb._get_lang(lang)
        return vocab_sizes[lang]
github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
def __setstate__(self, state):
        # load SentencePiece after the BPEmb object has been unpickled
        model_file = (
            state["cache_dir"] / state["lang"] / state['model_file'].name)
        if not model_file.exists():
            model_rel_path = Path(state["lang"]) / model_file.name
            model_file = self._load_file(
                str(model_rel_path), cache_dir=state["cache_dir"])
        state['spm'] = sentencepiece_load(model_file)
        self.__dict__ = state
github flairNLP / flair / flair / embeddings.py View on Github external
# write out the binary sentence piece model into the expected directory
        self.cache_dir: Path = Path(flair.cache_root) / "embeddings"
        if "spm_model_binary" in self.__dict__:
            # if the model was saved as binary and it is not found on disk, write to appropriate path
            if not os.path.exists(self.cache_dir / state["lang"]):
                os.makedirs(self.cache_dir / state["lang"])
            self.model_file = self.cache_dir / model_file
            with open(self.model_file, "wb") as out:
                out.write(self.__dict__["spm_model_binary"])
        else:
            # otherwise, use normal process and potentially trigger another download
            self.model_file = self._load_file(model_file)

        # once the modes if there, load it with sentence piece
        state["spm"] = sentencepiece_load(self.model_file)
github bheinzerling / bpemb / bpemb / bpemb.py View on Github external
if not available:
                raise ValueError("No BPEmb models for language " + lang)
            if vs not in available:
                available = sorted(available)
                _vs = vs
                if vs < available[0]:
                    vs = available[0]
                else:
                    vs = available[-1]
                print("BPEmb fallback: {} from vocab size {} to {}".format(lang, _vs, vs))
        self.vocab_size = self.vs = vs
        self.dim = dim
        self.cache_dir = Path(cache_dir)
        model_file = self.model_tpl.format(lang=lang, vs=vs)
        self.model_file = self._load_file(model_file)
        self.spm = sentencepiece_load(self.model_file)
        if encode_extra_options:
            self.spm.SetEncodeExtraOptions(encode_extra_options)
        emb_file = self.emb_tpl.format(lang=lang, vs=vs, dim=dim)
        self.emb_file = self._load_file(emb_file, archive=True)
        self.emb = load_word2vec_file(self.emb_file, add_pad=add_pad_emb)
        self.most_similar = self.emb.most_similar
        assert self.dim == self.emb.vectors.shape[1]
        self.do_preproc = preprocess
        self.BOS_str = "<s>"
        self.EOS_str = "</s>"
        self.BOS = self.spm.PieceToId(self.BOS_str)
        self.EOS = self.spm.PieceToId(self.EOS_str)