How to use the sentencepiece.SentencePieceTrainer.Train function in sentencepiece

To help you get started, we’ve selected a few sentencepiece examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github facebookresearch / wav2letter / recipes / models / seq2seq_tds / librispeech / prepare.py View on Github external
transcription = line.strip().split(" ")[3:]
                        if key == "train":
                            ftext.write(" ".join(transcription) + "\n")
                        word_dict[key].update(transcription)
    lexicon_words = sorted(word_dict["train"] | word_dict["dev"])

    # train
    print("Computing word pieces...\n", flush=True)
    train_cmd = (
        "--input={input} --model_prefix={prefix} --vocab_size={sz}"
        " --character_coverage=1.0 --model_type=unigram"
        " --split_by_unicode_script=false".format(
            input=train_all_text, prefix=prefix, sz=num_wordpieces
        )
    )
    spm.SentencePieceTrainer.Train(train_cmd)

    # word piece dictionary
    print("Creating word piece list...\n", flush=True)
    exclude_list = {"", "<s>", "</s>"}
    with open(vocab_name.replace(".vocab", ".tokens"), "w") as fvocab_filt:
        with open(vocab_name, "r", encoding="utf-8") as fvocab:
            for line in fvocab:
                val, _ = line.strip().split("\t", 1)
                if val not in exclude_list:
                    fvocab_filt.write(val.replace("\u2581", "_") + "\n")

    # word -&gt; word piece lexicon for loading targets
    print("Creating word -&gt; word pieces lexicon...\n", flush=True)
    sp = spm.SentencePieceProcessor()
    sp.Load(model_name)
    lexicon_name = "librispeech-train+dev-unigram-{sz}-nbest{n}.lexicon".format(
github Koziev / chatbot / ruchatbot / experiments / train_synonymy_detector_xgb_pairwise_ranking.py View on Github external
def fit(self, phrases):
        sp_corpus_path = os.path.join(tmp_dir, 'new_synonymy_detector.sentence_piece_corpus.txt')
        if not os.path.exists(sp_corpus_path):
            with io.open(sp_corpus_path, 'w', encoding='utf-8') as wrt:
                for phrase in phrases:
                    wrt.write(u'{}\n'.format(phrase))

        sp_model_name = 'new_synonymy_detector_{}'.format(self.vocab_size)
        if not os.path.exists(os.path.join(tmp_dir, sp_model_name + '.vocab')):
            logging.info('Start training SentencePiece for vocab_size={}'.format(self.vocab_size))
            spm.SentencePieceTrainer.Train(
                '--input={} --model_prefix={} --vocab_size={} --model_type=bpe'.format(sp_corpus_path, sp_model_name, self.vocab_size))
            os.rename(sp_model_name + '.vocab', os.path.join(tmp_dir, sp_model_name + '.vocab'))
            os.rename(sp_model_name + '.model', os.path.join(tmp_dir, sp_model_name + '.model'))

        self.splitter = spm.SentencePieceProcessor()
        self.splitter.Load(os.path.join(tmp_dir, sp_model_name + '.model'))

        pieces = set()
        for phrase in phrases:
            px = self.splitter.EncodeAsPieces(phrase)
            pieces.update(px)

        self.piece2index = dict((piece, i) for i, piece in enumerate(pieces))
        self.nb_shingles = len(self.piece2index)
github eladhoffer / seq2seq.pytorch / seq2seq / tools / tokenizer.py View on Github external
--vocab_size: vocabulary size, e.g., 8000, 16000, or 32000
        --character_coverage: amount of characters covered by the model
        --model_type: model type. Choose from unigram (default), bpe, char, or word. The input sentence must be pretokenized when using word type.
        """
        kwargs.update({'unk_piece': UNK_TOKEN, 'bos_piece': BOS_TOKEN,
                       'eos_piece': EOS_TOKEN, 'pad_piece': PAD_TOKEN,
                       'unk_id': UNK, 'bos_id': BOS,
                       'eos_id': EOS, 'pad_id': PAD,
                       'unk_surface': UNK_TOKEN,
                       })
        for arg, val in kwargs.items():
            if isinstance(val, bool):
                kwargs[arg] = 'true' if val else 'false'
        config = ' '.join(['--{}={}'.format(name, value)
                           for name, value in kwargs.items() if value is not None])
        spm.SentencePieceTrainer.Train(config)
github NVIDIA / OpenSeq2Seq / tokenizer_wrapper.py View on Github external
def train_tokenizer_model(args):
  print("========> Training tokenizer model")
  vocab_size = args.vocab_size
  model_prefix = args.model_prefix
  input_file = args.text_input

  spm.SentencePieceTrainer.Train(
    "--input={0} --model_type=bpe --model_prefix={1} --vocab_size={2} --pad_id={3} --eos_id={4} --bos_id={5} --unk_id={6}"
      .format(input_file,
              model_prefix, vocab_size, 0, # PAD. TODO: these should not be hardcoded
              1, 2, # EOS, SID
              3) # UNK 
  )
github Separius / BERT-keras / data / vocab.py View on Github external
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
                 vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
        super().__init__(vocab_size)
        if not os.path.exists('{}.model'.format(model_name)):
            if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
                raise ValueError(
                    '{} is not a valid model_type for sentence piece, '
                    'valid options are: unigram, bpe, char, word'.format(spm_model_type))
            spm.SentencePieceTrainer.Train(
                '--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
                '--character_coverage={coverage} --model_type={model_type} '
                '--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '.format(
                    input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
                    model_type=spm_model_type.lower()))
        self.sp = spm.SentencePieceProcessor()
        self.sp.load('{}.model'.format(model_name))
github asyml / texar-pytorch / texar / torch / data / tokenizers / sentencepiece_tokenizer.py View on Github external
Path to the cache directory.

        .. _`sentencepiece.SentencePieceTrainer.Train`:
            https://github.com/google/sentencepiece/blob/master/python/sentencepiece.py
        """
        if cache_dir is None:
            cache_path = str(default_download_dir('SentencePiece'))
        else:
            if not os.path.isdir(cache_dir):
                raise ValueError(f"Cache directory ({cache_dir}) should be a "
                                 f"directory.")
            cache_path = os.path.abspath(cache_dir)

        maybe_create_dir(cache_path)

        spm.SentencePieceTrainer.Train(cmd)
        cwd = os.getcwd()

        vocab_file = os.path.join(cwd, cls._VOCAB_FILE_NAMES['vocab_file'])
        out_vocab_file = os.path.join(
            cache_path, cls._VOCAB_FILE_NAMES['vocab_file'])

        if os.path.abspath(vocab_file) != os.path.abspath(out_vocab_file):
            move(vocab_file, out_vocab_file)

        # Delete spiece.vocab (We might want to keep it as well)
        extra_file = vocab_file.rstrip('model') + 'vocab'
        os.remove(extra_file)

        return cache_path
github freewym / espresso / scripts / spm_train.py View on Github external
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from __future__ import absolute_import, division, print_function, unicode_literals

import sys

import sentencepiece as spm


if __name__ == "__main__":
    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
github yyht / BERT / t2t_bert / data_generator / tokenization.py View on Github external
'''
		https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
		see from this tutorial for sentence piece training
		'''
		config = train_config if train_config else self.config
		param = ""
		param += "--input={} ".format(config["corpus"])
		param += "--model_prefix={} ".format(config["model_prefix"])
		param += "--vocab_size={} ".format(config["vocab_size"])
		param += "--model_type={} ".format(config.get("model_type", "unigram"))
		param += "--character_coverage={} ".format(config.get("character_coverage", 0.995))
		param += "--mining_sentence_size={} ".format(config.get("mining_sentence_size", 5000000))
		param += "--input_sentence_size={} ".format(config.get("input_sentence_size", 5000000))
		param += "--max_sentencepiece_length={} ".format(config.get("max_sentencepiece_length", 5))
		try:
			SentencePieceTrainer.Train(param)
			self.sp.Load(config["model_prefix"]+".model")
		except:
			raise ValueError(" training word piece model failed ")

sentencepiece

SentencePiece python wrapper

Apache-2.0
Latest version published 9 months ago

Package Health Score

91 / 100
Full package analysis