How to use the sentencepiece.SentencePieceTrainer function in sentencepiece

To help you get started, we’ve selected a few sentencepiece examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github espnet / espnet / test / espnet2 / utils / test_text_converter.py View on Github external
def spm_srcs(tmp_path: Path):
    input_text = tmp_path / "text"
    vocabsize = len(string.ascii_letters) + 4
    model_prefix = tmp_path / "model"
    model = str(model_prefix) + ".model"
    input_sentence_size = 100000

    with input_text.open("w") as f:
        f.write(string.ascii_letters + "\n")

    spm.SentencePieceTrainer.Train(
        f"--input={input_text} "
        f"--vocab_size={vocabsize} "
        f"--model_prefix={model_prefix} "
        f"--input_sentence_size={input_sentence_size}"
    )
    sp = spm.SentencePieceProcessor()
    sp.load(model)

    with input_text.open("r") as f:
        vocabs = {"", "▁"}
        for line in f:
            tokens = sp.DecodePieces(list(line.strip()))
        vocabs |= set(tokens)
    return model, vocabs
github ratsgo / embedding / preprocess / unsupervised_nlputils.py View on Github external
def make_bert_vocab(input_fname, output_fname):
    train = '--input=' + input_fname + ' --model_prefix=sentpiece --vocab_size=32000 --model_type=bpe --character_coverage=0.9995'
    spm.SentencePieceTrainer.Train(train)
    with open('sentpiece.vocab', 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        f2.writelines("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for line in f1:
            word = line.replace('\n', '').split('\t')[0].replace('▁', '##')
            if not word or word in ["##", "", "<s>", "</s>"]: continue
            f2.writelines(word + "\n")
github NVIDIA / sentiment-discovery / data_utils / tokenization.py View on Github external
if use_model_path is None:
            use_model_path = random_hash
        if use_model_path.endswith('.model'):
            use_model_path = use_model_path[:use_model_path.rfind('.model')]
        input_path = use_model_path+'.txt.'+random_hash
        print('Writing temporary dataset for tokenization to '+input_path)
        line_count, maxlenline = write_corpus_as_lines(corpus, input_path)
        line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES)
        print('Training sentencepiece model')
        train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \
            + ' --model_type={model_type} --input_sentence_size={input_sentence_size} --character_coverage={character_coverage} ' \
            + '--max_sentence_length={max_len}'
        train_string = train_string.format(file_path=input_path, model_prefix=use_model_path, vocab_size=num_text_tokens,
                            model_type=self.model_type, input_sentence_size=int(line_count), character_coverage=self.character_coverage,#)#,
                            max_len=str(maxlenline))
        spm.SentencePieceTrainer.Train(train_string)
        os.remove(input_path)
        self.spm_model = use_model_path+'.model'
        print('Sentencepiece model written to '+self.spm_model)
github NVIDIA / NeMo / examples / nlp / scripts / create_vocab.py View on Github external
filepaths = glob.glob(os.path.join(args.dataset_dir, "*.txt"))
        print("Found {} files, concatenenating dataset into one file..."
              .format(len(filepaths)))

        with open(MERGED_FILE, "w") as f:
            for filepath in tqdm(filepaths):
                f.write(open(filepath, "r", errors="ignore").read())

        train_path = MERGED_FILE
    elif args.train_path is not None:
        train_path = args.train_path
    else:
        print("One of 'dataset_dir' and 'train_path' must be specified")
        return

    SPT.Train("--input={} ".format(train_path) +
              "--model_prefix={} ".format(args.model_prefix) +
              "--vocab_size={} ".format(args.vocab_size
                                        - args.num_placeholders) +
              "--input_sentence_size={} ".format(args.sample_size) +
              "--shuffle_input_sentence=true " +
              "--hard_vocab_limit=false " +
              "--bos_id=-1 " +
              "--eos_id=-1")

    # Add BERT control symbols
    vocab = ["[PAD]"]
    tokens = []

    with open("{}.vocab".format(args.model_prefix), "r") as f:
        # Skip first  token
        f.seek(8)
github fastai / fastai_dev / dev / fastai2 / text / core.py View on Github external
def train(self, raw_text_path):
        "Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
        from sentencepiece import SentencePieceTrainer
        vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
        spec_tokens = ['\u2581'+s for s in self.special_toks]
        SentencePieceTrainer.Train(" ".join([
            f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
            f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
            f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
            f"--user_defined_symbols={','.join(spec_tokens)}"]))
        raw_text_path.unlink()
        return self.cache_dir/'spm.model'
github adobe / NLP-Cube / style / create_bpe_model.py View on Github external
print("Creating BPE model...")
all_lines = []
for speaker in speakers:
    for f in speakers[speaker]:
        with open(f, "r", encoding="utf8") as tf:
            all_lines.append(tf.read())
with open("temp.txt","w",encoding="utf8") as f:
    for line in all_lines:
        f.write(line)

if not os.path.exists(output_lookup_folder):
    os.makedirs(output_lookup_folder)

# TRAIN SENTENCEPIECE MODELS & CREATE LOOKUPS

spm.SentencePieceTrainer.Train('--input=temp.txt --model_prefix=' + os.path.join(output_lookup_folder, "tok")+ ' --character_coverage=1.0 --model_type=bpe --num_threads=8 --split_by_whitespace=true --shuffle_input_sentence=true --max_sentence_length=8000 --vocab_size=' + str(
    vocab_size))
print("Done.")
lookup = Lookup(type="bpe")
lookup.save_special_tokens(file_prefix=os.path.join(output_lookup_folder, "tok"))

# check everything is ok
lookup = Lookup(type="bpe")
lookup.load(file_prefix=os.path.join(output_lookup_folder,"tok"))
text = "This is a simple test."

token_ids = lookup.encode(text)
print("Encode: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode: [{}]".format(recreated_string))
print("Map w2i:")
tokens = lookup.tokenize(text)
github ratsgo / embedding / preprocess / unsupervised_nlputils.py View on Github external
def make_xlnet_vocab(input_fname, output_fname):
    SPM_COMMAND = ('--input={} '
                   '--model_prefix={} '
                   '--vocab_size={} '
                   '--character_coverage={} '
                   '--shuffle_input_sentence=true '
                   '--model_type=unigram '
                   '--control_symbols=,,,
github Kyubyong / transformer / prepro.py View on Github external
_write(prepro_train1, "iwslt2016/prepro/train.de")
    _write(prepro_train2, "iwslt2016/prepro/train.en")
    _write(prepro_train1+prepro_train2, "iwslt2016/prepro/train")
    _write(prepro_eval1, "iwslt2016/prepro/eval.de")
    _write(prepro_eval2, "iwslt2016/prepro/eval.en")
    _write(prepro_test1, "iwslt2016/prepro/test.de")
    _write(prepro_test2, "iwslt2016/prepro/test.en")

    logging.info("# Train a joint BPE model with sentencepiece")
    os.makedirs("iwslt2016/segmented", exist_ok=True)
    train = '--input=iwslt2016/prepro/train --pad_id=0 --unk_id=1 \
             --bos_id=2 --eos_id=3\
             --model_prefix=iwslt2016/segmented/bpe --vocab_size={} \
             --model_type=bpe'.format(hp.vocab_size)
    spm.SentencePieceTrainer.Train(train)

    logging.info("# Load trained bpe model")
    sp = spm.SentencePieceProcessor()
    sp.Load("iwslt2016/segmented/bpe.model")

    logging.info("# Segment")
    def _segment_and_write(sents, fname):
        with open(fname, "w") as fout:
            for sent in sents:
                pieces = sp.EncodeAsPieces(sent)
                fout.write(" ".join(pieces) + "\n")

    _segment_and_write(prepro_train1, "iwslt2016/segmented/train.de.bpe")
    _segment_and_write(prepro_train2, "iwslt2016/segmented/train.en.bpe")
    _segment_and_write(prepro_eval1, "iwslt2016/segmented/eval.de.bpe")
    _segment_and_write(prepro_eval2, "iwslt2016/segmented/eval.en.bpe")

sentencepiece

SentencePiece python wrapper

Apache-2.0
Latest version published 9 months ago

Package Health Score

91 / 100
Full package analysis