How to use sentencepiece - 10 common examples

To help you get started, we’ve selected a few sentencepiece examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github castorini / d-bert / dbert / generate / compute_bpe_ratio.py View on Github external
def main():
    parser = argparse.ArgumentParser()
    add_dict_options(parser, ARGS)
    args = parser.parse_args()

    tot_n = 0
    comp_n = 0
    sp = spm.SentencePieceProcessor()
    sp.Load(args.spm_model)

    lines = list(tqdm(sys.stdin))
    pbar = tqdm(lines)
    for idx, line in enumerate(pbar):
        line.strip()
        tot_n += len(line)
        comp_n += len(sp.EncodeAsIds(line))
        if (idx + 1) % (len(lines) // 20) == 0:
            pbar.set_postfix(cp_ratio=f'{tot_n / comp_n:.4f}')
    print(f'{tot_n / comp_n} compression ratio')
github espnet / espnet / test / espnet2 / utils / test_text_converter.py View on Github external
def spm_srcs(tmp_path: Path):
    input_text = tmp_path / "text"
    vocabsize = len(string.ascii_letters) + 4
    model_prefix = tmp_path / "model"
    model = str(model_prefix) + ".model"
    input_sentence_size = 100000

    with input_text.open("w") as f:
        f.write(string.ascii_letters + "\n")

    spm.SentencePieceTrainer.Train(
        f"--input={input_text} "
        f"--vocab_size={vocabsize} "
        f"--model_prefix={model_prefix} "
        f"--input_sentence_size={input_sentence_size}"
    )
    sp = spm.SentencePieceProcessor()
    sp.load(model)

    with input_text.open("r") as f:
        vocabs = {"", "▁"}
        for line in f:
            tokens = sp.DecodePieces(list(line.strip()))
        vocabs |= set(tokens)
    return model, vocabs
github facebookresearch / wav2letter / recipes / models / seq2seq_tds / librispeech / prepare.py View on Github external
transcription = line.strip().split(" ")[3:]
                        if key == "train":
                            ftext.write(" ".join(transcription) + "\n")
                        word_dict[key].update(transcription)
    lexicon_words = sorted(word_dict["train"] | word_dict["dev"])

    # train
    print("Computing word pieces...\n", flush=True)
    train_cmd = (
        "--input={input} --model_prefix={prefix} --vocab_size={sz}"
        " --character_coverage=1.0 --model_type=unigram"
        " --split_by_unicode_script=false".format(
            input=train_all_text, prefix=prefix, sz=num_wordpieces
        )
    )
    spm.SentencePieceTrainer.Train(train_cmd)

    # word piece dictionary
    print("Creating word piece list...\n", flush=True)
    exclude_list = {"", "<s>", "</s>"}
    with open(vocab_name.replace(".vocab", ".tokens"), "w") as fvocab_filt:
        with open(vocab_name, "r", encoding="utf-8") as fvocab:
            for line in fvocab:
                val, _ = line.strip().split("\t", 1)
                if val not in exclude_list:
                    fvocab_filt.write(val.replace("\u2581", "_") + "\n")

    # word -&gt; word piece lexicon for loading targets
    print("Creating word -&gt; word pieces lexicon...\n", flush=True)
    sp = spm.SentencePieceProcessor()
    sp.Load(model_name)
    lexicon_name = "librispeech-train+dev-unigram-{sz}-nbest{n}.lexicon".format(
github Koziev / chatbot / ruchatbot / experiments / train_synonymy_detector_xgb_pairwise_ranking.py View on Github external
def fit(self, phrases):
        sp_corpus_path = os.path.join(tmp_dir, 'new_synonymy_detector.sentence_piece_corpus.txt')
        if not os.path.exists(sp_corpus_path):
            with io.open(sp_corpus_path, 'w', encoding='utf-8') as wrt:
                for phrase in phrases:
                    wrt.write(u'{}\n'.format(phrase))

        sp_model_name = 'new_synonymy_detector_{}'.format(self.vocab_size)
        if not os.path.exists(os.path.join(tmp_dir, sp_model_name + '.vocab')):
            logging.info('Start training SentencePiece for vocab_size={}'.format(self.vocab_size))
            spm.SentencePieceTrainer.Train(
                '--input={} --model_prefix={} --vocab_size={} --model_type=bpe'.format(sp_corpus_path, sp_model_name, self.vocab_size))
            os.rename(sp_model_name + '.vocab', os.path.join(tmp_dir, sp_model_name + '.vocab'))
            os.rename(sp_model_name + '.model', os.path.join(tmp_dir, sp_model_name + '.model'))

        self.splitter = spm.SentencePieceProcessor()
        self.splitter.Load(os.path.join(tmp_dir, sp_model_name + '.model'))

        pieces = set()
        for phrase in phrases:
            px = self.splitter.EncodeAsPieces(phrase)
            pieces.update(px)

        self.piece2index = dict((piece, i) for i, piece in enumerate(pieces))
        self.nb_shingles = len(self.piece2index)
github ratsgo / embedding / preprocess / unsupervised_nlputils.py View on Github external
def make_bert_vocab(input_fname, output_fname):
    train = '--input=' + input_fname + ' --model_prefix=sentpiece --vocab_size=32000 --model_type=bpe --character_coverage=0.9995'
    spm.SentencePieceTrainer.Train(train)
    with open('sentpiece.vocab', 'r', encoding='utf-8') as f1, \
            open(output_fname, 'w', encoding='utf-8') as f2:
        f2.writelines("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
        for line in f1:
            word = line.replace('\n', '').split('\t')[0].replace('▁', '##')
            if not word or word in ["##", "", "<s>", "</s>"]: continue
            f2.writelines(word + "\n")
github akanyaani / gpt-2-tensorflow2.0 / pre_process.py View on Github external
def train_byte_pair_encoding(vocab_size):
	print("Training BytePair encoding......")
	token_dict = Counter()
	with open(PROCESS_DATA_PATH, 'r') as fr:
		for line in tqdm.tqdm(fr):
			token_dict.update(line.lower().split())

	with open(BPE_TSV_PATH, 'w', newline='') as f_output:
		tsv_output = csv.writer(f_output, delimiter='\t')
		for word in token_dict:
			tsv_output.writerow([word, token_dict[word]])

	spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(
		spm_input=BPE_TSV_PATH, spm_model=BPE_MODEL_PATH, vocab_size=vocab_size)
	spm.SentencePieceTrainer.train(spmcmd)
github monologg / DistilKoBERT / tokenization_kobert.py View on Github external
def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)
github zihangdai / xlnet / run_classifier.py View on Github external
raise ValueError(
        "At least one of `do_train`, `do_eval, `do_predict` or "
        "`do_submit` must be True.")

  if not tf.gfile.Exists(FLAGS.output_dir):
    tf.gfile.MakeDirs(FLAGS.output_dir)

  task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name]()
  label_list = processor.get_labels() if not FLAGS.is_regression else None

  sp = spm.SentencePieceProcessor()
  sp.Load(FLAGS.spiece_model_file)
  def tokenize_fn(text):
    text = preprocess_text(text, lower=FLAGS.uncased)
    return encode_ids(sp, text)

  run_config = model_utils.configure_tpu(FLAGS)

  model_fn = get_model_fn(len(label_list) if label_list is not None else None)

  spm_basename = os.path.basename(FLAGS.spiece_model_file)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  if FLAGS.use_tpu:
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
github kamalkraj / ALBERT-TF2.0 / tokenization.py View on Github external
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
    self.vocab = None
    self.sp_model = None
    if spm_model_file:
      self.sp_model = spm.SentencePieceProcessor()
      logging.info("loading sentence piece model")
      self.sp_model.Load(spm_model_file)
      # Note(mingdachen): For the purpose of consisent API, we are
      # generating a vocabulary for the sentence piece tokenizer.
      self.vocab = {self.sp_model.IdToPiece(i): i for i
                    in range(self.sp_model.GetPieceSize())}
    else:
      self.vocab = load_vocab(vocab_file)
      self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
      self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    self.inv_vocab = {v: k for k, v in self.vocab.items()}
github NVIDIA / Megatron-LM / data_utils / tokenization.py View on Github external
def load_spm_model(self):
        """load sentencepiece model and parse vocab"""
        if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'):
            self.spm_model = self.spm_model+'.model'
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(self.spm_model)
        self.vocab_size = self.num_text_tokens = len(self.sp)
        self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
        self._vocab = {t: i for i,t in enumerate(self._tokens)}

sentencepiece

SentencePiece python wrapper

Apache-2.0
Latest version published 10 months ago

Package Health Score

90 / 100
Full package analysis