How to use the sentencepiece.SentencePieceProcessor function in sentencepiece

To help you get started, we’ve selected a few sentencepiece examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github castorini / d-bert / dbert / generate / compute_bpe_ratio.py View on Github external
def main():
    parser = argparse.ArgumentParser()
    add_dict_options(parser, ARGS)
    args = parser.parse_args()

    tot_n = 0
    comp_n = 0
    sp = spm.SentencePieceProcessor()
    sp.Load(args.spm_model)

    lines = list(tqdm(sys.stdin))
    pbar = tqdm(lines)
    for idx, line in enumerate(pbar):
        line.strip()
        tot_n += len(line)
        comp_n += len(sp.EncodeAsIds(line))
        if (idx + 1) % (len(lines) // 20) == 0:
            pbar.set_postfix(cp_ratio=f'{tot_n / comp_n:.4f}')
    print(f'{tot_n / comp_n} compression ratio')
github monologg / DistilKoBERT / tokenization_kobert.py View on Github external
def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)
github zihangdai / xlnet / run_classifier.py View on Github external
raise ValueError(
        "At least one of `do_train`, `do_eval, `do_predict` or "
        "`do_submit` must be True.")

  if not tf.gfile.Exists(FLAGS.output_dir):
    tf.gfile.MakeDirs(FLAGS.output_dir)

  task_name = FLAGS.task_name.lower()

  if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

  processor = processors[task_name]()
  label_list = processor.get_labels() if not FLAGS.is_regression else None

  sp = spm.SentencePieceProcessor()
  sp.Load(FLAGS.spiece_model_file)
  def tokenize_fn(text):
    text = preprocess_text(text, lower=FLAGS.uncased)
    return encode_ids(sp, text)

  run_config = model_utils.configure_tpu(FLAGS)

  model_fn = get_model_fn(len(label_list) if label_list is not None else None)

  spm_basename = os.path.basename(FLAGS.spiece_model_file)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  if FLAGS.use_tpu:
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
github kamalkraj / ALBERT-TF2.0 / tokenization.py View on Github external
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
    self.vocab = None
    self.sp_model = None
    if spm_model_file:
      self.sp_model = spm.SentencePieceProcessor()
      logging.info("loading sentence piece model")
      self.sp_model.Load(spm_model_file)
      # Note(mingdachen): For the purpose of consisent API, we are
      # generating a vocabulary for the sentence piece tokenizer.
      self.vocab = {self.sp_model.IdToPiece(i): i for i
                    in range(self.sp_model.GetPieceSize())}
    else:
      self.vocab = load_vocab(vocab_file)
      self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
      self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
    self.inv_vocab = {v: k for k, v in self.vocab.items()}
github NVIDIA / Megatron-LM / data_utils / tokenization.py View on Github external
def load_spm_model(self):
        """load sentencepiece model and parse vocab"""
        if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'):
            self.spm_model = self.spm_model+'.model'
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(self.spm_model)
        self.vocab_size = self.num_text_tokens = len(self.sp)
        self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
        self._vocab = {t: i for i,t in enumerate(self._tokens)}
github stevezheng23 / xlnet_extension_tf / run_ner.py View on Github external
def __init__(self,
                 sp_model_file,
                 lower_case=False):
        """Construct XLNet tokenizer"""
        self.sp_processor = sp.SentencePieceProcessor()
        self.sp_processor.Load(sp_model_file)
        self.lower_case = lower_case
github kmadathil / sanskrit_parser / sanskrit_parser / util / lexical_scorer.py View on Github external
def __init__(self):
        self.logger = logging.getLogger(__name__)
        self._get_file(self.sentencepiece_file, self.sentencepiece_file_url)
        self._get_file(self.word2vec_file, self.word2vec_file_url)
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(self.sentencepiece_file)
        self.model = gensim.models.Word2Vec.load(self.word2vec_file)
github lernapparat / lotranslate / classes / lotranslate_backend.py View on Github external
def __init__(self, path):
        self.sp = sentencepiece.SentencePieceProcessor()
        self.sp.Load(path)
github monologg / DistilKoBERT / distilkobert / distilkobert / tokenization_kobert.py View on Github external
def __setstate__(self, d):
        self.__dict__ = d
        try:
            import sentencepiece as spm
        except ImportError:
            logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
                           "pip install sentencepiece")
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)
github akanyaani / gpt-2-tensorflow2.0 / pre_process.py View on Github external
def create_tf_records(min_seq_len, max_seq_len, per_file_limit=50000):
	print("Creating TF Records...............")
	s = spm.SentencePieceProcessor()
	s.Load(BPE_MODEL_PATH + ".model")
	if not os.path.exists(TF_RECORDS):
		os.makedirs(TF_RECORDS)
	filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord"
	tf_writer = tf.io.TFRecordWriter(filename)
	doc_counts = 0
	with open(PROCESS_DATA_PATH, 'r') as f:
		for line in tqdm.tqdm(f):
			encoded_id = s.encode_as_ids(line)
			if max_seq_len > len(encoded_id) > min_seq_len:
				inputs = np.array([BOS_ID] + encoded_id)
				targets = np.array(encoded_id + [EOS_ID])

				example = serialize_example(inputs, targets)
				tf_writer.write(example)
				doc_counts += 1

sentencepiece

SentencePiece python wrapper

Apache-2.0
Latest version published 9 months ago

Package Health Score

91 / 100
Full package analysis