Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main():
parser = argparse.ArgumentParser()
add_dict_options(parser, ARGS)
args = parser.parse_args()
tot_n = 0
comp_n = 0
sp = spm.SentencePieceProcessor()
sp.Load(args.spm_model)
lines = list(tqdm(sys.stdin))
pbar = tqdm(lines)
for idx, line in enumerate(pbar):
line.strip()
tot_n += len(line)
comp_n += len(sp.EncodeAsIds(line))
if (idx + 1) % (len(lines) // 20) == 0:
pbar.set_postfix(cp_ratio=f'{tot_n / comp_n:.4f}')
print(f'{tot_n / comp_n} compression ratio')
def __setstate__(self, d):
self.__dict__ = d
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
raise ValueError(
"At least one of `do_train`, `do_eval, `do_predict` or "
"`do_submit` must be True.")
if not tf.gfile.Exists(FLAGS.output_dir):
tf.gfile.MakeDirs(FLAGS.output_dir)
task_name = FLAGS.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels() if not FLAGS.is_regression else None
sp = spm.SentencePieceProcessor()
sp.Load(FLAGS.spiece_model_file)
def tokenize_fn(text):
text = preprocess_text(text, lower=FLAGS.uncased)
return encode_ids(sp, text)
run_config = model_utils.configure_tpu(FLAGS)
model_fn = get_model_fn(len(label_list) if label_list is not None else None)
spm_basename = os.path.basename(FLAGS.spiece_model_file)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
if FLAGS.use_tpu:
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=FLAGS.use_tpu,
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
self.vocab = None
self.sp_model = None
if spm_model_file:
self.sp_model = spm.SentencePieceProcessor()
logging.info("loading sentence piece model")
self.sp_model.Load(spm_model_file)
# Note(mingdachen): For the purpose of consisent API, we are
# generating a vocabulary for the sentence piece tokenizer.
self.vocab = {self.sp_model.IdToPiece(i): i for i
in range(self.sp_model.GetPieceSize())}
else:
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
def load_spm_model(self):
"""load sentencepiece model and parse vocab"""
if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'):
self.spm_model = self.spm_model+'.model'
self.sp = spm.SentencePieceProcessor()
self.sp.Load(self.spm_model)
self.vocab_size = self.num_text_tokens = len(self.sp)
self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
self._vocab = {t: i for i,t in enumerate(self._tokens)}
def __init__(self,
sp_model_file,
lower_case=False):
"""Construct XLNet tokenizer"""
self.sp_processor = sp.SentencePieceProcessor()
self.sp_processor.Load(sp_model_file)
self.lower_case = lower_case
def __init__(self):
self.logger = logging.getLogger(__name__)
self._get_file(self.sentencepiece_file, self.sentencepiece_file_url)
self._get_file(self.word2vec_file, self.word2vec_file_url)
self.sp = spm.SentencePieceProcessor()
self.sp.Load(self.sentencepiece_file)
self.model = gensim.models.Word2Vec.load(self.word2vec_file)
def __init__(self, path):
self.sp = sentencepiece.SentencePieceProcessor()
self.sp.Load(path)
def __setstate__(self, d):
self.__dict__ = d
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
def create_tf_records(min_seq_len, max_seq_len, per_file_limit=50000):
print("Creating TF Records...............")
s = spm.SentencePieceProcessor()
s.Load(BPE_MODEL_PATH + ".model")
if not os.path.exists(TF_RECORDS):
os.makedirs(TF_RECORDS)
filename = TF_RECORDS + str(datetime.datetime.now().timestamp()) + ".tfrecord"
tf_writer = tf.io.TFRecordWriter(filename)
doc_counts = 0
with open(PROCESS_DATA_PATH, 'r') as f:
for line in tqdm.tqdm(f):
encoded_id = s.encode_as_ids(line)
if max_seq_len > len(encoded_id) > min_seq_len:
inputs = np.array([BOS_ID] + encoded_id)
targets = np.array(encoded_id + [EOS_ID])
example = serialize_example(inputs, targets)
tf_writer.write(example)
doc_counts += 1