Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def main():
parser = argparse.ArgumentParser()
add_dict_options(parser, ARGS)
args = parser.parse_args()
tot_n = 0
comp_n = 0
sp = spm.SentencePieceProcessor()
sp.Load(args.spm_model)
lines = list(tqdm(sys.stdin))
pbar = tqdm(lines)
for idx, line in enumerate(pbar):
line.strip()
tot_n += len(line)
comp_n += len(sp.EncodeAsIds(line))
if (idx + 1) % (len(lines) // 20) == 0:
pbar.set_postfix(cp_ratio=f'{tot_n / comp_n:.4f}')
print(f'{tot_n / comp_n} compression ratio')
def spm_srcs(tmp_path: Path):
input_text = tmp_path / "text"
vocabsize = len(string.ascii_letters) + 4
model_prefix = tmp_path / "model"
model = str(model_prefix) + ".model"
input_sentence_size = 100000
with input_text.open("w") as f:
f.write(string.ascii_letters + "\n")
spm.SentencePieceTrainer.Train(
f"--input={input_text} "
f"--vocab_size={vocabsize} "
f"--model_prefix={model_prefix} "
f"--input_sentence_size={input_sentence_size}"
)
sp = spm.SentencePieceProcessor()
sp.load(model)
with input_text.open("r") as f:
vocabs = {"", "▁"}
for line in f:
tokens = sp.DecodePieces(list(line.strip()))
vocabs |= set(tokens)
return model, vocabs
transcription = line.strip().split(" ")[3:]
if key == "train":
ftext.write(" ".join(transcription) + "\n")
word_dict[key].update(transcription)
lexicon_words = sorted(word_dict["train"] | word_dict["dev"])
# train
print("Computing word pieces...\n", flush=True)
train_cmd = (
"--input={input} --model_prefix={prefix} --vocab_size={sz}"
" --character_coverage=1.0 --model_type=unigram"
" --split_by_unicode_script=false".format(
input=train_all_text, prefix=prefix, sz=num_wordpieces
)
)
spm.SentencePieceTrainer.Train(train_cmd)
# word piece dictionary
print("Creating word piece list...\n", flush=True)
exclude_list = {"", "<s>", "</s>"}
with open(vocab_name.replace(".vocab", ".tokens"), "w") as fvocab_filt:
with open(vocab_name, "r", encoding="utf-8") as fvocab:
for line in fvocab:
val, _ = line.strip().split("\t", 1)
if val not in exclude_list:
fvocab_filt.write(val.replace("\u2581", "_") + "\n")
# word -> word piece lexicon for loading targets
print("Creating word -> word pieces lexicon...\n", flush=True)
sp = spm.SentencePieceProcessor()
sp.Load(model_name)
lexicon_name = "librispeech-train+dev-unigram-{sz}-nbest{n}.lexicon".format(
def fit(self, phrases):
sp_corpus_path = os.path.join(tmp_dir, 'new_synonymy_detector.sentence_piece_corpus.txt')
if not os.path.exists(sp_corpus_path):
with io.open(sp_corpus_path, 'w', encoding='utf-8') as wrt:
for phrase in phrases:
wrt.write(u'{}\n'.format(phrase))
sp_model_name = 'new_synonymy_detector_{}'.format(self.vocab_size)
if not os.path.exists(os.path.join(tmp_dir, sp_model_name + '.vocab')):
logging.info('Start training SentencePiece for vocab_size={}'.format(self.vocab_size))
spm.SentencePieceTrainer.Train(
'--input={} --model_prefix={} --vocab_size={} --model_type=bpe'.format(sp_corpus_path, sp_model_name, self.vocab_size))
os.rename(sp_model_name + '.vocab', os.path.join(tmp_dir, sp_model_name + '.vocab'))
os.rename(sp_model_name + '.model', os.path.join(tmp_dir, sp_model_name + '.model'))
self.splitter = spm.SentencePieceProcessor()
self.splitter.Load(os.path.join(tmp_dir, sp_model_name + '.model'))
pieces = set()
for phrase in phrases:
px = self.splitter.EncodeAsPieces(phrase)
pieces.update(px)
self.piece2index = dict((piece, i) for i, piece in enumerate(pieces))
self.nb_shingles = len(self.piece2index)
def make_bert_vocab(input_fname, output_fname):
train = '--input=' + input_fname + ' --model_prefix=sentpiece --vocab_size=32000 --model_type=bpe --character_coverage=0.9995'
spm.SentencePieceTrainer.Train(train)
with open('sentpiece.vocab', 'r', encoding='utf-8') as f1, \
open(output_fname, 'w', encoding='utf-8') as f2:
f2.writelines("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
for line in f1:
word = line.replace('\n', '').split('\t')[0].replace('▁', '##')
if not word or word in ["##", "", "<s>", "</s>"]: continue
f2.writelines(word + "\n")
def train_byte_pair_encoding(vocab_size):
print("Training BytePair encoding......")
token_dict = Counter()
with open(PROCESS_DATA_PATH, 'r') as fr:
for line in tqdm.tqdm(fr):
token_dict.update(line.lower().split())
with open(BPE_TSV_PATH, 'w', newline='') as f_output:
tsv_output = csv.writer(f_output, delimiter='\t')
for word in token_dict:
tsv_output.writerow([word, token_dict[word]])
spmcmd = '--input={spm_input} --model_prefix={spm_model} --input_format=tsv --vocab_size={vocab_size} --user_defined_symbols=[SEP],[BOS],[EOS] --hard_vocab_limit=false --model_type=bpe --pad_id=0 --unk_id=1 --bos_id=-1 --eos_id=-1 --pad_piece=[PAD] --unk_piece=[UNK]'.format(
spm_input=BPE_TSV_PATH, spm_model=BPE_MODEL_PATH, vocab_size=vocab_size)
spm.SentencePieceTrainer.train(spmcmd)
def __setstate__(self, d):
self.__dict__ = d
try:
import sentencepiece as spm
except ImportError:
logger.warning("You need to install SentencePiece to use KoBertTokenizer: https://github.com/google/sentencepiece"
"pip install sentencepiece")
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(self.vocab_file)
raise ValueError(
"At least one of `do_train`, `do_eval, `do_predict` or "
"`do_submit` must be True.")
if not tf.gfile.Exists(FLAGS.output_dir):
tf.gfile.MakeDirs(FLAGS.output_dir)
task_name = FLAGS.task_name.lower()
if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))
processor = processors[task_name]()
label_list = processor.get_labels() if not FLAGS.is_regression else None
sp = spm.SentencePieceProcessor()
sp.Load(FLAGS.spiece_model_file)
def tokenize_fn(text):
text = preprocess_text(text, lower=FLAGS.uncased)
return encode_ids(sp, text)
run_config = model_utils.configure_tpu(FLAGS)
model_fn = get_model_fn(len(label_list) if label_list is not None else None)
spm_basename = os.path.basename(FLAGS.spiece_model_file)
# If TPU is not available, this will fall back to normal Estimator on CPU
# or GPU.
if FLAGS.use_tpu:
estimator = tf.contrib.tpu.TPUEstimator(
use_tpu=FLAGS.use_tpu,
def __init__(self, vocab_file, do_lower_case=True, spm_model_file=None):
self.vocab = None
self.sp_model = None
if spm_model_file:
self.sp_model = spm.SentencePieceProcessor()
logging.info("loading sentence piece model")
self.sp_model.Load(spm_model_file)
# Note(mingdachen): For the purpose of consisent API, we are
# generating a vocabulary for the sentence piece tokenizer.
self.vocab = {self.sp_model.IdToPiece(i): i for i
in range(self.sp_model.GetPieceSize())}
else:
self.vocab = load_vocab(vocab_file)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
def load_spm_model(self):
"""load sentencepiece model and parse vocab"""
if not os.path.exists(self.spm_model) and not self.spm_model.endswith('.model'):
self.spm_model = self.spm_model+'.model'
self.sp = spm.SentencePieceProcessor()
self.sp.Load(self.spm_model)
self.vocab_size = self.num_text_tokens = len(self.sp)
self._tokens = [self.IdToToken(t) for t in range(self.vocab_size)]
self._vocab = {t: i for i,t in enumerate(self._tokens)}