Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def spm_srcs(tmp_path: Path):
input_text = tmp_path / "text"
vocabsize = len(string.ascii_letters) + 4
model_prefix = tmp_path / "model"
model = str(model_prefix) + ".model"
input_sentence_size = 100000
with input_text.open("w") as f:
f.write(string.ascii_letters + "\n")
spm.SentencePieceTrainer.Train(
f"--input={input_text} "
f"--vocab_size={vocabsize} "
f"--model_prefix={model_prefix} "
f"--input_sentence_size={input_sentence_size}"
)
sp = spm.SentencePieceProcessor()
sp.load(model)
with input_text.open("r") as f:
vocabs = {"", "▁"}
for line in f:
tokens = sp.DecodePieces(list(line.strip()))
vocabs |= set(tokens)
return model, vocabs
def make_bert_vocab(input_fname, output_fname):
train = '--input=' + input_fname + ' --model_prefix=sentpiece --vocab_size=32000 --model_type=bpe --character_coverage=0.9995'
spm.SentencePieceTrainer.Train(train)
with open('sentpiece.vocab', 'r', encoding='utf-8') as f1, \
open(output_fname, 'w', encoding='utf-8') as f2:
f2.writelines("[PAD]\n[UNK]\n[CLS]\n[SEP]\n[MASK]\n")
for line in f1:
word = line.replace('\n', '').split('\t')[0].replace('▁', '##')
if not word or word in ["##", "", "<s>", "</s>"]: continue
f2.writelines(word + "\n")
if use_model_path is None:
use_model_path = random_hash
if use_model_path.endswith('.model'):
use_model_path = use_model_path[:use_model_path.rfind('.model')]
input_path = use_model_path+'.txt.'+random_hash
print('Writing temporary dataset for tokenization to '+input_path)
line_count, maxlenline = write_corpus_as_lines(corpus, input_path)
line_count = min(line_count, MAX_SENTENCEPIECE_SENTENCES)
print('Training sentencepiece model')
train_string = '--input={file_path} --model_prefix={model_prefix} --vocab_size={vocab_size}' \
+ ' --model_type={model_type} --input_sentence_size={input_sentence_size} --character_coverage={character_coverage} ' \
+ '--max_sentence_length={max_len}'
train_string = train_string.format(file_path=input_path, model_prefix=use_model_path, vocab_size=num_text_tokens,
model_type=self.model_type, input_sentence_size=int(line_count), character_coverage=self.character_coverage,#)#,
max_len=str(maxlenline))
spm.SentencePieceTrainer.Train(train_string)
os.remove(input_path)
self.spm_model = use_model_path+'.model'
print('Sentencepiece model written to '+self.spm_model)
filepaths = glob.glob(os.path.join(args.dataset_dir, "*.txt"))
print("Found {} files, concatenenating dataset into one file..."
.format(len(filepaths)))
with open(MERGED_FILE, "w") as f:
for filepath in tqdm(filepaths):
f.write(open(filepath, "r", errors="ignore").read())
train_path = MERGED_FILE
elif args.train_path is not None:
train_path = args.train_path
else:
print("One of 'dataset_dir' and 'train_path' must be specified")
return
SPT.Train("--input={} ".format(train_path) +
"--model_prefix={} ".format(args.model_prefix) +
"--vocab_size={} ".format(args.vocab_size
- args.num_placeholders) +
"--input_sentence_size={} ".format(args.sample_size) +
"--shuffle_input_sentence=true " +
"--hard_vocab_limit=false " +
"--bos_id=-1 " +
"--eos_id=-1")
# Add BERT control symbols
vocab = ["[PAD]"]
tokens = []
with open("{}.vocab".format(args.model_prefix), "r") as f:
# Skip first token
f.seek(8)
def train(self, raw_text_path):
"Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
from sentencepiece import SentencePieceTrainer
vocab_sz = self._get_vocab_sz(raw_text_path) if self.vocab_sz is None else self.vocab_sz
spec_tokens = ['\u2581'+s for s in self.special_toks]
SentencePieceTrainer.Train(" ".join([
f"--input={raw_text_path} --vocab_size={vocab_sz} --model_prefix={self.cache_dir/'spm'}",
f"--character_coverage={self.char_coverage} --model_type={self.model_type}",
f"--unk_id={len(spec_tokens)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
f"--user_defined_symbols={','.join(spec_tokens)}"]))
raw_text_path.unlink()
return self.cache_dir/'spm.model'
print("Creating BPE model...")
all_lines = []
for speaker in speakers:
for f in speakers[speaker]:
with open(f, "r", encoding="utf8") as tf:
all_lines.append(tf.read())
with open("temp.txt","w",encoding="utf8") as f:
for line in all_lines:
f.write(line)
if not os.path.exists(output_lookup_folder):
os.makedirs(output_lookup_folder)
# TRAIN SENTENCEPIECE MODELS & CREATE LOOKUPS
spm.SentencePieceTrainer.Train('--input=temp.txt --model_prefix=' + os.path.join(output_lookup_folder, "tok")+ ' --character_coverage=1.0 --model_type=bpe --num_threads=8 --split_by_whitespace=true --shuffle_input_sentence=true --max_sentence_length=8000 --vocab_size=' + str(
vocab_size))
print("Done.")
lookup = Lookup(type="bpe")
lookup.save_special_tokens(file_prefix=os.path.join(output_lookup_folder, "tok"))
# check everything is ok
lookup = Lookup(type="bpe")
lookup.load(file_prefix=os.path.join(output_lookup_folder,"tok"))
text = "This is a simple test."
token_ids = lookup.encode(text)
print("Encode: {}".format(token_ids))
recreated_string = lookup.decode(token_ids)
print("Decode: [{}]".format(recreated_string))
print("Map w2i:")
tokens = lookup.tokenize(text)
def make_xlnet_vocab(input_fname, output_fname):
SPM_COMMAND = ('--input={} '
'--model_prefix={} '
'--vocab_size={} '
'--character_coverage={} '
'--shuffle_input_sentence=true '
'--model_type=unigram '
'--control_symbols=,,,
_write(prepro_train1, "iwslt2016/prepro/train.de")
_write(prepro_train2, "iwslt2016/prepro/train.en")
_write(prepro_train1+prepro_train2, "iwslt2016/prepro/train")
_write(prepro_eval1, "iwslt2016/prepro/eval.de")
_write(prepro_eval2, "iwslt2016/prepro/eval.en")
_write(prepro_test1, "iwslt2016/prepro/test.de")
_write(prepro_test2, "iwslt2016/prepro/test.en")
logging.info("# Train a joint BPE model with sentencepiece")
os.makedirs("iwslt2016/segmented", exist_ok=True)
train = '--input=iwslt2016/prepro/train --pad_id=0 --unk_id=1 \
--bos_id=2 --eos_id=3\
--model_prefix=iwslt2016/segmented/bpe --vocab_size={} \
--model_type=bpe'.format(hp.vocab_size)
spm.SentencePieceTrainer.Train(train)
logging.info("# Load trained bpe model")
sp = spm.SentencePieceProcessor()
sp.Load("iwslt2016/segmented/bpe.model")
logging.info("# Segment")
def _segment_and_write(sents, fname):
with open(fname, "w") as fout:
for sent in sents:
pieces = sp.EncodeAsPieces(sent)
fout.write(" ".join(pieces) + "\n")
_segment_and_write(prepro_train1, "iwslt2016/segmented/train.de.bpe")
_segment_and_write(prepro_train2, "iwslt2016/segmented/train.en.bpe")
_segment_and_write(prepro_eval1, "iwslt2016/segmented/eval.de.bpe")
_segment_and_write(prepro_eval2, "iwslt2016/segmented/eval.en.bpe")