Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
transcription = line.strip().split(" ")[3:]
if key == "train":
ftext.write(" ".join(transcription) + "\n")
word_dict[key].update(transcription)
lexicon_words = sorted(word_dict["train"] | word_dict["dev"])
# train
print("Computing word pieces...\n", flush=True)
train_cmd = (
"--input={input} --model_prefix={prefix} --vocab_size={sz}"
" --character_coverage=1.0 --model_type=unigram"
" --split_by_unicode_script=false".format(
input=train_all_text, prefix=prefix, sz=num_wordpieces
)
)
spm.SentencePieceTrainer.Train(train_cmd)
# word piece dictionary
print("Creating word piece list...\n", flush=True)
exclude_list = {"", "<s>", "</s>"}
with open(vocab_name.replace(".vocab", ".tokens"), "w") as fvocab_filt:
with open(vocab_name, "r", encoding="utf-8") as fvocab:
for line in fvocab:
val, _ = line.strip().split("\t", 1)
if val not in exclude_list:
fvocab_filt.write(val.replace("\u2581", "_") + "\n")
# word -> word piece lexicon for loading targets
print("Creating word -> word pieces lexicon...\n", flush=True)
sp = spm.SentencePieceProcessor()
sp.Load(model_name)
lexicon_name = "librispeech-train+dev-unigram-{sz}-nbest{n}.lexicon".format(
def fit(self, phrases):
sp_corpus_path = os.path.join(tmp_dir, 'new_synonymy_detector.sentence_piece_corpus.txt')
if not os.path.exists(sp_corpus_path):
with io.open(sp_corpus_path, 'w', encoding='utf-8') as wrt:
for phrase in phrases:
wrt.write(u'{}\n'.format(phrase))
sp_model_name = 'new_synonymy_detector_{}'.format(self.vocab_size)
if not os.path.exists(os.path.join(tmp_dir, sp_model_name + '.vocab')):
logging.info('Start training SentencePiece for vocab_size={}'.format(self.vocab_size))
spm.SentencePieceTrainer.Train(
'--input={} --model_prefix={} --vocab_size={} --model_type=bpe'.format(sp_corpus_path, sp_model_name, self.vocab_size))
os.rename(sp_model_name + '.vocab', os.path.join(tmp_dir, sp_model_name + '.vocab'))
os.rename(sp_model_name + '.model', os.path.join(tmp_dir, sp_model_name + '.model'))
self.splitter = spm.SentencePieceProcessor()
self.splitter.Load(os.path.join(tmp_dir, sp_model_name + '.model'))
pieces = set()
for phrase in phrases:
px = self.splitter.EncodeAsPieces(phrase)
pieces.update(px)
self.piece2index = dict((piece, i) for i, piece in enumerate(pieces))
self.nb_shingles = len(self.piece2index)
--vocab_size: vocabulary size, e.g., 8000, 16000, or 32000
--character_coverage: amount of characters covered by the model
--model_type: model type. Choose from unigram (default), bpe, char, or word. The input sentence must be pretokenized when using word type.
"""
kwargs.update({'unk_piece': UNK_TOKEN, 'bos_piece': BOS_TOKEN,
'eos_piece': EOS_TOKEN, 'pad_piece': PAD_TOKEN,
'unk_id': UNK, 'bos_id': BOS,
'eos_id': EOS, 'pad_id': PAD,
'unk_surface': UNK_TOKEN,
})
for arg, val in kwargs.items():
if isinstance(val, bool):
kwargs[arg] = 'true' if val else 'false'
config = ' '.join(['--{}={}'.format(name, value)
for name, value in kwargs.items() if value is not None])
spm.SentencePieceTrainer.Train(config)
def train_tokenizer_model(args):
print("========> Training tokenizer model")
vocab_size = args.vocab_size
model_prefix = args.model_prefix
input_file = args.text_input
spm.SentencePieceTrainer.Train(
"--input={0} --model_type=bpe --model_prefix={1} --vocab_size={2} --pad_id={3} --eos_id={4} --bos_id={5} --unk_id={6}"
.format(input_file,
model_prefix, vocab_size, 0, # PAD. TODO: these should not be hardcoded
1, 2, # EOS, SID
3) # UNK
)
def __init__(self, text_corpus_address: Optional[str], model_name: str = 'spm',
vocab_size: int = 30000, spm_model_type: str = 'unigram') -> None:
super().__init__(vocab_size)
if not os.path.exists('{}.model'.format(model_name)):
if spm_model_type.lower() not in ('unigram', 'bpe', 'char', 'word'):
raise ValueError(
'{} is not a valid model_type for sentence piece, '
'valid options are: unigram, bpe, char, word'.format(spm_model_type))
spm.SentencePieceTrainer.Train(
'--input={input} --model_prefix={model_name} --vocab_size={vocab_size} '
'--character_coverage={coverage} --model_type={model_type} '
'--pad_id=-1 --unk_id=0 --bos_id=-1 --eos_id=-1 --input_sentence_size=100000000 '.format(
input=text_corpus_address, model_name=model_name, vocab_size=vocab_size, coverage=1,
model_type=spm_model_type.lower()))
self.sp = spm.SentencePieceProcessor()
self.sp.load('{}.model'.format(model_name))
Path to the cache directory.
.. _`sentencepiece.SentencePieceTrainer.Train`:
https://github.com/google/sentencepiece/blob/master/python/sentencepiece.py
"""
if cache_dir is None:
cache_path = str(default_download_dir('SentencePiece'))
else:
if not os.path.isdir(cache_dir):
raise ValueError(f"Cache directory ({cache_dir}) should be a "
f"directory.")
cache_path = os.path.abspath(cache_dir)
maybe_create_dir(cache_path)
spm.SentencePieceTrainer.Train(cmd)
cwd = os.getcwd()
vocab_file = os.path.join(cwd, cls._VOCAB_FILE_NAMES['vocab_file'])
out_vocab_file = os.path.join(
cache_path, cls._VOCAB_FILE_NAMES['vocab_file'])
if os.path.abspath(vocab_file) != os.path.abspath(out_vocab_file):
move(vocab_file, out_vocab_file)
# Delete spiece.vocab (We might want to keep it as well)
extra_file = vocab_file.rstrip('model') + 'vocab'
os.remove(extra_file)
return cache_path
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import sentencepiece as spm
if __name__ == "__main__":
spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
'''
https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
see from this tutorial for sentence piece training
'''
config = train_config if train_config else self.config
param = ""
param += "--input={} ".format(config["corpus"])
param += "--model_prefix={} ".format(config["model_prefix"])
param += "--vocab_size={} ".format(config["vocab_size"])
param += "--model_type={} ".format(config.get("model_type", "unigram"))
param += "--character_coverage={} ".format(config.get("character_coverage", 0.995))
param += "--mining_sentence_size={} ".format(config.get("mining_sentence_size", 5000000))
param += "--input_sentence_size={} ".format(config.get("input_sentence_size", 5000000))
param += "--max_sentencepiece_length={} ".format(config.get("max_sentencepiece_length", 5))
try:
SentencePieceTrainer.Train(param)
self.sp.Load(config["model_prefix"]+".model")
except:
raise ValueError(" training word piece model failed ")