Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def train_model(input_filename):
model = fasttext.train_unsupervised(input_filename, model='skipgram', maxn=0, dim=100, ws=5)
return model
input_path = Path(in_dir)
# Check to see if fasttext_filepath exists
if not input_path.exists() or not input_path.is_dir():
msg.fail("Not a valid input directory", in_dir, exits=1)
tmp_path = input_path / "s2v_input.tmp"
input_files = [p for p in input_path.iterdir() if p.suffix == ".s2v"]
if not input_files:
msg.fail("Input directory contains no .s2v files", in_dir, exits=1)
# fastText expects only one input file and only reads from disk and not
# stdin, so we need to create a temporary file that concatenates the inputs
with tmp_path.open("a", encoding="utf8") as tmp_file:
for input_file in input_files:
with input_file.open("r", encoding="utf8") as f:
tmp_file.write(f.read())
msg.info("Created temporary merged input file", tmp_path)
fasttext_model = fasttext.train_unsupervised(str(tmp_path), thread=n_threads, epoch=epoch, dim=vector_size,
minn=0, maxn=0, minCount=min_count, verbose=verbose)
msg.good("Successfully trained fastText model vectors")
tmp_path.unlink()
msg.good("Deleted temporary input file", tmp_path)
output_file = output_path / f"vectors_w2v_{vector_size}dim.bin"
if save_fasttext_model:
fasttext_model.save_model(str(output_file))
if not output_file.exists() or not output_file.is_file():
msg.fail("Failed to save fastText model to disk", output_file, exits=1)
msg.good("Successfully saved fastText model to disk", output_file)
else:
fasttext_model = None
msg.fail("Must provide an input directory or fastText binary filepath", exits=1)
msg.info("Creating vocabulary file")
def train_fasttext(self, data, model_name, epoch):
model = fasttext.train_unsupervised(data, model='skipgram', minCount=1, epoch=epoch)
model.save_model(model_name)