Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_make_split_key(word, sense, expected):
assert make_key(word, sense) == expected
assert split_key(expected) == (word, sense)
with input_path.open("rb") as f:
doc_bin_bytes = f.read()
doc_bin = DocBin().from_bytes(doc_bin_bytes)
msg.good(f"Loaded {len(doc_bin)} parsed docs")
docs = doc_bin.get_docs(nlp.vocab)
output_file = output_path / f"{input_path.stem}.s2v"
lines_count = 0
words_count = 0
with output_file.open("w", encoding="utf8") as f:
for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
doc = merge_phrases(doc)
words = []
for token in doc:
if not token.is_space:
word, sense = make_spacy_key(token, prefer_ents=True)
words.append(make_key(word, sense))
f.write(" ".join(words) + "\n")
lines_count += 1
words_count += len(words)
msg.good(
f"Successfully preprocessed {lines_count} docs ({words_count} words)",
output_file.resolve(),
)