Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
while True:
try:
yield pickle.load(p.stdout)
doc_count += 1
except EOFError:
break
assert doc_count >= 400, "Number of documents (%d) was less than expected (400) from %s. File is likely incomplete" % (
doc_count, labeled_and_featurized_tokens_path
)
else:
logging.warning(
"Could not find %s, recreating it", labeled_and_featurized_tokens_path
)
nonlocal token_stats
if token_stats is None:
token_stats = TokenStatistics(os.path.join(dirname, "all.tokenstats2.gz"))
temp_labeled_and_featurized_tokens_path = \
labeled_and_featurized_tokens_path + ".%d.temp" % os.getpid()
with multiprocessing_generator.ParallelGenerator(
labeled_tokens(), max_lookahead=64
) as docs:
docs = docs_with_normalized_features(
model_settings.max_page_number,
model_settings.token_hash_size,
model_settings.font_hash_size,
token_stats,
docs)
with gzip.open(temp_labeled_and_featurized_tokens_path, "wb") as f:
for doc in docs:
yield doc
pickle.dump(doc, f)