Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def create_train_batches(nlp, corpus, cfg):
while True:
train_examples = corpus.train_dataset(
nlp,
noise_level=0.0,
orth_variant_level=cfg["orth_variant_level"],
gold_preproc=cfg["gold_preproc"],
max_length=cfg["max_length"],
ignore_misaligned=True,
)
for batch in util.minibatch_by_words(train_examples, size=cfg["batch_size"]):
yield batch
def annotate(model_dir_path: str, files_dir_path: List[str], out_dir_path: str) -> None:
"""
Annotate a sample of the given XML files and save them into the given directory.
:param model_dir_path: the directory of the Spacy model
:param files_dir_path: the directory containing the XML files
:param out_dir_path: the directory where to write the annotations
"""
logging.info("Loading NER modelโฆ")
nlp = get_empty_model(load_labels_for_training=False)
nlp = nlp.from_disk(model_dir_path)
# TODO remove when we have retrained
infixes = nlp.Defaults.infixes + [r':', r"(?<=[\W\d_])-|-(?=[\W\d_])"]
infixes_regex = spacy.util.compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infixes_regex.finditer
# end of deletion above
entity_typename_builder = EntityTypename()
logging.info("Loading casesโฆ")
cases: List[Case] = list()
for path in files_dir_path:
if path.endswith(".xml"):
case: Case = get_paragraph_from_file(path=path,
keep_paragraph_without_annotation=True)
cases.append(case)
elif path.endswith(".txt"):
with open(path) as f:
lines = f.readlines()
def list_models():
def exclude_dir(dir_name):
# exclude common cache directories and hidden directories
exclude = ("cache", "pycache", "__pycache__")
return dir_name in exclude or dir_name.startswith(".")
data_path = util.get_data_path()
if data_path:
models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()]
return ", ".join([m for m in models if not exclude_dir(m)])
return "-"
elif 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe('ner')
# add labels
for _, annotations in train_data:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
util.env_opt('dropout_to', 0.2),
util.env_opt('dropout_decay', 0.005))
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 32),
util.env_opt('batch_compound', 1.001))
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
best_epoch = 0
best_f1 = 0
for i in range(n_iter):
random.shuffle(train_data)
count = 0
losses = {}
def evaluate(
model,
data_path,
gpu_id=-1,
gold_preproc=False,
displacy_path=None,
displacy_limit=25,
return_scores=False,
):
"""
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)
util.set_env_log(False)
data_path = util.ensure_path(data_path)
displacy_path = util.ensure_path(displacy_path)
if not data_path.exists():
msg.fail("Evaluation data not found", data_path, exits=1)
if displacy_path and not displacy_path.exists():
msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
begin = timer()
scorer = nlp.evaluate(dev_docs, verbose=False)
end = timer()
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
"""
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)
util.set_env_log(False)
data_path = util.ensure_path(data_path)
displacy_path = util.ensure_path(displacy_path)
if not data_path.exists():
msg.fail("Evaluation data not found", data_path, exits=1)
if displacy_path and not displacy_path.exists():
msg.fail("Visualization output directory not found", displacy_path, exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
begin = timer()
scorer = nlp.evaluate(dev_docs, verbose=False)
end = timer()
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
results = {
"Time": "%.2f s" % (end - begin),
"Words": nwords,
"Words/s": "%.0f" % (nwords / (end - begin)),
"TOK": "%.2f" % scorer.token_acc,
"POS": "%.2f" % scorer.tags_acc,
"UAS": "%.2f" % scorer.uas,
"LAS": "%.2f" % scorer.las,
"NER P": "%.2f" % scorer.ents_p,
"NER R": "%.2f" % scorer.ents_r,
"NER F": "%.2f" % scorer.ents_f,
if count % 100 == 0 and count > 0:
print('sum loss: %s' % losses['ner'])
count += 1
# save model to output directory
output_dir_path = Path(output_dir + "/" + str(i))
if not output_dir_path.exists():
output_dir_path.mkdir()
with nlp.use_params(optimizer.averages):
nlp.to_disk(output_dir_path)
print("Saved model to", output_dir_path)
# test the saved model
print("Loading from", output_dir_path)
nlp2 = util.load_model_from_path(output_dir_path)
metrics = evaluate_ner(nlp2, dev_data)
if metrics["f1-measure-untyped"] > best_f1:
best_f1 = metrics["f1-measure-untyped"]
best_epoch = i
# save model to output directory
best_model_path = Path(output_dir + "/" + "best")
if os.path.exists(best_model_path):
shutil.rmtree(best_model_path)
shutil.copytree(os.path.join(output_dir, str(best_epoch)),
best_model_path)
# test the saved model
print("Loading from", best_model_path)
nlp2 = util.load_model_from_path(best_model_path)
def get_tokenizer(model: French) -> Tokenizer:
split_char = r"[ ,\\.()-/\\|:;'\"+=!?_+#โโ'โ]"
extended_infix = [r'[:\\(\\)-\./#"โโ\'โ'] + model.Defaults.infixes
infix_re = spacy.util.compile_infix_regex(extended_infix)
prefix_re = spacy.util.compile_prefix_regex(tuple(list(model.Defaults.prefixes) + [split_char]))
suffix_re = spacy.util.compile_suffix_regex(tuple(list(model.Defaults.suffixes) + [split_char]))
tok = Tokenizer(
model.vocab,
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=None,
)
return tok
('spacy_version', 'Required spaCy version',
'>=%s,<3.0.0' % about.__version__),
('description', 'Model description',
meta.get('description', False)),
('author', 'Author', meta.get('author', False)),
('email', 'Author email', meta.get('email', False)),
('url', 'Author website', meta.get('url', False)),
('license', 'License', meta.get('license', 'CC BY-SA 3.0'))]
nlp = util.load_model_from_path(Path(model_path))
meta['pipeline'] = nlp.pipe_names
meta['vectors'] = {'width': nlp.vocab.vectors_length,
'vectors': len(nlp.vocab.vectors),
'keys': nlp.vocab.vectors.n_keys}
prints(Messages.M047, title=Messages.M046)
for setting, desc, default in settings:
response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response
if about.__title__ != 'spacy':
meta['parent_package'] = about.__title__
return meta