Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for t in split_tokens:
if t in skill_name:
patterns.append(
{
"label": label,
"pattern": self._skill_pattern(
skill_name, t
),
}
)
srsly.write_jsonl(patterns_path, patterns)
return patterns
else:
patterns = srsly.read_jsonl(patterns_path)
return patterns
def train_model(
model, train_path, eval_path, n_iter=10, output=None, tok2vec=None,
):
"""
Train a model from Prodigy annotations and optionally save out the best
model to disk.
"""
spacy.util.fix_random_seed(0)
with msg.loading(f"Loading '{model}'..."):
if model.startswith("blank:"):
nlp = spacy.blank(model.replace("blank:", ""))
else:
nlp = spacy.load(model)
msg.good(f"Loaded model '{model}'")
train_data, labels = format_data(srsly.read_jsonl(train_path))
eval_data, _ = format_data(srsly.read_jsonl(eval_path))
ner = nlp.create_pipe("ner")
for label in labels:
ner.add_label(label)
nlp.add_pipe(ner)
t2v_cfg = {
"embed_rows": 10000,
"token_vector_width": 128,
"conv_depth": 8,
"nr_feature_tokens": 3,
}
optimizer = nlp.begin_training(component_cfg={"ner": t2v_cfg} if tok2vec else {})
if tok2vec:
_load_pretrained_tok2vec(nlp, Path(tok2vec))
batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
best_acc = 0
def load_training_examples(self):
file_path = Path(self.runner.config.model_dir or "") / INPUT_EXAMPLES_FILE_NAME
if not file_path.is_file():
return False
examples = list(srsly.read_jsonl(str(file_path)))
# with file_path.open("r", encoding="utf8") as f:
# for line in f:
# ex = ujson.loads(line)
# examples.append(ex)
self.all_examples = examples
self.skip_first_self_play = True
return str(file_path)
if freqs_loc is not None or clusters_loc is not None:
settings = ["-j"]
if freqs_loc:
settings.append("-f")
if clusters_loc:
settings.append("-c")
msg.warn(
"Incompatible arguments",
"The -f and -c arguments are deprecated, and not compatible "
"with the -j argument, which should specify the same "
"information. Either merge the frequencies and clusters data "
"into the JSONL-formatted file (recommended), or use only the "
"-f and -c files, without the other lexical attributes.",
)
jsonl_loc = ensure_path(jsonl_loc)
lex_attrs = srsly.read_jsonl(jsonl_loc)
else:
clusters_loc = ensure_path(clusters_loc)
freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name)
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
"Sucessfully compiled vocab",
):
"""
Train or update a spaCy model. Requires data to be formatted in spaCy's
JSON format. To convert data from other formats, use the `spacy convert`
command.
"""
util.fix_random_seed()
util.set_env_log(verbose)
# Make sure all files and paths exists if they are needed
train_path = util.ensure_path(train_path)
dev_path = util.ensure_path(dev_path)
meta_path = util.ensure_path(meta_path)
output_path = util.ensure_path(output_path)
if raw_text is not None:
raw_text = list(srsly.read_jsonl(raw_text))
if not train_path or not train_path.exists():
msg.fail("Training data not found", train_path, exits=1)
if not dev_path or not dev_path.exists():
msg.fail("Development data not found", dev_path, exits=1)
if meta_path is not None and not meta_path.exists():
msg.fail("Can't find model meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) if meta_path else {}
if output_path.exists() and [p for p in output_path.iterdir() if p.is_dir()]:
msg.warn(
"Output directory is not empty",
"This can lead to unintended side effects when saving the model. "
"Please use an empty directory or a different path instead. If "
"the specified output path doesn't exist, the directory will be "
"created for you.",
)
if not output_path.exists():
msg.info("Using GPU" if has_gpu else "Not using GPU")
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
msg.good("Created output directory")
srsly.write_json(output_dir / "config.json", config)
msg.good("Saved settings to config.json")
# Load texts from file or stdin
if texts_loc != "-": # reading from a file
texts_loc = Path(texts_loc)
if not texts_loc.exists():
msg.fail("Input text file doesn't exist", texts_loc, exits=1)
with msg.loading("Loading input texts..."):
texts = list(srsly.read_jsonl(texts_loc))
if not texts:
msg.fail("Input file is empty", texts_loc, exits=1)
msg.good("Loaded input texts")
random.shuffle(texts)
else: # reading from stdin
msg.text("Reading input text from stdin...")
texts = srsly.read_jsonl("-")
with msg.loading("Loading model '{}'...".format(vectors_model)):
nlp = util.load_model(vectors_model)
msg.good("Loaded model '{}'".format(vectors_model))
pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name
model = create_pretraining_model(
nlp,
Tok2Vec(
width,
def evaluate_model(model, eval_path):
"""
Evaluate a trained model on Prodigy annotations and print the accuracy.
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [("F-Score", f"{sc.textcat_score:.3f}")]
msg.table(result)
def evaluate_model(model, eval_path):
"""
Evaluate a trained model on Prodigy annotations and print the accuracy.
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [
("Precision", f"{sc.ents_p:.3f}"),
("Recall", f"{sc.ents_r:.3f}"),
("F-Score", f"{sc.ents_f:.3f}"),
]
msg.table(result)
def evaluate_model(model, eval_path):
"""
Evaluate a trained model on Prodigy annotations and print the accuracy.
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [
("Precision", f"{sc.ents_p:.3f}"),
("Recall", f"{sc.ents_r:.3f}"),
("F-Score", f"{sc.ents_f:.3f}"),
]
msg.table(result)