How to use the spacy.util.minibatch function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github JulianGerhard21 / bert_spacy_rasa / bert_finetuner_splitset.py View on Github external
optimizer.L2 = 0.0
    learn_rates = cyclic_triangular_rate(
        learn_rate / 3, learn_rate * 3, 2 * len(train_data) // batch_size
    )

    pbar = tqdm.tqdm(total=100, leave=False)
    results = []
    epoch = 0
    step = 0
    eval_every = 100
    patience = 3
    while True:
        # Train and evaluate
        losses = Counter()
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_size)
        for batch in batches:
            optimizer.trf_lr = next(learn_rates)
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.1, losses=losses)
            pbar.update(1)
            if step and (step % eval_every) == 0:
                pbar.close()
                with nlp.use_params(optimizer.averages):
                    scores = evaluate_multiclass(nlp, eval_texts, eval_cats)
                results.append((scores["textcat_acc"], step, epoch))
                print(
                    "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(
                        losses["trf_textcat"],
                        scores["textcat_acc"],
                        scores["textcat_cor"],
                        scores["textcat_wrg"],
github explosion / spaCy / spacy / cli / converters / jsonl2json.py View on Github external
def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
    if lang is None:
        raise ValueError("No --lang specified, but tokenization required")
    json_docs = []
    input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")]
    nlp = get_lang_class(lang)()
    sentencizer = nlp.create_pipe("sentencizer")
    for i, batch in enumerate(minibatch(input_examples, size=n_sents)):
        docs = []
        for record in batch:
            raw_text = record["text"]
            if "entities" in record:
                ents = record["entities"]
            else:
                ents = record["spans"]
            ents = [(e["start"], e["end"], e["label"]) for e in ents]
            doc = nlp.make_doc(raw_text)
            sentencizer(doc)
            spans = [doc.char_span(s, e, label=L) for s, e, L in ents]
            doc.ents = _cleanup_spans(spans)
            docs.append(doc)
        json_docs.append(docs_to_json(docs, id=i))
    return json_docs
github explosion / spaCy / spacy / cli / converters / iob2json.py View on Github external
def merge_sentences(docs, n_sents):
    merged = []
    for group in minibatch(docs, size=n_sents):
        group = list(group)
        first = group.pop(0)
        to_extend = first["paragraphs"][0]["sentences"]
        for sent in group:
            to_extend.extend(sent["paragraphs"][0]["sentences"])
        merged.append(first)
    return merged
github explosion / spaCy / bin / wiki_entity_linking / wikidata_train_entity_linker.py View on Github external
with nlp.disable_pipes(*other_pipes):  # only train Entity Linking
        optimizer = nlp.begin_training()
        optimizer.learn_rate = lr
        optimizer.L2 = l2

    logger.info("Training on {} articles".format(len(train_data)))
    logger.info("Dev testing on {} articles".format(len(dev_data)))

    # baseline performance on dev data
    logger.info("Dev Baseline Accuracies:")
    measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)

    for itn in range(epochs):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
        batchnr = 0

        with nlp.disable_pipes(*other_pipes):
            for batch in batches:
                try:
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs=docs,
                        golds=golds,
                        sgd=optimizer,
                        drop=dropout,
                        losses=losses,
                    )
                    batchnr += 1
                except Exception as e:
                    logger.error("Error updating batch:" + str(e))
github honnibal / spacy-pretrain-polyaxon / lmao-imdb-1k / pretrain_textcat.py View on Github external
with nlp.disable_pipes(*other_pipes):  # only train textcat
        # Params arent passed in properly in spaCy :(. Work around the bug.
        optimizer = nlp.begin_training()
        configure_optimizer(optimizer, opt_params)
        if init_tok2vec is not None:
            with Path(init_tok2vec).open('rb') as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        for i in range(n_iter):
            losses = {"textcat": 0.0}
            if USE_TQDM:
                # If we're using the CLI, a progress bar is nice.
                train_data = tqdm.tqdm(train_data, leave=False)
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=batch_size)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts, annotations, sgd=optimizer, drop=dropout, losses=losses
                )
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
            best_acc = max(best_acc, scores["acc"])
            report_progress(i, best_acc, losses, scores)
            should_stop = early_stopping.update(scores)
            if should_stop:
                break
github explosion / spaCy / examples / training / pretrain_textcat.py View on Github external
n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        textcat.model.tok2vec.from_bytes(tok2vec_weights)
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        for i in range(n_iter):
            losses = {"textcat": 0.0}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(tqdm.tqdm(train_data), size=2)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
github explosion / spacy-transformers / examples / tasks / run_glue.py View on Github external
def evaluate(nlp, task, docs_golds):
    tok2vec = nlp.get_pipe(PIPES.tok2vec)
    textcat = nlp.get_pipe(PIPES.textcat)
    right = 0
    total = 0
    guesses = []
    truths = []
    labels = textcat.labels
    for batch in minibatch(docs_golds, size=HP.eval_batch_size):
        docs, golds = zip(*batch)
        docs = list(textcat.pipe(tok2vec.pipe(docs)))
        for doc, gold in zip(docs, golds):
            guess, _ = max(doc.cats.items(), key=lambda it: it[1])
            truth, _ = max(gold.cats.items(), key=lambda it: it[1])
            if guess not in labels:
                msg = (
                    f"Unexpected label {guess} predicted. "
                    f"Expectded labels: {', '.join(labels)}"
                )
                raise ValueError(msg)
            if truth not in labels:
                msg = (
                    f"Unexpected label {truth} predicted. "
                    f"Expectded labels: {', '.join(labels)}"
                )
github nabeel-oz / qlik-py-tools / core / _spacy.py View on Github external
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        with nlp.disable_pipes(*other_pipes):  # only train NER
           
            # Setup lists to store the loss for each epoch
            self.losses_train = []
            self.losses_test = []
            
            # reset and initialize the weights randomly โ€“ but only if we're
            # training a new model
            if self.blank:
                nlp.begin_training()
            for epoch in range(self.epochs): 
                random.shuffle(self.train)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(self.train, size=self.batch_size)
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        drop=self.drop,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
                # Store loss for the epoch to a list
                self.losses_train.append(('Epoch {}'.format(epoch+1), losses['ner']))

                # Debug information is printed to the terminal and logs if the paramater debug = true
                if self.debug:
                    self._print_log(8)
                
                # If a test dataset is available, calculate losses for it as well
github explosion / spaCy / examples / training / train_parser.py View on Github external
parser = nlp.get_pipe("parser")

    # add labels to the parser
    for _, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
github explosion / spaCy / spacy / cli / train.py View on Github external
msg.row(["-" * width for width in row_settings["widths"]], **row_settings)
    try:
        iter_since_best = 0
        best_score = 0.0
        for i in range(n_iter):
            train_docs = corpus.train_docs(
                nlp,
                noise_level=noise_level,
                orth_variant_level=orth_variant_level,
                gold_preproc=gold_preproc,
                max_length=0,
                ignore_misaligned=True,
            )
            if raw_text:
                random.shuffle(raw_text)
                raw_batches = util.minibatch(
                    (nlp.make_doc(rt["text"]) for rt in raw_text), size=8
                )
            words_seen = 0
            with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                losses = {}
                for batch in util.minibatch_by_words(train_docs, size=batch_sizes):
                    if not batch:
                        continue
                    docs, golds = zip(*batch)
                    nlp.update(
                        docs,
                        golds,
                        sgd=optimizer,
                        drop=next(dropout_rates),
                        losses=losses,
                    )