How to use the spacy.gold.GoldParse function in spacy

To help you get started, weโ€™ve selected a few spacy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github explosion / spaCy / tests / parser / test_add_label.py View on Github external
def _train_parser(parser):
    fix_random_seed(1)
    parser.add_label("left")
    parser.begin_training([], **parser.cfg)
    sgd = Adam(NumpyOps(), 0.001)

    for i in range(5):
        losses = {}
        doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
        gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"])
        parser.update([doc], [gold], sgd=sgd, losses=losses)
    return parser
github icoxfog417 / yans-2019-annotation-hackathon / yans / data / train.py View on Github external
def get_golds(self, model, force=False):
        if len(self.golds) > 0 and not force:
            return self.golds

        self.golds = []
        for text, annotation in self.label_data:
            doc = model.tokenizer(text)
            gold = GoldParse(doc, entities=annotation["entities"])
            self.golds.append(gold)

        return self.golds
github ELS-RD / anonymisation / misc / convert_to_bilou.py View on Github external
def convert_unknown_bilou(doc: Doc, offsets: List[Offset]) -> GoldParse:
    """
    Convert entity offsets to list of BILOU annotations
    and convert UNKNOWN label to Spacy missing values
    https://spacy.io/api/goldparse#biluo_tags_from_offsets
    :param doc: spacy tokenized text
    :param offsets: discovered offsets
    :return: tuple of docs and BILOU annotations
    """
    tupple_offset = [offset.to_tuple() for offset in offsets]
    bilou_annotations = convert_bilou_with_missing_action(doc=doc, offsets=tupple_offset)
    return GoldParse(doc, entities=bilou_annotations)
github explosion / spaCy / bin / ud / ud_train.py View on Github external
if "-" in id_:
                    continue
                id_ = int(id_) - 1
                head = int(head) - 1 if head != "0" else id_
                sent["words"].append(word)
                sent["tags"].append(tag)
                sent["morphology"].append(_parse_morph_string(morph))
                sent["morphology"][-1].add("POS_%s" % pos)
                sent["heads"].append(head)
                sent["deps"].append("ROOT" if dep == "root" else dep)
                sent["spaces"].append(space_after == "_")
            sent["entities"] = ["-"] * len(sent["words"])
            sent["heads"], sent["deps"] = projectivize(sent["heads"], sent["deps"])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
                golds.append(GoldParse(docs[-1], **sent))
                assert golds[-1].morphology is not None

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                assert gold.morphology is not None
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
github aatimofeev / spacy_russian_tokenizer / evaluate_on_opencorpora.py View on Github external
parsed_sentences = []
    gold_sentences = []

    with open(opencorpora_file, "r") as f:
        opencorpora = f.read().encode('utf-8')

    page_tree = html.fromstring(opencorpora)

    for text in page_tree.xpath('//text'):
        for paragraphs in text.xpath('./paragraphs'):
            for paragraph in paragraphs.xpath('./paragraph'):
                for sentence in paragraph.xpath('./sentence'):
                    text = sentence.xpath('./source')[0].text
                    parsed_sentences.append(nlp(text))
                    sent_words = [token.attrib['text'] for token in sentence.xpath('./tokens/token')]
                    gold = GoldParse(Doc(nlp.vocab, words=sent_words), words=sent_words,  # heads=sent_heads,
                                     # tags=sent_tags, deps=sent_deps,
                                     entities=['-'] * len(sent_words))
                    gold_sentences.append(gold)
    return parsed_sentences, gold_sentences
github explosion / spaCy / spacy / cli / ud_train.py View on Github external
continue
                if '-' in id_:
                    continue
                id_ = int(id_)-1
                head = int(head)-1 if head != '0' else id_
                sent['words'].append(word)
                sent['tags'].append(tag)
                sent['heads'].append(head)
                sent['deps'].append('ROOT' if dep == 'root' else dep)
                sent['spaces'].append(space_after == '_')
            sent['entities'] = ['-'] * len(sent['words'])
            sent['heads'], sent['deps'] = projectivize(sent['heads'],
                                                       sent['deps'])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
                golds.append(GoldParse(docs[-1], **sent))

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
            return docs, golds
github explosion / spaCy / bin / parser / nn_train.py View on Github external
def evaluate(nlp, gold_tuples, gold_preproc=True):
    scorer = Scorer()
    for raw_text, sents in gold_tuples:
        for annot_tuples, brackets in sents:
            tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1])
            nlp.tagger(tokens)
            nlp.parser(tokens)
            gold = GoldParse(tokens, annot_tuples)
            scorer.score(tokens, gold)
    return scorer
github explosion / spaCy / bin / ud / ud_train.py View on Github external
sent_starts = []
    for sent in sent_annots:
        flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
        for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
            flat[field].extend(sent[field])
        sent_starts.append(True)
        sent_starts.extend([False] * (len(sent["words"]) - 1))
    # Construct text if necessary
    assert len(flat["words"]) == len(flat["spaces"])
    if text is None:
        text = "".join(
            word + " " * space for word, space in zip(flat["words"], flat["spaces"])
        )
    doc = nlp.make_doc(text)
    flat.pop("spaces")
    gold = GoldParse(doc, **flat)
    gold.sent_starts = sent_starts
    for i in range(len(gold.heads)):
        if random.random() < drop_deps:
            gold.heads[i] = None
            gold.labels[i] = None

    return doc, gold
github explosion / spaCy / spacy / cli / ud_train.py View on Github external
# Flatten the conll annotations, and adjust the head indices
    flat = defaultdict(list)
    sent_starts = []
    for sent in sent_annots:
        flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
        for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
            flat[field].extend(sent[field])
        sent_starts.append(True)
        sent_starts.extend([False] * (len(sent['words'])-1))
    # Construct text if necessary
    assert len(flat['words']) == len(flat['spaces'])
    if text is None:
        text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) 
    doc = nlp.make_doc(text)
    flat.pop('spaces')
    gold = GoldParse(doc, **flat)
    gold.sent_starts = sent_starts
    for i in range(len(gold.heads)):
        if random.random() < drop_deps:
            gold.heads[i] = None
            gold.labels[i] = None

    return doc, gold
github aatimofeev / spacy_russian_tokenizer / evaluate_on_syntagrus.py View on Github external
continue
                id_ = int(id_) - 1
                try:
                    head = int(head) - 1 if head != '0' else id_
                except ValueError:
                    head = id_
                sent_words.append(word)
                sent_tags.append(tag)
                sent_heads.append(head)
                sent_deps.append('ROOT' if dep == 'root' else dep)
            sent_heads, sent_deps = projectivize(sent_heads, sent_deps)
            # text should be cleaned, because removing trailing spaces is not point of spaCy at all
            # and should not be evaluated
            text = re.sub('\s+', ' ', text).strip()
            parsed_sentences.append(nlp(text))
            gold = GoldParse(Doc(nlp.vocab, words=sent_words), words=sent_words, heads=sent_heads,
                             tags=sent_tags, deps=sent_deps,
                             entities=['-'] * len(sent_words))
            gold_sentences.append(gold)
            documents[docid].append(text)
            documents_gold_sentences[docid].append(gold)
            gold_segmentation[docid].append([1] + [0] * (len(sent_words) - 1))
    return parsed_sentences, gold_sentences, gold_segmentation, documents, documents_gold_sentences