How to use the conllu.parse function in conllu

To help you get started, we’ve selected a few conllu examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EmilStenstrom / conllu / tests / test_integration.py View on Github external
def test_parse_CoNLL2009_1(self):
        data = dedent("""\
            #\tid\tform\tlemma\tplemma\tpos\tppos\tfeats\tpfeats\thead\tphead\tdeprel\tpdeprel\tfillpred\tpred\tapreds
            1\tZ\tz\tz\tR\tR\tSubPOS=R|Cas=2\tSubPOS=R|Cas=2\t10\t10\tAuxP\tAuxP\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_
            2\ttéto\ttento\ttento\tP\tP\tSubPOS=D|Gen=F|Num=S|Cas=2\tSubPOS=D|Gen=F|Num=S|Cas=2\t3\t3\tAtr\tAtr\tY\ttento\t_\tRSTR\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_\t_
            3\tknihy\tkniha\tkniha\tN\tN\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\tSubPOS=N|Gen=F|Num=S|Cas=2|Neg=A\t1\t1\tAdv\tAdv\tY\tkniha\t_\t_\t_\t_\t_\t_\t_\tDIR1\t_\t_\t_\t_\t_\t_\t_\t_

        """)

        sentences = parse(
            data,
            fields=(
                'id', 'form', 'lemma', 'plemma', 'pos', 'ppos', 'feats', 'pfeats',
                'head', 'phead', 'deprel', 'pdeprel', 'fillpred', 'pred', 'apreds'
            ),
            field_parsers={
                "pfeats": lambda line, i: parse_dict_value(line[i]),
                "phead": lambda line, i: parse_int_value(line[i]),
                "apreds": lambda line, i: [
                    apred_field if apred_field != "_" else None
                    for apred_field in line[i:len(line)]
                ],
            },
        )
        self.assertEqual(
            sentences[0][2],
github oroszgy / spacy-hungarian-models / src / models / __main__.py View on Github external
def read_lemmatization_data(path):
    with open(path) as f:
        df = pd.DataFrame(tok for sent in tqdm(conllu.parse(f.read())) for tok in sent)
        X = [(word_class, full_form) for _, (word_class, full_form) in df[["upostag", "form"]].iterrows()]
        y = [lemma for _, (lemma,) in df[["lemma"]].iterrows()]
        return X, y
github oroszgy / spacy-hungarian-models / src / model_builder / __main__.py View on Github external
def benchmark_model(model_name, test_data_path, ner_test_data):
    with open(test_data_path) as f:
        data = conllu.parse(f.read())
        text = " ".join(d.metadata["text"] for d in data)

    load_model = getattr(importlib.import_module(model_name), "load")
    nlp = load_model()

    _parsed = StringIO(format_as_conllu(nlp(text), 1))
    parsed = conll17_ud_eval.load_conllu(_parsed)
    gold = conll17_ud_eval.load_conllu_file(test_data_path)

    results = pd.DataFrame(
        {k: v.__dict__ for k, v in conll17_ud_eval.evaluate(gold, parsed).items()}
    ).T

    print(results)

    diterator = DataIterator()
github oroszgy / spacy-hungarian-models / src / models / __main__.py View on Github external
def convert_szk_to_conllu(from_glob, to_path, dev_path, test_path):
    ignored = []
    for fpath in [dev_path, test_path]:
        with open(fpath) as f:
            ignored.extend(map(sentence_repr, conllu.parse(f.read())))

    ignored = set(ignored)
    parsed = []
    for fpath in glob.glob(from_glob):
        for sent in conllu.parse("\n\n".join(parse_szk(fpath))):
            if sentence_repr(sent) not in ignored:
                parsed.append(sent)

    print(len(parsed))
    with open(to_path, "w") as outf:
        out = "".join(sent.serialize() for sent in parsed)
        outf.write(out)
github oroszgy / spacy-hungarian-models / src / models / __main__.py View on Github external
def convert_szk_to_conllu(from_glob, to_path, dev_path, test_path):
    ignored = []
    for fpath in [dev_path, test_path]:
        with open(fpath) as f:
            ignored.extend(map(sentence_repr, conllu.parse(f.read())))

    ignored = set(ignored)
    parsed = []
    for fpath in glob.glob(from_glob):
        for sent in conllu.parse("\n\n".join(parse_szk(fpath))):
            if sentence_repr(sent) not in ignored:
                parsed.append(sent)

    print(len(parsed))
    with open(to_path, "w") as outf:
        out = "".join(sent.serialize() for sent in parsed)
        outf.write(out)
github oroszgy / spacy-hungarian-models / src / model_builder / __main__.py View on Github external
def convert_szk_to_conllu(from_glob, to_path, dev_path, test_path, morph):
    ignored = []
    for fpath in [dev_path, test_path]:
        with open(fpath) as f:
            ignored.extend(map(sentence_repr, conllu.parse(f.read())))

    parser = parse_szk_morph if morph else parse_szk_dep

    ignored = set(ignored)
    parsed = []
    for fpath in glob.glob(from_glob):
        for sent in conllu.parse("\n\n".join(parser(fpath))):
            if sentence_repr(sent) not in ignored:
                parsed.append(sent)

    logging.info("Read {} sentences".format(len(parsed)))
    with open(to_path, "w") as outf:
        out = "".join(sent.serialize() for sent in parsed)
        outf.write(out)
github proycon / folia / foliatools / conllu2folia.py View on Github external
parser.add_argument('--pos-set',dest="posset",type=str,help="URL of the set definition for *language-specific* part-of-speech and features (xpos and not the universal pos!)", action='store',default="undefined",required=False)
    parser.add_argument('--dependency-set',dest="depset", type=str,help="Dependency set", action='store',default=UDEP_SET, required=False)
    parser.add_argument('-o', '--outputdir',type=str,help="Output directory", action='store',default=".", required=False)
    parser.add_argument('files', nargs='+', help='CONLL-U input files')
    args = parser.parse_args()


    for file in args.files:
        if args.id:
            doc_id = args.id
        else:
            doc_id = os.path.basename(file)
        doc = None
        hascontent = False
        with open(file,'r',encoding='utf-8') as f:
            sentences = conllu.parse(f.read())
            for i, tokenlist in enumerate(sentences):
                if 'newdoc id' in tokenlist.metadata or i == 0:
                    if doc is not None and hascontent:
                        doc.save(os.path.join(args.outputdir, doc_id + ".folia.xml"))
                        print("Wrote " + doc_id + ".folia.xml",file=sys.stderr)
                    if 'newdoc id' in tokenlist.metadata:
                        doc_id = tokenlist.metadata['newdoc id']
                    hascontent = False
                    doc = folia.Document(id=doc_id)
                    doc.declare(folia.PosAnnotation, set=UPOS_SET, annotator="conll2folia")
                    doc.declare(folia.PosAnnotation, set=args.posset, annotator="conll2folia")
                    doc.declare(folia.Dependency, set=args.depset, annotator="conll2folia")
                    doc.declare(folia.LemmaAnnotation, set=args.lemmaset, annotator="conll2folia")
                    textbody = folia.Text(doc, id=doc_id+'.text')
                    doc.append(textbody)
                    anchor = textbody