How to use conllu - 10 common examples

To help you get started, we’ve selected a few conllu examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EmilStenstrom / conllu / tests / test_integration.py View on Github external
5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_

        """)

        sentences = parse(
            data,
            fields=(
                'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'
            ),
            field_parsers={
                "feats": lambda line, i: [feat for feat in line[i].split("|")]
            }
        )
        self.assertEqual(
            sentences[0][4],
            Token([
                ('id', 5),
                ('form', 'regnante'),
                ('lemma', 'regno'),
                ('upostag', 't'),
                ('xpostag', 't'),
                ('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']),
                ('head', 0),
                ('deprel', 'ADV'),
                ('deps', None),
                ('misc', None),
            ])
        )
        self.assertEqual(
            sentences[0].metadata,
            Token([
                ('id', "'1'-document_id='36:1047'-span='1'")
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_custom_metadata_parsers(self):
        data = dedent("""\
            # global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
            # newdoc id = mf920901-001
            # newpar id = mf920901-001-p1
            # sent_id = mf920901-001-p1s1A
            # text = Slovenská ústava: pro i proti
            # text_en = Slovak constitution: pros and cons
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))

        _, metadata = parse_token_and_metadata(
            data,
            metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
        )
        self.assertEqual(metadata, Token([
            ("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
            ("newdoc id", "mf920901-001"),
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
# newpar id = mf920901-001-p1
            # sent_id = mf920901-001-p1s1A
            # text = Slovenská ústava: pro i proti
            # text_en = Slovak constitution: pros and cons
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))

        _, metadata = parse_token_and_metadata(
            data,
            metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
        )
        self.assertEqual(metadata, Token([
            ("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))
github EmilStenstrom / conllu / tests / test_models.py View on Github external
def test_deep_filtering(self):
        tokenlist = TokenList([
            {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])},
            {"form": "quick", "feats": Token([('Degree', 'Pos')])},
            {"form": "brown", "feats": Token([('Degree', 'Pos')])},
            {"form": "fox", "feats": Token([('Number', 'Sing')])},
        ])
        self.assertEqual(
            tokenlist.filter(feats__Degree="Pos"),
            TokenList([
                {"form": "quick", "feats": Token([('Degree', 'Pos')])},
                {"form": "brown", "feats": Token([('Degree', 'Pos')])},
            ])
        )
        self.assertEqual(
            tokenlist.filter(form="brown", feats__Degree="Pos"),
            TokenList([
                {"form": "brown", "feats": Token([('Degree', 'Pos')])},
            ])
        )
        self.assertEqual(
            tokenlist.filter(form="brown", feats__Degree="Pos", id=1),
            TokenList([])
        )
        self.assertEqual(
            tokenlist.filter(unknown__property__value="undefined"),
            TokenList([])
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
""")
        fields = ("id", "backwards")

        # A field parser that takes all remaining field, reverses their letters and joins them
        def parse_backwards(value):
            return " ".join([part[::-1] for part in value])

        # This overrides the default parsers, so the id is parsed as a string
        field_parsers = {
            "id": lambda line, i: line[i],
            "backwards": lambda line, i: parse_backwards(line[i:len(line)])
        }

        tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
        self.assertEqual(tokens, [
            Token([("id", '1'), ("backwards", "sdrawkcab enil")]),
            Token([("id", '2'), ("backwards", "sirap paris")]),
        ])
github EmilStenstrom / conllu / tests / test_models.py View on Github external
def test_simple_tree(self):
        tokenlist = TokenList([
            Token([("id", 2), ("form", "dog"), ("head", 0)]),
            Token([("id", 1), ("form", "a"), ("head", 2)]),
        ])
        tree = TokenTree(
            token=Token([("id", 2), ("form", "dog"), ("head", 0)]),
            children=[TokenTree(
                token=Token([("id", 1), ("form", "a"), ("head", 2)]),
                children=[]
            )]
        )
        self.assertTreeEqual(tokenlist.to_tree(), tree)
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_ordered_dict(self):
        data = Token()
        self.assertEqual(serialize_field(data), "")

        data = Token([('SpaceAfter', 'No')])
        self.assertEqual(serialize_field(data), "SpaceAfter=No")

        data = Token([('Translit', None)])
        self.assertEqual(serialize_field(data), "Translit=_")
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_invalid_metadata(self):
        data = dedent("""\
            # meta = data2
            # meta = data
            # newdoc
            # newpar
            # meta
            # = data
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("meta", "data"),
            ("newdoc", None),
            ("newpar", None),
        ]))
github EmilStenstrom / conllu / tests / test_models.py View on Github external
def test_xupos_to_xupostag(self):
        token = Token({"id": 1, "xpos": "DT", "upos": "DET"})
        self.assertEqual(token["xpos"], "DT")
        self.assertEqual(token["xpostag"], "DT")
        self.assertEqual(token["upos"], "DET")
        self.assertEqual(token["upostag"], "DET")
github EmilStenstrom / conllu / tests / test_integration.py View on Github external
def test_parse_tree(self):
        sentences = parse_tree(data)
        self.assertEqual(len(sentences), 1)

        root = sentences[0]
        self.assertEqual(text(root), "TokenTree")

        self.assertEqual(
            root.token,
            Token([
                ('id', 5),
                ('form', 'jumps'),
                ('lemma', 'jump'),
                ('upos', 'VERB'),
                ('xpos', 'VBZ'),
                ('feats', Token([
                    ("Mood", "Ind"),
                    ("Number", "Sing"),
                    ("Person", "3"),
                    ("Tense", "Pres"),
                    ("VerbForm", "Fin"),
                ])),
                ('head', 0),
                ('deprel', 'root'),
                ('deps', None),
                ('misc', None)