How to use the conllu.models.Token function in conllu

To help you get started, we’ve selected a few conllu examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EmilStenstrom / conllu / tests / test_integration.py View on Github external
5\tregnante\tregno\tt\tt\tt|-|s|p|p|a|m|b|-\t0\tADV\t_\t_

        """)

        sentences = parse(
            data,
            fields=(
                'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'
            ),
            field_parsers={
                "feats": lambda line, i: [feat for feat in line[i].split("|")]
            }
        )
        self.assertEqual(
            sentences[0][4],
            Token([
                ('id', 5),
                ('form', 'regnante'),
                ('lemma', 'regno'),
                ('upostag', 't'),
                ('xpostag', 't'),
                ('feats', ['t', '-', 's', 'p', 'p', 'a', 'm', 'b', '-']),
                ('head', 0),
                ('deprel', 'ADV'),
                ('deps', None),
                ('misc', None),
            ])
        )
        self.assertEqual(
            sentences[0].metadata,
            Token([
                ('id', "'1'-document_id='36:1047'-span='1'")
github EmilStenstrom / conllu / tests / test_models.py View on Github external
def test_deep_filtering(self):
        tokenlist = TokenList([
            {"form": "The", "feats": Token([('Definite', 'Def'), ('PronType', 'Art')])},
            {"form": "quick", "feats": Token([('Degree', 'Pos')])},
            {"form": "brown", "feats": Token([('Degree', 'Pos')])},
            {"form": "fox", "feats": Token([('Number', 'Sing')])},
        ])
        self.assertEqual(
            tokenlist.filter(feats__Degree="Pos"),
            TokenList([
                {"form": "quick", "feats": Token([('Degree', 'Pos')])},
                {"form": "brown", "feats": Token([('Degree', 'Pos')])},
            ])
        )
        self.assertEqual(
            tokenlist.filter(form="brown", feats__Degree="Pos"),
            TokenList([
                {"form": "brown", "feats": Token([('Degree', 'Pos')])},
            ])
        )
        self.assertEqual(
            tokenlist.filter(form="brown", feats__Degree="Pos", id=1),
            TokenList([])
        )
        self.assertEqual(
            tokenlist.filter(unknown__property__value="undefined"),
            TokenList([])
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
""")
        fields = ("id", "backwards")

        # A field parser that takes all remaining field, reverses their letters and joins them
        def parse_backwards(value):
            return " ".join([part[::-1] for part in value])

        # This overrides the default parsers, so the id is parsed as a string
        field_parsers = {
            "id": lambda line, i: line[i],
            "backwards": lambda line, i: parse_backwards(line[i:len(line)])
        }

        tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
        self.assertEqual(tokens, [
            Token([("id", '1'), ("backwards", "sdrawkcab enil")]),
            Token([("id", '2'), ("backwards", "sirap paris")]),
        ])
github EmilStenstrom / conllu / tests / test_models.py View on Github external
def test_simple_tree(self):
        tokenlist = TokenList([
            Token([("id", 2), ("form", "dog"), ("head", 0)]),
            Token([("id", 1), ("form", "a"), ("head", 2)]),
        ])
        tree = TokenTree(
            token=Token([("id", 2), ("form", "dog"), ("head", 0)]),
            children=[TokenTree(
                token=Token([("id", 1), ("form", "a"), ("head", 2)]),
                children=[]
            )]
        )
        self.assertTreeEqual(tokenlist.to_tree(), tree)
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_ordered_dict(self):
        data = Token()
        self.assertEqual(serialize_field(data), "")

        data = Token([('SpaceAfter', 'No')])
        self.assertEqual(serialize_field(data), "SpaceAfter=No")

        data = Token([('Translit', None)])
        self.assertEqual(serialize_field(data), "Translit=_")
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_invalid_metadata(self):
        data = dedent("""\
            # meta = data2
            # meta = data
            # newdoc
            # newpar
            # meta
            # = data
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("meta", "data"),
            ("newdoc", None),
            ("newpar", None),
        ]))
github EmilStenstrom / conllu / tests / test_models.py View on Github external
def test_xupos_to_xupostag(self):
        token = Token({"id": 1, "xpos": "DT", "upos": "DET"})
        self.assertEqual(token["xpos"], "DT")
        self.assertEqual(token["xpostag"], "DT")
        self.assertEqual(token["upos"], "DET")
        self.assertEqual(token["upostag"], "DET")
github EmilStenstrom / conllu / tests / test_integration.py View on Github external
def test_parse_tree(self):
        sentences = parse_tree(data)
        self.assertEqual(len(sentences), 1)

        root = sentences[0]
        self.assertEqual(text(root), "TokenTree")

        self.assertEqual(
            root.token,
            Token([
                ('id', 5),
                ('form', 'jumps'),
                ('lemma', 'jump'),
                ('upos', 'VERB'),
                ('xpos', 'VBZ'),
                ('feats', Token([
                    ("Mood", "Ind"),
                    ("Number", "Sing"),
                    ("Person", "3"),
                    ("Tense", "Pres"),
                    ("VerbForm", "Fin"),
                ])),
                ('head', 0),
                ('deprel', 'root'),
                ('deps', None),
                ('misc', None)
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_parse_line_two_spaces(self):
        line = "1  The  the  DET  DT  Definite=Def|PronType=Art  4  det  _  _"
        self.assertEqual(parse_line(line, fields=["id", "form"]), Token([
            ('id', 1),
            ('form', 'The'),
        ]))
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
data = dedent("""\
            # meta = data
            1\thej
            2\tdå

            3\thej
            4\tdå
        """)
        tokens, metadata = parse_token_and_metadata(data)
        self.assertListEqual(tokens, [
            Token([("id", 1), ("form", "hej")]),
            Token([("id", 2), ("form", "då")]),
            Token([("id", 3), ("form", "hej")]),
            Token([("id", 4), ("form", "då")]),
        ])
        self.assertEqual(metadata, Token([("meta", "data")]))