How to use the conllu.parser.parse_token_and_metadata function in conllu

To help you get started, we’ve selected a few conllu examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_custom_metadata_parsers(self):
        data = dedent("""\
            # global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC
            # newdoc id = mf920901-001
            # newpar id = mf920901-001-p1
            # sent_id = mf920901-001-p1s1A
            # text = Slovenská ústava: pro i proti
            # text_en = Slovak constitution: pros and cons
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))

        _, metadata = parse_token_and_metadata(
            data,
            metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
        )
        self.assertEqual(metadata, Token([
            ("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
            ("newdoc id", "mf920901-001"),
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
# newpar id = mf920901-001-p1
            # sent_id = mf920901-001-p1s1A
            # text = Slovenská ústava: pro i proti
            # text_en = Slovak constitution: pros and cons
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("global.columns", "ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))

        _, metadata = parse_token_and_metadata(
            data,
            metadata_parsers={"global.columns": lambda key, value: (key, value.split())}
        )
        self.assertEqual(metadata, Token([
            ("global.columns", ["ID", "FORM", "LEMMA", "UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"]),
            ("newdoc id", "mf920901-001"),
            ("newpar id", "mf920901-001-p1"),
            ("sent_id", "mf920901-001-p1s1A"),
            ("text", "Slovenská ústava: pro i proti"),
            ("text_en", "Slovak constitution: pros and cons"),
        ]))
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_metadata(self):
        data = dedent("""\
            # newdoc
            # data = meta
            # meta = data
            1\tdog

        """)
        tokenlist = TokenList(*parse_token_and_metadata(data))
        self.assertEqual(serialize(tokenlist), data)
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_empty(self):
        with self.assertRaises(ParseException):
            parse_token_and_metadata(None)
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_invalid_metadata(self):
        data = dedent("""\
            # meta = data2
            # meta = data
            # newdoc
            # newpar
            # meta
            # = data
        """)
        _, metadata = parse_token_and_metadata(data)
        self.assertEqual(metadata, Token([
            ("meta", "data"),
            ("newdoc", None),
            ("newpar", None),
        ]))
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_serialize_tricky_fields(self):
        data = dedent("""\
            5\tjumps\tjump\tVERB\tVBZ\tMood=Ind|Number=Sing\t0\troot\t_\tSpaceAfter=No
        """)
        tokenlist = TokenList(*parse_token_and_metadata(data))
        self.assertEqual(serialize(tokenlist).strip(), data.strip())
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_newlines_in_sentence(self):
        data = dedent("""\
            # meta = data
            1\thej
            2\tdå

            3\thej
            4\tdå
        """)
        tokens, metadata = parse_token_and_metadata(data)
        self.assertListEqual(tokens, [
            Token([("id", 1), ("form", "hej")]),
            Token([("id", 2), ("form", "då")]),
            Token([("id", 3), ("form", "hej")]),
            Token([("id", 4), ("form", "då")]),
        ])
        self.assertEqual(metadata, Token([("meta", "data")]))
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_custom_fields(self):
        data = dedent("""\
            1\t1\t1
            2\t2\t2
        """)
        tokens, _ = parse_token_and_metadata(data, fields=("id", "id", "id"))
        self.assertEqual(tokens, [
            Token([("id", 1), ("id", 1), ("id", 1)]),
            Token([("id", 2), ("id", 2), ("id", 2)]),
        ])
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
1\tbackwards\tline
            2\tparis\tsirap
        """)
        fields = ("id", "backwards")

        # A field parser that takes all remaining field, reverses their letters and joins them
        def parse_backwards(value):
            return " ".join([part[::-1] for part in value])

        # This overrides the default parsers, so the id is parsed as a string
        field_parsers = {
            "id": lambda line, i: line[i],
            "backwards": lambda line, i: parse_backwards(line[i:len(line)])
        }

        tokens, _ = parse_token_and_metadata(data, fields=fields, field_parsers=field_parsers)
        self.assertEqual(tokens, [
            Token([("id", '1'), ("backwards", "sdrawkcab enil")]),
            Token([("id", '2'), ("backwards", "sirap paris")]),
        ])
github EmilStenstrom / conllu / conllu / __init__.py View on Github external
def parse_incr(in_file, fields=None, field_parsers=None, metadata_parsers=None):
    if not hasattr(in_file, 'read'):
        raise FileNotFoundError("Invalid file, 'parse_incr' needs an opened file as input")

    if not fields:
        fields = parse_conllu_plus_fields(in_file, metadata_parsers=metadata_parsers)

    for sentence in parse_sentences(in_file):
        yield TokenList(*parse_token_and_metadata(
            sentence,
            fields=fields,
            field_parsers=field_parsers,
            metadata_parsers=metadata_parsers
        ))