How to use the conllu.parser.DEFAULT_FIELDS function in conllu

To help you get started, we’ve selected a few conllu examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_parse_line_fewer_columns(self):
        line = "1\tThe\tthe\tDET\tDT"
        self.assertEqual(parse_line(line, fields=DEFAULT_FIELDS), Token([
            ('id', 1),
            ('form', 'The'),
            ('lemma', 'the'),
            ('upos', 'DET'),
            ('xpos', 'DT'),
        ]))
github EmilStenstrom / conllu / tests / test_parser.py View on Github external
def test_empty(self):
        with self.assertRaises(ParseException) as assert_context:
            line = "invalid_id\t_\t_\t_\t_\t_\t_\t_\t_\t"
            parse_line(line, fields=DEFAULT_FIELDS)

        expected = "Failed parsing field 'id'"
        self.assertEqual(str(assert_context.exception)[:len(expected)], expected)
github DreamerDeo / HIT-SCIR-CoNLL2019 / utils / transition_sdp_reader.py View on Github external
def extract_token_info_from_companion_data(self):
        annotation = []
        for line in self.companion:
            line = '\t'.join(line)
            annotation.append(parse_line(line, DEFAULT_FIELDS))

        tokens = [x["form"] for x in annotation if x["form"] is not None]
        lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None]
        pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None]
        token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation]

        return {"tokens": tokens,
                "lemmas": lemmas,
                "pos_tags": pos_tags,
                "token_range": token_range}
github DreamerDeo / HIT-SCIR-CoNLL2019 / utils / transition_eds_reader.py View on Github external
def extract_token_info_from_companion_data(self):
        annotation = []
        for line in self.companion:
            line = '\t'.join(line)
            annotation.append(parse_line(line, DEFAULT_FIELDS))

        tokens = [x["form"] for x in annotation if x["form"] is not None]
        lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None]
        pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None]
        token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation]

        return {"tokens": tokens,
                "lemmas": lemmas,
                "pos_tags": pos_tags,
                "token_range": token_range}
github plasticityai / magnitude / pymagnitude / third_party / allennlp / data / dataset_readers / universal_dependencies.py View on Github external
def lazy_parse(text     , fields        = DEFAULT_FIELDS):
    for sentence in text.split(u"\n\n"):
        if sentence:
            yield [parse_line(line, fields)
                   for line in sentence.split(u"\n")
                   if line and not line.strip().startswith(u"#")]
github DreamerDeo / HIT-SCIR-CoNLL2019 / utils / transition_ucca_reader.py View on Github external
def extract_token_info_from_companion_data(self):
        annotation = []
        for line in self.companion:
            line = '\t'.join(line)
            annotation.append(parse_line(line, DEFAULT_FIELDS))

        tokens = [x["form"] for x in annotation if x["form"] is not None]
        lemmas = [x["lemma"] for x in annotation if x["lemma"] is not None]
        pos_tags = [x["upostag"] for x in annotation if x["upostag"] is not None]
        token_range = [tuple([int(i) for i in list(x["misc"].values())[0].split(':')]) for x in annotation]

        return {"tokens": tokens,
                "lemmas": lemmas,
                "pos_tags": pos_tags,
                "token_range": token_range}
github allenai / scispacy / scispacy / spacy_convert.py View on Github external
def _lazy_parse(text: str, fields=DEFAULT_FIELDS):
    """
    Reads conllu annotations, yielding unwieldy OrderedDict-like
    objects per sentence.
    """
    for sentence in text.split("\n\n"):
        if sentence:
            yield [parse_line(line, fields)
                   for line in sentence.split("\n")
                   if line and not line.strip().startswith("#")]