How to use the conllu.parse_incr function in conllu

To help you get started, we’ve selected a few conllu examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EmilStenstrom / conllu / tests / test_integration.py View on Github external
def test_parse_incr_invalid_file(self):
        with self.assertRaises(FileNotFoundError):
            list(parse_incr("SOME STRING DATA"))
github RasaHQ / rasa / ner-evaluation / scripts / conll-to-rasa.py View on Github external
def convert(input, output):
    print ("Parsing file '{}'.".format(input))

    if os.path.exists(output):
        os.remove(output)

    data_file = open(input, "r", encoding="utf-8")

    f = open(output, "a")
    f.write("## intent:ner_examples")
    f.write("\n")
    f.close()

    for tokenlist in parse_incr(data_file, fields=CONLL_FILEDS):

        tokens = []
        entity = None
        found_entity = False

        for token in tokenlist:
            if "entity" not in token:
                token["entity"] = "O"

            # new entity found
            if token["entity"].startswith("B-") and not found_entity:
                tokens.append("[{}".format(token["form"]))
                found_entity = True
                entity = token["entity"][2:]

            # new entity directly after another entity
github allenai / allennlp / allennlp / data / dataset_readers / universal_dependencies.py View on Github external
def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path)

        with open(file_path, "r") as conllu_file:
            logger.info("Reading UD instances from conllu dataset at: %s", file_path)

            for annotation in parse_incr(conllu_file):
                # CoNLLU annotations sometimes add back in words that have been elided
                # in the original sentence; we remove these, as we're just predicting
                # dependencies for the original sentence.
                # We filter by None here as elided words have a non-integer word id,
                # and are replaced with None by the conllu python library.
                annotation = [x for x in annotation if isinstance(x["id"], int)]

                heads = [x["head"] for x in annotation]
                tags = [x["deprel"] for x in annotation]
                words = [x["form"] for x in annotation]
                if self.use_language_specific_pos:
                    pos_tags = [x["xpostag"] for x in annotation]
                else:
                    pos_tags = [x["upostag"] for x in annotation]
                yield self.text_to_instance(words, pos_tags, list(zip(tags, heads)))
github doccano / doccano / app / api / utils.py View on Github external
def parse(self, file):
        data = []
        file = io.TextIOWrapper(file, encoding='utf-8')

        # Add check exception

        field_parsers = {
            "ne": lambda line, i: conllu.parser.parse_nullable_value(line[i]),
        }

        gen_parser = conllu.parse_incr(
            file,
            fields=("form", "ne"),
            field_parsers=field_parsers
        )

        try:
            for sentence in gen_parser:
                if not sentence:
                    continue
                if len(data) >= settings.IMPORT_BATCH_SIZE:
                    yield data
                    data = []
                words, labels = [], []
                for item in sentence:
                    word = item.get("form")
                    tag = item.get("ne")
github estnltk / estnltk / estnltk / converters / conll_importer.py View on Github external
ambiguous=False
                      )

    syntax = Layer(name=syntax_layer,
                   text_object=text,
                   attributes=['id', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc'],
                   ambiguous=False
                   )
    cur = 0
    t = []

    sentence_start = 0

    with open(file, "r", encoding="utf-8") as data_file:

        for sentence in parse_incr(data_file):
            for w in sentence:
                token = w['form']
                t.append(token)
                len_w = len(token)
                base_span = ElementaryBaseSpan(cur, cur+len_w)
                words.add_annotation(base_span)

                syntax.add_annotation(base_span, **w)
                cur += len_w + 1

            sentences.add_annotation(words[sentence_start:])
            sentence_start += len(sentence)

    text.text = ' '.join(t)
    text.add_layer(words)
    text.add_layer(sentences)
github estnltk / estnltk / estnltk / converters / conll_importer.py View on Github external
Returns results in a tuple.
        '''
        j = -1
        for i in range( len(sent_id_str)-1, -1, -1 ):
            if sent_id_str[i] == '_':
                j = i
                break
        return (sent_id_str[:j], sent_id_str[j:]) if j != -1 else (sent_id_str, '')
    
    cur = 0
    t = []
    sentence_start = 0
    last_fname = None
    last_sent_id = '##start##'
    with open(file, "r", encoding="utf-8") as data_file:
        for sentence in parse_incr(data_file):
            cur_sent_id = sentence.metadata.get('sent_id', None)
            if not last_sent_id == '##start##':
                # Determine if we need to create a new document
                if isinstance(last_sent_id, str) and isinstance(cur_sent_id, str):
                    # Separate fname from the sentence counter 
                    last_fname, _ = _split_into_fname_and_counter( last_sent_id )
                    cur_fname, _  = _split_into_fname_and_counter( cur_sent_id )
                    if postcorrect_sent_ids:
                        # Manually correct some broken file names
                        # (remove redundant letter 'n' from the start)
                        if last_fname in broken_fnames:
                            last_fname = last_fname[1:]
                        if cur_fname in broken_fnames:
                            cur_fname = cur_fname[1:]
                    if last_fname != cur_fname:
                        # New document needs to be created
github allenai / allennlp / allennlp / data / dataset_readers / universal_dependencies_multilang.py View on Github external
def _read_one_file(self, lang: str, file_path: str):
        with open(file_path, "r") as conllu_file:
            logger.info(
                "Reading UD instances for %s language from conllu dataset at: %s", lang, file_path
            )

            for annotation in parse_incr(conllu_file):
                # CoNLLU annotations sometimes add back in words that have been elided
                # in the original sentence; we remove these, as we're just predicting
                # dependencies for the original sentence.
                # We filter by None here as elided words have a non-integer word id,
                # and are replaced with None by the conllu python library.
                annotation = [x for x in annotation if x["id"] is not None]

                heads = [x["head"] for x in annotation]
                tags = [x["deprel"] for x in annotation]
                words = [x["form"] for x in annotation]
                if self._use_language_specific_pos:
                    pos_tags = [x["xpostag"] for x in annotation]
                else:
                    pos_tags = [x["upostag"] for x in annotation]
                yield self.text_to_instance(lang, words, pos_tags, list(zip(tags, heads)))
github jxhe / cross-lingual-struct-flow / modules / conllu_data.py View on Github external
if word_to_id_dict is None:
            word_to_id = defaultdict(lambda: len(word_to_id))
        else:
            word_to_id = word_to_id_dict

        text = []
        tags = []
        trees = []
        heads = []
        right_num_deps = []
        left_num_deps = []
        deps = []
        fin = open(fname, "r", encoding="utf-8")
        fin_tree = open(fname, "r", encoding="utf-8")
        data_file_tree = parse_tree_incr(fin_tree)
        data_file = parse_incr(fin)
        for sent, tree in zip(data_file, data_file_tree):
            sent_list = []
            tag_list = []
            head_list = []
            right_num_deps_ = []
            left_num_deps_ = []
            sent_n = []
            deps_list = []

            # delete multi-word token
            for token in sent:
                if isinstance(token["id"], int):
                    sent_n += [token]

            for token in sent_n:
                sent_list.append(word_to_id[token["form"]])