How to use the allennlp.data.instance.Instance function in allennlp

To help you get started, we’ve selected a few allennlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rloganiv / kglm-model / kglm / data / dataset_readers / conll2012.py View on Github external
if cluster_dict:
            for cluster, entity_id in cluster_dict.items():
                # Fill in "1" for positions corresponding to words in entities
                # Need offset by 1 to account for @@START@@ token.
                entity_types[cluster[0] + 1:cluster[1] + 1 + 1] = 1
                # Fill in entity ID
                entity_ids[cluster[0] + 1:cluster[1] + 1 + 1] = entity_id
                entity_length = (cluster[1] + 1) - cluster[0]
                # Fill in mention length
                mention_lengths[cluster[0] + 1:cluster[1] + 1 + 1] = np.arange(entity_length, 0, step=-1)

        fields['entity_ids'] = SequentialArrayField(entity_ids, dtype=np.int64)
        fields['mention_lengths'] = SequentialArrayField(mention_lengths, dtype=np.int64)
        fields['entity_types'] = SequentialArrayField(entity_types, dtype=np.uint8)
        return Instance(fields)
github allenai / allennlp / allennlp / data / dataset_readers / semantic_parsing / nlvr.py View on Github external
# TODO(pradeep): Assuming every world gives the same agenda for a sentence. This is true
            # now, but may change later too.
            agenda = worlds[0].get_agenda_for_sentence(sentence)
            assert agenda, "No agenda found for sentence: %s" % sentence
            # agenda_field contains indices into actions.
            agenda_field = ListField(
                [IndexField(instance_action_ids[action], action_field) for action in agenda]
            )
            fields["agenda"] = agenda_field
        if labels:
            labels_field = ListField(
                [LabelField(label, label_namespace="denotations") for label in labels]
            )
            fields["labels"] = labels_field

        return Instance(fields)
github plasticityai / magnitude / pymagnitude / third_party / allennlp / data / dataset_readers / language_modeling.py View on Github external
if self._tokens_per_instance is not None:
            all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info(u"Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({u'input_tokens': input_field,
                            u'output_tokens': output_field})
github allenai / ontoemma / emma / allennlp_classes / ontoemma_dataset_reader.py View on Github external
s_contexts = sample_n((s_ent['other_contexts'], 20))
        t_contexts = sample_n((t_ent['other_contexts'], 20))

        fields['s_ent_context'] = ListField(
            [TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
             for c in s_contexts]
        ) if s_contexts else self._empty_list_token_text_field
        fields['t_ent_context'] = ListField(
            [TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
             for c in t_contexts]
        ) if t_contexts else self._empty_list_token_text_field

        # add boolean label (0 = no match, 1 = match)
        fields['label'] = BooleanField(label)

        return Instance(fields)
github recognai / get_started_with_deep_learning_for_text_with_allennlp / recognai / readers.py View on Github external
def text_to_instance(self,  # type: ignore
                         input_text: str,
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        if self._field_preparator:
            input_text = self._field_preparator.transform(self._input, input_text)
        input_tokens = self._tokenizer.tokenize(input_text)
        fields['tokens'] = TextField(input_tokens, self._token_indexers)
        if label:
            if self._field_preparator:
                label = self._field_preparator.transform(self._gold_label, label)
            fields['label'] = LabelField(label)
        return Instance(fields)
github mhagiwara / realworldnlp / examples / generation / seqgan.py View on Github external
def tokens_to_lm_instance(tokens: List[Token],
                          token_indexers: Dict[str, TokenIndexer]):
    tokens = list(tokens)   # shallow copy
    tokens.insert(0, Token(START_SYMBOL))
    tokens.append(Token(END_SYMBOL))

    input_field = TextField(tokens[:-1], token_indexers)
    output_field = TextField(tokens[1:], token_indexers)
    return Instance({'input_tokens': input_field,
                     'output_tokens': output_field})
github dwadden / dygiepp / dygie / data / iterators / multitask_iterator.py View on Github external
def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        # Shuffle the documents if requested.
        ace_instances = self._shuffle_documents(get_dataset_instances(instances, "ace"))
        ontonotes_instances = self._shuffle_documents(get_dataset_instances(instances, "ontonotes"))
        n_ontonotes = math.floor(len(ace_instances) / 2)
        ontonotes_instances = ontonotes_instances[:n_ontonotes]

        all_instances = self._shuffle_documents(ace_instances + ontonotes_instances)
        hoppers: Dict[Any, List[Instance]] = defaultdict(list)

        for instance in all_instances:
            # Which hopper do we put this instance in?
            instance_type = (instance["metadata"]["dataset"]
                             if "dataset" in instance["metadata"]
                             else None)

            hoppers[instance_type].append(instance)

            # If the hopper is full, yield up the batch and clear it.
            if len(hoppers[instance_type]) >= self._batch_size:
                yield Batch(hoppers[instance_type])
                hoppers[instance_type].clear()

        # Deal with leftovers
        for remaining in hoppers.values():
github plasticityai / magnitude / pymagnitude / third_party / allennlp / data / dataset_readers / stanford_sentiment_tree_bank.py View on Github external
if self._granularity == u"3-class":
                if int(sentiment) < 2:
                    sentiment = u"0"
                elif int(sentiment) == 2:
                    sentiment = u"1"
                else:
                    sentiment = u"2"
            elif self._granularity == u"2-class":
                if int(sentiment) < 2:
                    sentiment = u"0"
                elif int(sentiment) == 2:
                    return None
                else:
                    sentiment = u"1"
            fields[u'label'] = LabelField(sentiment)
        return Instance(fields)
github plkmo / NLP_Toolkit / nlptoolkit / gec / models / gector / datareader.py View on Github external
return None
            rnd = random()
            # skip TN
            if self._skip_correct and all(x == "CORRECT" for x in detect_tags):
                if rnd > self._tn_prob:
                    return None
            # skip TP
            else:
                if rnd > self._tp_prob:
                    return None

            fields["labels"] = SequenceLabelField(labels, sequence,
                                                  label_namespace="labels")
            fields["d_tags"] = SequenceLabelField(detect_tags, sequence,
                                                  label_namespace="d_tags")
        return Instance(fields)
github allenai / ontoemma / emma / allennlp_classes / ontoemma_dataset_reader.py View on Github external
s_contexts = sample_n(s_ent['other_contexts'], 16, 256)
        t_contexts = sample_n(t_ent['other_contexts'], 16, 256)

        fields['s_ent_context'] = ListField(
            [TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
             for c in s_contexts]
        )
        fields['t_ent_context'] = ListField(
            [TextField(self._tokenizer.tokenize(c), self._token_only_indexer)
             for c in t_contexts]
        )

        # add boolean label (0 = no match, 1 = match)
        fields['label'] = BooleanField(label)

        return Instance(fields)