How to use the allennlp.data.fields.TextField function in allennlp

To help you get started, we’ve selected a few allennlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github vered1986 / NC_embeddings / source / evaluation / compute_any_vector.py View on Github external
def text_to_instance(self, nc: str) -> Instance:
        tokenized_nc = self._tokenizer.tokenize(nc)
        nc_field = TextField(tokenized_nc, self._token_indexers)
        w1_field, w2_field, nc_seq_field = nc_field, nc_field, nc_field

        constituents = nc.split('_')
        if len(constituents) == 2:
            w1, w2 = constituents
            tokenized_w1 = self._tokenizer.tokenize(w1)
            w1_field = TextField(tokenized_w1, self._token_indexers)
            tokenized_w2 = self._tokenizer.tokenize(w2)
            w2_field = TextField(tokenized_w2, self._token_indexers)
            tokenized_nc_seq = self._tokenizer.tokenize(' '.join((w1, w2)))
            nc_seq_field = TextField(tokenized_nc_seq, self._token_indexers)

        fields = {'nc': nc_field, 'w1': w1_field, 'w2': w2_field, 'nc_seq': nc_seq_field}
        return Instance(fields)
github nafitzgerald / nrl-qasrl / nrl / data / dataset_readers / qasrl_reader.py View on Github external
def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None):
        instance_dict = {}

        if isinstance(sent_tokens, str):
            sent_tokens = sent_tokens.split()
        sent_tokens = cleanse_sentence_text(sent_tokens)
        text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers)
        instance_dict['text'] = text_field
        instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field)

        if annotations is not None:
            for i, slot_name in enumerate(self._slot_labels):
                span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans])
                instance_dict['span_slot_%s'%slot_name] = span_slot

            labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans])
            instance_dict['labeled_spans'] = labeled_span_field

            if self._bio_labels:
                bio_labels = ["O"] * len(sent_tokens)

                bio_labels[pred_index] = "B-V"
github allenai / allennlp / allennlp / data / dataset_readers / ontonotes_ner.py View on Github external
def text_to_instance(
        self,  # type: ignore
        tokens: List[Token],
        ner_tags: List[str] = None,
    ) -> Instance:
        """
        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
        """

        sequence = TextField(tokens, self._token_indexers)
        instance_fields: Dict[str, Field] = {"tokens": sequence}
        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens]})
        # Add "tag label" to instance
        if ner_tags is not None:
            if self._coding_scheme == "BIOUL":
                ner_tags = to_bioul(ner_tags, encoding="BIO")
            instance_fields["tags"] = SequenceLabelField(ner_tags, sequence)
        return Instance(instance_fields)
github vered1986 / NC_embeddings / source / training / paraphrase_based / nc_paraphrases_dataset_reader.py View on Github external
def text_to_instance(self, nc: str, paraphrase: str = None, neg_paraphrase: str = None) -> Instance:
        tokenized_nc = self._tokenizer.tokenize(nc)
        nc_field = TextField(tokenized_nc, self._token_indexers)

        # Remove non-binary NCs to make it comparable to the other composition functions
        if nc_field.sequence_length() != 2:
            return None

        fields = {'nc': nc_field}

        # During training, we minimize the distance to the paraphrase
        if paraphrase is not None:
            tokenized_paraphrase = self._tokenizer.tokenize(paraphrase)
            paraphrase_field = TextField(tokenized_paraphrase, self._token_indexers)
            fields['paraphrase'] = paraphrase_field

            # Negative sampled paraphrase to move away from
            tokenized_neg_paraphrase = self._tokenizer.tokenize(neg_paraphrase)
            neg_paraphrase_field = TextField(tokenized_neg_paraphrase, self._token_indexers)
            fields['neg_paraphrase'] = neg_paraphrase_field

        return Instance(fields)
github DreamerDeo / HIT-SCIR-CoNLL2019 / utils / transition_amr_reader.py View on Github external
def text_to_instance(self,  # type: ignore
                         tokens: List[str],
                         lemmas: List[str] = None,
                         pos_tags: List[str] = None,
                         gold_actions: List[List[str]] = None,
                         id: str = None,
                         amr: str = None,
                         input: str = None,
                         mrp: str = None,
                         companion: str = None) -> Instance:
        # pylint: disable=arguments-differ
        fields: Dict[str, Field] = {}
        token_field = TextField([Token(t) for t in tokens], self._token_indexers)
        fields["tokens"] = token_field
        meta_dict = {"tokens": tokens}
        if id:
            meta_dict["id"] = id
        if amr:
            meta_dict["amr"] = amr
        if input:
            meta_dict["input"] = input
        if mrp:
            meta_dict["mrp"] = json.loads(mrp)
        if companion:
            meta_dict["companion"] = json.loads(companion)

        if lemmas is not None and self._lemma_indexers is not None:
            fields["lemmas"] = TextField([Token(l) for l in lemmas], self._lemma_indexers)
        if pos_tags is not None:
github allenai / allennlp-reading-comprehension / allennlp_rc / dataset_readers / qangaroo.py View on Github external
def text_to_instance(
        self,  # type: ignore
        candidates: List[str],
        query: str,
        supports: List[str],
        _id: str = None,
        answer: str = None,
        annotations: List[List[str]] = None,
    ) -> Instance:

        fields: Dict[str, Field] = {}

        candidates_field = ListField(
            [
                TextField(candidate, self._token_indexers)
                for candidate in self._tokenizer.batch_tokenize(candidates)
            ]
        )

        fields["query"] = TextField(self._tokenizer.tokenize(query), self._token_indexers)

        fields["supports"] = ListField(
            [
                TextField(support, self._token_indexers)
                for support in self._tokenizer.batch_tokenize(supports)
            ]
        )

        fields["answer"] = TextField(self._tokenizer.tokenize(answer), self._token_indexers)

        fields["answer_index"] = IndexField(candidates.index(answer), candidates_field)
github allenai / allennlp / scripts / write_elmo_representations_to_file.py View on Github external
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
github vered1986 / NC_embeddings / source / evaluation / compute_any_vector.py View on Github external
def text_to_instance(self, nc: str) -> Instance:
        nc = nc.replace('_', ' ')
        tokenized_nc = self._tokenizer.tokenize(nc)
        nc_field = TextField(tokenized_nc, self._token_indexers)
        fields = {'nc': nc_field}
        return Instance(fields)
github mhagiwara / realworldnlp / examples / generation / seqgan.py View on Github external
def tokens_to_lm_instance(tokens: List[Token],
                          token_indexers: Dict[str, TokenIndexer]):
    tokens = list(tokens)   # shallow copy
    tokens.insert(0, Token(START_SYMBOL))
    tokens.append(Token(END_SYMBOL))

    input_field = TextField(tokens[:-1], token_indexers)
    output_field = TextField(tokens[1:], token_indexers)
    return Instance({'input_tokens': input_field,
                     'output_tokens': output_field})
github plasticityai / magnitude / pymagnitude / third_party / allennlp / data / dataset_readers / language_modeling.py View on Github external
instance_strings = text_file.readlines()

        if self._tokens_per_instance is not None:
            all_text = u" ".join([x.replace(u"\n", u" ").strip() for x in instance_strings])
            tokenized_text = self._tokenizer.tokenize(all_text)
            num_tokens = self._tokens_per_instance + 1
            tokenized_strings = []
            logger.info(u"Creating dataset from all text in file: %s", file_path)
            for index in Tqdm.tqdm(range(0, len(tokenized_text) - num_tokens, num_tokens - 1)):
                tokenized_strings.append(tokenized_text[index:(index + num_tokens)])
        else:
            tokenized_strings = [self._tokenizer.tokenize(s) for s in instance_strings]

        for tokenized_string in tokenized_strings:
            input_field = TextField(tokenized_string[:-1], self._token_indexers)
            output_field = TextField(tokenized_string[1:], self._output_indexer)
            yield Instance({u'input_tokens': input_field,
                            u'output_tokens': output_field})