How to use the allennlp.data.dataset_readers.dataset_reader.DatasetReader.register function in allennlp

To help you get started, we’ve selected a few allennlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github serrano-s / attn-tests / textcat / textcat_reader.py View on Github external
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
    """
    Reads tokens and their topic labels.

    Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
    fields 'tokens' and 'category', in no particular order, with each document/label on one line.
    (So this means that documents must not contain either newlines or tabs.)

    Example:

    category    tokens
    sample_label_1  This is a document. It contains a couple of sentences.
    sample_label_1  This is another document. It also contains two sentences.
    sample_label_2  This document has a different label.

    and so on.
github mandarjoshi90 / pair2vec / relemb / data / dataset_readers / squad.py View on Github external
# (the "em" and "f1" metrics use the official script).
        candidate_answers: Counter = Counter()
        token_spans = set(token_spans)
        span_fields = []
        span_fields = ListField([SpanField(start, end, passage_field)
                                          for start, end in token_spans])
    else:
        span_fields = ListField([SpanField(-1, -1, passage_field)])

    fields['spans'] = span_fields
    metadata.update(additional_metadata)
    fields['metadata'] = MetadataField(metadata)
    return Instance(fields)


@DatasetReader.register("squad2")
class Squad2Reader(DatasetReader):
    """
    Reads a JSON-formatted SQuAD file and returns a ``Dataset`` where the ``Instances`` have four
    fields: ``question``, a ``TextField``, ``passage``, another ``TextField``, and ``span_start``
    and ``span_end``, both ``IndexFields`` into the ``passage`` ``TextField``.  We also add a
    ``MetadataField`` that stores the instance's ID, the original passage text, gold answer strings,
    and token offsets into the original passage, accessible as ``metadata['id']``,
    ``metadata['original_passage']``, ``metadata['answer_texts']`` and
    ``metadata['token_offsets']``.  This is so that we can more easily use the official SQuAD
    evaluation script to get metrics.

    Parameters
    ----------
    multiparagraph : ``bool``, optional (default=``False``)
        If ``True``, uses ``util.make_multi_paragraph_reading_comprehension_instance`` to create
        a "multi-paragraph" instance (but with only one paragraph) with ``"paragraphs"`` being a
github allenai / allennlp-reading-comprehension / allennlp_rc / dataset_readers / qangaroo.py View on Github external
import logging

from typing import Dict, List
from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.fields import Field, TextField, ListField, MetadataField, IndexField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer

logger = logging.getLogger(__name__)


@DatasetReader.register("qangaroo")
class QangarooReader(DatasetReader):
    """
    Reads a JSON-formatted Qangaroo file and returns a ``Dataset`` where the ``Instances`` have six
    fields: ``candidates``, a ``ListField[TextField]``, ``query``, a ``TextField``, ``supports``, a
    ``ListField[TextField]``, ``answer``, a ``TextField``, and ``answer_index``, a ``IndexField``.
    We also add a ``MetadataField`` that stores the instance's ID and annotations if they are present.

    Parameters
    ----------
    tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
        We use this ``Tokenizer`` for both the question and the passage.  See :class:`Tokenizer`.
        Default is ```WordTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        We similarly use this for both the question and the passage.  See :class:`TokenIndexer`.
        Default is ``{"tokens": SingleIdTokenIndexer()}``.
    """
github recognai / get_started_with_deep_learning_for_text_with_allennlp / recognai / readers.py View on Github external
self._mapping = mapping

    def transform(self, field, value) -> str:
        if field == self._field:
            return self._mapping.get(value, default=value)
        else:
            return value

    @classmethod
    def from_params(cls, params: Params) -> 'FieldPreparator':
        field = params.pop('field', None)
        mapping = params.pop('mapping', {}).as_dict()
        return FieldPreparator(field=field, mapping=mapping)


@DatasetReader.register("jsonl_classification_reader")
class JsonlClassificationReader(DatasetReader):
    """
    Reads a file from a classification dataset.  This data is
    formatted as jsonl, one json-formatted instance per line.  The keys in the data are
    "gold_label", "input", which are configurable in the JSON definition.

    Parameters
    ----------
    tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
        See :class:`Tokenizer`.
    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
        See :class:`TokenIndexer`.
    """

    def __init__(self,
                 input: str,
github allenai / allennlp / allennlp / data / dataset_readers / reading_comprehension / qangaroo.py View on Github external
import logging

from typing import Dict, List
from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.fields import Field, TextField, ListField, MetadataField, IndexField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer

logger = logging.getLogger(__name__)


@DatasetReader.register("qangaroo")
class QangarooReader(DatasetReader):
    """
    Reads a JSON-formatted Qangaroo file and returns a ``Dataset`` where the ``Instances`` have six
    fields: ``candidates``, a ``ListField[TextField]``, ``query``, a ``TextField``, ``supports``, a
    ``ListField[TextField]``, ``answer``, a ``TextField``, and ``answer_index``, a ``IndexField``.
    We also add a ``MetadataField`` that stores the instance's ID and annotations if they are present.

    Parameters
    ----------
    tokenizer : ``Tokenizer``, optional (default=``SpacyTokenizer()``)
        We use this ``Tokenizer`` for both the question and the passage.  See :class:`Tokenizer`.
        Default is ```SpacyTokenizer()``.
    token_indexers : ``Dict[str, TokenIndexer]``, optional
        We similarly use this for both the question and the passage.  See :class:`TokenIndexer`.
        Default is ``{"tokens": SingleIdTokenIndexer()}``.
    """
github plasticityai / magnitude / pymagnitude / third_party / allennlp / data / dataset_readers / universal_dependencies.py View on Github external
fields[u"words"] = tokens
        fields[u"pos_tags"] = SequenceLabelField(upos_tags, tokens, label_namespace=u"pos")
        if dependencies is not None:
            # We don't want to expand the label namespace with an additional dummy token, so we'll
            # always give the 'ROOT_HEAD' token a label of 'root'.
            fields[u"head_tags"] = SequenceLabelField([x[0] for x in dependencies],
                                                     tokens,
                                                     label_namespace=u"head_tags")
            fields[u"head_indices"] = SequenceLabelField([int(x[1]) for x in dependencies],
                                                        tokens,
                                                        label_namespace=u"head_index_tags")

        fields[u"metadata"] = MetadataField({u"words": words, u"pos": upos_tags})
        return Instance(fields)

UniversalDependenciesDatasetReader = DatasetReader.register(u"universal_dependencies")(UniversalDependenciesDatasetReader)
github plasticityai / magnitude / pymagnitude / third_party / allennlp / data / dataset_readers / ccgbank.py View on Github external
Predicate-argument categories (only if supplied)
        """
        # pylint: disable=arguments-differ
        text_field = TextField([Token(x) for x in tokens], token_indexers=self._token_indexers)
        fields                   = {u"tokens": text_field}

        for field_name, labels in ((u'ccg_categories', ccg_categories),
                                   (u'original_pos_tags', original_pos_tags),
                                   (u'modified_pos_tags', modified_pos_tags),
                                   (u'predicate_arg_categories', predicate_arg_categories)):
            if labels is not None:
                fields[field_name] = SequenceLabelField(labels, text_field)

        return Instance(fields)

CcgBankDatasetReader = DatasetReader.register(u"ccgbank")(CcgBankDatasetReader)
github dwadden / dygiepp / dygie / data / dataset_readers / dygie.py View on Github external
from allennlp.data.instance import Instance
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.dataset_readers.dataset_utils import enumerate_spans

from dygie.data.fields.adjacency_field_assym import AdjacencyFieldAssym
from dygie.data.dataset_readers.document import Document, Sentence

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


class DyGIEDataException(Exception):
    pass


@DatasetReader.register("dygie")
class DyGIEReader(DatasetReader):
    """
    Reads a single JSON-formatted file. This is the same file format as used in the
    scierc, but is preprocessed
    """
    def __init__(self,
                 max_span_width: int,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._max_span_width = max_span_width
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
github deepmipt / Allen_HCN / allen_hcn / babi_reader.py View on Github external
from allennlp.data.dataset import Dataset
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.fields import TextField, IndexField, ListField
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.common.checks import ConfigurationError
from allennlp.common import Params

from allen_hcn.actions import HCNActionTracker
from allen_hcn.entities import HCNEntityTracker
import allen_hcn.util as util

logger = logging.getLogger(__name__)


@DatasetReader.register("babi")
class BabiDatasetReader(DatasetReader):
    """
    Read a tsv file containing paired sequences, and create a dataset suitable for a
    ``HybridCodeNetwork`` model.

    Expected format for each input line is 

     The output of ``read`` is a list of ``Instance``s with the fields:
        source_tokens: ``TextField`` and
        target_tokens: ``TextField``
    """

    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None):
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
github allenai / allennlp / allennlp / data / dataset_readers / semantic_parsing / quarel.py View on Github external
from allennlp.data.tokenizers.word_stemmer import PorterStemmer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.fields import ArrayField, Field, TextField, KnowledgeGraphField, LabelField
from allennlp.data.fields import IndexField, ListField, MetadataField, ProductionRuleField
from allennlp.data.fields import SequenceLabelField
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.semparse.contexts.knowledge_graph import KnowledgeGraph
from allennlp.semparse.contexts.quarel_utils import WorldTaggerExtractor, words_from_entity_string
from allennlp.semparse.contexts.quarel_utils import LEXICAL_CUES, align_entities
from allennlp.semparse.worlds.quarel_world import QuarelWorld


logger = logging.getLogger(__name__)


@DatasetReader.register("quarel")
class QuarelDatasetReader(DatasetReader):
    """
    Parameters
    ----------
    lazy : ``bool`` (optional, default=False)
        Passed to ``DatasetReader``.  If this is ``True``, training will start sooner, but will
        take longer per batch.
    replace_world_entities : ``bool`` (optional, default=False)
        Replace world entities (w/stemming) with "worldone" and "worldtwo" directly in the question
    world_extraction_model: ``str`` (optional, default=None)
        Reference (file or URL) to world tagger model used to extract worlds.
    align_world_extractions : ``bool`` (optional, default=False)
        Use alignment of extracted worlds with gold worlds, to pick the appropriate gold LF.
    gold_world_extractions : ``bool`` (optional, default=False)
        Use gold worlds rather than world extractor
    tagger_only : ``bool`` (optional default=False)