How to use allennlp - 10 common examples

To help you get started, we’ve selected a few allennlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github serrano-s / attn-tests / train_model.py View on Github external
datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))
    vocab = Vocabulary.from_params(
            params.pop("vocabulary", {}),
            (instance for key, dataset in all_datasets.items()
             for instance in dataset
             if key in datasets_for_vocab_creation)
    )

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    model = transfer_prev_model_weights_to_new_model(prev_best_model, model)

    # Initializing the model can have side effect of expanding the vocabulary
    vocab.save_to_files(os.path.join(serialization_dir, "vocabulary"))

    iterator = DataIterator.from_params(params.pop("iterator"))
    iterator.index_with(vocab)
    validation_iterator_params = params.pop("validation_iterator", None)
    if validation_iterator_params:
        validation_iterator = DataIterator.from_params(validation_iterator_params)
        validation_iterator.index_with(vocab)
    else:
        validation_iterator = None

    train_data = all_datasets['train']
    validation_data = all_datasets.get('validation')
github serrano-s / attn-tests / textcat / textcat_reader.py View on Github external
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
    """
    Reads tokens and their topic labels.

    Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
    fields 'tokens' and 'category', in no particular order, with each document/label on one line.
    (So this means that documents must not contain either newlines or tabs.)

    Example:

    category    tokens
    sample_label_1  This is a document. It contains a couple of sentences.
    sample_label_1  This is another document. It also contains two sentences.
    sample_label_2  This document has a different label.

    and so on.
github serrano-s / attn-tests / textcat / textcat_reader.py View on Github external
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ListField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.common.checks import ConfigurationError
from allennlp.data.tokenizers import Tokenizer, WordTokenizer
from .sentence_tokenizer import SentenceTokenizer
from allennlp.data.tokenizers.word_filter import StopwordFilter, PassThroughWordFilter

logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@DatasetReader.register("textcat")
class TextCatReader(DatasetReader):
    """
    Reads tokens and their topic labels.

    Assumes that data in file_path provided to _read is tab-separated, containing (at least) the two
    fields 'tokens' and 'category', in no particular order, with each document/label on one line.
    (So this means that documents must not contain either newlines or tabs.)

    Example:

    category    tokens
    sample_label_1  This is a document. It contains a couple of sentences.
    sample_label_1  This is another document. It also contains two sentences.
    sample_label_2  This document has a different label.

    and so on.
github nyu-mll / GLUE-baselines / src / util.py View on Github external
def _get_combination(combination: str, tensors: List[torch.Tensor]) -> torch.Tensor:
    if combination.isdigit():
        index = int(combination) - 1
        return tensors[index]
    else:
        if len(combination) != 3:
            raise ConfigurationError("Invalid combination: " + combination)
        first_tensor = _get_combination(combination[0], tensors)
        second_tensor = _get_combination(combination[2], tensors)
        operation = combination[1]
        if operation == '*':
            return first_tensor * second_tensor
        elif operation == '/':
            return first_tensor / second_tensor
        elif operation == '+':
            return first_tensor + second_tensor
        elif operation == '-':
            return first_tensor - second_tensor
        else:
            raise ConfigurationError("Invalid operation: " + operation)
github vered1986 / NC_embeddings / source / evaluation / compute_any_vector.py View on Github external
def text_to_instance(self, nc: str) -> Instance:
        tokenized_nc = self._tokenizer.tokenize(nc)
        nc_field = TextField(tokenized_nc, self._token_indexers)
        w1_field, w2_field, nc_seq_field = nc_field, nc_field, nc_field

        constituents = nc.split('_')
        if len(constituents) == 2:
            w1, w2 = constituents
            tokenized_w1 = self._tokenizer.tokenize(w1)
            w1_field = TextField(tokenized_w1, self._token_indexers)
            tokenized_w2 = self._tokenizer.tokenize(w2)
            w2_field = TextField(tokenized_w2, self._token_indexers)
            tokenized_nc_seq = self._tokenizer.tokenize(' '.join((w1, w2)))
            nc_seq_field = TextField(tokenized_nc_seq, self._token_indexers)

        fields = {'nc': nc_field, 'w1': w1_field, 'w2': w2_field, 'nc_seq': nc_seq_field}
        return Instance(fields)
github nafitzgerald / nrl-qasrl / nrl / data / dataset_readers / qasrl_reader.py View on Github external
def _make_instance_from_text(self, sent_tokens, pred_index, annotations = None, sent_id = None):
        instance_dict = {}

        if isinstance(sent_tokens, str):
            sent_tokens = sent_tokens.split()
        sent_tokens = cleanse_sentence_text(sent_tokens)
        text_field = TextField([Token(t) for t in sent_tokens], self._token_indexers)
        instance_dict['text'] = text_field
        instance_dict['predicate_indicator'] = SequenceLabelField([1 if i == pred_index else 0 for i in range(len(sent_tokens))], text_field)

        if annotations is not None:
            for i, slot_name in enumerate(self._slot_labels):
                span_slot = ListField([LabelField(ann.slots[i], label_namespace="slot_%s"%slot_name) for ann in annotations for span in ann.all_spans])
                instance_dict['span_slot_%s'%slot_name] = span_slot

            labeled_span_field = ListField([SpanField(span.start(), span.end(), text_field) for ann in annotations for span in ann.all_spans])
            instance_dict['labeled_spans'] = labeled_span_field

            if self._bio_labels:
                bio_labels = ["O"] * len(sent_tokens)

                bio_labels[pred_index] = "B-V"
github rloganiv / kglm-model / kglm / models / kglm_disc.py View on Github external
def _forward_loop(self,
                      source: Dict[str, torch.Tensor],
                      alias_database: AliasDatabase,
                      mention_type: torch.Tensor,
                      raw_entity_ids: Dict[str, torch.Tensor],
                      entity_ids: Dict[str, torch.Tensor],
                      parent_ids: Dict[str, torch.Tensor],
                      relations: Dict[str, torch.Tensor],
                      shortlist: Dict[str, torch.Tensor],
                      shortlist_inds: torch.Tensor) -> Dict[str, torch.Tensor]:
        # Get the token mask and extract indexed text fields.
        # shape: (batch_size, sequence_length)
        target_mask = get_text_field_mask(source)
        source = source['tokens']
        raw_entity_ids = raw_entity_ids['raw_entity_ids']
        entity_ids = entity_ids['entity_ids']
        parent_ids = parent_ids['entity_ids']
        relations = relations['relations']

        logger.debug('Source & Target shape: %s', source.shape)
        logger.debug('Entity ids shape: %s', entity_ids.shape)
        logger.debug('Relations & Parent ids shape: %s', relations.shape)
        logger.debug('Shortlist shape: %s', shortlist['entity_ids'].shape)
        # Embed source tokens.
        # shape: (batch_size, sequence_length, embedding_dim)
        encoded, alpha_loss, beta_loss = self._encode_source(source)
        splits = [self.token_embedding_dim] + [self.entity_embedding_dim] * 2
        encoded_token, encoded_head, encoded_relation = encoded.split(splits, dim=-1)
github allenai / allennlp-semparse / allennlp_semparse / dataset_readers / grammar_based_text2sql.py View on Github external
action_sequence = []
            elif action_sequence is None:
                return None

        index_fields: List[Field] = []
        production_rule_fields: List[Field] = []

        for production_rule in all_actions:
            nonterminal, _ = production_rule.split(" ->")
            production_rule = " ".join(production_rule.split(" "))
            field = ProductionRuleField(
                production_rule, self._world.is_global_rule(nonterminal), nonterminal=nonterminal
            )
            production_rule_fields.append(field)

        valid_actions_field = ListField(production_rule_fields)
        fields["valid_actions"] = valid_actions_field

        action_map = {
            action.rule: i  # type: ignore
            for i, action in enumerate(valid_actions_field.field_list)
        }

        for production_rule in action_sequence:
            index_fields.append(IndexField(action_map[production_rule], valid_actions_field))
        if not action_sequence:
            index_fields = [IndexField(-1, valid_actions_field)]

        action_sequence_field = ListField(index_fields)
        fields["action_sequence"] = action_sequence_field
        return Instance(fields)
github titipata / detecting-scientific-claim / scripts / predict_claim_feature_concat.py View on Github external
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.decomposition import TruncatedSVD

EMBEDDING_DIM = 200
MEDLINE_WORD_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/medline_word_prob.json'
DISCOURSE_MODEL_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/model.tar.gz'
PUBMED_PRETRAINED_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/wikipedia-pubmed-and-PMC-w2v.txt.gz'
TRAIN_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/train_labels.json'
VALIDATION_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/validation_labels.json'
TEST_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/test_labels.json'

archive = load_archive(DISCOURSE_MODEL_PATH) # discourse model
predictor = Predictor.from_archive(archive, 'discourse_predictor')
assert os.path.exists('wiki.en.bin') == True
ft_model = load_model('wiki.en.bin') # fastText word vector
p_dict = json.load(open(cached_path(MEDLINE_WORD_PATH), 'r'))


def read_embedding(pretrained_path=PUBMED_PRETRAINED_PATH):
    """
    Read Pubmed Pretrained embedding from Amazon S3 and 
    return dictionary of embeddings
    """
    embeddings = {}
    with EmbeddingsTextFile(pretrained_path) as embeddings_file:
        for line in embeddings_file:
            token = line.split(' ', 1)[0]
            if token in p_dict.keys():
                fields = line.rstrip().split(' ')
                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
    return embeddings
github allenai / vampire / dataset_readers / vocab_generator.py View on Github external
def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            columns = data_file.readline().strip('\n').split('\t')
            for line in data_file.readlines():
                if not line:
                    continue
                items = line.strip("\n").split("\t")
                tokens = items[columns.index("tokens")]
                category = items[columns.index("category")]
                instance = self.text_to_instance(tokens=tokens,
                                                 category=category)
                if instance is not None:
                    yield instance