How to use the allennlp.common.file_utils.cached_path function in allennlp

To help you get started, we’ve selected a few allennlp examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github titipata / detecting-scientific-claim / scripts / predict_claim_feature_concat.py View on Github external
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.decomposition import TruncatedSVD

EMBEDDING_DIM = 200
MEDLINE_WORD_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/medline_word_prob.json'
DISCOURSE_MODEL_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/model.tar.gz'
PUBMED_PRETRAINED_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/wikipedia-pubmed-and-PMC-w2v.txt.gz'
TRAIN_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/train_labels.json'
VALIDATION_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/validation_labels.json'
TEST_PATH = 'https://s3-us-west-2.amazonaws.com/pubmed-rct/test_labels.json'

archive = load_archive(DISCOURSE_MODEL_PATH) # discourse model
predictor = Predictor.from_archive(archive, 'discourse_predictor')
assert os.path.exists('wiki.en.bin') == True
ft_model = load_model('wiki.en.bin') # fastText word vector
p_dict = json.load(open(cached_path(MEDLINE_WORD_PATH), 'r'))


def read_embedding(pretrained_path=PUBMED_PRETRAINED_PATH):
    """
    Read Pubmed Pretrained embedding from Amazon S3 and 
    return dictionary of embeddings
    """
    embeddings = {}
    with EmbeddingsTextFile(pretrained_path) as embeddings_file:
        for line in embeddings_file:
            token = line.split(' ', 1)[0]
            if token in p_dict.keys():
                fields = line.rstrip().split(' ')
                vector = np.asarray(fields[1:], dtype='float32')
                embeddings[token] = vector
    return embeddings
github allenai / vampire / dataset_readers / vocab_generator.py View on Github external
def _read(self, file_path):
        with open(cached_path(file_path), "r") as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            columns = data_file.readline().strip('\n').split('\t')
            for line in data_file.readlines():
                if not line:
                    continue
                items = line.strip("\n").split("\t")
                tokens = items[columns.index("tokens")]
                category = items[columns.index("category")]
                instance = self.text_to_instance(tokens=tokens,
                                                 category=category)
                if instance is not None:
                    yield instance
github alontalmor / MultiQA / models / multiqa_reader.py View on Github external
def _read(self, file_path: str):
        # supporting multi-dataset training:
        datasets = []
        for ind, single_file_path in enumerate(file_path.split(',')):
            single_file_path_cached = cached_path(single_file_path)
            zip_handle = gzip.open(single_file_path_cached, 'rb')
            datasets.append({'single_file_path':single_file_path, \
                             'file_handle': zip_handle, \
                             'num_of_questions':0, 'inst_remainder':[], \
                             'dataset_weight':1 if self._dataset_weight is None else self._dataset_weight[ind] })
            datasets[ind]['header'] = json.loads(datasets[ind]['file_handle'].readline())['header']

        is_done = [False for _ in datasets]
        while not all(is_done):
            for ind, dataset in enumerate(datasets):
                if is_done[ind]:
                    continue

                for example in dataset['file_handle']:
                    example = self.combine_context(json.loads(example))
github alontalmor / MultiQA / convert_multiqa_to_squad_format.py View on Github external
def multiqa_to_squad(dataset_paths, dataset_weights=None, sample_size = -1):
    # take one or more multiqa files and convert it to a squad format file.
    # supporting multi-dataset training:
    datasets = []
    for ind, single_file_path in enumerate(dataset_paths):
        single_file_path_cached = cached_path(single_file_path)
        zip_handle = gzip.open(single_file_path_cached, 'rb')
        datasets.append({'single_file_path': single_file_path, \
                         'file_handle': zip_handle, \
                         'num_of_questions': 0, 'inst_remainder': [], \
                         'dataset_weight': 1 if dataset_weights is None else dataset_weights[ind]})
        datasets[ind]['header'] = json.loads(datasets[ind]['file_handle'].readline())['header']

    # We will have only one topic here..
    squad_data = {'data':[{'title':'','paragraphs':[]}]}
    is_done = [False for _ in datasets]
    while not all(is_done):
        for ind, dataset in enumerate(datasets):
            if is_done[ind]:
                continue

            for example in dataset['file_handle']:
github allenai / allennlp / allennlp / data / dataset_readers / semantic_parsing / grammar_based_text2sql.py View on Github external
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._use_all_sql = use_all_sql
        self._remove_unneeded_aliases = remove_unneeded_aliases
        self._use_prelinked_entities = use_prelinked_entities
        self._keep_if_unparsable = keep_if_unparseable

        if not self._use_prelinked_entities:
            raise ConfigurationError(
                "The grammar based text2sql dataset reader "
                "currently requires the use of entity pre-linking."
            )

        self._cross_validation_split_to_exclude = str(cross_validation_split_to_exclude)

        if database_file is not None:
            database_file = cached_path(database_file)
            connection = sqlite3.connect(database_file)
            self._cursor = connection.cursor()
        else:
            self._cursor = None

        self._schema_path = schema_path
        self._world = Text2SqlWorld(
            schema_path,
            self._cursor,
            use_prelinked_entities=use_prelinked_entities,
            use_untyped_entities=use_untyped_entities,
        )
github allenai / allennlp / allennlp / modules / elmo.py View on Github external
def _load_highway(self):

        # the highway layers have same dimensionality as the number of cnn filters
        cnn_options = self._options["char_cnn"]
        filters = cnn_options["filters"]
        n_filters = sum(f[1] for f in filters)
        n_highway = cnn_options["n_highway"]

        # create the layers, and load the weights
        self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu)
        for k in range(n_highway):
            # The AllenNLP highway is one matrix multplication with concatenation of
            # transform and carry weights.
            with h5py.File(cached_path(self._weight_file), "r") as fin:
                # The weights are transposed due to multiplication order assumptions in tf
                # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X))
                w_transform = numpy.transpose(fin["CNN_high_{}".format(k)]["W_transform"][...])
                # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x)
                w_carry = -1.0 * numpy.transpose(fin["CNN_high_{}".format(k)]["W_carry"][...])
                weight = numpy.concatenate([w_transform, w_carry], axis=0)
                self._highways._layers[k].weight.data.copy_(torch.FloatTensor(weight))
                self._highways._layers[k].weight.requires_grad = self.requires_grad

                b_transform = fin["CNN_high_{}".format(k)]["b_transform"][...]
                b_carry = -1.0 * fin["CNN_high_{}".format(k)]["b_carry"][...]
                bias = numpy.concatenate([b_transform, b_carry], axis=0)
                self._highways._layers[k].bias.data.copy_(torch.FloatTensor(bias))
                self._highways._layers[k].bias.requires_grad = self.requires_grad
github huggingface / hmtl / hmtl / dataset_readers / mention_ace.py View on Github external
def _read(self, file_path: str):
        file_path = cached_path(file_path)  # if `file_path` is a URL, redirect to the cache
        ace_reader = ACE()
        logger.info("Reading ACE Mention instances from dataset files at: %s", file_path)

        for sentence in self._sentence_iterate(ace_reader, file_path):
            tokens = [Token(t) for t in sentence.words]
            if not sentence.mention_tags:
                tags = ["O" for _ in tokens]
            else:
                tags = sentence.mention_tags

            yield self.text_to_instance(tokens, tags)
github allenai / allennlp / allennlp / modules / token_embedders / embedding.py View on Github external
def _open_inside_zip(self, archive_path: str, member_path: Optional[str] = None) -> None:
        cached_archive_path = cached_path(archive_path, cache_dir=self._cache_dir)
        archive = zipfile.ZipFile(cached_archive_path, "r")
        if member_path is None:
            members_list = archive.namelist()
            member_path = self._get_the_only_file_in_the_archive(members_list, archive_path)
        member_path = cast(str, member_path)
        member_file = archive.open(member_path, "r")
        self._handle = io.TextIOWrapper(member_file, encoding=self._encoding)
        self._archive_handle = archive
github allenai / vampire / vampire / common / allennlp_bridge.py View on Github external
def from_params(cls, params: Params, instances: Iterable['adi.Instance'] = None):
        vampire_vocab_file = params.pop('vampire_vocab_file')
        vocab = cls()
        vocab = vocab.from_instances(instances=instances,
                                     tokens_to_add={"classifier": ["@@UNKNOWN@@"]})
        vampire_vocab_file = cached_path(vampire_vocab_file)
        vocab.set_from_file(filename=vampire_vocab_file,
                            namespace="vampire",
                            oov_token="@@UNKNOWN@@",
                            is_padded=False)
        return vocab