How to use the emmental.data.EmmentalDataset function in emmental

To help you get started, we’ve selected a few emmental examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github SenWu / emmental / tests / schedulers / test_mixed_scheduler.py View on Github external
caplog.set_level(logging.INFO)

    emmental.Meta.init()

    task1 = "task1"
    x1 = np.random.rand(20, 2)
    y1 = torch.from_numpy(np.random.rand(20))

    task2 = "task2"
    x2 = np.random.rand(30, 3)
    y2 = torch.from_numpy(np.random.rand(30))

    dataloaders = [
        EmmentalDataLoader(
            task_to_label_dict={task_name: "label"},
            dataset=EmmentalDataset(
                name=task_name, X_dict={"feature": x}, Y_dict={"label": y}
            ),
            split="train",
            batch_size=10,
            shuffle=True,
        )
        for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
    ]

    scheduler = MixedScheduler()

    assert scheduler.get_num_batches(dataloaders) == 2

    batch_task_names_1 = [
        batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders)
    ]
github SenWu / emmental / tests / schedulers / test_sequential_scheduler.py View on Github external
caplog.set_level(logging.INFO)

    emmental.Meta.init()

    task1 = "task1"
    x1 = np.random.rand(20, 2)
    y1 = torch.from_numpy(np.random.rand(20))

    task2 = "task2"
    x2 = np.random.rand(30, 3)
    y2 = torch.from_numpy(np.random.rand(30))

    dataloaders = [
        EmmentalDataLoader(
            task_to_label_dict={task_name: "label"},
            dataset=EmmentalDataset(
                name=task_name, X_dict={"feature": x}, Y_dict={"label": y}
            ),
            split="train",
            batch_size=10,
            shuffle=True,
        )
        for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
    ]

    scheduler = SequentialScheduler()

    assert scheduler.get_num_batches(dataloaders) == 5

    batch_task_names = [
        batch_data[-2] for batch_data in scheduler.get_batches(dataloaders)
    ]
github SenWu / emmental / tests / test_e2e.py View on Github external
torch.tensor(Y2[int(0.9 * N) :]),
    )

    train_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}
    )

    train_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}
    )

    dev_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}
    )

    dev_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}
    )

    test_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}
    )

    test_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test}
    )

    task_to_label_dict = {"task1": "label1"}

    train_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=train_dataset1,
github SenWu / emmental / tests / data / test_data.py View on Github external
dataset.add_labels(Y_dict={"label2": x2})

    # Check add one more label to dataset
    assert torch.equal(dataset[0][1]["label2"], y2[0])

    dataset.remove_label(label_name="label1")

    # Check remove one more label to dataset
    assert "label1" not in dataset.Y_dict

    with pytest.raises(ValueError):
        dataset = EmmentalDataset(
            X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids"
        )

    dataset = EmmentalDataset(
        X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data"
    )

    with pytest.raises(ValueError):
        dataset = EmmentalDataset(
            X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
        )

    with pytest.raises(ValueError):
        dataset = EmmentalDataset(
            X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
        )
github SenWu / emmental / tests / test_e2e.py View on Github external
name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}
    )

    train_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}
    )

    dev_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}
    )

    dev_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}
    )

    test_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}
    )

    test_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test}
    )

    task_to_label_dict = {"task1": "label1"}

    train_dataloader1 = EmmentalDataLoader(
        task_to_label_dict=task_to_label_dict,
        dataset=train_dataset1,
        split="train",
        batch_size=10,
    )
    dev_dataloader1 = EmmentalDataLoader(
github SenWu / emmental / tests / data / test_data.py View on Github external
dataset.add_labels(Y_dict={"label2": y2})

    with pytest.raises(ValueError):
        dataset.add_labels(Y_dict={"label2": x2})

    # Check add one more label to dataset
    assert torch.equal(dataset[0][1]["label2"], y2[0])

    dataset.remove_label(label_name="label1")

    # Check remove one more label to dataset
    assert "label1" not in dataset.Y_dict

    with pytest.raises(ValueError):
        dataset = EmmentalDataset(
            X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids"
        )

    dataset = EmmentalDataset(
        X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data"
    )

    with pytest.raises(ValueError):
        dataset = EmmentalDataset(
            X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
        )

    with pytest.raises(ValueError):
        dataset = EmmentalDataset(
            X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
        )
github SenWu / emmental / tests / test_e2e.py View on Github external
Y1_train, Y1_dev, Y1_test = (
        torch.tensor(Y1[: int(0.8 * N)]),
        torch.tensor(Y1[int(0.8 * N) : int(0.9 * N)]),
        torch.tensor(Y1[int(0.9 * N) :]),
    )
    Y2_train, Y2_dev, Y2_test = (
        torch.tensor(Y2[: int(0.8 * N)]),
        torch.tensor(Y2[int(0.8 * N) : int(0.9 * N)]),
        torch.tensor(Y2[int(0.9 * N) :]),
    )

    train_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}
    )

    train_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}
    )

    dev_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}
    )

    dev_dataset2 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}
    )

    test_dataset1 = EmmentalDataset(
        name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}
    )

    test_dataset2 = EmmentalDataset(
github SenWu / emmental / tests / data / test_data.py View on Github external
torch.Tensor([1, 2, 3, 4, 5]),
    ]

    y1 = torch.Tensor([0, 0, 0, 0, 0])

    x2 = [
        torch.Tensor([1, 2, 3, 4, 5]),
        torch.Tensor([1, 2, 3, 4]),
        torch.Tensor([1, 2, 3]),
        torch.Tensor([1, 2]),
        torch.Tensor([1]),
    ]

    y2 = torch.Tensor([1, 1, 1, 1, 1])

    dataset = EmmentalDataset(
        X_dict={"data1": x1, "data2": x2},
        Y_dict={"label1": y1, "label2": y2},
        name="new_data",
    )

    dataloader1 = EmmentalDataLoader(
        task_to_label_dict={"task1": "label1"},
        dataset=dataset,
        split="train",
        batch_size=2,
    )

    x_batch, y_batch = next(iter(dataloader1))

    # Check if the dataloader is correctly constructed
    assert dataloader1.task_to_label_dict == {"task1": "label1"}
github SenWu / emmental / tests / schedulers / test_round_robin_scheduler.py View on Github external
# Set random seed seed
    set_random_seed(2)

    task1 = "task1"
    x1 = np.random.rand(20, 2)
    y1 = torch.from_numpy(np.random.rand(20))

    task2 = "task2"
    x2 = np.random.rand(30, 3)
    y2 = torch.from_numpy(np.random.rand(30))

    dataloaders = [
        EmmentalDataLoader(
            task_to_label_dict={task_name: "label"},
            dataset=EmmentalDataset(
                name=task_name, X_dict={"feature": x}, Y_dict={"label": y}
            ),
            split="train",
            batch_size=10,
            shuffle=True,
        )
        for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
    ]

    scheduler = RoundRobinScheduler()

    assert scheduler.get_num_batches(dataloaders) == 5

    batch_task_names = [
        batch_data[-2] for batch_data in scheduler.get_batches(dataloaders)
    ]
github HazyResearch / fonduer / src / fonduer / learning / dataset.py View on Github external
import logging
from typing import Any, Dict, List, Optional, Tuple, Union

import numpy as np
import torch
from emmental.data import EmmentalDataset
from scipy.sparse import csr_matrix
from torch import Tensor

from fonduer.candidates.models import Candidate
from fonduer.learning.utils import mark_sentence, mention_to_tokens

logger = logging.getLogger(__name__)


class FonduerDataset(EmmentalDataset):
    """A FonduerDataset class which is inherited from EmmentalDataset, which takes
    list of candidates and corresponding feature matrix as input and wraps them.

    :param name: The name of the dataset.
    :type name: str
    :param candidates: The list of candidates.
    :type candidates: List[Candidate]
    :param features: The corresponding feature matrix.
    :type features: csr_matrix
    :param word2id: The name of the dataset.
    :type word2id: dict
    :param labels: If np.array, it's the label for all candidates; If int, it's
        the number of classes of label and we will create placeholder labels
        (mainly used for inference).
    :type labels: np.array or int
    :param labels: Which candidates to use. If None, use all candidates.