How to use the t5.data.utils.MixtureRegistry function in t5

To help you get started, we’ve selected a few t5 examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github google-research / text-to-text-transfer-transformer / t5 / data / test_utils.py View on Github external
import collections
import os
import shutil

from absl import logging
from absl.testing import absltest
import numpy as np
import six
from t5.data import sentencepiece_vocabulary
from t5.data import utils as dataset_utils
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

TaskRegistry = dataset_utils.TaskRegistry
MixtureRegistry = dataset_utils.MixtureRegistry

mock = absltest.mock

TEST_DATA_DIR = os.path.join(
    os.path.dirname(os.path.abspath(__file__)), "test_data")


# _ProxyTest is required because py2 does not allow instantiating
# absltest.TestCase directly.
class _ProxyTest(absltest.TestCase):
  """Instance of TestCase to reuse methods for testing."""
  maxDiff = None

  def runTest(self):
    pass
github google-research / text-to-text-transfer-transformer / t5 / data / mixtures.py View on Github external
# Tasks is not the best description. For example, glue_v002_equal refers to a
# mixture. Calling it "finetune tasks" because we consider all glue tasks as
# a single dataset to train on.
_finetune_tasks = [
    "glue_v002_proportional",  # mixture
    "super_glue_v102_proportional",  # mixture
    "cnn_dailymail_v002",
    "squad_v010_allanswers",
    "wmt_t2t_ende_v003",
    "wmt15_enfr_v003",
    "wmt16_enro_v003"
]

# ========================== GLUE and SuperGLUE ================================

MixtureRegistry.add(
    "glue_v002_proportional",
    _glue_tasks, default_rate=rate_num_examples)


MixtureRegistry.add(
    "super_glue_v102_proportional",
    _super_glue_tasks,
    default_rate=rate_num_examples)


# mnli and its associated dev sets: mnli_matched and mnli_mismatched
MixtureRegistry.add(
    "glue_mnli_and_dev_v002",
    [t for t in _glue_tasks if "mnli" in t],
    default_rate=1.0)
github google-research / text-to-text-transfer-transformer / t5 / data / mixtures.py View on Github external
# ============================== Co-training ===================================


# C4, glue, squad, superglue
#  The supervised tasks here are all small datasets
#  Mix them proportionally to their dataset sizes.
# TODO(noam): This should be called "small_mix" or something, but we will
#   keep it as en_mix to avoid restarting experiments.
# TODO(noam): some rates should be reduced - but not now to avoid restarting
#     experiments.   They are:
#  - Tasks duplicated between glue and superglue (see _dedupe)
#  - squad and glue_qnli are duplicates
#  - glue_sst2 may contain overlapping phrases (related examples with itself)
#  - we seem to overtrain on super_glue_record - don't know why
MixtureRegistry.add(
    "en_mix",
    [("c4_v020_unsupervised", rate_unsupervised)] +
    _glue_tasks + _super_glue_tasks +
    ["squad_v010_allanswers"],
    default_rate=rate_num_examples)

MixtureRegistry.add(
    "all_equal",
    _supervised_tasks + ["c4_v020_unsupervised"],
    default_rate=1.,
)


def _dedupe(name):
  if "glue" in name and "rte" in name:
    return functools.partial(rate_num_examples, scale=0.5)
github google-research / text-to-text-transfer-transformer / t5 / data / utils.py View on Github external
def add(cls, name, tasks, default_rate=None):
    super(MixtureRegistry, cls).add(name, Mixture, tasks, default_rate)
github google-research / text-to-text-transfer-transformer / t5 / data / utils.py View on Github external
def get_mixture_or_task(task_or_mixture_name):
  """Return the Task or Mixture from the appropriate registry."""
  mixtures = MixtureRegistry.names()
  tasks = TaskRegistry.names()
  if task_or_mixture_name in mixtures:
    if task_or_mixture_name in tasks:
      logging.warning("%s is both a Task and a Mixture, returning Mixture",
                      task_or_mixture_name)
    return MixtureRegistry.get(task_or_mixture_name)
  if task_or_mixture_name in tasks:
    return TaskRegistry.get(task_or_mixture_name)
  else:
    raise ValueError("No Task or Mixture found with name: %s" %
                     task_or_mixture_name)
github google-research / text-to-text-transfer-transformer / t5 / data / utils.py View on Github external
def get_mixture_or_task(task_or_mixture_name):
  """Return the Task or Mixture from the appropriate registry."""
  mixtures = MixtureRegistry.names()
  tasks = TaskRegistry.names()
  if task_or_mixture_name in mixtures:
    if task_or_mixture_name in tasks:
      logging.warning("%s is both a Task and a Mixture, returning Mixture",
                      task_or_mixture_name)
    return MixtureRegistry.get(task_or_mixture_name)
  if task_or_mixture_name in tasks:
    return TaskRegistry.get(task_or_mixture_name)
  else:
    raise ValueError("No Task or Mixture found with name: %s" %
                     task_or_mixture_name)