Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def sentencepiece_vocab(extra_ids=0):
return sentencepiece_vocabulary.SentencePieceVocabulary(
os.path.join(TEST_DATA_DIR, "sentencepiece", "sentencepiece.model"),
extra_ids=extra_ids)
import collections
import os
import shutil
from absl import logging
from absl.testing import absltest
import numpy as np
import six
from t5.data import sentencepiece_vocabulary
from t5.data import utils as dataset_utils
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
TaskRegistry = dataset_utils.TaskRegistry
MixtureRegistry = dataset_utils.MixtureRegistry
mock = absltest.mock
TEST_DATA_DIR = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "test_data")
# _ProxyTest is required because py2 does not allow instantiating
# absltest.TestCase directly.
class _ProxyTest(absltest.TestCase):
"""Instance of TestCase to reuse methods for testing."""
maxDiff = None
def runTest(self):
pass
from __future__ import print_function
import collections
import os
import shutil
from absl import logging
from absl.testing import absltest
import numpy as np
import six
from t5.data import sentencepiece_vocabulary
from t5.data import utils as dataset_utils
import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds
TaskRegistry = dataset_utils.TaskRegistry
MixtureRegistry = dataset_utils.MixtureRegistry
mock = absltest.mock
TEST_DATA_DIR = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "test_data")
# _ProxyTest is required because py2 does not allow instantiating
# absltest.TestCase directly.
class _ProxyTest(absltest.TestCase):
"""Instance of TestCase to reuse methods for testing."""
maxDiff = None
def runTest(self):
pass
_FAKE_CACHED_DATASET["train"], [2, 1], _dump_examples_to_tfrecord)
_dump_fake_dataset(
os.path.join(cached_task_dir, "validation.tfrecord"),
_FAKE_CACHED_DATASET["validation"], [2], _dump_examples_to_tfrecord)
# Prepare uncached TfdsTask.
add_tfds_task("uncached_task")
self.uncached_task = TaskRegistry.get("uncached_task")
# Prepare uncached TextLineTask.
_dump_fake_dataset(
os.path.join(self.test_data_dir, "train.tsv"),
_FAKE_DATASET["train"], [2, 1], _dump_examples_to_tsv)
TaskRegistry.add(
"text_line_task",
dataset_utils.TextLineTask,
split_to_filepattern={
"train": os.path.join(self.test_data_dir, "train.tsv*"),
},
skip_header_lines=1,
text_preprocessor=[_split_tsv_preprocessor, test_text_preprocessor],
sentencepiece_model_path=os.path.join(
TEST_DATA_DIR, "sentencepiece", "sentencepiece.model"),
metric_fns=[])
self.text_line_task = TaskRegistry.get("text_line_task")
# Auto-verify any split by just retuning the split name
dataset_utils.verify_tfds_split = absltest.mock.Mock(
side_effect=lambda x, y: y
)
def add_tfds_task(
name,
tfds_name="fake:0.0.0",
text_preprocessor=test_text_preprocessor,
token_preprocessor=None,
splits=None):
TaskRegistry.add(
name,
dataset_utils.TfdsTask,
tfds_name=tfds_name,
text_preprocessor=text_preprocessor,
token_preprocessor=token_preprocessor,
sentencepiece_model_path=os.path.join(TEST_DATA_DIR, "sentencepiece",
"sentencepiece.model"),
metric_fns=[],
splits=splits)
def add_fake_tfds(fake_tfds):
dataset_utils.LazyTfdsLoader._MEMOIZED_INSTANCES[ # pylint:disable=protected-access
(fake_tfds.name, None)] = fake_tfds
class Metric(object):
def __init__(self, name, group=None):
self.name = name
self.group = group or name
# This OrderedDict maps TensorBoard tags to nice-looking metric names.
# The order of the keys in the dict determine the order they get logged.
METRIC_NAMES = collections.OrderedDict([
("glue_average", Metric("Average GLUE Score")),
("glue_cola_v002/matthews_corrcoef", Metric("CoLA")),
("glue_sst2_v002/accuracy", Metric("SST-2")),
("glue_mrpc_v002/f1", Metric("MRPC (F1)", "MRPC")),
("glue_mrpc_v002/accuracy", Metric("MRPC (accuracy)", "MRPC")),
("glue_stsb_v002/pearson_corrcoef", Metric("STSB (Pearson)", "STSB")),
("glue_stsb_v002/spearman_corrcoef", Metric("STSB (Spearman)", "STSB")),
("glue_qqp_v002/f1", Metric("QQP (F1)", "QQP")),
("glue_qqp_v002/accuracy", Metric("QQP (accuracy)", "QQP")),
("glue_mnli_matched_v002/accuracy", Metric("MNLIm", "MNLI")),
("glue_mnli_mismatched_v002/accuracy", Metric("MNLImm", "MNLI")),
("glue_qnli_v002/accuracy", Metric("QNLI")),
("glue_rte_v002/accuracy", Metric("GLUE RTE")),
("cnn_dailymail_v002/rouge1", Metric("CNN/DM (ROUGE-1)", "CNN/DM")),
("cnn_dailymail_v002/rouge2", Metric("CNN/DM (ROUGE-2)", "CNN/DM")),
("cnn_dailymail_v002/rougeL", Metric("CNN/DM (ROUGE-L)", "CNN/DM")),
("cnn_dailymail_v002/rougeLsum", Metric("CNN/DM (ROUGE-L)", "CNN/DM")),
("squad_v010_allanswers/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010_allanswers/f1", Metric("SQuAD (F1)", "SQuAD")),
("squad_v010_allanswers_span/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010_allanswers_span/f1", Metric("SQuAD (F1)", "SQuAD")),
("squad_v010/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010/f1", Metric("SQuAD (F1)", "SQuAD")),
METRIC_NAMES = collections.OrderedDict([
("glue_average", Metric("Average GLUE Score")),
("glue_cola_v002/matthews_corrcoef", Metric("CoLA")),
("glue_sst2_v002/accuracy", Metric("SST-2")),
("glue_mrpc_v002/f1", Metric("MRPC (F1)", "MRPC")),
("glue_mrpc_v002/accuracy", Metric("MRPC (accuracy)", "MRPC")),
("glue_stsb_v002/pearson_corrcoef", Metric("STSB (Pearson)", "STSB")),
("glue_stsb_v002/spearman_corrcoef", Metric("STSB (Spearman)", "STSB")),
("glue_qqp_v002/f1", Metric("QQP (F1)", "QQP")),
("glue_qqp_v002/accuracy", Metric("QQP (accuracy)", "QQP")),
("glue_mnli_matched_v002/accuracy", Metric("MNLIm", "MNLI")),
("glue_mnli_mismatched_v002/accuracy", Metric("MNLImm", "MNLI")),
("glue_qnli_v002/accuracy", Metric("QNLI")),
("glue_rte_v002/accuracy", Metric("GLUE RTE")),
("cnn_dailymail_v002/rouge1", Metric("CNN/DM (ROUGE-1)", "CNN/DM")),
("cnn_dailymail_v002/rouge2", Metric("CNN/DM (ROUGE-2)", "CNN/DM")),
("cnn_dailymail_v002/rougeL", Metric("CNN/DM (ROUGE-L)", "CNN/DM")),
("cnn_dailymail_v002/rougeLsum", Metric("CNN/DM (ROUGE-L)", "CNN/DM")),
("squad_v010_allanswers/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010_allanswers/f1", Metric("SQuAD (F1)", "SQuAD")),
("squad_v010_allanswers_span/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010_allanswers_span/f1", Metric("SQuAD (F1)", "SQuAD")),
("squad_v010/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010/f1", Metric("SQuAD (F1)", "SQuAD")),
("super_glue_average", Metric("Average SuperGLUE Score")),
("super_glue_boolq_v102/accuracy", Metric("BoolQ (accuracy)")),
("super_glue_cb_v102/mean_3class_f1", Metric("CB (F1)", "CB")),
("super_glue_cb_v102/accuracy", Metric("CB (accuracy)", "CB")),
("super_glue_copa_v102/accuracy", Metric("CoPA")),
("super_glue_multirc_v102/f1", Metric("MultiRC (F1)", "MultiRC")),
("super_glue_multirc_v102/exact_match", Metric("MultiRC (EM)", "MultiRC")),
("super_glue_record_v102/f1", Metric("ReCoRD (F1)", "ReCoRD")),
import tensorflow.compat.v1 as tf
class Metric(object):
def __init__(self, name, group=None):
self.name = name
self.group = group or name
# This OrderedDict maps TensorBoard tags to nice-looking metric names.
# The order of the keys in the dict determine the order they get logged.
METRIC_NAMES = collections.OrderedDict([
("glue_average", Metric("Average GLUE Score")),
("glue_cola_v002/matthews_corrcoef", Metric("CoLA")),
("glue_sst2_v002/accuracy", Metric("SST-2")),
("glue_mrpc_v002/f1", Metric("MRPC (F1)", "MRPC")),
("glue_mrpc_v002/accuracy", Metric("MRPC (accuracy)", "MRPC")),
("glue_stsb_v002/pearson_corrcoef", Metric("STSB (Pearson)", "STSB")),
("glue_stsb_v002/spearman_corrcoef", Metric("STSB (Spearman)", "STSB")),
("glue_qqp_v002/f1", Metric("QQP (F1)", "QQP")),
("glue_qqp_v002/accuracy", Metric("QQP (accuracy)", "QQP")),
("glue_mnli_matched_v002/accuracy", Metric("MNLIm", "MNLI")),
("glue_mnli_mismatched_v002/accuracy", Metric("MNLImm", "MNLI")),
("glue_qnli_v002/accuracy", Metric("QNLI")),
("glue_rte_v002/accuracy", Metric("GLUE RTE")),
("cnn_dailymail_v002/rouge1", Metric("CNN/DM (ROUGE-1)", "CNN/DM")),
("cnn_dailymail_v002/rouge2", Metric("CNN/DM (ROUGE-2)", "CNN/DM")),
("cnn_dailymail_v002/rougeL", Metric("CNN/DM (ROUGE-L)", "CNN/DM")),
("cnn_dailymail_v002/rougeLsum", Metric("CNN/DM (ROUGE-L)", "CNN/DM")),
("squad_v010_allanswers/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010_allanswers/f1", Metric("SQuAD (F1)", "SQuAD")),
("squad_v010_allanswers_span/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010_allanswers/f1", Metric("SQuAD (F1)", "SQuAD")),
("squad_v010_allanswers_span/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010_allanswers_span/f1", Metric("SQuAD (F1)", "SQuAD")),
("squad_v010/em", Metric("SQuAD (EM)", "SQuAD")),
("squad_v010/f1", Metric("SQuAD (F1)", "SQuAD")),
("super_glue_average", Metric("Average SuperGLUE Score")),
("super_glue_boolq_v102/accuracy", Metric("BoolQ (accuracy)")),
("super_glue_cb_v102/mean_3class_f1", Metric("CB (F1)", "CB")),
("super_glue_cb_v102/accuracy", Metric("CB (accuracy)", "CB")),
("super_glue_copa_v102/accuracy", Metric("CoPA")),
("super_glue_multirc_v102/f1", Metric("MultiRC (F1)", "MultiRC")),
("super_glue_multirc_v102/exact_match", Metric("MultiRC (EM)", "MultiRC")),
("super_glue_record_v102/f1", Metric("ReCoRD (F1)", "ReCoRD")),
("super_glue_record_v102/em", Metric("ReCoRD (EM)", "ReCoRD")),
("super_glue_rte_v102/accuracy", Metric("SuperGLUE RTE")),
("super_glue_wic_v102/accuracy", Metric("WiC")),
("super_glue_wsc_v102_simple_eval/accuracy", Metric("WSC")),
("dpr_v001_simple/accuracy", Metric("DPR")),
("wmt_t2t_ende_v003/bleu", Metric("WMT T2T En-De")),
("wmt14_ende_v003/bleu", Metric("WMT14 En-De")),
("wmt15_enfr_v003/bleu", Metric("WMT15 En-Fr")),
("wmt16_enro_v003/bleu", Metric("WMT16 En-Ro")),
])
Event = collections.namedtuple("event", ["step", "value"])
def parse_events_files(tb_summary_dir):
"""Parse all TensorBoard events files in tb_summary_dir.
Args:
tb_summary_dir: str, path to look for events files in.