How to use snorkel - 10 common examples

To help you get started, we’ve selected a few snorkel examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github HazyResearch / fonduer / tests / e2e / test_e2e.py View on Github external
LF_voltage_row_temp,
        LF_voltage_row_part,
        LF_typ_row,
        LF_complement_left_row,
        LF_too_many_numbers_row,
        LF_temp_on_high_page_num,
        LF_temp_outside_table,
        LF_not_temp_relevant,
    ]
    labeler.update(split=0, lfs=[stg_temp_lfs_2, ce_v_max_lfs], parallelism=PARALLEL)
    assert session.query(Label).count() == 6478
    assert session.query(LabelKey).count() == 16
    L_train = labeler.get_label_matrices(train_cands)
    assert L_train[0].shape == (3493, 16)

    gen_model = LabelModel()
    gen_model.fit(L_train=L_train[0], n_epochs=500, log_freq=100)

    train_marginals = gen_model.predict_proba(L_train[0])

    diffs = train_marginals.max(axis=1) - train_marginals.min(axis=1)
    train_idxs = np.where(diffs > 1e-6)[0]

    train_dataloader = EmmentalDataLoader(
        task_to_label_dict={ATTRIBUTE: "labels"},
        dataset=FonduerDataset(
            ATTRIBUTE,
            train_cands[0],
            F_train[0],
            emb_layer.word2id,
            train_marginals,
            train_idxs,
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_model_loss(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel(cardinality=2, verbose=False)

        label_model.fit(L, n_epochs=1)
        init_loss = label_model._loss_mu().item()

        label_model.fit(L, n_epochs=10)
        next_loss = label_model._loss_mu().item()

        self.assertLessEqual(next_loss, init_loss)

        with self.assertRaisesRegex(Exception, "Loss is NaN."):
            label_model.fit(L, n_epochs=10, lr=1e8)
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_score(self):
        L = np.array([[1, 1, 0], [-1, -1, -1], [1, 0, 1]])
        Y = np.array([1, 0, 1])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=100)
        results = label_model.score(L, Y, metrics=["accuracy", "coverage"])
        np.testing.assert_array_almost_equal(
            label_model.predict(L), np.array([1, -1, 1])
        )

        results_expected = dict(accuracy=1.0, coverage=2 / 3)
        self.assertEqual(results, results_expected)

        L = np.array([[1, 0, 1], [1, 0, 1]])
        label_model = self._set_up_model(L)
        label_model.mu = nn.Parameter(label_model.mu_init.clone().clamp(0.01, 0.99))

        results = label_model.score(L, Y=np.array([0, 1]))
        results_expected = dict(accuracy=0.5)
        self.assertEqual(results, results_expected)
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_scheduler_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, lr_scheduler="constant", n_epochs=1)
        self.assertIsNone(label_model.lr_scheduler)

        label_model.fit(L, lr_scheduler="linear", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.LambdaLR)

        label_model.fit(L, lr_scheduler="exponential", n_epochs=1)
        self.assertIsInstance(
            label_model.lr_scheduler, optim.lr_scheduler.ExponentialLR
        )

        label_model.fit(L, lr_scheduler="step", n_epochs=1)
        self.assertIsInstance(label_model.lr_scheduler, optim.lr_scheduler.StepLR)
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_label_model_sparse(self) -> None:
        """Test the LabelModel's estimate of P and Y on a sparse synthetic dataset.

        This tests the common setting where LFs abstain most of the time, which can
        cause issues for example if parameter clamping set too high (e.g. see Issue
        #1422).
        """
        np.random.seed(123)
        P, Y, L = generate_simple_label_matrix(
            self.n, self.m, self.cardinality, abstain_multiplier=1000.0
        )

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L, n_epochs=1000, lr=0.01, seed=123)

        # Test estimated LF conditional probabilities
        P_lm = label_model.get_conditional_probs()
        np.testing.assert_array_almost_equal(P, P_lm, decimal=2)

        # Test predicted labels *only on non-abstained data points*
        Y_pred = label_model.predict(L, tie_break_policy="abstain")
        (idx,) = np.where(Y_pred != -1)
        acc = np.where(Y_pred[idx] == Y[idx], 1, 0).sum() / len(idx)
        self.assertGreaterEqual(acc, 0.65)

        # Make sure that we don't output abstain when an LF votes, per issue #1422
        self.assertEqual(len(idx), np.where((L + 1).sum(axis=1) != 0, 1, 0).sum())
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_optimizer(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1, optimizer="sgd")
        label_model.fit(L, n_epochs=1, optimizer="adam")
        label_model.fit(L, n_epochs=1, optimizer="adamax")
        with self.assertRaisesRegex(ValueError, "Unrecognized optimizer option"):
            label_model.fit(L, n_epochs=1, optimizer="bad_opt")
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_lr_scheduler(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        label_model.fit(L, n_epochs=1, lr_scheduler="constant")
        label_model.fit(L, n_epochs=1, lr_scheduler="linear")
        label_model.fit(L, n_epochs=1, lr_scheduler="exponential")
        label_model.fit(L, n_epochs=1, lr_scheduler="step")
        with self.assertRaisesRegex(ValueError, "Unrecognized lr scheduler option"):
            label_model.fit(L, n_epochs=1, lr_scheduler="bad_scheduler")
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_save_and_load(self):
        L = np.array([[0, -1, 0], [0, 1, 1]])
        label_model = LabelModel(cardinality=2, verbose=False)
        label_model.fit(L, n_epochs=1)
        original_preds = label_model.predict(L)

        dir_path = tempfile.mkdtemp()
        save_path = dir_path + "label_model.pkl"
        label_model.save(save_path)

        label_model_new = LabelModel(cardinality=2, verbose=False)
        label_model_new.load(save_path)
        loaded_preds = label_model_new.predict(L)
        shutil.rmtree(dir_path)

        np.testing.assert_array_equal(loaded_preds, original_preds)
github snorkel-team / snorkel / test / labeling / model / test_label_model.py View on Github external
def test_optimizer_init(self):
        L = np.array([[0, -1, 0], [0, 1, 0]])
        label_model = LabelModel()

        label_model.fit(L, optimizer="sgd", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.SGD)

        label_model.fit(L, optimizer="adam", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adam)

        label_model.fit(L, optimizer="adamax", n_epochs=1)
        self.assertIsInstance(label_model.optimizer, optim.Adamax)

        with self.assertRaisesRegex(ValueError, "Unrecognized optimizer"):
            label_model.fit(L, optimizer="bad_optimizer", n_epochs=1)
github snorkel-team / snorkel / test / slicing / test_convergence.py View on Github external
base_task = create_task("task", module_suffixes=["A", "B"])

        # Apply SFs
        slicing_functions = [f, g]  # low-coverage slices
        slice_names = [sf.name for sf in slicing_functions]
        applier = PandasSFApplier(slicing_functions)
        S_train = applier.apply(self.df_train, progress_bar=False)
        S_valid = applier.apply(self.df_valid, progress_bar=False)

        # Add slice labels
        add_slice_labels(dataloaders[0], base_task, S_train)
        add_slice_labels(dataloaders[1], base_task, S_valid)

        # Convert to slice tasks
        tasks = convert_to_slice_tasks(base_task, slice_names)
        model = MultitaskClassifier(tasks=tasks)

        # Train
        # NOTE: Needs more epochs to convergence with more heads
        trainer = Trainer(lr=0.001, n_epochs=60, progress_bar=False)
        trainer.fit(model, dataloaders)
        scores = model.score(dataloaders)

        # Confirm reasonably high slice scores
        # Check train scores
        self.assertGreater(scores["task/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:f_pred/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:f_ind/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:g_pred/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:g_ind/TestData/train/f1"], 0.9)
        self.assertGreater(scores["task_slice:base_pred/TestData/train/f1"], 0.9)
        self.assertEqual(scores["task_slice:base_ind/TestData/train/f1"], 1.0)