How to use the snorkel.labeling.PandasLFApplier function in snorkel

To help you get started, we’ve selected a few snorkel examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github snorkel-team / snorkel / test / labeling / test_convergence.py View on Github external
def test_labeling_convergence(self) -> None:
        """Test convergence of end to end labeling pipeline."""
        # Apply LFs
        labeling_functions = (
            [f]
            + [get_positive_labeling_function(divisor) for divisor in range(2, 9)]
            + [get_negative_labeling_function(divisor) for divisor in range(2, 9)]
        )
        applier = PandasLFApplier(labeling_functions)
        L_train = applier.apply(self.df_train, progress_bar=False)

        self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions)))

        # Train LabelModel
        label_model = LabelModel(cardinality=self.cardinality, verbose=False)
        label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
        Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
        Y = self.df_train.y
        err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
        self.assertLess(err, 0.05)
github snorkel-team / snorkel / test / labeling / apply / test_lf_applier.py View on Github external
def test_lf_applier_pandas(self) -> None:
        df = pd.DataFrame(dict(num=DATA))
        applier = PandasLFApplier([f, g])
        L = applier.apply(df, progress_bar=False)
        np.testing.assert_equal(L, L_EXPECTED)
        L = applier.apply(df, progress_bar=True)
        np.testing.assert_equal(L, L_EXPECTED)
        L, meta = applier.apply(df, return_meta=True)
        np.testing.assert_equal(L, L_EXPECTED)
        self.assertEqual(meta, ApplierMetadata(dict()))
github snorkel-team / snorkel / test / labeling / apply / test_lf_applier.py View on Github external
def test_lf_applier_pandas_spacy_preprocessor(self) -> None:
        spacy = SpacyPreprocessor(text_field="text", doc_field="doc")

        @labeling_function(pre=[spacy])
        def first_is_name(x: DataPoint) -> int:
            return 0 if x.doc[0].pos_ == "PROPN" else -1

        @labeling_function(pre=[spacy])
        def has_verb(x: DataPoint) -> int:
            return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1

        df = pd.DataFrame(dict(text=TEXT_DATA))
        applier = PandasLFApplier([first_is_name, has_verb])
        L = applier.apply(df, progress_bar=False)
        np.testing.assert_equal(L, L_TEXT_EXPECTED)
github snorkel-team / snorkel / test / labeling / apply / test_lf_applier.py View on Github external
def test_lf_applier_pandas_preprocessor(self) -> None:
        df = pd.DataFrame(dict(num=DATA))
        applier = PandasLFApplier([f, fp])
        L = applier.apply(df, progress_bar=False)
        np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
github snorkel-team / snorkel / test / labeling / apply / test_lf_applier.py View on Github external
def test_lf_applier_pandas_fault(self) -> None:
        df = pd.DataFrame(dict(num=DATA))
        applier = PandasLFApplier([f, f_bad])
        with self.assertRaises(AttributeError):
            applier.apply(df, progress_bar=False)
        L = applier.apply(df, progress_bar=False, fault_tolerant=True)
        np.testing.assert_equal(L, L_EXPECTED_BAD)
        L, meta = applier.apply(
            df, progress_bar=False, fault_tolerant=True, return_meta=True
        )
        np.testing.assert_equal(L, L_EXPECTED_BAD)
        self.assertEqual(meta, ApplierMetadata(dict(f_bad=5)))
github snorkel-team / snorkel / test / labeling / apply / test_lf_applier.py View on Github external
def test_lf_applier_pandas_spacy_preprocessor_memoized(self) -> None:
        spacy = SpacyPreprocessor(text_field="text", doc_field="doc")
        spacy.memoize = True

        @labeling_function(pre=[spacy])
        def first_is_name(x: DataPoint) -> int:
            return 0 if x.doc[0].pos_ == "PROPN" else -1

        @labeling_function(pre=[spacy])
        def has_verb(x: DataPoint) -> int:
            return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1

        df = pd.DataFrame(dict(text=TEXT_DATA))
        applier = PandasLFApplier([first_is_name, has_verb])
        L = applier.apply(df, progress_bar=False)
        np.testing.assert_equal(L, L_TEXT_EXPECTED)
        self.assertEqual(len(spacy._cache), 2)
github snorkel-team / snorkel-tutorials / getting_started / getting_started.py View on Github external
# ## 2) Combining & Cleaning the Labels
#
# Our next step is to apply the labeling functions we wrote to the unlabeled training data.
# The result is a *label matrix*, `L_train`, where each row corresponds to a data point and each column corresponds to a labeling function.
# Since the labeling functions have unknown accuracies and correlations, their output labels may overlap and conflict.
# We use the `LabelModel` to automatically estimate their accuracies and correlations, reweight and combine their labels, and produce our final set of clean, integrated training labels:

# %%
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier

# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity]

# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)

# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")

# %% [markdown]
# Note that we used the `LabelModel` to label data; however, on many data points, all the labeling functions abstain, and so the `LabelModel` abstains as well.
# We'll filter these data points out of our training set now:

# %%
df_train = df_train[df_train.label != ABSTAIN]

# %% [markdown]
# Our ultimate goal is to use the resulting labeled training data points to train a machine learning model that can **generalize beyond the coverage of the labeling functions and the `LabelModel`**.
github snorkel-team / snorkel-tutorials / crowdsourcing / crowdsourcing_tutorial.py View on Github external
def make_worker_lf(worker_id):
    worker_dict = worker_dicts[worker_id]
    name = f"worker_{worker_id}"
    return LabelingFunction(name, f=worker_lf, resources={"worker_dict": worker_dict})


worker_lfs = [make_worker_lf(worker_id) for worker_id in worker_dicts]

# %% [markdown]
# Let's take a quick look at how well they do on the development set.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier

applier = PandasLFApplier(worker_lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)

# %% [markdown]
# Note that because our dev set is so small and our LFs are relatively sparse, many LFs will appear to have zero coverage.
# Fortunately, our label model learns weights for LFs based on their outputs on the training set, which is generally much larger.

# %%
from snorkel.labeling import LFAnalysis

LFAnalysis(L_dev, worker_lfs).lf_summary(Y_dev).sample(5)

# %% [markdown]
# So the crowd labels in general are quite good! But how much of our dev and training
# sets do they cover?
github snorkel-team / snorkel-tutorials / spouse / spouse_demo.py View on Github external
# %%
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_husband_wife,
    lf_husband_wife_left_window,
    lf_same_last_name,
    lf_married,
    lf_familial_relationship,
    lf_family_left_window,
    lf_other_relationship,
    lf_distant_supervision,
    lf_distant_supervision_last_names,
]
applier = PandasLFApplier(lfs)

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import LFAnalysis

L_dev = applier.apply(df_dev)
L_train = applier.apply(df_train)

# %%
LFAnalysis(L_dev, lfs).lf_summary(Y_dev)

# %% [markdown]
# ### Training the Label Model
#
# Now, we'll train a model of the LFs to estimate their weights and combine their outputs. Once the model is trained, we can combine the outputs of the LFs into a single, noise-aware training label set for our extractor.

# %% {"tags": ["md-exclude-output"]}
github snorkel-team / snorkel-tutorials / visual_relation / visual_relation_tutorial.py View on Github external
# Note that the labeling functions have varying empirical accuracies and coverages. Due to class imbalance in our chosen relationships, labeling functions that label the `OTHER` class have higher coverage than labeling functions for `RIDE` or `CARRY`. This reflects the distribution of classes in the dataset as well.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier

lfs = [
    lf_ride_object,
    lf_carry_object,
    lf_carry_subject,
    lf_not_person,
    lf_ydist,
    lf_dist,
    lf_area,
]

applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_valid = applier.apply(df_valid)

# %%
from snorkel.labeling import LFAnalysis

Y_valid = df_valid.label.values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)

# %% [markdown]
# ## 3. Train Label Model
# We now train a multi-class `LabelModel` to assign training labels to the unalabeled training set.

# %%
from snorkel.labeling.model import LabelModel