Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_labeling_convergence(self) -> None:
"""Test convergence of end to end labeling pipeline."""
# Apply LFs
labeling_functions = (
[f]
+ [get_positive_labeling_function(divisor) for divisor in range(2, 9)]
+ [get_negative_labeling_function(divisor) for divisor in range(2, 9)]
)
applier = PandasLFApplier(labeling_functions)
L_train = applier.apply(self.df_train, progress_bar=False)
self.assertEqual(L_train.shape, (self.N_TRAIN, len(labeling_functions)))
# Train LabelModel
label_model = LabelModel(cardinality=self.cardinality, verbose=False)
label_model.fit(L_train, n_epochs=100, lr=0.01, l2=0.0)
Y_lm = label_model.predict_proba(L_train).argmax(axis=1)
Y = self.df_train.y
err = np.where(Y != Y_lm, 1, 0).sum() / self.N_TRAIN
self.assertLess(err, 0.05)
def test_lf_applier_pandas(self) -> None:
df = pd.DataFrame(dict(num=DATA))
applier = PandasLFApplier([f, g])
L = applier.apply(df, progress_bar=False)
np.testing.assert_equal(L, L_EXPECTED)
L = applier.apply(df, progress_bar=True)
np.testing.assert_equal(L, L_EXPECTED)
L, meta = applier.apply(df, return_meta=True)
np.testing.assert_equal(L, L_EXPECTED)
self.assertEqual(meta, ApplierMetadata(dict()))
def test_lf_applier_pandas_spacy_preprocessor(self) -> None:
spacy = SpacyPreprocessor(text_field="text", doc_field="doc")
@labeling_function(pre=[spacy])
def first_is_name(x: DataPoint) -> int:
return 0 if x.doc[0].pos_ == "PROPN" else -1
@labeling_function(pre=[spacy])
def has_verb(x: DataPoint) -> int:
return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1
df = pd.DataFrame(dict(text=TEXT_DATA))
applier = PandasLFApplier([first_is_name, has_verb])
L = applier.apply(df, progress_bar=False)
np.testing.assert_equal(L, L_TEXT_EXPECTED)
def test_lf_applier_pandas_preprocessor(self) -> None:
df = pd.DataFrame(dict(num=DATA))
applier = PandasLFApplier([f, fp])
L = applier.apply(df, progress_bar=False)
np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
def test_lf_applier_pandas_fault(self) -> None:
df = pd.DataFrame(dict(num=DATA))
applier = PandasLFApplier([f, f_bad])
with self.assertRaises(AttributeError):
applier.apply(df, progress_bar=False)
L = applier.apply(df, progress_bar=False, fault_tolerant=True)
np.testing.assert_equal(L, L_EXPECTED_BAD)
L, meta = applier.apply(
df, progress_bar=False, fault_tolerant=True, return_meta=True
)
np.testing.assert_equal(L, L_EXPECTED_BAD)
self.assertEqual(meta, ApplierMetadata(dict(f_bad=5)))
def test_lf_applier_pandas_spacy_preprocessor_memoized(self) -> None:
spacy = SpacyPreprocessor(text_field="text", doc_field="doc")
spacy.memoize = True
@labeling_function(pre=[spacy])
def first_is_name(x: DataPoint) -> int:
return 0 if x.doc[0].pos_ == "PROPN" else -1
@labeling_function(pre=[spacy])
def has_verb(x: DataPoint) -> int:
return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1
df = pd.DataFrame(dict(text=TEXT_DATA))
applier = PandasLFApplier([first_is_name, has_verb])
L = applier.apply(df, progress_bar=False)
np.testing.assert_equal(L, L_TEXT_EXPECTED)
self.assertEqual(len(spacy._cache), 2)
# ## 2) Combining & Cleaning the Labels
#
# Our next step is to apply the labeling functions we wrote to the unlabeled training data.
# The result is a *label matrix*, `L_train`, where each row corresponds to a data point and each column corresponds to a labeling function.
# Since the labeling functions have unknown accuracies and correlations, their output labels may overlap and conflict.
# We use the `LabelModel` to automatically estimate their accuracies and correlations, reweight and combine their labels, and produce our final set of clean, integrated training labels:
# %%
from snorkel.labeling.model import LabelModel
from snorkel.labeling import PandasLFApplier
# Define the set of labeling functions (LFs)
lfs = [lf_keyword_my, lf_regex_check_out, lf_short_comment, lf_textblob_polarity]
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=500, log_freq=50, seed=123)
df_train["label"] = label_model.predict(L=L_train, tie_break_policy="abstain")
# %% [markdown]
# Note that we used the `LabelModel` to label data; however, on many data points, all the labeling functions abstain, and so the `LabelModel` abstains as well.
# We'll filter these data points out of our training set now:
# %%
df_train = df_train[df_train.label != ABSTAIN]
# %% [markdown]
# Our ultimate goal is to use the resulting labeled training data points to train a machine learning model that can **generalize beyond the coverage of the labeling functions and the `LabelModel`**.
def make_worker_lf(worker_id):
worker_dict = worker_dicts[worker_id]
name = f"worker_{worker_id}"
return LabelingFunction(name, f=worker_lf, resources={"worker_dict": worker_dict})
worker_lfs = [make_worker_lf(worker_id) for worker_id in worker_dicts]
# %% [markdown]
# Let's take a quick look at how well they do on the development set.
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier
applier = PandasLFApplier(worker_lfs)
L_train = applier.apply(df_train)
L_dev = applier.apply(df_dev)
# %% [markdown]
# Note that because our dev set is so small and our LFs are relatively sparse, many LFs will appear to have zero coverage.
# Fortunately, our label model learns weights for LFs based on their outputs on the training set, which is generally much larger.
# %%
from snorkel.labeling import LFAnalysis
LFAnalysis(L_dev, worker_lfs).lf_summary(Y_dev).sample(5)
# %% [markdown]
# So the crowd labels in general are quite good! But how much of our dev and training
# sets do they cover?
# %%
from snorkel.labeling import PandasLFApplier
lfs = [
lf_husband_wife,
lf_husband_wife_left_window,
lf_same_last_name,
lf_married,
lf_familial_relationship,
lf_family_left_window,
lf_other_relationship,
lf_distant_supervision,
lf_distant_supervision_last_names,
]
applier = PandasLFApplier(lfs)
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import LFAnalysis
L_dev = applier.apply(df_dev)
L_train = applier.apply(df_train)
# %%
LFAnalysis(L_dev, lfs).lf_summary(Y_dev)
# %% [markdown]
# ### Training the Label Model
#
# Now, we'll train a model of the LFs to estimate their weights and combine their outputs. Once the model is trained, we can combine the outputs of the LFs into a single, noise-aware training label set for our extractor.
# %% {"tags": ["md-exclude-output"]}
# Note that the labeling functions have varying empirical accuracies and coverages. Due to class imbalance in our chosen relationships, labeling functions that label the `OTHER` class have higher coverage than labeling functions for `RIDE` or `CARRY`. This reflects the distribution of classes in the dataset as well.
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier
lfs = [
lf_ride_object,
lf_carry_object,
lf_carry_subject,
lf_not_person,
lf_ydist,
lf_dist,
lf_area,
]
applier = PandasLFApplier(lfs)
L_train = applier.apply(df_train)
L_valid = applier.apply(df_valid)
# %%
from snorkel.labeling import LFAnalysis
Y_valid = df_valid.label.values
LFAnalysis(L_valid, lfs).lf_summary(Y_valid)
# %% [markdown]
# ## 3. Train Label Model
# We now train a multi-class `LabelModel` to assign training labels to the unalabeled training set.
# %%
from snorkel.labeling.model import LabelModel