Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
doc = spacy_nlp(sentence)
token_vectors = [t.vector for t in doc]
ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())
greet = {"intent": "greet", "text_features": [0.5]}
message = Message(sentence, greet)
message.set("text_spacy_doc", doc)
ftr._set_spacy_features(message)
vecs = message.get("text_dense_features")[0][:5]
assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
assert np.allclose(vecs, expected, atol=1e-4)
Message(
"test b",
data={
SPARSE_FEATURE_NAMES[TEXT]: np.zeros(1),
DENSE_FEATURE_NAMES[TEXT]: np.zeros(1),
},
),
],
True,
),
(
[
Message(
"test a",
data={
SPARSE_FEATURE_NAMES[INTENT]: np.zeros(1),
DENSE_FEATURE_NAMES[INTENT]: np.zeros(1),
],
)
def test_count_vector_featurizer_oov_words(sentence, expected):
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
CountVectorsFeaturizer,
)
ftr = CountVectorsFeaturizer(
{
"token_pattern": r"(?u)\b\w+\b",
"OOV_token": "__oov__",
"OOV_words": ["oov_word0", "OOV_word1"],
"return_sequence": True,
}
)
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert np.all(test_message.get("text_sparse_features").toarray()[0] == expected)
@pytest.mark.parametrize(
"sentence, expected",
[
("hello hello hello hello hello ", [5]),
("hello goodbye hello", [1, 2]),
("a b c d e f", [1, 1, 1, 1, 1, 1]),
("a 1 2", [2, 1]),
],
)
def test_count_vector_featurizer(sentence, expected):
from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert np.all(test_message.get("text_features") == expected)
("__OOV__ a 1 2 __oov__ __OOV__", [[0, 1, 0]]),
],
)
def test_count_vector_featurizer_oov_token(sentence, expected):
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
CountVectorsFeaturizer,
)
ftr = CountVectorsFeaturizer(
{
"token_pattern": r"(?u)\b\w+\b",
"OOV_token": "__oov__",
"return_sequence": True,
}
)
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert np.all(test_message.get("text_sparse_features").toarray()[0] == expected)
def test_count_vector_featurizer(sentence, expected):
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
CountVectorsFeaturizer,
)
ftr = CountVectorsFeaturizer(
{"token_pattern": r"(?u)\b\w+\b", "return_sequence": True}
)
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.coo_matrix)
actual = test_message.get("text_sparse_features").toarray()
assert np.all(actual[0] == expected)
("__OOV__ a 1 2 __oov__ __OOV__", [2, 3, 1]),
],
)
def test_count_vector_featurizer_oov_token(sentence, expected):
from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
ftr = CountVectorsFeaturizer(
{"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"}
)
train_message = Message(sentence)
# this is needed for a valid training example
train_message.set("intent", "bla")
data = TrainingData([train_message])
ftr.train(data)
test_message = Message(sentence)
ftr.process(test_message)
assert np.all(test_message.get("text_features") == expected)
},
),
]
# uses BILOU and the default features
ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
sentence = "anywhere in the west"
doc = {"spacy_doc": spacy_nlp(sentence)}
crf_format = ext._from_text_to_crf(Message(sentence, doc))
assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"]
feats = ext._sentence_to_features(crf_format)
assert "BOS" in feats[0]
assert "EOS" in feats[-1]
assert feats[1]["0:low"] == "in"
sentence = "anywhere in the west"
ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
filtered = ext.filter_trainable_entities(examples)
assert filtered[0].get("entities") == [
{"start": 16, "end": 20, "value": "west", "entity": "location"}
], "Entity without extractor remains"
assert filtered[1].get("entities") == [
{
"start": 8,
"end": 14,
"value": "indian",
"entity": "cuisine",
"extractor": "CRFEntityExtractor",
}
], "Only CRFEntityExtractor entity annotation remains"
assert examples[1].get("entities")[0] == {
"start": 0,
"end": 7,
only_output_properties: bool = True,
) -> Dict[Text, Any]:
"""Parse the input text, classify it and return pipeline result.
The pipeline result usually contains intent and entities."""
if not text:
# Not all components are able to handle empty strings. So we need
# to prevent that... This default return will not contain all
# output attributes of all components, but in the end, no one
# should pass an empty string in the first place.
output = self.default_output_attributes()
output["text"] = ""
return output
message = Message(text, self.default_output_attributes(), time=time)
for component in self.pipeline:
component.process(message, **self.context)
output = self.default_output_attributes()
output.update(message.as_dict(only_output_properties=only_output_properties))
return output
Creates a copy of entity_examples in which entities that have
`extractor` set to something other than
self.name (e.g. 'CRFEntityExtractor') are removed.
"""
filtered = []
for message in entity_examples:
entities = []
for ent in message.get(ENTITIES, []):
extractor = ent.get(EXTRACTOR)
if not extractor or extractor == self.name:
entities.append(ent)
data = message.data.copy()
data[ENTITIES] = entities
filtered.append(
Message(
text=message.text,
data=data,
output_properties=message.output_properties,
time=message.time,
)
)
return filtered