Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
utterances_history.append(utterance['text'])
annotations_history.append(utterance['annotations'])
last_utterances.append(utterances_history[-1])
utterances_histories.append(utterances_history)
last_annotations.append(annotations_history[-1])
annotations_histories.append(annotations_history)
dialog_ids.append(dialog['id'])
user_ids.append(dialog['user']['id'])
return last_utterances, last_annotations, utterances_histories, annotations_histories, dialog_ids, user_ids
@register('annotations_parser')
class AnnotationsParser(Component):
""" Inputs utterance annotations and gets recursive values.
Example:
> parser = AnnotaionsParser(keys=['ner.tokens', 'ner.tags'])
> parser([{'ner': {'tokens': ['I'], 'tags': ['O']}}])
[['I']], [['O']]
"""
def __init__(self, keys, **kwargs):
self.keys = [k.split('.') for k in keys]
def __call__(self, annotations: List[dict]) -> List[List]:
ann_values = [[]] * len(self.keys)
for ann in annotations:
for i, key_rec in enumerate(self.keys):
val = ann
import logging
from deeppavlov.core.commands.utils import expand_path
# from deeppavlov.core.common.log import get_logger
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.tf_backend import TfModelMeta
from deeppavlov.models.bidirectional_lms.elmo.utils import load_model, load_options_latest_checkpoint
from deeppavlov.models.bidirectional_lms.elmo.data import InferBatcher
log = logging.getLogger(__name__)
@register('elmo_bilm')
class ELMoEmbedder(Component, metaclass=TfModelMeta):
"""
"""
def __init__(self, model_dir: str, forward_direction_sequence: bool = True, backward_direction_sequence: bool = True,
pad_zero: bool = False, max_token: Optional[int] = None, mini_batch_size: int = 32, **kwargs) -> None:
self.model_dir = model_dir if '://' in model_dir else str(expand_path(model_dir))
self.forward_direction_sequence = forward_direction_sequence
self.backward_direction_sequence = backward_direction_sequence
if not (self.forward_direction_sequence or self.backward_direction_sequence):
log.error(f'At least one direction sequence of forward_direction_sequence or backward_direction_sequence'\
' must be equal to True.')
sys.exit(1)
self.pad_zero = pad_zero
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
import nltk
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
@register("nltk_tokenizer")
class NLTKTokenizer(Component):
"""Class for splitting texts on tokens using NLTK
Args:
tokenizer: tokenization mode for `nltk.tokenize`
download: whether to download nltk data
Attributes:
tokenizer: tokenizer instance from nltk.tokenizers
"""
def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False,
*args, **kwargs):
if download:
nltk.download()
self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
if not callable(self.tokenizer):
raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from abc import ABCMeta, abstractmethod
from typing import Tuple
from deeppavlov.core.models.component import Component
class Skill(Component, metaclass=ABCMeta):
"""Abstract class for skills.
Skill is a DeepPavlov component, which provides handling dialog state,
dialog history and rich content.
"""
@abstractmethod
def __call__(self, utterances_batch: list, history_batch: list,
states_batch: list = None) -> Tuple[list, list, list]:
"""Returns skill inference result.
def _try_load_spacy_model(model_name: str, disable: Iterable[str] = ()):
disable = set(disable)
try:
model = spacy.load(model_name, disable=disable)
except OSError as e:
try:
model = __import__(model_name).load(disable=disable)
if not isinstance(model, spacy.language.Language):
raise RuntimeError(f'{model_name} is not a spacy model module')
except Exception:
raise e
return model
@register('stream_spacy_tokenizer')
class StreamSpacyTokenizer(Component):
"""Tokenize or lemmatize a list of documents. Default spacy model is **en_core_web_sm**.
Return a list of tokens or lemmas for a whole document.
If is called onto ``List[str]``, performs detokenizing procedure.
Args:
disable: spacy pipeline elements to disable, serves a purpose of performing; if nothing
stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing
and ngrams creation
batch_size: a batch size for inner spacy multi-threading
ngram_range: size of ngrams to create; only unigrams are returned by default
lemmas: whether to perform lemmatizing or not
n_threads: a number of threads for inner spacy multi-threading
lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`
and :meth:`_lemmatize` methods
alphas_only: whether to filter out non-alpha tokens; is performed by default by
:meth:`_filter` method
from deeppavlov.models.morpho_tagger.common_tagger import make_pos_and_tag
from deeppavlov.models.augmentation.utils.inflection import RuInflector
from deeppavlov.models.augmentation.utils.inflection import EnInflector
from deeppavlov.models.augmentation.utils.lettercaser import Lettercaser
from deeppavlov.models.augmentation.utils.thesaurus_wrapper import RuThesaurus
from deeppavlov.models.augmentation.utils.thesaurus_wrapper import EnThesaurus
from deeppavlov.models.augmentation.utils.word_filter import RuWordFilter
from deeppavlov.models.augmentation.utils.word_filter import EnWordFilter
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
logger = getLogger(__name__)
@register("thesaurus_augmentation")
class ThesaurusAug(Component):
"""Component for augmentation, based on replacing words with synonyms from thesaurus
Args:
lang: language of text, that will be augmented, 'eng' for english, 'rus' for russian
penalty_for_source_token: [0, 1] penalty for using source token
replace_freq: [0,1] frequence of replacing tokens,
it calculates respecting to tokens that passed other filters
isalpha_only: flag that activate filter based on method str.isalpha()
not_replaced_tokens: list of tokens that should not be replaced
with_source_token: flag that decides source tokens is synonyms for itself or not
cases: dictionary that describes map:
name of lettercase -> func that takes str and convert it in certain lettercase
default_case: func: str->str that define transformation of string,
when lettercase was not be detected in 'put_in_case' func
replaced_pos_tags: List of pos_tags that can be replaced,
e.g. 'NOUN' for Noun, 'VERB' for Verb, 'ADJ' for Adjective, 'ADV' for Adverb
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from logging import getLogger
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component
log = getLogger(__name__)
@register('char_connector')
class CharConnector(Component):
""" Component tranforms batch of sequences of characters to batch of strings \
connecting characters without other symbols"""
def __init__(self, **kwargs) -> None:
pass
def __call__(self, batch: List[List[str]], **kwargs) -> List[str]:
return ["".join(sample) for sample in batch]
from typing import List, Union, Optional, Tuple
import numpy as np
from overrides import overrides
from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component
log = getLogger(__name__)
@register('tfidf_weighted')
class TfidfWeightedEmbedder(Component):
"""
The class implements the functionality of embedding the sentence \
as a weighted average by special coefficients of tokens embeddings. \
Coefficients can be taken from the given TFIDF-vectorizer in ``vectorizer`` or \
calculated as TFIDF from counter vocabulary given in ``counter_vocab_path``.
Also one can give ``tags_vocab_path`` to the vocabulary with weights of tags. \
In this case, batch with tags should be given as a second input in ``__call__`` method.
Args:
embedder: embedder instance
tokenizer: tokenizer instance, should be able to detokenize sentence
pad_zero: whether to pad samples or not
mean: whether to return mean token embedding
tags_vocab_path: optional path to vocabulary with tags weights
vectorizer: vectorizer instance should be trained with ``analyzer="word"``
counter_vocab_path: path to counter vocabulary
candidates: List[Dict[str, List[str]]]) -> List[Dict[str, str]]:
# dirty hack to fix that everything must be a batch
if len(candidates) != 1:
raise NotImplementedError("not implemented for candidates with length > 1")
candidates = candidates[0]
slot_dict = {}
# for slot_idx, value_idx in zip(*np.where(slots)):
for slot_idx, value_idx in enumerate(slots_values):
slot = self.slot_vocab([[slot_idx]])[0][0]
value = self._idx2value(slot, value_idx, candidates)
if value not in self.exclude_values:
slot_dict[slot] = value
return [slot_dict]
class ActionSlotsMatcher(Component):
"""
Matches slots with actions.
Inputs list of slots and list of string actions from NLU model.
Outputs list of actions matched with slots.
"""
def __init__(self, matched_actions: List[str] = [], **kwargs) -> None:
self.matched_actions = matched_actions
def _get_priority(self, action):
if action in self.matched_actions:
return len(self.matched_actions) - self.matched_actions.index(action)
return 0
def __call__(self, actions: Union[List[List[str]], List[List[Dict[str, Any]]]],
slots: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]:
def __init__(self, special_values: List[str], **kwargs):
self.special_vals = special_values
def __call__(self, candidates: List[List[Dict[str, Any]]])\
-> List[Dict[str, List[str]]]:
result = []
for cands in candidates:
slot_cands = defaultdict(lambda: self.special_vals)
for cand in cands:
slot, value = cand['slot'], cand['value']
slot_cands[slot] = slot_cands[slot] + [value]
result.append(slot_cands)
return result
class SlotsValuesMatrixBuilder(Component):
""""""
def __init__(self, slot_vocab: callable, max_num_values: int, **kwargs):
self.slot_vocab = slot_vocab
self.max_num_values = max_num_values
def _slot2idx(self, slot):
if slot not in self.slot_vocab:
raise RuntimeError(f"Utterance slot {slot} doesn't match any slot"
" from slot_vocab")
return self.slot_vocab([[slot]])[0][0]
def __call__(self, cand_indexers: List[Dict[str, List[str]]]) -> List[np.ndarray]:
utt_matrices = []
for cand_indexer in cand_indexers:
mat = np.zeros((len(self.slot_vocab), self.max_num_values + 2),
dtype=np.float32)