How to use the deeppavlov.core.models.component.Component function in deeppavlov

To help you get started, we’ve selected a few deeppavlov examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepmipt / DeepPavlov / deeppavlov / models / dp_assistant / states_parser.py View on Github external
utterances_history.append(utterance['text'])
                annotations_history.append(utterance['annotations'])

            last_utterances.append(utterances_history[-1])
            utterances_histories.append(utterances_history)
            last_annotations.append(annotations_history[-1])
            annotations_histories.append(annotations_history)

            dialog_ids.append(dialog['id'])
            user_ids.append(dialog['user']['id'])

        return last_utterances, last_annotations, utterances_histories, annotations_histories, dialog_ids, user_ids


@register('annotations_parser')
class AnnotationsParser(Component):
    """ Inputs utterance annotations and gets recursive values.

    Example:
        > parser = AnnotaionsParser(keys=['ner.tokens', 'ner.tags'])
        > parser([{'ner': {'tokens': ['I'], 'tags': ['O']}}])
        [['I']], [['O']]
    """

    def __init__(self, keys, **kwargs):
        self.keys = [k.split('.') for k in keys]

    def __call__(self, annotations: List[dict]) -> List[List]:
        ann_values = [[]] * len(self.keys)
        for ann in annotations:
            for i, key_rec in enumerate(self.keys):
                val = ann
github deepmipt / DeepPavlov / deeppavlov / models / bidirectional_lms / elmo_bilm.py View on Github external
import logging

from deeppavlov.core.commands.utils import expand_path
# from deeppavlov.core.common.log import get_logger
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component
from deeppavlov.core.models.tf_backend import TfModelMeta

from deeppavlov.models.bidirectional_lms.elmo.utils import load_model, load_options_latest_checkpoint
from deeppavlov.models.bidirectional_lms.elmo.data import InferBatcher

log = logging.getLogger(__name__)

@register('elmo_bilm')
class ELMoEmbedder(Component, metaclass=TfModelMeta):
    """

    """
    def __init__(self, model_dir: str, forward_direction_sequence: bool = True, backward_direction_sequence: bool = True,
                 pad_zero: bool = False, max_token: Optional[int] = None, mini_batch_size: int = 32, **kwargs) -> None:

        self.model_dir = model_dir if '://' in model_dir else str(expand_path(model_dir))

        self.forward_direction_sequence = forward_direction_sequence
        self.backward_direction_sequence = backward_direction_sequence
        if not (self.forward_direction_sequence or self.backward_direction_sequence):
            log.error(f'At least one direction sequence of forward_direction_sequence or backward_direction_sequence'\
                      ' must be equal to True.')
            sys.exit(1)

        self.pad_zero = pad_zero
github deepmipt / DeepPavlov / deeppavlov / models / tokenizers / nltk_tokenizer.py View on Github external
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

import nltk

from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component


@register("nltk_tokenizer")
class NLTKTokenizer(Component):
    """Class for splitting texts on tokens using NLTK

    Args:
        tokenizer: tokenization mode for `nltk.tokenize`
        download: whether to download nltk data

    Attributes:
        tokenizer: tokenizer instance from nltk.tokenizers
    """
    def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False,
                 *args, **kwargs):
        if download:
            nltk.download()
        self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
        if not callable(self.tokenizer):
            raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
github deepmipt / DeepPavlov / deeppavlov / core / skill / skill.py View on Github external
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from abc import ABCMeta, abstractmethod
from typing import Tuple

from deeppavlov.core.models.component import Component


class Skill(Component, metaclass=ABCMeta):
    """Abstract class for skills.

    Skill is a DeepPavlov component, which provides handling dialog state,
    dialog history and rich content.
    """
    @abstractmethod
    def __call__(self, utterances_batch: list, history_batch: list,
                 states_batch: list = None) -> Tuple[list, list, list]:
        """Returns skill inference result.
github deepmipt / DeepPavlov / deeppavlov / models / tokenizers / spacy_tokenizer.py View on Github external
def _try_load_spacy_model(model_name: str, disable: Iterable[str] = ()):
    disable = set(disable)
    try:
        model = spacy.load(model_name, disable=disable)
    except OSError as e:
        try:
            model = __import__(model_name).load(disable=disable)
            if not isinstance(model, spacy.language.Language):
                raise RuntimeError(f'{model_name} is not a spacy model module')
        except Exception:
            raise e
    return model


@register('stream_spacy_tokenizer')
class StreamSpacyTokenizer(Component):
    """Tokenize or lemmatize a list of documents. Default spacy model is **en_core_web_sm**.
    Return a list of tokens or lemmas for a whole document.
    If is called onto ``List[str]``, performs detokenizing procedure.

    Args:
        disable: spacy pipeline elements to disable, serves a purpose of performing; if nothing
        stopwords: a list of stopwords that should be ignored during tokenizing/lemmatizing
         and ngrams creation
        batch_size: a batch size for inner spacy multi-threading
        ngram_range: size of ngrams to create; only unigrams are returned by default
        lemmas: whether to perform lemmatizing or not
        n_threads: a number of threads for inner spacy multi-threading
        lowercase: whether to perform lowercasing or not; is performed by default by :meth:`_tokenize`
         and :meth:`_lemmatize` methods
        alphas_only: whether to filter out non-alpha tokens; is performed by default by
         :meth:`_filter` method
github deepmipt / DeepPavlov / deeppavlov / models / augmentation / thesaurus_aug.py View on Github external
from deeppavlov.models.morpho_tagger.common_tagger import make_pos_and_tag
from deeppavlov.models.augmentation.utils.inflection import RuInflector
from deeppavlov.models.augmentation.utils.inflection import EnInflector
from deeppavlov.models.augmentation.utils.lettercaser import Lettercaser
from deeppavlov.models.augmentation.utils.thesaurus_wrapper import RuThesaurus
from deeppavlov.models.augmentation.utils.thesaurus_wrapper import EnThesaurus
from deeppavlov.models.augmentation.utils.word_filter import RuWordFilter
from deeppavlov.models.augmentation.utils.word_filter import EnWordFilter
from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

logger = getLogger(__name__)


@register("thesaurus_augmentation")
class ThesaurusAug(Component):
    """Component for augmentation, based on replacing words with synonyms from thesaurus

    Args:
        lang: language of text, that will be augmented, 'eng' for english, 'rus' for russian
        penalty_for_source_token: [0, 1] penalty for using source token
        replace_freq: [0,1] frequence of replacing tokens,
                      it calculates respecting to tokens that passed other filters
        isalpha_only: flag that activate filter based on method str.isalpha()
        not_replaced_tokens: list of tokens that should not be replaced
        with_source_token: flag that decides source tokens is synonyms for itself or not
        cases: dictionary that describes map:
               name of lettercase -> func that takes str and convert it in certain lettercase
        default_case: func: str->str that define transformation of string,
                      when lettercase was not be detected in 'put_in_case' func
        replaced_pos_tags: List of pos_tags that can be replaced,
                           e.g. 'NOUN' for Noun, 'VERB' for Verb, 'ADJ' for Adjective, 'ADV' for Adverb
github deepmipt / DeepPavlov / deeppavlov / models / preprocessors / char_connector.py View on Github external
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List
from logging import getLogger


from deeppavlov.core.common.registry import register
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


@register('char_connector')
class CharConnector(Component):
    """ Component tranforms batch of sequences of characters to batch of strings \
            connecting characters without other symbols"""
    def __init__(self, **kwargs) -> None:
        pass

    def __call__(self, batch: List[List[str]], **kwargs) -> List[str]:
        return ["".join(sample) for sample in batch]
github deepmipt / DeepPavlov / deeppavlov / models / embedders / tfidf_weighted_embedder.py View on Github external
from typing import List, Union, Optional, Tuple

import numpy as np
from overrides import overrides

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.errors import ConfigError
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import zero_pad
from deeppavlov.core.models.component import Component

log = getLogger(__name__)


@register('tfidf_weighted')
class TfidfWeightedEmbedder(Component):
    """
    The class implements the functionality of embedding the sentence \
        as a weighted average by special coefficients of tokens embeddings. \
        Coefficients can be taken from the given TFIDF-vectorizer in ``vectorizer`` or \
        calculated as TFIDF from counter vocabulary given in ``counter_vocab_path``.
        Also one can give ``tags_vocab_path`` to the vocabulary with weights of tags. \
        In this case, batch with tags should be given as a second input in ``__call__`` method.

    Args:
        embedder: embedder instance
        tokenizer: tokenizer instance, should be able to detokenize sentence
        pad_zero: whether to pad samples or not
        mean: whether to return mean token embedding
        tags_vocab_path: optional path to vocabulary with tags weights
        vectorizer: vectorizer instance should be trained with ``analyzer="word"``
        counter_vocab_path: path to counter vocabulary
github deepmipt / DeepPavlov / deeppavlov / models / state_tracker / preprocessors.py View on Github external
candidates: List[Dict[str, List[str]]]) -> List[Dict[str, str]]:
        # dirty hack to fix that everything must be a batch
        if len(candidates) != 1:
            raise NotImplementedError("not implemented for candidates with length > 1")
        candidates = candidates[0]
        slot_dict = {}
        # for slot_idx, value_idx in zip(*np.where(slots)):
        for slot_idx, value_idx in enumerate(slots_values):
            slot = self.slot_vocab([[slot_idx]])[0][0]
            value = self._idx2value(slot, value_idx, candidates)
            if value not in self.exclude_values:
                slot_dict[slot] = value
        return [slot_dict]


class ActionSlotsMatcher(Component):
    """
    Matches slots with actions.
    Inputs list of slots and list of string actions from NLU model.
    Outputs list of actions matched with slots.
    """

    def __init__(self, matched_actions: List[str] = [], **kwargs) -> None:
        self.matched_actions = matched_actions

    def _get_priority(self, action):
        if action in self.matched_actions:
            return len(self.matched_actions) - self.matched_actions.index(action)
        return 0

    def __call__(self, actions: Union[List[List[str]], List[List[Dict[str, Any]]]],
                 slots: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]:
github deepmipt / DeepPavlov / deeppavlov / models / state_tracker / candidates.py View on Github external
def __init__(self, special_values: List[str], **kwargs):
        self.special_vals = special_values

    def __call__(self, candidates: List[List[Dict[str, Any]]])\
            -> List[Dict[str, List[str]]]:
        result = []
        for cands in candidates:
            slot_cands = defaultdict(lambda: self.special_vals)
            for cand in cands:
                slot, value = cand['slot'], cand['value']
                slot_cands[slot] = slot_cands[slot] + [value]
            result.append(slot_cands)
        return result


class SlotsValuesMatrixBuilder(Component):
    """"""
    def __init__(self, slot_vocab: callable, max_num_values: int, **kwargs):
        self.slot_vocab = slot_vocab
        self.max_num_values = max_num_values

    def _slot2idx(self, slot):
        if slot not in self.slot_vocab:
            raise RuntimeError(f"Utterance slot {slot} doesn't match any slot"
                               " from slot_vocab")
        return self.slot_vocab([[slot]])[0][0]

    def __call__(self, cand_indexers: List[Dict[str, List[str]]]) -> List[np.ndarray]:
        utt_matrices = []
        for cand_indexer in cand_indexers:
            mat = np.zeros((len(self.slot_vocab), self.max_num_values + 2),
                           dtype=np.float32)