How to use the forte.processors.base.PackProcessor function in forte

To help you get started, we’ve selected a few forte examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github asyml / forte / forte / processors / sentence_predictor.py View on Github external
from forte.data import DataPack
from forte.processors.base import PackProcessor
from ft.onto.base_ontology import Sentence

__all__ = [
    "PeriodSentenceSegmenter"
]


class PeriodSentenceSegmenter(PackProcessor):
    """
    A dummy sentence segmenter which segments sentence only based on periods.
    Used for unit tests.
    """
    def _process(self, input_pack: DataPack):
        # pylint: disable=no-self-use
        text = input_pack.text

        begin_pos = 0
        while begin_pos < len(text):
            end_pos = min(text.find('.', begin_pos))
            if end_pos == -1:
                end_pos = len(text) - 1
            sentence_entry = Sentence(input_pack, begin_pos, end_pos + 1)
            input_pack.add_or_get_entry(sentence_entry)
github asyml / forte / forte / processors / nltk_processors.py View on Github external
for i in range(len(token_texts))]
        for token, lemma in zip(token_entries, lemmas):
            token.lemma = lemma


def penn2morphy(penntag: str) -> str:
    r"""Converts tags from Penn format to Morphy.
    """
    morphy_tag = {'NN': 'n', 'JJ': 'a', 'VB': 'v', 'RB': 'r'}
    if penntag[:2] in morphy_tag:
        return morphy_tag[penntag[:2]]
    else:
        return 'n'


class NLTKChunker(PackProcessor):
    r"""A wrapper of NLTK chunker.
    """

    def __init__(self):
        super().__init__()
        self.chunker = None

    # pylint: disable=unused-argument
    def initialize(self, resources: Resources, configs: Config):
        super().initialize(resources, configs)
        self.chunker = RegexpParser(configs.pattern)

    @classmethod
    def default_configs(cls):
        r"""This defines a basic config structure for NLTKChunker.
        """
github asyml / forte / forte / processors / attribute_masking_processor.py View on Github external
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Any, Dict

from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor

__all__ = [
    "AttributeMasker"
]


class AttributeMasker(PackProcessor):

    # pylint: disable=attribute-defined-outside-init
    def initialize(self, _: Resources, config: Config):
        self.fields = config.kwargs

    @classmethod
    def default_configs(cls) -> Dict[str, Any]:
        r"""Default config for this processor.

        Example usage is shown below

        .. code-block:: python
            {
                "kwargs": {
                    Token: ["ner"]
                }
github asyml / forte / examples / content_rewriter / prepare_pipeline.py View on Github external
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The main running pipeline for the rewriter.
"""
from examples.content_rewriter.reader import TableReader
from forte.data.data_pack import DataPack
from forte.pipeline import Pipeline

from forte.processors.base import PackProcessor
from forte.processors.writers import PackNameJsonPackWriter
from ft.onto.base_ontology import Utterance


class Instructor(PackProcessor):
    def __init__(self, instruction: str):
        super().__init__()
        self.instruction = instruction

    def _process(self, input_pack: DataPack):
        input_pack.set_text(input_pack.text + '\n' + self.instruction)
        u = Utterance(input_pack,
                      len(input_pack.text) - len(self.instruction),
                      len(input_pack.text))
        u.speaker = 'ai'


instruct_text = 'This is an example to use the chatbot interface with the ' \
                'content rewriter model. To run this example, follow the ' \
                'instructions here "https://github.com/asyml/forte' \
                '/tree/master/examples/content_rewriter" to obtain ' \
github asyml / forte / forte / processors / nltk_processors.py View on Github external
"""

    def __init__(self):
        super().__init__()
        self.token_component = None

    def _process(self, input_pack: DataPack):
        token_entries = list(input_pack.get(entry_type=Token,
                                            components=self.token_component))
        token_texts = [token.text for token in token_entries]
        taggings = pos_tag(token_texts)
        for token, tag in zip(token_entries, taggings):
            token.pos = tag[1]


class NLTKLemmatizer(PackProcessor):
    r"""A wrapper of NLTK lemmatizer.
    """

    def __init__(self):
        super().__init__()
        self.token_component = None
        self.lemmatizer = WordNetLemmatizer()

    def _process(self, input_pack: DataPack):
        token_entries: List[Token] = list(input_pack.get(
            entry_type=Token, components=self.token_component))

        token_texts: List[str] = []
        token_poses: List[str] = []
        for token in token_entries:
            token_texts.append(token.text)
github asyml / forte / forte / processors / nltk_processors.py View on Github external
if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('NP', [('This', 'DT'), ('tool', 'NN')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    phrase = Phrase(input_pack, begin_pos, end_pos)
                    phrase.phrase_type = chunk.label()

                    index += len(chunk)
                else:
                    # For example:
                    # chunk: ('is', 'VBZ')
                    index += 1


class NLTKSentenceSegmenter(PackProcessor):
    r"""A wrapper of NLTK sentence tokenizer.
    """

    def __init__(self):
        super().__init__()
        self.sent_splitter = PunktSentenceTokenizer()

    def _process(self, input_pack: DataPack):
        for begin, end in self.sent_splitter.span_tokenize(input_pack.text):
            Sentence(input_pack, begin, end)


class NLTKNER(PackProcessor):
    r"""A wrapper of NLTK NER.
    """
github asyml / forte / forte / processors / tokenization_predictor.py View on Github external
from nltk.tokenize import word_tokenize

from forte.data import DataPack
from forte.data.ontology import base_ontology
from forte.processors.base import PackProcessor, ProcessInfo

__all__ = [
    "NLTKWordTokenizer",
]


class NLTKWordTokenizer(PackProcessor):
    """
    A wrapper of NLTK word tokenizer.
    """
    def __init__(self):
        super().__init__()
        self.sentence_component = None
        self._ontology = base_ontology

    def _define_input_info(self) -> ProcessInfo:
        input_info: ProcessInfo = {
            self._ontology.Sentence: ["span"]
        }
        return input_info

    def _define_output_info(self) -> ProcessInfo:
        output_info: ProcessInfo = {
github asyml / forte / forte / processors / lowercaser_processor.py View on Github external
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor

__all__ = [
    "LowerCaserProcessor",
]


class LowerCaserProcessor(PackProcessor):

    def _process(self, input_pack: DataPack):
        input_pack.set_text(input_pack.text.lower())
github asyml / forte / forte / processors / spacy_processors.py View on Github external
from spacy.language import Language
from spacy.cli.download import download

from forte.common import ProcessExecutionException
from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.processors.base import PackProcessor
from ft.onto.base_ontology import EntityMention, Sentence, Token

__all__ = [
    "SpacyProcessor",
]


class SpacyProcessor(PackProcessor):
    """
    A wrapper for spaCy processors
    """

    def __init__(self):
        super().__init__()
        self.processors: str = ""
        self.nlp: Optional[Language] = None
        self.lang_model: str = ''

    def set_up(self):
        try:
            self.nlp = spacy.load(self.lang_model)
        except OSError:
            download(self.lang_model)
            self.nlp = spacy.load(self.lang_model)
github asyml / forte / forte / processors / pretrained_encoder_processors.py View on Github external
import texar.torch as tx
import torch

from forte.common.configuration import Config
from forte.common.resources import Resources
from forte.data.data_pack import DataPack
from forte.data.ontology.top import Annotation
from forte.processors.base import PackProcessor
from forte.utils.utils import get_class

__all__ = [
    "PretrainedEncoder",
]


class PretrainedEncoder(PackProcessor):
    r"""A wrapper of Texar pre-trained encoders.

    This processor will compute the embedding vectors for entries of type
    ``Annotation`` using pre-trained models. The user can specify the
    pre-trained model type and the annotation class name via configuration.
    For the full list of pre-trained models supported, see
    :meth:`default_config` for more details. The processor will add embedding
    vector for all entries matching the specified entry type. The resulting
    vector can be accessed by the embedding field of the annotations.
    """

    def __init__(self):
        super().__init__()
        self.tokenizer = None
        self.encoder = None
        self.entry_type = None