How to use the forte.data.readers.base_reader.PackReader function in forte

To help you get started, we’ve selected a few forte examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github asyml / forte / forte / data / readers / file_reader.py View on Github external
from forte.data.io_utils import dataset_path_iterator
from forte.data.readers.base_reader import PackReader, MultiPackReader
from forte.data.data_pack import DataPack
from forte.data.multi_pack import MultiPack
from forte.data.base_pack import PackType

logger = logging.getLogger(__name__)

__all__ = [
    "MonoFileReader",
    "PackReader"
]


class MonoFileReader(PackReader, ABC):
    """Data reader that reads one data pack from each single text files.
    To be inherited by all mono file data readers.
    """

    # pylint: disable=no-self-use
    def _cache_key_function(self, file_directory: str):
        return file_directory.split('/')[-1]

    # pylint: disable=no-self-use
    def _collect(self, file_directory: str) -> Iterator[str]:  # type: ignore
        """
        :param file_directory: the path to a single directory containing the
        files.
        :return: Iterator[Any] collections to iterate over
        """
        return dataset_path_iterator(file_directory, "")
github asyml / forte / forte / data / datasets / wikipedia / dump_reader.py View on Github external
import mwxml
from mwlinks.libs.common import Span
from mwlinks.libs.wikilink import Wikilink

from forte.data import DataPack
from forte.data.ontology import wiki_ontology
from forte.data.readers.base_reader import PackReader
from forte.data.datasets.wikipedia import page_parser

__all__ = [
    "WikiDumpReader",
]
logger = logging.getLogger(__name__)


class WikiDumpReader(PackReader):
    def __init__(self, links_to_ignore: Optional[Set[str]] = None):
        super().__init__()
        self._ontology = wiki_ontology

        if links_to_ignore is None:
            # Default ignoring link types.
            self.links_to_ignore = {"File", "Category", "wikt"}
        else:
            self.links_to_ignore = links_to_ignore

    @property
    def pack_type(self):
        return DataPack

    def _cache_key_function(self, collection: Any) -> str:
        pass
github asyml / forte / forte / data / readers / race_multi_choice_qa_reader.py View on Github external
import json
import os
from typing import Any, Iterator

from forte.data.data_pack import DataPack
from forte.data.data_utils_io import dataset_path_iterator
from forte.data.readers.base_reader import PackReader
from ft.onto.race_multi_choice_qa_ontology import (
    RaceDocument, Passage, Question, Option)

__all__ = [
    "RACEMultiChoiceQAReader",
]


class RACEMultiChoiceQAReader(PackReader):
    r""":class:`RACEMultiChoiceQAReader` is designed to read in RACE multi
    choice qa dataset.
    """

    def _collect(self, json_directory) -> Iterator[Any]:  # type: ignore
        r"""Should be called with param ``json_directory`` which is a path to a
        folder containing json files.

        Args:
            json_directory: directory containing the json files.

        Returns: Iterator over paths to .json files
        """
        return dataset_path_iterator(json_directory, "")

    def _cache_key_function(self, json_file: str) -> str:
github asyml / forte / forte / data / readers / html_reader.py View on Github external
else:
                    break
            else:
                assert 0, "interesting.search() lied"
        # end while
        if end and i < n and not self.cdata_elem:
            if self.convert_charrefs and not self.cdata_elem:
                self.handle_data(unescape(rawdata[i:n]))
            else:
                self.handle_data(rawdata[i:n])
            i = self.updatepos(i, n)
        # pylint: disable=attribute-defined-outside-init
        self.rawdata = rawdata[i:]


class HTMLReader(PackReader):
    r""":class:`HTMLReader` is designed to read in list of html strings.

    It takes in list of html strings, cleans the HTML tags and stores the
    cleaned text in pack.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.init_with_fileloc = False
        self.init_with_html = False

    def _collect(self, content) -> Iterator[str]:  # type: ignore
        r"""Could be called with a directory, a particular file location or a
        list of strings. If the string is an HTML string, it will be cleaned.

        Args:
github asyml / forte / forte / data / readers / conllu_ud_reader.py View on Github external
"""
from typing import Iterator, Dict, Tuple, Any

from ft.onto.base_ontology import (
    Document, Sentence, Token, Dependency, EnhancedDependency)

from forte.data.data_utils_io import dataset_path_iterator
from forte.data.data_pack import DataPack
from forte.data.readers.base_reader import PackReader

__all__ = [
    "ConllUDReader"
]


class ConllUDReader(PackReader):
    r""":class:`conllUReader` is designed to read in the Universal Dependencies
    2.4 dataset.
    """

    def _cache_key_function(self, data_pack: Any) -> str:
        if data_pack.meta.pack_name is None:
            raise ValueError("data_pack does not have a document id")
        return data_pack.meta.pack_name

    def _collect(self, *args, **kwargs) -> Iterator[Any]:
        # pylint: disable = unused-argument
        r"""Iterator over conll files in the data_source.

        Args:
            args: args[0] is the directory to the conllu files.
            kwargs:
github asyml / forte / forte / data / readers / corpus_reader.py View on Github external
from typing import Iterator, Tuple

from texar.torch import HParams

from forte.data.data_pack import DataPack
from forte.data.readers.base_reader import PackReader
from forte.common.resources import Resources

from ft.onto.base_ontology import Document

__all__ = [
    "CorpusReader"
]


class CorpusReader(PackReader):

    def __init__(self):
        super(CorpusReader, self).__init__()
        self.configs = None

    def initialize(self, resources: Resources, configs: HParams):
        # pylint: disable = unused-argument
        self.configs = configs

    def _collect(self, *args, **kwargs) -> Iterator[Tuple[str, str]]:
        # pylint: disable = unused-argument, undefined-variable
        dir_path: str = args[0]

        corpus_file_path = os.path.join(dir_path, 'collection.tsv')

        with open(corpus_file_path, 'r') as file:
github asyml / forte / forte / data / datasets / wikipedia / dbpedia_infobox_reader.py View on Github external
info_box.key = v.toPython()
        info_box.value = get_resource_name(o)


def read_index(pack_index_path: str) -> Dict[str, str]:
    page_idx: Dict[str, str] = {}

    logging.info("Reading pack index from %s", pack_index_path)

    with open(pack_index_path) as idx:
        for page_name, page_path in csv.reader(idx, delimiter='\t'):
            page_idx[page_name] = page_path
    return page_idx


class DBpediaInfoBoxReader(PackReader):
    def __init__(self):
        super().__init__()
        self.pack_index: Dict[str, str]
        self.pack_dir: str
        self.redirects: Dict[str, str]
        self.logger = logging.getLogger(__name__)

    def initialize(self, resources: Resources, configs: Config):
        # pylint: disable=attribute-defined-outside-init
        self.pack_index = read_index(configs.pack_index)
        self.pack_dir = configs.pack_dir

        self.redirects = resources.get('redirects')

        self.literal_info_reader = NIFBufferedContextReader(
            configs.mapping_literals)
github asyml / forte / forte / data / readers / conll03_reader.py View on Github external
import codecs
import logging
import os
from typing import Iterator, Any

from forte.data.data_pack import DataPack
from forte.data.data_utils_io import dataset_path_iterator
from forte.data.readers.base_reader import PackReader
from ft.onto.base_ontology import Token, Sentence, Document

__all__ = [
    "CoNLL03Reader"
]


class CoNLL03Reader(PackReader):
    r""":class:`CoNLL03Reader` is designed to read in the CoNLL03-ner dataset.
    """

    def _collect(self, conll_directory) -> Iterator[Any]:  # type: ignore
        r"""Iterator over conll files in the data_source.

        Args:
            conll_directory: directory to the conll files.

        Returns: Iterator over files in the path with conll extensions.
        """
        logging.info("Reading .conll from %s", conll_directory)
        return dataset_path_iterator(conll_directory, "conll")

    def _cache_key_function(self, conll_file: str) -> str:
        return os.path.basename(conll_file)
github asyml / forte / forte / data / readers / wiki_passage_qa_reader.py View on Github external
from typing import Iterator, List, Tuple, Optional, Union

import pandas as pd
from texar.torch import HParams

from ft.onto.base_ontology import Query, Document, Passage
from forte.data import DataPack
from forte.data.readers.base_reader import PackReader
from forte.common.resources import Resources

__all__ = [
    "WikiPassageQAReader"
]


class WikiPassageQAReader(PackReader):

    DocInfoType = Tuple[bool, str, List[str], Optional[List[str]]]

    def __init__(self):
        super(WikiPassageQAReader, self).__init__()
        self.configs = None

    def initialize(self, resources: Resources, configs: HParams):
        # pylint: disable = unused-argument
        self.configs = configs

    def _collect(self, *args, **kwargs) -> Iterator[DocInfoType]:
        # pylint: disable = unused-argument, undefined-variable
        """
        Reads the contents of the input `dir_path` and returns a info to
        populate query or document data packs. It reads the documents from the
github asyml / forte / forte / data / readers / ontonotes_reader.py View on Github external
from typing import (Any, DefaultDict, Iterator, List, NamedTuple, Optional,
                    Set, Tuple)

from forte.data.data_pack import DataPack
from forte.data.data_utils_io import dataset_path_iterator
from forte.data.readers.base_reader import PackReader
from ft.onto.base_ontology import (
    CoreferenceGroup, Document, EntityMention, PredicateArgument, PredicateLink,
    PredicateMention, Sentence, Token)

__all__ = [
    "OntonotesReader",
]


class OntonotesReader(PackReader):
    r""":class:`OntonotesReader` is designed to read in the English OntoNotes
    v5.0 data in the datasets used by the CoNLL 2011/2012 shared tasks. To use
    this Reader, you must follow the instructions provided `here (v12 release):
    `_:, which will allow you to
    download the CoNLL style annotations for the OntoNotes v5.0 release
    – LDC2013T19.tgz obtained from LDC.

    Args:
        column_format: A list of strings indicating which field each column in a
            line corresponds to. The length of the list should be equal to the
            number of columns in the files to be read. Available field types
            include:

            - ``"document_id"``
            - ``"part_number"``
            - ``"word"``