How to use the deeppavlov.core.data.dataset_reader.DatasetReader function in deeppavlov

To help you get started, we’ve selected a few deeppavlov examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepmipt / DeepPavlov / deeppavlov / dataset_readers / morphotagging_dataset_reader.py View on Github external
if not from_words and not index.isdigit():
                continue
            curr_word_sent.append(splitted[word_column])
            if not read_only_words:
                pos, tag = splitted[pos_column], splitted[tag_column]
                tag = pos if tag == "_" else "{},{}".format(pos, tag)
                curr_tag_sent.append(tag)
        if len(curr_word_sent) > 0:
            if read_only_words:
                curr_tag_sent = None
            answer.append((curr_word_sent, curr_tag_sent))
    return answer


@register('morphotagger_dataset_reader')
class MorphotaggerDatasetReader(DatasetReader):
    """Class to read training datasets in UD format"""

    URL = 'http://files.deeppavlov.ai/datasets/UD2.0_source/'

    def read(self, data_path: Union[List, str],
             language: Optional[None] = None,
             data_types: Optional[List[str]] = None,
             **kwargs) -> Dict[str, List]:
        """Reads UD dataset from data_path.

        Args:
            data_path: can be either
                1. a directory containing files. The file for data_type 'mode'
                is then data_path / {language}-ud-{mode}.conllu
                2. a list of files, containing the same number of items as data_types
            language: a language to detect filename when it is not given
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / siamese_reader.py View on Github external
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
from pathlib import Path
from typing import Dict, List, Tuple

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('siamese_reader')
class SiameseReader(DatasetReader):
    """The class to read dataset for ranking or paraphrase identification with Siamese networks."""

    def read(self, data_path: str, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the dataset for ranking or paraphrase identification with Siamese networks.

        Args:
            data_path: A path to a folder with dataset files.
        """

        dataset = {'train': None, 'valid': None, 'test': None}
        data_path = expand_path(data_path)
        train_fname = data_path / 'train.csv'
        valid_fname = data_path / 'valid.csv'
        test_fname = data_path / 'test.csv'
        dataset["train"] = self._preprocess_data_train(train_fname)
        dataset["valid"] = self._preprocess_data_valid_test(valid_fname)
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / kvret_reader.py View on Github external
import json
from logging import getLogger
from pathlib import Path
from typing import Dict, List

from overrides import overrides

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download_decompress, mark_done

log = getLogger(__name__)


@register('kvret_reader')
class KvretDatasetReader(DatasetReader):
    """
    A New Multi-Turn, Multi-Domain, Task-Oriented Dialogue Dataset.

    Stanford NLP released a corpus of 3,031 multi-turn dialogues in three distinct domains appropriate for an in-car assistant: calendar scheduling, weather information retrieval, and point-of-interest navigation. The dialogues are grounded through knowledge bases ensuring that they are versatile in their natural language without being completely free form.

    For details see https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/.
    """

    url = 'http://files.deeppavlov.ai/datasets/kvret_public.tar.gz'

    @staticmethod
    def _data_fname(datatype):
        assert datatype in ('train', 'dev', 'test'), "wrong datatype name"
        return 'kvret_{}_public.json'.format(datatype)

    @classmethod
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / line_reader.py View on Github external
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('line_reader')
class LineReader(DatasetReader):
    """Read txt file by lines"""

    def read(self, data_path: str = None, *args, **kwargs) -> Dict:
        """Read lines from txt file

        Args:
            data_path: path to txt file

        Returns:
            A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys.
        """

        with open(data_path) as f:
            content = f.readlines()

        dataset = dict()
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / sq_reader.py View on Github external
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pickle

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('sq_reader')
class OntonotesReader(DatasetReader):
    """Class to read training datasets in OntoNotes format"""

    def read(self, data_path: str):
        with open(data_path, 'rb') as f:
            dataset = pickle.load(f)

        return dataset
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / typos_kartaslov.py View on Github external
import csv
from pathlib import Path

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.utils import is_done, download, mark_done
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('typos_kartaslov_reader')
class TyposKartaslov(DatasetReader):
    def __init__(self):
        pass

    @staticmethod
    def build(data_path: str):
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / basic_dataset_reader.py View on Github external
from pathlib import Path
from logging import getLogger
import pandas as pd
from overrides import overrides

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader
from deeppavlov.core.data.utils import download, mark_done


log = getLogger(__name__)


@register('basic_dataset_reader')
class BasicDatasetReader(DatasetReader):
    """
    Class provides reading dataset in .csv format and \
    assigns columns with given names to `x` and `y` without any changes of data
    """

    @overrides
    def read(self, data_path: str, url: str = None,
             format: str = "csv",
             *args, **kwargs) -> dict:
        """
        Read dataset from data_path directory.
        Reading files are all data_types + extension
        (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
        data_path will be read)

        Args:
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / faq_reader.py View on Github external
# Unless required by applicable law or agreed to in writing, softwaredata
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict

from pandas import read_csv

from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('faq_reader')
class FaqDatasetReader(DatasetReader):
    """Reader for FAQ dataset"""

    def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict:
        """
        Read FAQ dataset from specified csv file or remote url

        Parameters:
            data_path: path to csv file of FAQ
            data_url: url to csv file of FAQ
            x_col_name: name of Question column in csv file
            y_col_name: name of Answer column in csv file

        Returns:
            A dictionary containing training, validation and test parts of the dataset obtainable via
            ``train``, ``valid`` and ``test`` keys.
        """
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / squad_ranking_reader.py View on Github external
# limitations under the License.

import csv
import itertools
import random
import json
from pathlib import Path
from typing import Dict, List, Tuple

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('squad_ranking_reader')
class SquadRankingReader(DatasetReader):
    """The class to read dataset for ranking or paraphrase identification with Siamese networks."""

    def read(self, data_path: str,
             num_candidates=10,
             positive_samples=False, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the dataset for ranking or paraphrase identification with Siamese networks.

        Args:
            data_path: A path to a folder with dataset files.
        """
        self.num_candidates = num_candidates
        self.positive_samples = positive_samples
        dataset = {'train': None, 'valid': None, 'test': None}
        data_path = expand_path(data_path)
        train_fname = data_path / 'train.jsonl'
        valid_fname = data_path / 'dev.jsonl'
github deepmipt / DeepPavlov / deeppavlov / dataset_readers / msmarco_reader.py View on Github external
# See the License for the specific language governing permissions and
# limitations under the License.

import csv
from pathlib import Path
from typing import List, Dict, Tuple
import random
from collections import defaultdict

from deeppavlov.core.commands.utils import expand_path
from deeppavlov.core.common.registry import register
from deeppavlov.core.data.dataset_reader import DatasetReader


@register('msmarco_reader')
class MSMARCOReader(DatasetReader):
    """The class to read the Ubuntu V2 dataset from csv files.

    Please, see https://github.com/rkadlec/ubuntu-ranking-dataset-creator.
    """

    def read(self, data_path: str,
             positive_samples=False,
             random_seed=243,
             *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the Ubuntu V2 dataset from csv files.

        Args:
            data_path: A path to a folder with dataset csv files.
            positive_samples: if `True`, only positive context-response pairs will be taken for train
        """