How to use the flair.data.Corpus function in flair

To help you get started, we’ve selected a few flair examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github flairNLP / flair / tests / test_data.py View on Github external
def test_tagged_corpus_downsample():
    sentence = Sentence(
        "I love Berlin.", labels=[Label("class_1")], use_tokenizer=segtok_tokenizer
    )

    corpus: Corpus = Corpus(
        [
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
        ],
        [],
        [],
    )
github flairNLP / flair / flair / datasets.py View on Github external
skip_header=skip_header,
                **fmtparams,
            )
        else:
            train_length = len(train)
            dev_size: int = round(train_length / 10)
            splits = random_split(train, [train_length - dev_size, dev_size])
            train = splits[0]
            dev = splits[1]

        super(CSVClassificationCorpus, self).__init__(
            train, dev, test, name=data_folder.name
        )


class ParallelTextCorpus(Corpus):
    def __init__(
        self,
        source_file: Union[str, Path],
        target_file: Union[str, Path],
        name: str = None,
        use_tokenizer: bool = True,
        max_tokens_per_doc=-1,
        max_chars_per_doc=-1,
        in_memory: bool = True,
    ):
        """
        Instantiates a Corpus for text classification from CSV column formatted data

        :param data_folder: base folder with the task data
        :param train_file: the name of the train file
        :param test_file: the name of the test file
github flairNLP / flair / flair / data_fetcher.py View on Github external
dev_file = file
                if "testa" in file_name:
                    dev_file = file
                if "testb" in file_name:
                    test_file = file

        log.info("Reading data from {}".format(data_folder))
        log.info("Train: {}".format(train_file))
        log.info("Dev: {}".format(dev_file))
        log.info("Test: {}".format(test_file))

        sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file)
        sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file)
        sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file)

        return Corpus(
            sentences_train, sentences_dev, sentences_test, name=data_folder.name
        )
github RasaHQ / rasa / ner-evaluation / evaluation / evaluate_flair.py View on Github external
def training_data_to_corpus(data_train: TrainingData):
    sentences = []

    for ex in data_train.training_examples:
        sentence = Sentence(ex.text)
        for token in sentence.tokens:
            for entity in ex.get("entities"):
                if (
                    token.start_pos >= entity["start"]
                    and token.end_pos <= entity["end"]
                ):
                    token.add_tag("ner", entity["entity"])

        sentences.append(sentence)

    return Corpus(
        train=CustomDataset(sentences),
        dev=CustomDataset(sentences),
        test=CustomDataset([]),
    )
github flairNLP / flair / flair / data_fetcher.py View on Github external
)
        else:
            sentences_dev: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [x for x in sentences_train if x not in sentences_dev]

        if tag_to_biloes is not None:
            # convert tag scheme to iobes
            for sentence in sentences_train + sentences_test + sentences_dev:
                sentence.convert_tag_scheme(
                    tag_type=tag_to_biloes, target_scheme="iobes"
                )

        return Corpus(
            sentences_train, sentences_dev, sentences_test, name=data_folder.name
        )
github flairNLP / flair / flair / data_fetcher.py View on Github external
if dev_file is not None:
            sentences_dev: List[
                Sentence
            ] = NLPTaskDataFetcher.read_text_classification_file(
                dev_file,
                use_tokenizer=use_tokenizer,
                max_tokens_per_doc=max_tokens_per_doc,
            )
        else:
            sentences_dev: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [x for x in sentences_train if x not in sentences_dev]

        return Corpus(sentences_train, sentences_dev, sentences_test)
github flairNLP / flair / flair / datasets.py View on Github external
from flair.data import (
    Sentence,
    Corpus,
    Token,
    FlairDataset,
    DataPair,
    Image,
    space_tokenizer,
    segtok_tokenizer,
)
from flair.file_utils import cached_path, unzip_file

log = logging.getLogger("flair")


class ColumnCorpus(Corpus):
    def __init__(
        self,
        data_folder: Union[str, Path],
        column_format: Dict[int, str],
        train_file=None,
        test_file=None,
        dev_file=None,
        tag_to_bioes=None,
        comment_symbol: str = None,
        in_memory: bool = True,
        encoding: str = "utf-8",
        document_separator_token: str = None,
    ):
        """
        Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
github flairNLP / flair / flair / data.py View on Github external
return ConcatDataset([self.train, self.dev, self.test])

    def make_tag_dictionary(self, tag_type: str) -> Dictionary:

        # Make the tag dictionary
        tag_dictionary: Dictionary = Dictionary()
        tag_dictionary.add_item("O")
        for sentence in self.get_all_sentences():
            for token in sentence.tokens:
                tag_dictionary.add_item(token.get_tag(tag_type).value)
        tag_dictionary.add_item("")
        tag_dictionary.add_item("