How to use the torchtext.data.Example.fromlist function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github MultiPath / Squirrel / data_loader.py View on Github external
if max_len is not None:
                        flag = 0
                        for line in lines:
                            if len(line.split()) > max_len:
                                flag = 1
                                break
                        if flag == 1:
                            continue   

                    examples.append(lines)
                    out_step += 1

                if (out_step % buffer == 0) and (out_step > 0):    # pre-reading the dataset, and cached...
                    # examples = sorted(examples, key=lambda x: sum([len(xi.split()) for xi in x]) )
                    for it, example in enumerate(examples):
                        yield data.Example.fromlist(example, fields)

                    examples = []
github microsoft / samples-for-ai / examples / NLPTutorials / paraphrase_identification / paraphrase_identification.py View on Github external
def __init__(self, path, fields, separator="\t", **kwargs):
        examples = []
        with codecs.open(path, 'r', encoding='utf-8') as input_file:
            for idx, line in enumerate(input_file):
                line = line.strip()
                if idx != 0 and len(line) != 0:
                    label, _, _, sentence1, sentence2 = line.split(separator)
                    columns = []
                    columns.append(tokenize_line_en(sentence1))
                    columns.append(tokenize_line_en(sentence2))
                    columns.append([int(label)])
                    examples.append(data.Example.fromlist(columns, fields))
        super(MSRPDataset, self).__init__(examples, fields, **kwargs)
github bamtercelboo / pytorch_SRU / loaddata / loadingdata_Twitter.py View on Github external
if examples is None:
            path = None if os.path.join(path, file) is None else os.path.join(path, file)
            examples = []
            with open(path) as f:
                a, b = 0, 0
                for line in f.readlines():
                    sentence, flag = line.strip().split(' ||| ')
                    # clear string in every sentence
                    sentence = clean_str(sentence)
                    if line[-2] == '0':
                        a += 1
                        examples += [data.Example.fromlist([sentence, 'negative'], fields=fields)]
                    elif line[-2] == '1':
                        a += 1
                        examples += [data.Example.fromlist([sentence, 'negative'], fields=fields)]
                    elif line[-2] == '3':
                        b += 1
                        examples += [data.Example.fromlist([sentence, 'positive'], fields=fields)]
                    elif line[-2] == '4':
                        b += 1
                        examples += [data.Example.fromlist([sentence, 'positive'], fields=fields)]
                print("a {} b {} ".format(a, b))
        super(Twitter, self).__init__(examples, fields, **kwargs)
github tacchinotacchi / distil-bilstm / generate_dataset.py View on Github external
# build lists of words indexes by POS tab
        pos_dict = build_pos_dict(sentences)
        # Generate augmented samples
        sentences = augmentation(sentences, pos_dict)
    else:
        sentences = [text for text, _ in input_tsv]

    # Load teacher model
    model = BertForSequenceClassification.from_pretrained(args.model).to(device)
    tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=True)

    # Assign labels with teacher
    teacher_field = data.Field(sequential=True, tokenize=tokenizer.tokenize, lower=True, include_lengths=True, batch_first=True)
    fields = [("text", teacher_field)]
    if not args.no_augment:
        examples = [data.Example.fromlist([" ".join(words)], fields) for words in sentences]
    else:
        examples = [data.Example.fromlist([text], fields) for text in sentences]
    augmented_dataset = data.Dataset(examples, fields)
    teacher_field.vocab = BertVocab(tokenizer.vocab)
    new_labels = BertTrainer(model, device, batch_size=args.batch_size).infer(augmented_dataset)

    # Write to file
    with open(args.output, "w") as f:
        f.write("sentence\tscores\n")
        for sentence, rating in zip(sentences, new_labels):
            if not args.no_augment:
                text = " ".join(sentence)
            else: text = sentence
            f.write("%s\t%.6f %.6f\n" % (text, *rating))
github tatsuokun / context2vec / src / util / batch.py View on Github external
def _get_examples(self, items: list, fields: list):
        return [data.Example.fromlist(item, fields) for item in items]
github joeynmt / joeynmt / joeynmt / data.py View on Github external
:param kwargs: Passed to the constructor of data.Dataset.
        """

        fields = [('src', field)]

        if hasattr(path, "readline"):  # special usage: stdin
            src_file = path
        else:
            src_path = os.path.expanduser(path + ext)
            src_file = open(src_path)

        examples = []
        for src_line in src_file:
            src_line = src_line.strip()
            if src_line != '':
                examples.append(data.Example.fromlist(
                    [src_line], fields))

        src_file.close()

        super(MonoDataset, self).__init__(examples, fields, **kwargs)
github outcastofmusic / quick-nlp / src / quicknlp / data / datasets.py View on Github external
def get_examples_from_file(self, path: str, fields: List[NamedField], format: str, encoding: str = 'utf-8',
                               skip_header: bool = True) -> Tuple[List[Example], List[NamedField]]:
        if format.lower() in ["csv", "tsv"]:
            sep = "," if format.lower() == "csv" else "\t"
            data = pd.read_csv(os.path.expanduser(path), encoding=encoding, header=0 if skip_header else None,
                               sep=sep)
        elif format.lower() == "json":
            data = pd.read_json(os.path.expanduser(path), encoding=encoding)
        examples = []
        for _, row in data.iterrows():
            examples.append(Example.fromlist(row.values.tolist(), fields))
        return examples, fields
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / loaddata / mydatasets.py View on Github external
string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with open(os.path.join(path, 'rt-polarity.neg')) as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields) for line in f]
            with open(os.path.join(path, 'rt-polarity.pos')) as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs)
github MillionIntegrals / vel / vel / sources / nlp / imdb.py View on Github external
"""
        cache_file = os.path.join(path, 'examples_cache.pk')

        fields = [('text', text_field), ('label', label_field)]

        if os.path.exists(cache_file):
            with open(cache_file, 'rb') as fp:
                examples = pickle.load(fp)
        else:
            examples = []

            for label in ['pos', 'neg']:
                for fname in glob.iglob(os.path.join(path, label, '*.txt')):
                    with io.open(fname, 'r', encoding="utf-8") as f:
                        text = f.readline()
                    examples.append(data.Example.fromlist([text, label], fields))

            with open(cache_file, 'wb') as fp:
                pickle.dump(examples, file=fp)

        data.Dataset.__init__(self, examples, fields, **kwargs)
github mingu600 / Unsupervised-Style-Transfer / load_data.py View on Github external
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e', 'ltd', 'jr', 'sr', 'co', 'st', 'ms', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sept', 'nov', 'dec'])
        sentence_splitter = PunktSentenceTokenizer(punkt_param)
        fields = [('text', text_field)]
        if len(examples) == 0:
            examples = []
            sentences = []
            fp = open(path)
            txt = fp.read()
            txt = txt.replace('?"', '? "').replace('!"', '! "').replace('."', '. "').replace("?'", "? '").replace("!'", "! '").replace(".'", ". '").replace('\n', ' ')
            sentences += sentence_splitter.tokenize(txt.lower())
            for sent in sentences[2:]:
                text = []
                text += text_field.preprocess(sent)
                text += ['']
                if 3 <= len(text) <= 19:
                    examples.append(data.Example.fromlist([text + [''] *(19- len(text))], fields))

        else:
            examples = examples
        super(EncoderDataset, self).__init__(
            examples, fields, **kwargs)