How to use the torchtext.data.Pipeline function in torchtext

To help you get started, we’ve selected a few torchtext examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github pytorch / text / test / data / test_pipeline.py View on Github external
def test_composition(self):
        id_pipeline = data.Pipeline()
        pipeline = data.Pipeline(TestPipeline.repeat_n)
        pipeline.add_before(id_pipeline)
        pipeline.add_after(id_pipeline)
        pipeline.add_before(six.text_type.lower)
        pipeline.add_after(six.text_type.capitalize)

        other_pipeline = data.Pipeline(six.text_type.swapcase)
        other_pipeline.add_before(pipeline)

        # Assert pipeline gives proper results after composition
        # (test that we aren't modfifying pipes member)
        assert pipeline("teST") == "Testtesttest"
        assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"]

        # Assert pipeline that we added to gives proper results
        assert other_pipeline("teST") == "tESTTESTTEST"
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / loaddata / mydatasets_self_five.py View on Github external
string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)

            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = None if os.path.join(path, file) is None else os.path.join(path, file)
            examples = []
            with open(path) as f:
                a, b, c, d, e = 0, 0, 0, 0, 0
                for line in f:
                    sentence, flag = line.strip().split(' ||| ')
                    if char_data is True:
                        sentence = sentence.split(" ")
                        sentence = MR.char_data(self, sentence)
                    # print(sentence)
                    # clear string in every sentence
                    sentence = clean_str(sentence)
                    if line[-2] == '0':
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / loaddata / mydatasets_self_two.py View on Github external
string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)

            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = None if os.path.join(path, file) is None else os.path.join(path, file)
            examples = []
            with open(path) as f:
                a, b = 0, 0
                for line in f.readlines():
                    sentence, flag = line.strip().split(' ||| ')
                    if char_data is True:
                        sentence = sentence.split(" ")
                        sentence = MR.char_data(self, sentence)
                    # print(sentence)
                    # clear string in every sentence
                    sentence = clean_str(sentence)
                    if line[-2] == '0':
github kolloldas / torchnlp / torchnlp / data / nyt.py View on Github external
train_file: Train filename
        validation_file: Validation filename
        test_file: Test filename
        convert_digits: If True will convert numbers to single 0's

    Returns:
        A dict containing:
            task: 'nyt_ingredients.ner'
            iters: (train iter, validation iter, test iter)
            vocabs: (Inputs word vocabulary, Inputs character vocabulary, 
                    Tag vocabulary )
    """
    
    # Setup fields with batch dimension first
    inputs_word = data.Field(init_token="", eos_token="", batch_first=True, lower=True,
                                preprocessing=data.Pipeline(
                                    lambda w: '0' if convert_digits and w.isdigit() else w ))

    inputs_char_nesting = data.Field(tokenize=list, init_token="", eos_token="", 
                                    batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting, 
                                    init_token="", eos_token="")
                        

    labels = data.Field(init_token="", eos_token="", batch_first=True)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)), 
                ('labels', labels)])

    # Load the data
    if use_local:
github pytorch / text / torchtext / datasets / trec.py View on Github external
Arguments:
            path: Path to the data file.
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            fine_grained: Whether to use the fine-grained (50-class) version of TREC
                or the coarse grained (6-class) version.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        fields = [('text', text_field), ('label', label_field)]
        examples = []

        def get_label_str(label):
            return label.split(':')[0] if not fine_grained else label
        label_field.preprocessing = data.Pipeline(get_label_str)

        for line in open(os.path.expanduser(path), 'rb'):
            # there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
            label, _, text = line.replace(b'\xf0', b' ').decode().partition(' ')
            examples.append(data.Example.fromlist([text, label], fields))

        super(TREC, self).__init__(examples, fields, **kwargs)
github castorini / hedwig / sm_cnn / train.py View on Github external
args.gpu = -1
if torch.cuda.is_available() and args.cuda:
    print("Note: You are using GPU for training")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
    print("You have Cuda but you're using CPU for training.")
np.random.seed(args.seed)
random.seed(args.seed)

QID = data.Field(sequential=False)
QUESTION = data.Field(batch_first=True)
ANSWER = data.Field(batch_first=True)
LABEL = data.Field(sequential=False)
EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False,
            postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr]))

if config.dataset == 'TREC':
    train, dev, test = TrecDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL)
elif config.dataset == 'wiki':
    train, dev, test = WikiDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL)
else:
    print("Unsupported dataset")
    exit()

QID.build_vocab(train, dev, test)
QUESTION.build_vocab(train, dev, test)
ANSWER.build_vocab(train, dev, test)
LABEL.build_vocab(train, dev, test)


QUESTION = set_vectors(QUESTION, args.vector_cache)
github ari-holtzman / l2w / trainers / train_classifier.py View on Github external
parser.add_argument('--fix_embeddings',
                    action='store_true',
                    help='fix word embeddings')
# Output Parameters
parser.add_argument('--valid_every',
                    type=int,
                    default=128,
                    help='batch interval for running validation')
parser.add_argument('-p',
                    action='store_true',
                    help='use this flag to print samples of the data')
args = parser.parse_args()

TEXT = data.Field(sequential=True, lower=True, include_lengths=True)

LABEL = data.Field(sequential=False, use_vocab=False, tensor_type=torch.FloatTensor, postprocessing=data.Pipeline(lambda x, y: float(x)))

if args.valid_only:
    train_name = 'valid.tsv'
else:
    train_name = 'disc_train.tsv'

print('Reading the data')
train, valid = data.TabularDataset.splits(
    path=args.data_dir,
    train=train_name, validation='valid.tsv',
    format='tsv',
    fields=[
        ('context', TEXT),
        ('generated', TEXT),
        ('gold', TEXT),
        ])
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / loaddata / mydatasets.py View on Github external
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with open(os.path.join(path, 'rt-polarity.neg')) as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields) for line in f]
            with open(os.path.join(path, 'rt-polarity.pos')) as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs)
github bamtercelboo / cnn-lstm-bilstm-deepcnn-clstm-in-pytorch / loaddata / mydatasets_self_twitter.py View on Github external
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = None if os.path.join(path, file) is None else os.path.join(path, file)
            print("loading {}... ".format(path))
            examples = []
            with open(path) as f:
                for line in f.readlines():
                    if line[-2] == '0':
                        examples += [data.Example.fromlist([line[:line.find('|')], 'negative'], fields=fields)]
                    elif line[-2] == '1':
                        examples += [data.Example.fromlist([line[:line.find('|')], 'positive'], fields=fields)]
        super(MR, self).__init__(examples, fields, **kwargs)