How to use the fairseq.data.LanguagePairDataset function in fairseq

To help you get started, we’ve selected a few fairseq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github StillKeepTry / Transformer-PyTorch / fairseq / models / fconv.py View on Github external
max_positions=1024, convolutions=((512, 3),) * 20, dropout=0.1):
        super().__init__(dictionary)
        self.dropout = dropout
        self.num_attention_layers = None

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if embed_dict:
            self.embed_tokens = utils.load_embedding(embed_dict, self.dictionary, self.embed_tokens)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
            left_pad=LanguagePairDataset.LEFT_PAD_SOURCE,
        )

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            self.projections.append(Linear(in_channels, out_channels)
                                    if in_channels != out_channels else None)
            if kernel_size % 2 == 1:
                padding = kernel_size // 2
            else:
                padding = 0
            self.convolutions.append(
                ConvTBC(in_channels, out_channels * 2, kernel_size,
                        dropout=dropout, padding=padding)
github freewym / espresso / tests / test_train.py View on Github external
def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch):
    tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1)
    tokens_ds = data.TokenBlockDataset(
        tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False,
    )
    trainer = mock_trainer(epoch, num_updates, iterations_in_epoch)
    dataset = data.LanguagePairDataset(tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False)
    epoch_itr = data.EpochBatchIterator(
        dataset=dataset,
        collate_fn=dataset.collater,
        batch_sampler=[[i] for i in range(epoch_size)],
    )
    return trainer, epoch_itr
github freewym / espresso / fairseq / tasks / semisupervised_translation.py View on Github external
def language_pair_dataset(lang_pair):
            src, tgt = lang_pair.split('-')
            src_dataset, tgt_dataset = src_datasets[lang_pair], tgt_datasets[lang_pair]
            return self.alter_dataset_langtok(
                LanguagePairDataset(
                    src_dataset, src_dataset.sizes, self.dicts[src],
                    tgt_dataset, tgt_dataset.sizes, self.dicts[tgt],
                    left_pad_source=self.args.left_pad_source,
                    left_pad_target=self.args.left_pad_target,
                    max_source_positions=self.args.max_source_positions,
                    max_target_positions=self.args.max_target_positions,
                ),
                self.dicts[src].eos(),
                src,
                self.dicts[tgt].eos(),
                tgt,
            )
github StillKeepTry / Transformer-PyTorch / fairseq / models / dpn.py View on Github external
assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(max_positions, embed_dim, padding_idx,
                                                       left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(embed_dim, padding_idx,
                                                                 left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers
        self.attnpath = AttnPathEncoder(self.layers, num_heads=num_heads,
                                        filter_size=filter_size, hidden_size=hidden_size,
                                        dropout=dropout, attention_dropout=attention_dropout,
                                        relu_dropout=relu_dropout)
        self.cnnpath = CNNPathEncoder(self.layers, hidden_size=hidden_size, dropout=dropout,
                                      in_embed=hidden_size, out_embed=hidden_size)
github freewym / espresso / fairseq / data.py View on Github external
def merge(key, left_pad, move_eos_to_beginning=False):
            return LanguagePairDataset.collate_tokens(
                [s[key] for s in samples],
                pad_idx, eos_idx, left_pad, move_eos_to_beginning,
            )
github pytorch / translate / pytorch_translate / tasks / pytorch_translate_task.py View on Github external
if self.char_source_dict is not None:
                char_data_class = (
                    char_data.LanguagePairCharDataset
                    if self.char_target_dict is not None
                    else char_data.LanguagePairSourceCharDataset
                )
                datasets[key] = char_data_class(
                    src=src_dataset,
                    src_sizes=src_sizes,
                    src_dict=self.source_dictionary,
                    tgt=tgt_dataset,
                    tgt_sizes=tgt_dataset.sizes,
                    tgt_dict=self.target_dictionary,
                )
            else:
                datasets[key] = LanguagePairDataset(
                    src=src_dataset,
                    src_sizes=src_sizes,
                    src_dict=self.source_dictionary,
                    tgt=tgt_dataset,
                    tgt_sizes=tgt_dataset.sizes,
                    tgt_dict=self.target_dictionary,
                    left_pad_source=False,
                )
        total_line_count = sum(len(datasets[key]) for key in datasets)
        if dataset_relative_ratio:
            ds, ratio = dataset_relative_ratio
            line_count = len(datasets[ds])
            # By definition ratio = u * line_count / sum(#lines of other datasets)
            u = (total_line_count - line_count) / line_count * ratio
            dataset_upsampling = {key: u}
        elif not dataset_upsampling:
github NVIDIA / DeepLearningExamples / PyTorch / Translation / Transformer / interactive.py View on Github external
def make_batches(lines, args, src_dict, max_positions, bpe=None):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, src_dict, tokenize=tokenizer.tokenize_en, add_if_not_exist=False, bpe=bpe).long()
        for src_str in lines
    ]
    lengths = np.array([t.numel() for t in tokens])
    itr = data.EpochBatchIterator(
        dataset=data.LanguagePairDataset(tokens, lengths, src_dict),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    for batch in itr:
        yield Batch(
            srcs=[lines[i] for i in batch['id']],
            tokens=batch['net_input']['src_tokens'],
            lengths=batch['net_input']['src_lengths'],
        ), batch['id']
github hhexiy / pungen / pungen / interactive.py View on Github external
def make_batches(lines, args, src_dict, max_positions, tgt_str=None, tgt_dict=None):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
        for src_str in lines
    ]
    if not tgt_str is None:
        tgt_tokens = [
            tokenizer.Tokenizer.tokenize(tgt_str, tgt_dict, add_if_not_exist=False).long()
                ]
    else:
        tgt_tokens = None
    lengths = np.array([t.numel() for t in tokens])
    itr = data.EpochBatchIterator(
        dataset=data.LanguagePairDataset(tokens, lengths, src_dict, tgt=tgt_tokens, tgt_sizes=None, tgt_dict=tgt_dict),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    for batch in itr:
        if not tgt_str is None:
            yield Batch(
                srcs=[lines[i] for i in batch['id']],
                tokens=batch['net_input']['src_tokens'],
                lengths=batch['net_input']['src_lengths'],
                prefix=batch['target'][:, :3],
            ), batch['id']
        else:
            yield Batch(
                srcs=[lines[i] for i in batch['id']],
                tokens=batch['net_input']['src_tokens'],
github StillKeepTry / Transformer-PyTorch / fairseq / models / dpn.py View on Github external
assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(max_positions, embed_dim, padding_idx,
                                                       left_pad=LanguagePairDataset.LEFT_PAD_TARGET)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(embed_dim, padding_idx,
                                                                 left_pad=LanguagePairDataset.LEFT_PAD_TARGET)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.encdec_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        self.norm3_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(MultiheadAttentionDecoder(hidden_size,
                                                                        hidden_size,
                                                                        hidden_size,
                                                                        num_heads))
            self.ffn_blocks.append(FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
github StillKeepTry / Transformer-PyTorch / fairseq / models / transformer.py View on Github external
dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, share_embed=False):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(max_positions, embed_dim, padding_idx,
                                                       left_pad=LanguagePairDataset.LEFT_PAD_TARGET)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(embed_dim, padding_idx,
                                                                 left_pad=LanguagePairDataset.LEFT_PAD_TARGET)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.encdec_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        self.norm3_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(MultiheadAttentionDecoder(hidden_size,
                                                                        hidden_size,
                                                                        hidden_size,