How to use the fairseq.tokenizer.Tokenizer.tokenize function in fairseq

To help you get started, we’ve selected a few fairseq examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github StillKeepTry / Transformer-PyTorch / preprocess.py View on Github external
make_all(args, make_dataset, args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
        tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
        src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
github rgcottrell / pytorch-human-performance-gec / fairseq-scripts / interactive.py View on Github external
def make_batches(lines, args, task, max_positions):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, task.source_dictionary, add_if_not_exist=False).long()
        for src_str in lines
    ]
    lengths = np.array([t.numel() for t in tokens])
    itr = task.get_batch_iterator(
        dataset=data.LanguagePairDataset(tokens, lengths, task.source_dictionary),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    for batch in itr:
        yield Batch(
            srcs=[lines[i] for i in batch['id']],
            tokens=batch['net_input']['src_tokens'],
            lengths=batch['net_input']['src_lengths'],
        ), batch['id']
github hhexiy / pungen / src / retriever.py View on Github external
def make_batches(lines, src_dict, max_positions):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
        for src_str in lines
    ]
    lengths = np.array([t.numel() for t in tokens])
    itr = data.EpochBatchIterator(
        dataset=data.MonolingualDataset([(s[:-1], s[1:]) for s in tokens], lengths, src_dict, False),
        max_tokens=100,
        max_sentences=5,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    return itr
github hhexiy / pungen / pungen / generator.py View on Github external
def make_batches(self, templates, deleted_words, src_dict, max_positions):
        temps = [
            tokenizer.Tokenizer.tokenize(temp, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
            for temp in templates
        ]
        deleted = [
            tokenizer.Tokenizer.tokenize(word, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
            for word in deleted_words
        ]
        inputs = [
                {'template': temp, 'deleted': dw} for
                temp, dw in zip(temps, deleted)
                ]
        lengths = np.array([t['template'].numel() for t in inputs])
        dataset = EditDataset(inputs, lengths, src_dict, insert=self.model_args.insert, combine=self.model_args.combine)
        itr = self.task.get_batch_iterator(
                dataset=dataset,
                max_tokens=100,
                max_sentences=5,
                max_positions=max_positions,
            ).next_epoch_itr(shuffle=False)
        return itr
github hhexiy / pungen / pungen / generator.py View on Github external
def make_batches(self, templates, deleted_words, src_dict, max_positions):
        temps = [
            tokenizer.Tokenizer.tokenize(temp, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
            for temp in templates
        ]
        deleted = [
            tokenizer.Tokenizer.tokenize(word, src_dict, add_if_not_exist=False, tokenize=lambda x: x).long()
            for word in deleted_words
        ]
        inputs = [
                {'template': temp, 'deleted': dw} for
                temp, dw in zip(temps, deleted)
                ]
        lengths = np.array([t['template'].numel() for t in inputs])
        dataset = EditDataset(inputs, lengths, src_dict, insert=self.model_args.insert, combine=self.model_args.combine)
        itr = self.task.get_batch_iterator(
                dataset=dataset,
                max_tokens=100,
                max_sentences=5,
github hhexiy / pungen / pungen / interactive.py View on Github external
def make_batches(lines, args, src_dict, max_positions, tgt_str=None, tgt_dict=None):
    tokens = [
        tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False).long()
        for src_str in lines
    ]
    if not tgt_str is None:
        tgt_tokens = [
            tokenizer.Tokenizer.tokenize(tgt_str, tgt_dict, add_if_not_exist=False).long()
                ]
    else:
        tgt_tokens = None
    lengths = np.array([t.numel() for t in tokens])
    itr = data.EpochBatchIterator(
        dataset=data.LanguagePairDataset(tokens, lengths, src_dict, tgt=tgt_tokens, tgt_sizes=None, tgt_dict=tgt_dict),
        max_tokens=args.max_tokens,
        max_sentences=args.max_sentences,
        max_positions=max_positions,
    ).next_epoch_itr(shuffle=False)
    for batch in itr:
        if not tgt_str is None:
            yield Batch(
                srcs=[lines[i] for i in batch['id']],
                tokens=batch['net_input']['src_tokens'],
                lengths=batch['net_input']['src_lengths'],
github rgcottrell / pytorch-human-performance-gec / fairseq-scripts / generate.py View on Github external
if args.print_alignment:
                            print('A-{}\t{}'.format(
                                sample_id,
                                ' '.join(map(lambda x: str(utils.item(x)), alignment))
                            ))

                # Compare best scores
                max_fluency_score = max(hypo_fluency_score_list)
                max_idx = hypo_fluency_score_list.index(max_fluency_score)
                max_hypo_str = hypo_str_list[max_idx]
                if max_fluency_score <= best_fluency_score:
                    # Score only the top hypothesis
                    if align_dict is not None or args.remove_bpe is not None:
                        # Convert back to tokens for evaluation with unk replacement and/or without BPE
                        target_tokens = tokenizer.Tokenizer.tokenize(target_str, tgt_dict, add_if_not_exist=True)
                    max_tokens = hypo_tokens_list[max_idx]
                    scorer.add(target_tokens, max_tokens)
                    hypoths.append(max_hypo_str)
                    hypoths.append(max_hypo_str)
                    break
                else:
                    # Keep boosting
                    iteration = iteration + 1
                    curr_src_str = max_hypo_str
                    best_fluency_score = max_fluency_score
                    best_hypo_str = max_hypo_str

            wps_meter.update(src_tokens.size(0))
            t.log({'wps': round(wps_meter.avg)})
            num_sentences += 1
github hhexiy / pungen / pungen / scorer.py View on Github external
def make_batches(self, lines, src_dict, max_positions, tokenize=str.split):
        tokens = [
            tokenizer.Tokenizer.tokenize(src_str, src_dict, add_if_not_exist=False, tokenize=tokenize).long()
            for src_str in lines
        ]
        lengths = np.array([t.numel() for t in tokens])

        # Load dataset
        # MonolingualDataset[i] = source, future_target, past_target
        # all targets are effectively ignored during inference
        dataset = data.MonolingualDataset(
                dataset=[(s[:-1], s[1:], None) for s in tokens],
                sizes=lengths, src_vocab=src_dict, tgt_vocab=src_dict,
                add_eos_for_other_targets=False, shuffle=False)
        itr = self.task.get_batch_iterator(
            dataset=dataset,
            max_tokens=100,
            max_sentences=5,
            max_positions=max_positions,
github KelleyYin / Cross-lingual-Summarization / Teacher-Student / fairseq / utils.py View on Github external
def post_process_prediction(hypo_tokens, src_str, alignment, align_dict, tgt_dict, remove_bpe):
    from fairseq import tokenizer
    hypo_str = tgt_dict.string(hypo_tokens, remove_bpe)
    if align_dict is not None:
        hypo_str = replace_unk(hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string())
    if align_dict is not None or remove_bpe is not None:
        # Convert back to tokens for evaluating with unk replacement or without BPE
        # Note that the dictionary can be modified inside the method.
        hypo_tokens = tokenizer.Tokenizer.tokenize(hypo_str, tgt_dict, add_if_not_exist=True)
    return hypo_tokens, hypo_str, alignment