How to use the datasets.get_dataset function in datasets

To help you get started, we’ve selected a few datasets examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github rinkstiekema / PDF-Table-Structure-Recognition-using-deep-learning / pipeline / pdffigures2 / evaluation / datasets / build_dataset_images.py View on Github external
"-aa", "no", "-aaVector", "no", "-cropbox",
                  join(pdf_dir, pdfname), join(output_dir, doc_id + "-page")]
        else:
            args = ["pdftoppm", "-jpeg", "-r", str(dpi), "-cropbox",
                  join(pdf_dir, pdfname), join(output_dir, doc_id + "-page")]
        retcode = call(args)
        if retcode != 0:
            raise ValueError("Bad return code for <%s> (%d)", " ".join(args), retcode)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Cache rasterized page images for a dataset')
    parser.add_argument("dataset", choices=datasets.DATASETS.keys(), help="target dataset")
    parser.add_argument("color", choices=["gray", "color"], help="kind of images to render")
    args = parser.parse_args()

    dataset = datasets.get_dataset(args.dataset)
    print("Running on dataset: " + dataset.name)
    if args.color == "gray":
        get_images(dataset.pdf_dir, dataset.page_images_gray_dir,
                   dataset.IMAGE_DPI, True)
    elif args.color == "color":
        get_images(dataset.pdf_dir, dataset.page_images_color_dir,
                   dataset.COLOR_IMAGE_DPI, False)
    else:
        exit(1)
github allenai / science-parse / figure-extraction-evaluation / datasets / gather_unannotated_documents.py View on Github external
raise ValueError("Output dir must be empty")

    ignore_docs = set()
    if args.ignore_docs_in is not None:
        if isdir(args.ignore_docs_in):
            for filename in listdir(args.ignore_docs_in):
                if isdir(join(args.ignore_docs_in, filename)):
                    for sub_filename in listdir(join(args.ignore_docs_in, filename)):
                        ignore_docs.add(sub_filename.split("-page-")[0])
                else:
                    ignore_docs.add(filename.split("-page-")[0])
        else:
            raise ValueError()
        print("Found %d documents in %s, ignoring" % (len(ignore_docs), args.ignore_docs_in))

    dataset = datasets.get_dataset(args.dataset)
    pages_to_annotated = dataset.get_annotated_pages_map()
    if dataset.has_annotations():
        annotations = dataset.get_annotations("all")
    else:
        annotations = {}

    annotated_docs = annotations.keys()
    all_docs = dataset.get_doc_ids("all")
    missing_docs = list(set(all_docs) - set(annotated_docs) - ignore_docs)
    image_file_map = dataset.get_color_image_file_map()
    print("%d missing documents" % len(missing_docs))

    if args.groups:
        size = len(missing_docs) /args.groups
        groups = [missing_docs[round(i*size):round(i*size + size)] for i in range(args.groups)]
        for i,group in enumerate(groups):
github SsnL / dataset-distillation / base_options.py View on Github external
other_str = bytearray(other_t.cpu().storage().tolist()).decode(encoding="utf-8")
                    this_str = bytes.decode(encoding="utf-8")
                    raise ValueError(
                        "Rank {} opt is different from rank {}:\n".format(state.world_rank, other) +
                        utils.diff_str(this_str, other_str))

        # in case of downloading, to avoid race, let rank 0 download.
        if state.world_rank == 0:
            train_dataset = datasets.get_dataset(state, 'train')
            test_dataset = datasets.get_dataset(state, 'test')

        if not dummy and state.distributed:
            utils.distributed.barrier()

        if state.world_rank != 0:
            train_dataset = datasets.get_dataset(state, 'train')
            test_dataset = datasets.get_dataset(state, 'test')

        state.opt.train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=state.batch_size,
            num_workers=state.num_workers, pin_memory=True, shuffle=True)

        state.opt.test_loader = torch.utils.data.DataLoader(
            test_dataset, batch_size=state.test_batch_size,
            num_workers=state.num_workers, pin_memory=True, shuffle=True)

        if not dummy:
            logging.info('train dataset size:\t{}'.format(len(train_dataset)))
            logging.info('test dataset size: \t{}'.format(len(test_dataset)))
            logging.info('datasets built!')

            state.vis_queue = utils.multiprocessing.FixSizeProcessQueue(2)
github microsoft / nni / examples / nas / darts / search.py View on Github external
from nni.nas.pytorch.callbacks import ArchitectureCheckpoint, LRSchedulerCallback
from nni.nas.pytorch.darts import DartsTrainer
from utils import accuracy

logger = logging.getLogger('nni')

if __name__ == "__main__":
    parser = ArgumentParser("darts")
    parser.add_argument("--layers", default=8, type=int)
    parser.add_argument("--batch-size", default=64, type=int)
    parser.add_argument("--log-frequency", default=10, type=int)
    parser.add_argument("--epochs", default=50, type=int)
    parser.add_argument("--unrolled", default=False, action="store_true")
    args = parser.parse_args()

    dataset_train, dataset_valid = datasets.get_dataset("cifar10")

    model = CNN(32, 3, 16, 10, args.layers)
    criterion = nn.CrossEntropyLoss()

    optim = torch.optim.SGD(model.parameters(), 0.025, momentum=0.9, weight_decay=3.0E-4)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, args.epochs, eta_min=0.001)

    trainer = DartsTrainer(model,
                           loss=criterion,
                           metrics=lambda output, target: accuracy(output, target, topk=(1,)),
                           optimizer=optim,
                           num_epochs=args.epochs,
                           dataset_train=dataset_train,
                           dataset_valid=dataset_valid,
                           batch_size=args.batch_size,
                           log_frequency=args.log_frequency,
github lzx551402 / contextdesc / evaluations.py View on Github external
def format_data(config):
    """Post-processing and generate custom files."""
    prog_bar = progressbar.ProgressBar()
    config['stage'] = 'post_format'
    dataset = get_dataset(config['data_name'])(**config)
    prog_bar.max_value = dataset.data_length
    test_set = dataset.get_test_set()

    idx = 0
    while True:
        try:
            data = next(test_set)
            dataset.format_data(data)
            prog_bar.update(idx)
            idx += 1
        except dataset.end_set:
            break
github lzx551402 / contextdesc / evaluations.py View on Github external
def extract_aug_feat(config):
    """Extract augmented features."""
    prog_bar = progressbar.ProgressBar()
    config['stage'] = 'aug'
    dataset = get_dataset(config['data_name'])(**config)
    prog_bar.max_value = dataset.data_length
    test_set = dataset.get_test_set()

    model = get_model('aug_model')(config['pretrained']['loc_model'], **(config['aug_feat']))
    idx = 0
    while True:
        try:
            data = next(test_set)
            dump_path = data['dump_path'].decode('utf-8')
            aug_f = h5py.File(dump_path, 'a')
            if 'aug_feat' not in aug_f or config['aug_feat']['overwrite']:
                aug_feat, _ = model.run_test_data(data['dump_data'])
                if 'aug_feat' in aug_f:
                    del aug_f['aug_feat']
                if aug_feat.dtype == np.uint8:
                    _ = aug_f.create_dataset('aug_feat', data=aug_feat, dtype='uint8')
github Philip-Bachman / amdim-public / train.py View on Github external
def main():
    # create target output dir if it doesn't exist yet
    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    # enable mixed-precision computation if desired
    if args.amp:
        mixed_precision.enable_mixed_precision()

    # set the RNG seeds (probably more hidden elsewhere...)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    # get the dataset
    dataset = get_dataset(args.dataset)
    encoder_size = get_encoder_size(dataset)

    # get a helper object for tensorboard logging
    log_dir = os.path.join(args.output_dir, args.run_name)
    stat_tracker = StatTracker(log_dir=log_dir)

    # get dataloaders for training and testing
    train_loader, test_loader, num_classes = \
        build_dataset(dataset=dataset,
                      batch_size=args.batch_size,
                      input_dir=args.input_dir,
                      labeled_only=args.classifiers)

    torch_device = torch.device('cuda')
    checkpointer = Checkpointer(args.output_dir)
    if args.cpt_load_path:
github microsoft / nni / examples / nas / enas / search.py View on Github external
from nni.nas.pytorch import enas
from nni.nas.pytorch.callbacks import (ArchitectureCheckpoint,
                                       LRSchedulerCallback)
from utils import accuracy, reward_accuracy

logger = logging.getLogger('nni')


if __name__ == "__main__":
    parser = ArgumentParser("enas")
    parser.add_argument("--batch-size", default=128, type=int)
    parser.add_argument("--log-frequency", default=10, type=int)
    parser.add_argument("--search-for", choices=["macro", "micro"], default="macro")
    args = parser.parse_args()

    dataset_train, dataset_valid = datasets.get_dataset("cifar10")
    if args.search_for == "macro":
        model = GeneralNetwork()
        num_epochs = 310
        mutator = None
    elif args.search_for == "micro":
        model = MicroNetwork(num_layers=6, out_channels=20, num_nodes=5, dropout_rate=0.1, use_aux_heads=True)
        num_epochs = 150
        mutator = enas.EnasMutator(model, tanh_constant=1.1, cell_exit_extra_step=True)
    else:
        raise AssertionError

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1.0E-4)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0.001)

    trainer = enas.EnasTrainer(model,