How to use the reader.Dataset function in reader

To help you get started, we’ve selected a few reader examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github PaddlePaddle / models / legacy / deep_fm / infer.py View on Github external
def infer():
    args = parse_args()

    paddle.init(use_gpu=False, trainer_count=1)

    model = DeepFM(args.factor_size, infer=True)

    parameters = paddle.parameters.Parameters.from_tar(
        gzip.open(args.model_gz_path, 'r'))

    inferer = paddle.inference.Inference(
        output_layer=model, parameters=parameters)

    dataset = reader.Dataset()

    infer_reader = paddle.batch(dataset.infer(args.data_path), batch_size=1000)

    with open(args.prediction_output_path, 'w') as out:
        for id, batch in enumerate(infer_reader()):
            res = inferer.infer(input=batch)
            predictions = [x for x in itertools.chain.from_iterable(res)]
            out.write('\n'.join(map(str, predictions)) + '\n')
github Kejie-Wang / End-to-End-Learning-for-Self-Driving-Cars / reader.py View on Github external
validation_filename = os.path.join(data_dir, "validation.npy")
        test_filename = os.path.join(data_dir, "test.npy")

        if os.path.exists(train_filename) and os.path.exists(
                validation_filename) and os.path.exists(test_filename):
            train_data = np.load(train_filename)
            validation_data = np.load(validation_filename)
            test_data = np.load(test_filename)
        else:
            print(
                "Data does NOT exist, please check directory if exists and run split_dataset.py before train."
            )
            exit(0)

        train_images, train_labels = zip(*train_data)
        self._train = Dataset(train_images, train_labels)

        validation_images, validation_labels = zip(*validation_data)
        self._validation = Dataset(validation_images, validation_labels)

        test_images, test_labels = zip(*test_data)
        self._test = Dataset(test_images, test_labels)
github shibing624 / python-tutorial / 16paddle / dssm / train.py View on Github external
train DSSM
    """
    default_train_paths = ["./data/classification/train/right.txt",
                           "./data/classification/train/wrong.txt"]
    default_test_paths = ["./data/classification/test/right.txt",
                          "./data/classification/test/wrong.txt"]
    default_dic_path = "./data/vocab.txt"
    layer_dims = [int(i) for i in config.config['dnn_dims'].split(',')]
    use_default_data = not train_data_paths
    if use_default_data:
        train_data_paths = default_train_paths
        test_data_paths = default_test_paths
        source_dic_path = default_dic_path
        target_dic_path = default_dic_path

    dataset = reader.Dataset(
        train_paths=train_data_paths,
        test_paths=test_data_paths,
        source_dic_path=source_dic_path,
        target_dic_path=target_dic_path
    )

    train_reader = paddle.batch(paddle.reader.shuffle(dataset.train, buf_size=1000),
                                batch_size=batch_size)
    test_reader = paddle.batch(paddle.reader.shuffle(dataset.test, buf_size=1000),
                               batch_size=batch_size)
    paddle.init(use_gpu=use_gpu, trainer_count=num_workers)

    # DSSM
    cost, prediction, label = DSSM(
        dnn_dims=layer_dims,
        vocab_sizes=[len(load_dic(path)) for path in [source_dic_path, target_dic_path]],
github PaddlePaddle / models / legacy / deep_fm / train.py View on Github external
if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    paddle.init(use_gpu=False, trainer_count=1)

    optimizer = paddle.optimizer.Adam(learning_rate=1e-4)

    model = DeepFM(args.factor_size)

    params = paddle.parameters.create(model)

    trainer = paddle.trainer.SGD(cost=model,
                                 parameters=params,
                                 update_equation=optimizer)

    dataset = reader.Dataset()

    def __event_handler__(event):
        if isinstance(event, paddle.event.EndIteration):
            num_samples = event.batch_id * args.batch_size
            if event.batch_id % 100 == 0:
                logger.warning("Pass %d, Batch %d, Samples %d, Cost %f, %s" %
                               (event.pass_id, event.batch_id, num_samples,
                                event.cost, event.metrics))

            if event.batch_id % 10000 == 0:
                if args.test_data_path:
                    result = trainer.test(
                        reader=paddle.batch(
                            dataset.test(args.test_data_path),
                            batch_size=args.batch_size),
                        feeding=reader.feeding)
github PaddlePaddle / models / PaddleNLP / lexical_analysis / eval.py View on Github external
def do_eval(args):
    dataset = reader.Dataset(args)

    test_program = fluid.Program()
    with fluid.program_guard(test_program, fluid.default_startup_program()):
        with fluid.unique_name.guard():
            test_ret = creator.create_model(
                args, dataset.vocab_size, dataset.num_labels, mode='test')
    test_program = test_program.clone(for_test=True)

    # init executor
    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
    else:
        place = fluid.CPUPlace()

    pyreader = creator.create_pyreader(args, file_name=args.test_data,
                                       feed_list=test_ret['feed_list'],
github PaddlePaddle / models / legacy / dssm / train.py View on Github external
default_train_path = "./data/rank/train.txt"
    default_test_path = "./data/rank/test.txt"
    default_dic_path = "./data/vocab.txt"
    if not model_type.is_rank():
        default_train_path = "./data/classification/train.txt"
        default_test_path = "./data/classification/test.txt"

    use_default_data = not train_data_path

    if use_default_data:
        train_data_path = default_train_path
        test_data_path = default_test_path
        source_dic_path = default_dic_path
        target_dic_path = default_dic_path

    dataset = reader.Dataset(
        train_path=train_data_path,
        test_path=test_data_path,
        source_dic_path=source_dic_path,
        target_dic_path=target_dic_path,
        model_type=model_type, )

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            dataset.train, buf_size=1000),
        batch_size=batch_size)

    test_reader = paddle.batch(
        paddle.reader.shuffle(
            dataset.test, buf_size=1000),
        batch_size=batch_size)
github PaddlePaddle / models / PaddleNLP / lexical_analysis / inference_model.py View on Github external
def save_inference_model(args):

    # model definition
    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
    else:
        place = fluid.CPUPlace()
    dataset = reader.Dataset(args)
    infer_program = fluid.Program()
    with fluid.program_guard(infer_program, fluid.default_startup_program()):
        with fluid.unique_name.guard():

            infer_ret = creator.create_model(
                args, dataset.vocab_size, dataset.num_labels, mode='infer')
            infer_program = infer_program.clone(for_test=True)


    # load pretrain check point
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    utils.init_checkpoint(exe, args.init_checkpoint, infer_program)

    fluid.io.save_inference_model(args.inference_save_dir,
                                  ['words'],
github PaddlePaddle / models / legacy / dssm / infer.py View on Github external
def infer(self, data_path):
        dataset = reader.Dataset(
            train_path=data_path,
            test_path=None,
            source_dic_path=args.source_dic_path,
            target_dic_path=args.target_dic_path,
            model_type=args.model_type, )
        infer_reader = paddle.batch(dataset.infer, batch_size=1000)
        logger.warning("Write predictions to %s." % args.prediction_output_path)

        output_f = open(args.prediction_output_path, "w")

        for id, batch in enumerate(infer_reader()):
            res = self.inferer.infer(input=batch)
            predictions = [" ".join(map(str, x)) for x in res]
            assert len(batch) == len(predictions), (
                "Error! %d inputs are given, "
                "but only %d predictions are returned.") % (len(batch),
github PaddlePaddle / models / legacy / ctr / infer.py View on Github external
def infer(self, data_path):
        logger.info("infer data...")
        dataset = reader.Dataset()
        infer_reader = paddle.batch(
            dataset.infer(args.data_path), batch_size=1000)
        logger.warning('write predictions to %s' % args.prediction_output_path)
        output_f = open(args.prediction_output_path, 'w')
        for id, batch in enumerate(infer_reader()):
            res = self.inferer.infer(input=batch)
            predictions = [x for x in itertools.chain.from_iterable(res)]
            assert len(batch) == len(
                predictions), "predict error, %d inputs, but %d predictions" % (
                    len(batch), len(predictions))
            output_f.write('\n'.join(map(str, predictions)) + '\n')
github lca4 / collaborative-rnn / crnn.py View on Github external
def main(args):
    # Read (and optionally, truncate) the training and validation data.
    train_data = Dataset.from_path(args.train_path)
    if args.max_train_chunks is not None:
        size = args.max_train_chunks * args.chunk_size
        train_data.truncate_seqs(size)
    valid_data = Dataset.from_path(args.valid_path)
    if args.max_valid_chunks is not None:
        size = args.max_valid_chunks * args.chunk_size
        valid_data.truncate_seqs(size, keep_first=True)

    num_users = train_data.num_users
    num_items = train_data.num_items
    tot_size = train_data.num_triplets + valid_data.num_triplets

    train_data.prepare_batches(args.chunk_size, args.batch_size)
    valid_data.prepare_batches(args.chunk_size, args.batch_size,
            batches_like=train_data)

    settings = {
        "chunk_size": args.chunk_size,
        "batch_size": args.batch_size,
        "hidden_size": args.hidden_size,