How to use the ludwig.utils.data_utils.replace_file_extension function in ludwig

To help you get started, we’ve selected a few ludwig examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github uber / ludwig / tests / fixtures / filenames.py View on Github external
def delete_temporary_data(csv_path):
    """
    Helper method to delete temporary data created for running tests. Deletes
    the csv and hdf5/json data (if any)
    :param csv_path: path to the csv data file
    :return: None
    """
    if os.path.isfile(csv_path):
        os.remove(csv_path)

    json_path = replace_file_extension(csv_path, 'json')
    if os.path.isfile(json_path):
        os.remove(json_path)

    hdf5_path = replace_file_extension(csv_path, 'hdf5')
    if os.path.isfile(hdf5_path):
        os.remove(hdf5_path)
github uber / ludwig / ludwig / data / preprocessing.py View on Github external
)
        concatenated_df.csv = data_train_csv
        data, train_set_metadata = build_dataset_df(
            concatenated_df,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_train_hdf5_fp = replace_file_extension(data_train_csv, 'hdf5')
            data_utils.save_hdf5(
                data_train_hdf5_fp,
                training_set,
                train_set_metadata
            )
            train_set_metadata[DATA_TRAIN_HDF5_FP] = data_train_hdf5_fp
            if validation_set is not None:
                data_validation_hdf5_fp = replace_file_extension(
                    data_validation_csv,
                    'hdf5'
                )
                data_utils.save_hdf5(
                    data_validation_hdf5_fp,
                    validation_set,
                    train_set_metadata
                )
github uber / ludwig / ludwig / data / preprocessing.py View on Github external
# Also ignore data and train set metadata needs preprocessing
        logger.info(
            'Using full raw csv, no hdf5 and json file '
            'with the same name have been found'
        )
        logger.info('Building dataset (it may take a while)')
        data, train_set_metadata = build_dataset(
            data_csv,
            features,
            preprocessing_params,
            train_set_metadata=train_set_metadata,
            random_seed=random_seed
        )
        if not skip_save_processed_input:
            logger.info('Writing dataset')
            data_hdf5_fp = replace_file_extension(data_csv, 'hdf5')
            data_utils.save_hdf5(data_hdf5_fp, data, train_set_metadata)
            train_set_metadata[DATA_TRAIN_HDF5_FP] = data_hdf5_fp
            logger.info('Writing train set metadata with vocabulary')

            train_set_metadata_json_fp = replace_file_extension(
                data_csv,
                'json'
            )
            data_utils.save_json(
                train_set_metadata_json_fp, train_set_metadata)

        training_set, test_set, validation_set = split_dataset_tvt(
            data,
            data['split']
        )
github uber / ludwig / ludwig / data / preprocessing.py View on Github external
file_exists_with_diff_extension(train_fp, 'json') and
                    file_exists_with_diff_extension(validation_fp, 'hdf5') and
                    file_exists_with_diff_extension(test_fp, 'hdf5')):
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csvs, using them instead.'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    train_fp=replace_file_extension(train_fp, 'hdf5'),
                    validation_fp=replace_file_extension(
                        validation_fp,
                        'hdf5'
                    ),
                    test_fp=replace_file_extension(test_fp, 'hdf5'),
                    train_set_metadata_json=replace_file_extension(
                        train_fp,
                        'json'
                    ),
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
            else:
                (
                    training_set,
                    test_set,
                    validation_set,
                    train_set_metadata
                ) = _preprocess_csv_for_training(
                    features=features,
github uber / ludwig / ludwig / data / preprocessing.py View on Github external
file_exists_with_diff_extension(validation_fp, 'hdf5') and
                    file_exists_with_diff_extension(test_fp, 'hdf5')):
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csvs, using them instead.'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    train_fp=replace_file_extension(train_fp, 'hdf5'),
                    validation_fp=replace_file_extension(
                        validation_fp,
                        'hdf5'
                    ),
                    test_fp=replace_file_extension(test_fp, 'hdf5'),
                    train_set_metadata_json=replace_file_extension(
                        train_fp,
                        'json'
                    ),
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
            else:
                (
                    training_set,
                    test_set,
                    validation_set,
                    train_set_metadata
                ) = _preprocess_csv_for_training(
                    features=features,
                    data_csv=None,
github uber / ludwig / ludwig / data / preprocessing.py View on Github external
)
        model_definition['data_hdf5_fp'] = data_hdf5_fp

        if all_data_fp is not None:
            if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and
                    file_exists_with_diff_extension(all_data_fp, 'json')):
                # use hdf5 data instead
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csv, using them instead'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    all_data_fp=replace_file_extension(all_data_fp, 'hdf5'),
                    train_set_metadata_json=replace_file_extension(all_data_fp,
                                                                   'json'),
                    skip_save_processed_input=skip_save_processed_input,
                    preprocessing_params=preprocessing_params,
                    random_seed=random_seed
                )
            else:
                (
                    training_set,
                    test_set,
                    validation_set,
                    train_set_metadata
                ) = _preprocess_csv_for_training(
                    features=features,
                    data_csv=all_data_fp,
                    data_train_csv=None,
                    data_validation_csv=None,
github uber / ludwig / ludwig / data / preprocessing.py View on Github external
model_definition['input_features'],
                    model_definition['output_features'],
                    split_data=False
                )

            test_set = None
            if test_fp is not None:
                test_set = load_data(
                    test_fp,
                    model_definition['input_features'],
                    model_definition['output_features'],
                    split_data=False
                )

    elif data_type == 'csv':
        data_hdf5_fp = replace_file_extension(
            all_data_fp, 'hdf5'
        )
        model_definition['data_hdf5_fp'] = data_hdf5_fp

        if all_data_fp is not None:
            if (file_exists_with_diff_extension(all_data_fp, 'hdf5') and
                    file_exists_with_diff_extension(all_data_fp, 'json')):
                # use hdf5 data instead
                logger.info(
                    'Found hdf5 and json with the same filename '
                    'of the csv, using them instead'
                )
                return preprocess_for_training_by_type(
                    model_definition,
                    'hdf5',
                    all_data_fp=replace_file_extension(all_data_fp, 'hdf5'),
github uber / ludwig / ludwig / data / preprocessing.py View on Github external
) = _preprocess_df_for_training(
            features,
            all_data_df,
            train_df,
            validation_df,
            test_df,
            train_set_metadata_json=train_set_metadata_json,
            preprocessing_params=preprocessing_params,
            random_seed=random_seed
        )
    elif data_type == 'hdf5' and train_set_metadata_json is None:
        raise ValueError('train set metadata file is not found along with hdf5'
                         ' data')
    elif data_type == 'hdf5':
        if all_data_fp is not None:
            data_hdf5_fp = replace_file_extension(all_data_fp, 'hdf5')
            logger.info('Using full hdf5 and json')
            training_set, test_set, validation_set = load_data(
                all_data_fp,
                model_definition['input_features'],
                model_definition['output_features'],
                shuffle_training=True
            )
            train_set_metadata = load_metadata(train_set_metadata_json)
        elif train_fp is not None:
            logger.info('Using hdf5 and json')
            training_set = load_data(
                train_fp,
                model_definition['input_features'],
                model_definition['output_features'],
                split_data=False
            )