How to use the rsmtool.preprocessor.FeaturePreprocessor function in rsmtool

To help you get started, we’ve selected a few rsmtool examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EducationalTestingService / rsmtool / rsmtool / rsmpredict.py View on Github external
converters = {'input_features': configuration.get_default_converter()}

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}})

    # load the Modeler to generate the predictions
    model = Modeler.load_from_file(join(experiment_output_dir,
                                        '{}.model'.format(experiment_id)))

    # Add the model to the configuration object
    configuration['model'] = model

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container,
                                                   context='rsmpredict')

    # save the pre-processed features to disk if we were asked to
    if feats_file is not None:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        feats_dir = dirname(feats_file)

        # create any directories needed for the output file
        os.makedirs(feats_dir, exist_ok=True)

        _, feats_filename = split(feats_file)
github EducationalTestingService / rsmtool / rsmtool / preprocessor.py View on Github external
if (length_column and
            (len(df_filtered[df_filtered['length'].isnull()]) != 0 or
                df_filtered['length'].std() <= 0)):
            logging.warning("The {} column either has missing values or a standard "
                            "deviation <= 0. No length-based analysis will be "
                            "provided. The column will be renamed as ##{}## and "
                            "saved in *train_other_columns.csv.".format(length_column,
                                                                        length_column))
            df_filtered.rename(columns={'length': '##{}##'.format(length_column)},
                               inplace=True)

        # if requested, exclude the candidates with less than X responses
        # left after filtering
        if min_candidate_items:
            (df_filtered_candidates,
             df_excluded_candidates) = FeaturePreprocessor.select_candidates(df_filtered,
                                                                             min_candidate_items)
            # check that there are still responses left for analysis
            if len(df_filtered_candidates) == 0:
                raise ValueError("After filtering non-numeric scores and "
                                 "non-numeric feature values there were "
                                 "no candidates with {} or more responses "
                                 "left for analysis".format(min_candidate_items))

            # redefine df_filtered
            df_filtered = df_filtered_candidates.copy()

            # update df_excluded
            df_excluded = pd.concat([df_excluded, df_excluded_candidates], sort=True)

        # create separate data frames for features and sc1, all other
        # information, and responses excluded during filtering
github EducationalTestingService / rsmtool / rsmtool / modeler.py View on Github external
predict_expected=predict_expected_scores)
        df_test_predictions = self.predict(df_test,
                                           int(trim_min),
                                           int(trim_max),
                                           predict_expected=predict_expected_scores)

        # get the mean and SD of the training set predictions
        train_predictions_mean = df_train_predictions['raw'].mean()
        train_predictions_sd = df_train_predictions['raw'].std()

        # get the mean and SD of the human labels
        human_labels_mean = df_train['sc1'].mean()
        human_labels_sd = df_train['sc1'].std()

        logging.info('Processing train set predictions.')
        df_train_predictions = FeaturePreprocessor.process_predictions(df_train_predictions,
                                                                       train_predictions_mean,
                                                                       train_predictions_sd,
                                                                       human_labels_mean,
                                                                       human_labels_sd,
                                                                       trim_min,
                                                                       trim_max,
                                                                       trim_tolerance)

        logging.info('Processing test set predictions.')
        df_test_predictions = FeaturePreprocessor.process_predictions(df_test_predictions,
                                                                      train_predictions_mean,
                                                                      train_predictions_sd,
                                                                      human_labels_mean,
                                                                      human_labels_sd,
                                                                      trim_min,
                                                                      trim_max,
github EducationalTestingService / rsmtool / rsmtool / rsmtool.py View on Github external
'{}'.format(repr(missing_file_paths)))

    # Use the default converter for both train and test
    converters = {'train': configuration.get_default_converter(),
                  'test': configuration.get_default_converter()}

    logger.info('Reading in all data from files.')

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing all features.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container)

    # Rename certain frames with more descriptive names
    # for writing out experiment files
    rename_dict = {'train_excluded': 'train_excluded_responses',
                   'test_excluded': 'test_excluded_responses',
                   'train_length': 'train_response_lengths',
                   'train_flagged': 'train_responses_with_excluded_flags',
                   'test_flagged': 'test_responses_with_excluded_flags'}

    logger.info('Saving training and test set data to disk.')

    # Write out files
github EducationalTestingService / rsmtool / rsmtool / rsmeval.py View on Github external
file_paths) = configuration.get_names_and_paths(paths, names)

    file_paths = DataReader.locate_files(file_paths, configuration.configdir)

    converters = {'predictions': configuration.get_default_converter()}

    logger.info('Reading predictions: {}.'.format(configuration['predictions_file']))

    # Initialize the reader
    reader = DataReader(file_paths, file_names, converters)
    data_container = reader.read()

    logger.info('Preprocessing predictions.')

    # Initialize the processor
    processor = FeaturePreprocessor()

    (processed_config,
     processed_container) = processor.process_data(configuration,
                                                   data_container,
                                                   context='rsmeval')

    logger.info('Saving pre-processed predictions and metadata to disk.')
    writer.write_experiment_output(csvdir,
                                   processed_container,
                                   new_names_dict={'pred_test':
                                                   'pred_processed',
                                                   'test_excluded':
                                                   'test_excluded_responses'},
                                   file_format=file_format)

    # Initialize the analyzer
github EducationalTestingService / rsmtool / rsmtool / preprocessor.py View on Github external
df_pred_processed : pd.DataFrame
            Data frame containing the various trimmed
            and rounded predictions.
        """

        # rescale the test set predictions by boosting
        # them to match the human mean and SD
        scaled_test_predictions = (df_test_predictions['raw'] -
                                   train_predictions_mean) / train_predictions_sd
        scaled_test_predictions = scaled_test_predictions * human_labels_sd + human_labels_mean

        df_pred_process = df_test_predictions.copy()
        df_pred_process['scale'] = scaled_test_predictions

        # trim and round the predictions before running the analyses
        df_pred_process['raw_trim'] = FeaturePreprocessor.trim(df_pred_process['raw'],
                                                               trim_min,
                                                               trim_max,
                                                               trim_tolerance)

        df_pred_process['raw_trim_round'] = np.rint(df_pred_process['raw_trim'])
        df_pred_process['raw_trim_round'] = df_pred_process['raw_trim_round'].astype('int64')

        df_pred_process['scale_trim'] = FeaturePreprocessor.trim(df_pred_process['scale'],
                                                                 trim_min,
                                                                 trim_max,
                                                                 trim_tolerance)

        df_pred_process['scale_trim_round'] = np.rint(df_pred_process['scale_trim'])
        df_pred_process['scale_trim_round'] = df_pred_process['scale_trim_round'].astype('int64')

        return df_pred_process