How to use the rsmtool.reader.DataReader.read_from_file function in rsmtool

To help you get started, we’ve selected a few rsmtool examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github EducationalTestingService / rsmtool / rsmtool / test_utils.py View on Github external
----------
    output_dir : str
        Path to the `output` experiment output directory for a test.
    experiment_id : str
        The experiment ID.
    subgroups : list of str
        List of column names that contain grouping
        information.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """
    train_preprocessed_file = join(output_dir,
                                   '{}_train_metadata.{}'.format(experiment_id,
                                                                 file_format))
    train_preprocessed = DataReader.read_from_file(train_preprocessed_file, index_col=0)

    test_preprocessed_file = join(output_dir,
                                  '{}_test_metadata.{}'.format(experiment_id,
                                                               file_format))
    test_preprocessed = DataReader.read_from_file(test_preprocessed_file,
                                                  index_col=0)
    for group in subgroups:
        ok_(group in train_preprocessed.columns)
        ok_(group in test_preprocessed.columns)

    # check that the total sum of N per category matches the total N
    # in data composition and the total N categories matches what is
    # in overall data composition
    file_data_composition_all = join(output_dir,
                                     '{}_data_composition.{}'.format(experiment_id,
                                                                     file_format))
github EducationalTestingService / rsmtool / rsmtool / test_utils.py View on Github external
test_preprocessed_file = join(output_dir,
                                  '{}_test_metadata.{}'.format(experiment_id,
                                                               file_format))
    test_preprocessed = DataReader.read_from_file(test_preprocessed_file,
                                                  index_col=0)
    for group in subgroups:
        ok_(group in train_preprocessed.columns)
        ok_(group in test_preprocessed.columns)

    # check that the total sum of N per category matches the total N
    # in data composition and the total N categories matches what is
    # in overall data composition
    file_data_composition_all = join(output_dir,
                                     '{}_data_composition.{}'.format(experiment_id,
                                                                     file_format))
    df_data_composition_all = DataReader.read_from_file(file_data_composition_all)
    for group in subgroups:
        file_composition_by_group = join(output_dir,
                                         '{}_data_composition_by_{}.{}'.format(experiment_id,
                                                                               group,
                                                                               file_format))
        composition_by_group = DataReader.read_from_file(file_composition_by_group)
        for partition in ['Training', 'Evaluation']:
            partition_info = df_data_composition_all.loc[df_data_composition_all['partition'] ==
                                                         partition]

            summation = sum(composition_by_group['{} set'
                                                 ''.format(partition)])
            ok_(summation == partition_info.iloc[0]['responses'])

            length = len(composition_by_group.loc[composition_by_group['{} set'
                                                                       ''.format(partition)] != 0])
github EducationalTestingService / rsmtool / rsmtool / test_utils.py View on Github external
file1 : str
        Path to the first file.
    file2 : str
        Path to the second files.
    file_format : str, optional
        The format of the output files.
        Defaults to 'csv'.
    """

    # make sure that the main id columns are read as strings since
    # this may affect merging in custom notebooks
    string_columns = ['spkitemid', 'candidate']

    converter_dict = {column: str for column in string_columns}

    df1 = DataReader.read_from_file(file1, converters=converter_dict)
    df2 = DataReader.read_from_file(file2, converters=converter_dict)

    # convert all column names to strings
    # we do this to avoid any errors during sorting.
    for df in [df1, df2]:
        df.columns = df.columns.map(str)


    # if the first column is numeric, just force the index to string;
    # however, if it is non-numeric, assume that it is an index and
    # force it to string. We do this to ensure string indices are
    # preserved as such
    for df in [df1, df2]:
        if np.issubdtype(df[df.columns[0]].dtype, np.number):
            df.index = df.index.map(str)
        else:
github EducationalTestingService / rsmtool / rsmtool / test_utils.py View on Github external
'{}_coefficients_scaled.{}'.format(experiment_id,
                                                                       file_format))
    predictions_file = join('test_outputs',
                            source,
                            'output',
                            '{}_pred_processed.{}'.format(experiment_id,
                                                          file_format))

    postprocessing_params_file = join('test_outputs',
                                      source,
                                      'output',
                                      '{}_postprocessing_params.{}'.format(experiment_id,
                                                                           file_format))

    postproc_params = DataReader.read_from_file(postprocessing_params_file).loc[0]
    df_preprocessed_test_data = DataReader.read_from_file(preprocessed_test_file)
    df_old_predictions = DataReader.read_from_file(predictions_file)
    df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']]

    # create fake skll objects with new coefficients
    df_coef = DataReader.read_from_file(scaled_coefficients_file)
    learner = Modeler.create_fake_skll_learner(df_coef)
    modeler = Modeler.load_from_learner(learner)

    # generate new predictions and rename the prediction column to 'scale'
    df_new_predictions = modeler.predict(df_preprocessed_test_data,
                                         postproc_params['trim_min'],
                                         postproc_params['trim_max'])
    df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True)

    # check that new predictions match the scaled old predictions
    assert_frame_equal(df_new_predictions.sort_index(axis=1),
github EducationalTestingService / rsmtool / rsmtool / comparer.py View on Github external
# degradation
        degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id,
                                                                    file_format))

        # load if degradation file is present
        if exists(degradation_file):
            df_degradation = DataReader.read_from_file(degradation_file, index_col=0)
            files['df_degradation'] = df_degradation

        # disattenuated correlations
        dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id,
                                                                                file_format))

        # load if disattenuated correlations is present
        if exists(dis_corr_file):
            df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0)
            # we only use the row for raw_trim or scale_trim score
            files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]]

        # read in disattenuated correlations by group
        for group in groups_eval:
            group_dis_corr_file = join(filedir,
                                       '{}_disattenuated_correlations_by_{}.{}'.format(experiment_id,
                                                                                       group,
                                                                                       file_format))
            if exists(group_dis_corr_file):
                df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0)
                files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group
                files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group)

        # true score evaluations
        true_score_eval_file = join(filedir, "{}_true_score_eval.{}".format(experiment_id,
github EducationalTestingService / rsmtool / rsmtool / comparer.py View on Github external
# read in the partial correlations vs. score for all data
        pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id,
                                                                           file_format))
        if exists(pcor_score_file):
            files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0)
            files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id,
                                                                            group,
                                                                            file_format))
            if exists(group_pcor_file):
                files['df_pcor_sc1_by_{}'
                      ''.format(group)] = DataReader.read_from_file(group_pcor_file,
                                                                    index_col=0)

                series = files['df_pcor_sc1_by_{}'.format(group)]
                files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        # read in the marginal correlations vs. score for all data
        mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id,
                                                                              file_format))
        if exists(mcor_score_file):
            files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0)
            files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1'])

        # read in the partial correlations by subgroups, if we are asked to
        for group in groups_eval:
            group_mcor_file = join(filedir,
                                   '{}_margcor_score_by_{}.{}'.format(experiment_id,
github EducationalTestingService / rsmtool / rsmtool / comparer.py View on Github external
file_format))
        if exists(descriptives_file):
            # we read all files pertaining to the descriptive analysis together
            # since we merge the outputs
            files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)

            # this df contains only the number of features. this is used later
            # for another two tables to show the number of features
            df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]

            files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
                                                                 'skewness', 'kurtosis']]

            outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
                                                                          file_format))
            df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
            df_outliers = df_outliers.rename(columns={'upper': 'Upper',
                                                      'lower': 'Lower',
                                                      'both': 'Both',
                                                      'upperperc': 'Upper %',
                                                      'lowerperc': 'Lower %',
                                                      'bothperc': 'Both %'})
            df_outliers_columns = df_outliers.columns.tolist()
            files['df_outliers'] = df_outliers

            # join with df_features_n_values to get the value of N
            files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values,
                                            left_index=True,
                                            right_index=True)[['N'] + df_outliers_columns]

            # join with df_features_n_values to get the value of N
            percentiles_file = join(filedir, '{}_feature_descriptives'
github EducationalTestingService / rsmtool / rsmtool / comparer.py View on Github external
files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)

        pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format))
        if exists(pca_file):
            files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0)
            files['df_pcavar'] = DataReader.read_from_file(join(filedir,
                                                                '{}_pcavar.{}'.format(experiment_id,
                                                                                      file_format)),
                                                           index_col=0)

        descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id,
                                                                              file_format))
        if exists(descriptives_file):
            # we read all files pertaining to the descriptive analysis together
            # since we merge the outputs
            files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)

            # this df contains only the number of features. this is used later
            # for another two tables to show the number of features
            df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]

            files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
                                                                 'skewness', 'kurtosis']]

            outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
                                                                          file_format))
            df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
            df_outliers = df_outliers.rename(columns={'upper': 'Upper',
                                                      'lower': 'Lower',
                                                      'both': 'Both',
                                                      'upperperc': 'Upper %',
                                                      'lowerperc': 'Lower %',
github EducationalTestingService / rsmtool / rsmtool / comparer.py View on Github external
file_format))
        if exists(feature_cors_file):
            files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0)

        # df_scores
        scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id,
                                                                  file_format))
        if exists(scores_file):
            df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str})
            files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]]

        # model coefficients if present
        betas_file = join(filedir, '{}_betas.{}'.format(experiment_id,
                                                        file_format))
        if exists(betas_file):
            files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0)
            files['df_coef'].index.name = None

        # read in the model fit files if present
        model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id,
                                                                file_format))
        if exists(model_fit_file):
            files['df_model_fit'] = DataReader.read_from_file(model_fit_file)

        # human human agreement
        consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id,
                                                                    file_format))

        # load if consistency file is present
        if exists(consistency_file):
            df_consistency = DataReader.read_from_file(consistency_file, index_col=0)
            files['df_consistency'] = df_consistency