Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
----------
output_dir : str
Path to the `output` experiment output directory for a test.
experiment_id : str
The experiment ID.
subgroups : list of str
List of column names that contain grouping
information.
file_format : str, optional
The format of the output files.
Defaults to 'csv'.
"""
train_preprocessed_file = join(output_dir,
'{}_train_metadata.{}'.format(experiment_id,
file_format))
train_preprocessed = DataReader.read_from_file(train_preprocessed_file, index_col=0)
test_preprocessed_file = join(output_dir,
'{}_test_metadata.{}'.format(experiment_id,
file_format))
test_preprocessed = DataReader.read_from_file(test_preprocessed_file,
index_col=0)
for group in subgroups:
ok_(group in train_preprocessed.columns)
ok_(group in test_preprocessed.columns)
# check that the total sum of N per category matches the total N
# in data composition and the total N categories matches what is
# in overall data composition
file_data_composition_all = join(output_dir,
'{}_data_composition.{}'.format(experiment_id,
file_format))
test_preprocessed_file = join(output_dir,
'{}_test_metadata.{}'.format(experiment_id,
file_format))
test_preprocessed = DataReader.read_from_file(test_preprocessed_file,
index_col=0)
for group in subgroups:
ok_(group in train_preprocessed.columns)
ok_(group in test_preprocessed.columns)
# check that the total sum of N per category matches the total N
# in data composition and the total N categories matches what is
# in overall data composition
file_data_composition_all = join(output_dir,
'{}_data_composition.{}'.format(experiment_id,
file_format))
df_data_composition_all = DataReader.read_from_file(file_data_composition_all)
for group in subgroups:
file_composition_by_group = join(output_dir,
'{}_data_composition_by_{}.{}'.format(experiment_id,
group,
file_format))
composition_by_group = DataReader.read_from_file(file_composition_by_group)
for partition in ['Training', 'Evaluation']:
partition_info = df_data_composition_all.loc[df_data_composition_all['partition'] ==
partition]
summation = sum(composition_by_group['{} set'
''.format(partition)])
ok_(summation == partition_info.iloc[0]['responses'])
length = len(composition_by_group.loc[composition_by_group['{} set'
''.format(partition)] != 0])
file1 : str
Path to the first file.
file2 : str
Path to the second files.
file_format : str, optional
The format of the output files.
Defaults to 'csv'.
"""
# make sure that the main id columns are read as strings since
# this may affect merging in custom notebooks
string_columns = ['spkitemid', 'candidate']
converter_dict = {column: str for column in string_columns}
df1 = DataReader.read_from_file(file1, converters=converter_dict)
df2 = DataReader.read_from_file(file2, converters=converter_dict)
# convert all column names to strings
# we do this to avoid any errors during sorting.
for df in [df1, df2]:
df.columns = df.columns.map(str)
# if the first column is numeric, just force the index to string;
# however, if it is non-numeric, assume that it is an index and
# force it to string. We do this to ensure string indices are
# preserved as such
for df in [df1, df2]:
if np.issubdtype(df[df.columns[0]].dtype, np.number):
df.index = df.index.map(str)
else:
'{}_coefficients_scaled.{}'.format(experiment_id,
file_format))
predictions_file = join('test_outputs',
source,
'output',
'{}_pred_processed.{}'.format(experiment_id,
file_format))
postprocessing_params_file = join('test_outputs',
source,
'output',
'{}_postprocessing_params.{}'.format(experiment_id,
file_format))
postproc_params = DataReader.read_from_file(postprocessing_params_file).loc[0]
df_preprocessed_test_data = DataReader.read_from_file(preprocessed_test_file)
df_old_predictions = DataReader.read_from_file(predictions_file)
df_old_predictions = df_old_predictions[['spkitemid', 'sc1', 'scale']]
# create fake skll objects with new coefficients
df_coef = DataReader.read_from_file(scaled_coefficients_file)
learner = Modeler.create_fake_skll_learner(df_coef)
modeler = Modeler.load_from_learner(learner)
# generate new predictions and rename the prediction column to 'scale'
df_new_predictions = modeler.predict(df_preprocessed_test_data,
postproc_params['trim_min'],
postproc_params['trim_max'])
df_new_predictions.rename(columns={'raw': 'scale'}, inplace=True)
# check that new predictions match the scaled old predictions
assert_frame_equal(df_new_predictions.sort_index(axis=1),
# degradation
degradation_file = join(filedir, "{}_degradation.{}".format(experiment_id,
file_format))
# load if degradation file is present
if exists(degradation_file):
df_degradation = DataReader.read_from_file(degradation_file, index_col=0)
files['df_degradation'] = df_degradation
# disattenuated correlations
dis_corr_file = join(filedir, "{}_disattenuated_correlations.{}".format(experiment_id,
file_format))
# load if disattenuated correlations is present
if exists(dis_corr_file):
df_dis_corr = DataReader.read_from_file(dis_corr_file, index_col=0)
# we only use the row for raw_trim or scale_trim score
files['df_disattenuated_correlations'] = df_dis_corr.loc[['{}_trim'.format(prefix)]]
# read in disattenuated correlations by group
for group in groups_eval:
group_dis_corr_file = join(filedir,
'{}_disattenuated_correlations_by_{}.{}'.format(experiment_id,
group,
file_format))
if exists(group_dis_corr_file):
df_dis_cor_group = DataReader.read_from_file(group_dis_corr_file, index_col=0)
files['df_disattenuated_correlations_by_{}'.format(group)] = df_dis_cor_group
files['df_disattenuated_correlations_by_{}_overview'.format(group)] = self.make_summary_stat_df(df_dis_cor_group)
# true score evaluations
true_score_eval_file = join(filedir, "{}_true_score_eval.{}".format(experiment_id,
# read in the partial correlations vs. score for all data
pcor_score_file = join(filedir, '{}_pcor_score_all_data.{}'.format(experiment_id,
file_format))
if exists(pcor_score_file):
files['df_pcor_sc1'] = DataReader.read_from_file(pcor_score_file, index_col=0)
files['df_pcor_sc1_overview'] = self.make_summary_stat_df(files['df_pcor_sc1'])
# read in the partial correlations by subgroups, if we are asked to
for group in groups_eval:
group_pcor_file = join(filedir, '{}_pcor_score_by_{}.{}'.format(experiment_id,
group,
file_format))
if exists(group_pcor_file):
files['df_pcor_sc1_by_{}'
''.format(group)] = DataReader.read_from_file(group_pcor_file,
index_col=0)
series = files['df_pcor_sc1_by_{}'.format(group)]
files['df_pcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)
# read in the marginal correlations vs. score for all data
mcor_score_file = join(filedir, '{}_margcor_score_all_data.{}'.format(experiment_id,
file_format))
if exists(mcor_score_file):
files['df_mcor_sc1'] = DataReader.read_from_file(mcor_score_file, index_col=0)
files['df_mcor_sc1_overview'] = self.make_summary_stat_df(files['df_mcor_sc1'])
# read in the partial correlations by subgroups, if we are asked to
for group in groups_eval:
group_mcor_file = join(filedir,
'{}_margcor_score_by_{}.{}'.format(experiment_id,
file_format))
if exists(descriptives_file):
# we read all files pertaining to the descriptive analysis together
# since we merge the outputs
files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)
# this df contains only the number of features. this is used later
# for another two tables to show the number of features
df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]
files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
'skewness', 'kurtosis']]
outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
file_format))
df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
df_outliers = df_outliers.rename(columns={'upper': 'Upper',
'lower': 'Lower',
'both': 'Both',
'upperperc': 'Upper %',
'lowerperc': 'Lower %',
'bothperc': 'Both %'})
df_outliers_columns = df_outliers.columns.tolist()
files['df_outliers'] = df_outliers
# join with df_features_n_values to get the value of N
files['df_outliers'] = pd.merge(files['df_outliers'], df_features_n_values,
left_index=True,
right_index=True)[['N'] + df_outliers_columns]
# join with df_features_n_values to get the value of N
percentiles_file = join(filedir, '{}_feature_descriptives'
files['df_mcor_sc1_{}_overview'.format(group)] = self.make_summary_stat_df(series)
pca_file = join(filedir, '{}_pca.{}'.format(experiment_id, file_format))
if exists(pca_file):
files['df_pca'] = DataReader.read_from_file(pca_file, index_col=0)
files['df_pcavar'] = DataReader.read_from_file(join(filedir,
'{}_pcavar.{}'.format(experiment_id,
file_format)),
index_col=0)
descriptives_file = join(filedir, '{}_feature_descriptives.{}'.format(experiment_id,
file_format))
if exists(descriptives_file):
# we read all files pertaining to the descriptive analysis together
# since we merge the outputs
files['df_descriptives'] = DataReader.read_from_file(descriptives_file, index_col=0)
# this df contains only the number of features. this is used later
# for another two tables to show the number of features
df_features_n_values = files['df_descriptives'][['N', 'min', 'max']]
files['df_descriptives'] = files['df_descriptives'][['N', 'mean', 'std. dev.',
'skewness', 'kurtosis']]
outliers_file = join(filedir, '{}_feature_outliers.{}'.format(experiment_id,
file_format))
df_outliers = DataReader.read_from_file(outliers_file, index_col=0)
df_outliers = df_outliers.rename(columns={'upper': 'Upper',
'lower': 'Lower',
'both': 'Both',
'upperperc': 'Upper %',
'lowerperc': 'Lower %',
file_format))
if exists(feature_cors_file):
files['df_feature_cors'] = DataReader.read_from_file(feature_cors_file, index_col=0)
# df_scores
scores_file = join(filedir, '{}_pred_processed.{}'.format(experiment_id,
file_format))
if exists(scores_file):
df_scores = DataReader.read_from_file(scores_file, converters={'spkitemid': str})
files['df_scores'] = df_scores[['spkitemid', 'sc1', prefix]]
# model coefficients if present
betas_file = join(filedir, '{}_betas.{}'.format(experiment_id,
file_format))
if exists(betas_file):
files['df_coef'] = DataReader.read_from_file(betas_file, index_col=0)
files['df_coef'].index.name = None
# read in the model fit files if present
model_fit_file = join(filedir, '{}_model_fit.{}'.format(experiment_id,
file_format))
if exists(model_fit_file):
files['df_model_fit'] = DataReader.read_from_file(model_fit_file)
# human human agreement
consistency_file = join(filedir, '{}_consistency.{}'.format(experiment_id,
file_format))
# load if consistency file is present
if exists(consistency_file):
df_consistency = DataReader.read_from_file(consistency_file, index_col=0)
files['df_consistency'] = df_consistency