Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if isinstance(included_dataset, (str, PurePath)):
included_dataset = [included_dataset]
if isinstance(excluded_dataset, (str, PurePath)):
excluded_dataset = [excluded_dataset]
if isinstance(prior_dataset, (str, PurePath)):
prior_dataset = [prior_dataset]
as_data = ASReviewData()
# Find the URL of the datasets if the dataset is an example dataset.
for data in dataset:
as_data.append(ASReviewData.from_file(find_data(data)))
if new:
as_data.labels = np.full((len(as_data),), LABEL_NA, dtype=int)
for data in included_dataset:
as_data.append(ASReviewData.from_file(
find_data(data), data_type="included"))
for data in excluded_dataset:
as_data.append(ASReviewData.from_file(
find_data(data), data_type="excluded"))
for data in prior_dataset:
as_data.append(ASReviewData.from_file(
find_data(data), data_type="prior"))
return as_data
logging.warning("Unable to detect abstracts in dataset.")
if "title" not in col_names:
logging.warning("Unable to detect titles in dataset.")
# Replace NA values with empty strings.
for col in ["title", "abstract", "authors", "keywords"]:
try:
df[all_column_spec[col]].fillna("", inplace=True)
except KeyError:
pass
# Convert labels to integers.
if "final_included" in col_names:
try:
col = all_column_spec["final_included"]
df[col].fillna(LABEL_NA, inplace=True)
df[col] = pd.to_numeric(df[col])
except KeyError:
pass
# If the we have a record_id (for example from an ASReview export) use it.
if "record_id" in list(df):
df.set_index('record_id', inplace=True)
if df.index.name != "record_id":
df["record_id"] = np.arange(len(df.index))
df.set_index('record_id', inplace=True)
df.sort_index(inplace=True)
return df, all_column_spec
super(BaseReview, self).__init__()
# Default to Naive Bayes model
if model is None:
model = NBModel()
if query_model is None:
query_model = MaxQuery()
if balance_model is None:
balance_model = SimpleBalance()
if feature_model is None:
feature_model = Tfidf()
self.as_data = as_data
self.y = as_data.labels
if self.y is None:
self.y = np.full(len(as_data), LABEL_NA)
self.model = model
self.balance_model = balance_model
self.query_model = query_model
self.feature_model = feature_model
self.shared = {"query_src": {}, "current_queries": {}}
self.model.shared = self.shared
self.query_model.shared = self.shared
self.balance_model.shared = self.shared
self.n_papers = n_papers
self.n_instances = n_instances
self.n_queries = n_queries
self.start_idx = start_idx
if log_file is not None:
def todict(self):
"""Create dictionary from the record."""
label = self.label
if self.label is LABEL_NA:
label = None
paper_dict = {
"title": self.title,
"abstract": self.abstract,
"authors": self.authors,
"keywords": self.keywords,
"record_id": self.record_id,
"label": label,
}
paper_dict.update(self.extra_fields)
return paper_dict
with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
# open the projects file
with open(project_file_path, "r") as f_read:
project_dict = json.load(f_read)
# add path to dict (overwrite if already exists)
project_dict["dataset_path"] = file_name
with open(project_file_path, "w") as f_write:
json.dump(project_dict, f_write)
# fill the pool of the first iteration
as_data = read_data(project_id)
if as_data.labels is not None:
unlabeled = np.where(as_data.labels == LABEL_NA)[0]
pool_indices = as_data.record_ids[unlabeled]
else:
pool_indices = as_data.record_ids
np.random.shuffle(pool_indices)
write_pool(project_id, pool_indices.tolist())
# make a empty qeue for the items to label
write_label_history(project_id, [])
def __init__(self, as_data, *args, use_cli_colors=True,
new_review=False,
**kwargs):
self.as_data = as_data
if not new_review:
start_idx = np.where(as_data.labels != LABEL_NA)[0]
else:
as_data.labels = np.full_like(as_data.labels, LABEL_NA)
start_idx = []
super(ReviewOracle, self).__init__(
as_data, *args, **kwargs, start_idx=start_idx)
self.use_cli_colors = use_cli_colors
pd.DataFrame:
Dataframe of all available record data.
"""
new_df = pd.DataFrame.copy(self.df)
col = self.column_spec["final_included"]
if labels is not None:
new_df[col] = labels
if ranking is not None:
# sort the datasets based on the ranking
new_df = new_df.iloc[ranking]
# append a column with 1 to n
new_df["asreview_ranking"] = np.arange(1, len(new_df) + 1)
if col in list(new_df):
new_df[col] = new_df[col].astype(object)
new_df.loc[new_df[col] == LABEL_NA, col] = np.nan
return new_df
self.data_name = as_data.data_name
self.prior_idx = as_data.prior_idx
self.max_idx = as_data.max_idx
self.column_spec = as_data.column_spec
return
reindex_val = max(self.max_idx - min(as_data.df.index.values), 0)
new_index = np.append(self.df.index.values,
as_data.df.index.values + reindex_val)
new_priors = np.append(self.prior_idx, as_data.prior_idx + reindex_val)
new_df = self.df.append(as_data.df, sort=False)
new_df.index = new_index
new_labels = None
if self.labels is None and as_data.labels is not None:
new_labels = np.append(np.full(len(self), LABEL_NA, dtype=int),
as_data.labels)
elif self.labels is not None and as_data.labels is None:
new_labels = np.append(self.labels,
np.full(len(as_data), LABEL_NA, dtype=int))
self.max_idx = max(self.max_idx, as_data.max_idx, max(new_index))
self.df = new_df
if new_labels is not None:
self.labels = new_labels
self.prior_idx = new_priors
self.data_name += "_" + as_data.data_name
for data_type, col in as_data.column_spec.items():
if data_type in self.column_spec:
if self.column_spec[data_type] != col:
raise ValueError(
"Error merging dataframes: column specifications "
f"differ: {self.column_spec} vs {as_data.column_spec}")
def read_current_labels(project_id, as_data=None, label_history=None):
if as_data is None:
as_data = read_data(project_id)
if label_history is None:
label_history = read_label_history(project_id)
labels = as_data.labels
if labels is None:
labels = np.full(len(as_data), LABEL_NA, dtype=int)
for idx, inclusion in label_history:
labels[idx] = inclusion
return np.array(labels, dtype=int)
def __init__(self, record_id, column_spec={}, **kwargs):
for attr in ["title", "abstract", "authors", "keywords",
"final_included"]:
if attr in column_spec:
col = column_spec[attr]
elif attr in kwargs:
col = attr
else:
col = None
setattr(self, attr, kwargs.pop(col, None))
self.record_id = record_id
if self.final_included is None:
self.final_included = LABEL_NA
else:
self.final_included = int(self.final_included)
self.extra_fields = kwargs
for attr, val in self.extra_fields.items():
if pd.isna(val):
self.extra_fields[attr] = None