Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def export_to_string(project_id, export_type="csv"):
fp_lock = get_lock_path(project_id)
as_data = read_data(project_id)
with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
proba = read_proba(project_id)
if proba is None:
proba = np.flip(np.arange(len(as_data)))
else:
proba = np.array(proba)
labels = read_current_labels(project_id, as_data=as_data)
pool_idx = np.where(labels == LABEL_NA)[0]
one_idx = np.where(labels == 1)[0]
zero_idx = np.where(labels == 0)[0]
proba_order = np.argsort(-proba[pool_idx])
ranking = np.concatenate(
(one_idx, pool_idx[proba_order], zero_idx), axis=None)
if export_type == "csv":
def add_dataset_to_project(project_id, file_name):
"""Add file path to the project file.
Add file to data subfolder and fill the pool of iteration 0.
"""
project_file_path = get_project_file_path(project_id)
fp_lock = get_lock_path(project_id)
with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
# open the projects file
with open(project_file_path, "r") as f_read:
project_dict = json.load(f_read)
# add path to dict (overwrite if already exists)
project_dict["dataset_path"] = file_name
with open(project_file_path, "w") as f_write:
json.dump(project_dict, f_write)
# fill the pool of the first iteration
as_data = read_data(project_id)
if as_data.labels is not None:
unlabeled = np.where(as_data.labels == LABEL_NA)[0]
pool_indices = as_data.record_ids[unlabeled]
else:
It uses a lock to ensure only one model is running at the same time.
Old results directories are deleted after 4 iterations.
It has one argument on the CLI, which is the base project directory.
"""
print(f"Train a new model for project {project_id}")
# get file locations
asr_kwargs_file = get_kwargs_path(project_id)
lock_file = get_lock_path(project_id)
# Lock so that only one training run is running at the same time.
# It doesn't lock the flask server/client.
with SQLiteLock(lock_file, blocking=False, lock_name="training") as lock:
# If the lock is not acquired, another training instance is running.
if not lock.locked():
logging.info("Cannot acquire lock, other instance running.")
return
# Lock the current state. We want to have a consistent active state.
# This does communicate with the flask backend; it prevents writing and
# reading to the same files at the same time.
with SQLiteLock(lock_file, blocking=True, lock_name="active") as lock:
# Get the all labels since last run. If no new labels, quit.
new_label_history = read_label_history(project_id)
data_fp = str(get_data_file_path(project_id))
as_data = read_data(project_id)
state_file = get_state_path(project_id)
def api_get_prior(project_id): # noqa: F401
"""Get all papers classified as prior documents
"""
lock_fp = get_lock_path(project_id)
with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
label_history = read_label_history(project_id)
indices = [x[0] for x in label_history]
records = read_data(project_id).record(indices)
payload = {"result": []}
for i, record in enumerate(records):
payload["result"].append({
"id": int(record.record_id),
"title": record.title,
"abstract": record.abstract,
"authors": record.authors,
"keywords": record.keywords,
"included": int(label_history[i][1])
def get_instance(project_id):
"""Get a new instance to review.
Arguments
---------
project_id: str
The id of the current project.
"""
fp_lock = get_lock_path(project_id)
with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
pool_idx = read_pool(project_id)
if len(pool_idx) > 0:
logging.info(f"Requesting {pool_idx[0]} from project {project_id}")
return pool_idx[0]
else:
# end of pool
logging.info(f"No more records for project {project_id}")
return None
def get_statistics(project_id):
fp_lock = get_lock_path(project_id)
with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
# get the index of the active iteration
label_history = read_label_history(project_id)
current_labels = read_current_labels(
project_id, label_history=label_history)
n_since_last_inclusion = 0
for _, inclusion in reversed(label_history):
if inclusion == 1:
break
n_since_last_inclusion += 1
n_included = len(np.where(current_labels == 1)[0])
n_excluded = len(np.where(current_labels == 0)[0])
n_papers = len(current_labels)
stats = {
"n_included": n_included,
asr_kwargs_file = get_kwargs_path(project_id)
lock_file = get_lock_path(project_id)
# Lock so that only one training run is running at the same time.
# It doesn't lock the flask server/client.
with SQLiteLock(lock_file, blocking=False, lock_name="training") as lock:
# If the lock is not acquired, another training instance is running.
if not lock.locked():
logging.info("Cannot acquire lock, other instance running.")
return
# Lock the current state. We want to have a consistent active state.
# This does communicate with the flask backend; it prevents writing and
# reading to the same files at the same time.
with SQLiteLock(lock_file, blocking=True, lock_name="active") as lock:
# Get the all labels since last run. If no new labels, quit.
new_label_history = read_label_history(project_id)
data_fp = str(get_data_file_path(project_id))
as_data = read_data(project_id)
state_file = get_state_path(project_id)
# collect command line arguments and pass them to the reviewer
with open(asr_kwargs_file, "r") as fp:
asr_kwargs = json.load(fp)
asr_kwargs['state_file'] = str(state_file)
reviewer = get_reviewer(dataset=data_fp,
mode="minimal",
**asr_kwargs)
with open_state(state_file) as state:
def api_random_prior_papers(project_id): # noqa: F401
"""Get a selection of random papers to find exclusions.
This set of papers is extracted from the pool, but without
the already labeled items.
"""
lock_fp = get_lock_path(project_id)
with SQLiteLock(lock_fp, blocking=True, lock_name="active"):
pool = read_pool(project_id)
# with open(get_labeled_path(project_id, 0), "r") as f_label:
# prior_labeled = json.load(f_label)
# excluded the already labeled items from our random selection.
# prior_labeled_index = [int(label) for label in prior_labeled.keys()]
# pool = [i for i in pool if i not in prior_labeled_index]
# sample from the pool (this is already done atm of initializing
# the pool. But doing it again because a double shuffle is always
# best)
try:
pool_random = np.random.choice(pool, 5, replace=False)
except Exception:
def label_instance(project_id, paper_i, label, retrain_model=True):
"""Label a paper after reviewing the abstract.
"""
paper_i = int(paper_i)
label = int(label)
fp_lock = get_lock_path(project_id)
with SQLiteLock(fp_lock, blocking=True, lock_name="active"):
# get the index of the active iteration
if int(label) in [0, 1]:
move_label_from_pool_to_labeled(
project_id, paper_i, label
)
else:
move_label_from_labeled_to_pool(
project_id, paper_i, label
)
if retrain_model:
# Update the model (if it isn't busy).
py_exe = _get_executable()
run_command = [