Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_qa(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
batch_size = 2
n_epochs = 1
evaluate_every = 4
base_LM_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=base_LM_model, do_lower_case=False
)
label_list = ["start_token", "end_token"]
processor = SquadProcessor(
tokenizer=tokenizer,
max_seq_len=20,
doc_stride=10,
max_query_length=6,
train_filename="train-sample.json",
def test_ner(caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 5
batch_size = 2
evaluate_every = 1
lang_model = "bert-base-german-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
ner_labels = ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH",
"I-OTH"]
processor = NERProcessor(
tokenizer=tokenizer,
max_seq_len=8,
def test_doc_classification():
#caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(
pretrained_model_name_or_path=lang_model)
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
label_list=["OTHER", "OFFENSE"],
metric="f1_macro",
dev_filename="test-sample.tsv",
def test_lm_finetuning_no_next_sentence(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
def test_lm_finetuning(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
processor = BertStyleLMProcessor(
data_dir="samples/lm_finetuning",
train_filename="train-sample.txt",
test_filename="test-sample.txt",
dev_filename=None,
tokenizer=tokenizer,
from farm.train import Trainer
from farm.utils import set_all_seeds, MLFlowLogger, initialize_device_settings
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(experiment_name="Public_FARM", run_name="Run_question_answering")
##########################
########## Settings
##########################
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
batch_size = 5
n_epochs = 2
evaluate_every = 500
base_LM_model = "bert-base-cased"
train_filename="train-v2.0.json"
dev_filename="dev-v2.0.json"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=base_LM_model, do_lower_case=False
)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
label_list = ["start_token", "end_token"]
metric = "squad"
processor = SquadProcessor(
from transformers.tokenization_bert import BertTokenizer
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
ml_logger = MLFlowLogger(tracking_uri="")
ml_logger.init_experiment(experiment_name="from_scratch", run_name="debug")
#########################
######## Settings
########################
set_all_seeds(seed=39)
device, n_gpu = initialize_device_settings(use_cuda=True)
learning_rate = 1e-5
batch_size = 45
max_seq_len = 128
n_epochs = 100
evaluate_every = 50
vocab_size = 30522
# dev_filename = None
save_dir = "../saved_models/from_scratch"
predictions_file = save_dir + "/predictions.json"
full_predictions_file = save_dir + "/full_predictions.json"
inference_multiprocessing = True
train = True
inference = True
if train:
level=logging.INFO)
# reduce verbosity from transformers library
logging.getLogger('transformers').setLevel(logging.WARNING)
# ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
# for local logging instead:
ml_logger = MLFlowLogger(tracking_uri="logs")
# ml_logger.init_experiment(experiment_name="Public_FARM", run_name="DocClassification_ES_f1_1")
##########################
########## Settings
##########################
xval_folds = 5
xval_stratified = True
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 20
batch_size = 32
evaluate_every = 100
lang_model = "bert-base-german-cased"
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
# The evaluation on the dev-set can be done with one of the predefined metrics or with a
# metric defined as a function from (preds, labels) to a dict that contains all the actual
# metrics values. The function must get registered under a string name and the string name must
# be used.
# For xval, we also store the actual predictions and labels in each result so we can