Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
steps_trained_in_current_epoch = 0
# Check if continuing training from a checkpoint
if os.path.exists(args.model_name_or_path):
# set global_step to gobal_step of last saved checkpoint from model path
global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
logger.info(" Continuing training from checkpoint, will skip to saved global_step")
logger.info(" Continuing training from epoch %d", epochs_trained)
logger.info(" Continuing training from global step %d", global_step)
logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
tr_loss, logging_loss = 0.0, 0.0
model.zero_grad()
train_iterator = trange(
epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
)
set_seed(args) # Added here for reproductibility
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
# Skip past any already trained steps if resuming training
if steps_trained_in_current_epoch > 0:
steps_trained_in_current_epoch -= 1
continue
model.train()
batch = tuple(t.to(args.device) for t in batch)
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
if args.model_type != "distilbert":
def download_data(dataset: Dict[str, str]) -> io.BytesIO:
from .. import session
f = session.get(dataset["url"], stream=True)
total = int(f.headers["Content-Length"])
buffer = io.BytesIO()
for chunk in tqdm(
f.iter_content(1024),
total=total // 1024 + 1 if total % 1024 > 0 else 0,
desc="download",
):
buffer.write(chunk)
buffer.seek(0)
compute_md5 = md5(buffer.getbuffer()).hexdigest()
if compute_md5 != dataset["md5sum"]:
raise RuntimeError(
f"Error in MD5 check: {compute_md5} instead of {dataset['md5sum']}"
)
return buffer
axis = 0
elif axis == 'columns':
axis = 1
# when axis=0, total is shape[axis1]
total = df.size // df.shape[axis]
# Init bar
if deprecated_t[0] is not None:
t = deprecated_t[0]
deprecated_t[0] = None
else:
t = tclass(*targs, total=total, **tkwargs)
if len(args) > 0:
# *args intentionally not supported (see #244, #299)
TqdmDeprecationWarning(
"Except func, normal arguments are intentionally" +
" not supported by" +
" `(DataFrame|Series|GroupBy).progress_apply`." +
" Use keyword arguments instead.",
fp_write=getattr(t.fp, 'write', sys.stderr.write))
try:
func = df._is_builtin_func(func)
except TypeError:
pass
# Define bar updating wrapper
def wrapper(*args, **kwargs):
# update tbar correctly
# it seems `pandas apply` calls `func` twice
# on the first column/row to decide whether it can
axis = 0
elif axis == 'columns':
axis = 1
# when axis=0, total is shape[axis1]
total = df.size // df.shape[axis]
# Init bar
if deprecated_t[0] is not None:
t = deprecated_t[0]
deprecated_t[0] = None
else:
t = tclass(*targs, total=total, **tkwargs)
if len(args) > 0:
# *args intentionally not supported (see #244, #299)
TqdmDeprecationWarning(
"Except func, normal arguments are intentionally" +
" not supported by" +
" `(DataFrame|Series|GroupBy).progress_apply`." +
" Use keyword arguments instead.",
fp_write=getattr(t.fp, 'write', sys.stderr.write))
# Define bar updating wrapper
def wrapper(*args, **kwargs):
# update tbar correctly
# it seems `pandas apply` calls `func` twice
# on the first column/row to decide whether it can
# take a fast or slow code path; so stop when t.total==t.n
t.update(n=1 if not t.total or t.n < t.total else 0)
return func(*args, **kwargs)
# Apply the provided function (in **kwargs)
def create_labels(entityset,
min_training_data='28 days',
lead='7 days',
window='28 days',
reduce='sum',
binarize=None,
iterate_by=None):
label_cols = ['quantity', 'price']
time_index = "order_date"
index = "customer_id"
df = entityset['orders'].df.merge(
entityset['order_products'].df, how='outer')
tqdm.pandas(desc="Creating Labels", unit="customer")
# # Only use data after one of the label columns has been non-null
# for i, v in df[label_cols].iterrows():
# if v.dropna(how='all').shape[0] > 0:
# df = df.loc[slice(i, None), :]
# break
grouped = df.groupby(index, as_index=True)
project_cutoff_dates = grouped.progress_apply(
lambda df: make_labels_from_windows(
df,
cols=label_cols,
min_training_data=min_training_data,
lead=lead, window=window,
index_col=index,
date_col=time_index,
def test_vectorized_math_applymap_on_large_dataframe(self):
LOG.info("test_vectorized_math_applymap_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)})
tqdm.pandas(desc="Pandas Vec math applymap ~ DF")
start_pd = time.time()
pd_val = df.progress_applymap(math_vec_square)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Vec math applymap ~ DF").applymap(math_vec_square)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_nonvectorized_math_apply_on_large_series(self):
LOG.info("test_nonvectorized_math_apply_on_large_series")
df = pd.DataFrame({"x": np.random.normal(size=10_000_000)})
series = df["x"]
tqdm.pandas(desc="Pandas Nonvec math apply ~ Series")
start_pd = time.time()
pd_val = series.progress_apply(math_foo, compare_to=1)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = series.swifter.progress_bar(desc="Nonvec math apply ~ Series").apply(math_foo, compare_to=1)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_vectorized_math_apply_on_large_dataframe(self):
LOG.info("test_vectorized_math_apply_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)})
tqdm.pandas(desc="Pandas Vec math apply ~ DF")
start_pd = time.time()
pd_val = df.progress_apply(math_vec_multiply, axis=1)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_vec_multiply, axis=1)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_vectorized_math_apply_on_large_dataframe(self):
LOG.info("test_vectorized_math_apply_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=1_000_000), "y": np.random.uniform(size=1_000_000)})
tqdm.pandas(desc="Pandas Vec math apply ~ DF")
start_pd = time.time()
pd_val = df.progress_apply(math_vec_multiply, axis=1)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Vec math apply ~ DF").apply(math_vec_multiply, axis=1)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)
def test_nonvectorized_math_applymap_on_large_dataframe(self):
LOG.info("test_nonvectorized_math_applymap_on_large_dataframe")
df = pd.DataFrame({"x": np.random.normal(size=5_000_000), "y": np.random.uniform(size=5_000_000)})
tqdm.pandas(desc="Pandas Nonvec math applymap ~ DF")
start_pd = time.time()
pd_val = df.progress_applymap(math_foo)
end_pd = time.time()
pd_time = end_pd - start_pd
start_swifter = time.time()
swifter_val = df.swifter.progress_bar(desc="Nonvec math applymap ~ DF").applymap(math_foo)
end_swifter = time.time()
swifter_time = end_swifter - start_swifter
self.assertEqual(pd_val, swifter_val) # equality test
if self.ncores > 1: # speed test
self.assertLess(swifter_time, pd_time)