Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
std_reg = self.hyper_params["W_STD_REG"] * std.pow(2).mean()
pre_activation_reg = self.hyper_params["W_PRE_ACTIVATION_REG"] * (
pre_tanh_value.pow(2).sum(dim=-1).mean()
)
actor_reg = mean_reg + std_reg + pre_activation_reg
# actor loss + regularization
actor_loss += actor_reg
# train actor
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# update target networks
common_utils.soft_update(self.vf, self.vf_target, self.hyper_params["TAU"])
# update priorities
new_priorities = vf_loss_element_wise
new_priorities += self.hyper_params[
"LAMBDA3"
] * actor_loss_element_wise.pow(2)
new_priorities += self.hyper_params["PER_EPS"]
new_priorities = new_priorities.data.cpu().numpy().squeeze()
new_priorities += eps_d
self.memory.update_priorities(indices, new_priorities)
# increase beta
fraction = min(float(self.i_episode) / self.args.episode_num, 1.0)
self.beta = self.beta + fraction * (1.0 - self.beta)
else:
actor_loss = torch.zeros(1)
std_reg = self.hyper_params["W_STD_REG"] * std.pow(2).mean()
pre_activation_reg = self.hyper_params["W_PRE_ACTIVATION_REG"] * (
pre_tanh_value.pow(2).sum(dim=-1).mean()
)
actor_reg = mean_reg + std_reg + pre_activation_reg
# actor loss + regularization
actor_loss += actor_reg
# train actor
self.actor_optimizer.zero_grad()
actor_loss.backward()
self.actor_optimizer.step()
# update target networks
common_utils.soft_update(self.vf, self.vf_target, self.hyper_params["TAU"])
else:
actor_loss = torch.zeros(1)
return (
actor_loss.item(),
qf_1_loss.item(),
qf_2_loss.item(),
vf_loss.item(),
alpha_loss.item(),
)
nn.utils.clip_grad_norm_(self.critic.parameters(), gradient_clip_cr)
self.critic_optimizer.step()
# train actor
gradient_clip_ac = self.hyper_params["GRADIENT_CLIP_AC"]
actions = self.actor(states)
actor_loss_element_wise = -self.critic(torch.cat((states, actions), dim=-1))
actor_loss = torch.mean(actor_loss_element_wise * weights)
self.actor_optimizer.zero_grad()
actor_loss.backward()
nn.utils.clip_grad_norm_(self.actor.parameters(), gradient_clip_ac)
self.actor_optimizer.step()
# update target networks
tau = self.hyper_params["TAU"]
common_utils.soft_update(self.actor, self.actor_target, tau)
common_utils.soft_update(self.critic, self.critic_target, tau)
# update priorities in PER
new_priorities = critic_loss_element_wise
new_priorities = (
new_priorities.data.cpu().numpy() + self.hyper_params["PER_EPS"]
)
self.memory.update_priorities(indices, new_priorities)
# increase beta
fraction = min(float(self.i_episode) / self.args.episode_num, 1.0)
self.beta = self.beta + fraction * (1.0 - self.beta)
return actor_loss.item(), critic_loss.item()
def main():
"""Main."""
# env initialization
env = gym.make("Reacher-v2")
env_utils.set_env(env, args)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
# set a random seed
common_utils.set_random_seed(args.seed, env)
# run
module_path = "examples.reacher_v2." + args.algo
example = importlib.import_module(module_path)
example.run(env, args, state_dim, action_dim)
bc_loss = (
torch.mul(pred_actions, qf_mask) - torch.mul(demo_actions, qf_mask)
).pow(2).sum() / n_qf_mask
# train actor: pg loss + BC loss
actor_loss = self.lambda1 * policy_loss + self.lambda2 * bc_loss
gradient_clip_ac = self.hyper_params["GRADIENT_CLIP_AC"]
self.actor_optimizer.zero_grad()
actor_loss.backward()
nn.utils.clip_grad_norm_(self.actor.parameters(), gradient_clip_ac)
self.actor_optimizer.step()
# update target networks
tau = self.hyper_params["TAU"]
common_utils.soft_update(self.actor, self.actor_target, tau)
common_utils.soft_update(self.critic, self.critic_target, tau)
return actor_loss.item(), critic_loss.item(), n_qf_mask
def main():
"""Main."""
# env initialization
env = gym.make("LunarLanderContinuous-v2")
env_utils.set_env(env, args)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
# set a random seed
common_utils.set_random_seed(args.seed, env)
# run
module_path = "examples.lunarlander_continuous_v2." + args.algo
example = importlib.import_module(module_path)
example.run(env, args, state_dim, action_dim)
# q_value regularization
q_regular = torch.norm(q_values, 2).mean() * self.hyper_params["W_Q_REG"]
# total loss
loss = dq_loss + supervised_loss + q_regular
# train dqn
self.dqn_optimizer.zero_grad()
loss.backward()
clip_grad_norm_(self.dqn.parameters(), self.hyper_params["GRADIENT_CLIP"])
self.dqn_optimizer.step()
# update target networks
tau = self.hyper_params["TAU"]
common_utils.soft_update(self.dqn, self.dqn_target, tau)
# update priorities in PER
loss_for_prior = dq_loss_element_wise.detach().cpu().numpy().squeeze()
new_priorities = loss_for_prior + self.hyper_params["PER_EPS"]
new_priorities += eps_d
self.memory.update_priorities(indices, new_priorities)
# increase beta
fraction = min(float(self.i_episode) / self.args.episode_num, 1.0)
self.beta = self.beta + fraction * (1.0 - self.beta)
if self.hyper_params["USE_NOISY_NET"]:
self.dqn.reset_noise()
self.dqn_target.reset_noise()
return (
torch.mul(pred_actions, qf_mask) - torch.mul(demo_actions, qf_mask)
).pow(2).sum() / n_qf_mask
# train actor: pg loss + BC loss
actor_loss = self.lambda1 * policy_loss + self.lambda2 * bc_loss
gradient_clip_ac = self.hyper_params["GRADIENT_CLIP_AC"]
self.actor_optimizer.zero_grad()
actor_loss.backward()
nn.utils.clip_grad_norm_(self.actor.parameters(), gradient_clip_ac)
self.actor_optimizer.step()
# update target networks
tau = self.hyper_params["TAU"]
common_utils.soft_update(self.actor, self.actor_target, tau)
common_utils.soft_update(self.critic, self.critic_target, tau)
return actor_loss.item(), critic_loss.item(), n_qf_mask
def _initialize(self):
"""Initialize non-common things."""
self.use_n_step = self.hyper_params["N_STEP"] > 1
if not self.args.test:
# load demo replay memory
with open(self.args.demo_path, "rb") as f:
demos = pickle.load(f)
if self.use_n_step:
demos, demos_n_step = common_utils.get_n_step_info_from_demo(
demos, self.hyper_params["N_STEP"], self.hyper_params["GAMMA"]
)
# replay memory for multi-steps
self.memory_n = ReplayBuffer(
buffer_size=self.hyper_params["BUFFER_SIZE"],
n_step=self.hyper_params["N_STEP"],
gamma=self.hyper_params["GAMMA"],
demo=demos_n_step,
)
# replay memory for a single step
self.beta = self.hyper_params["PER_BETA"]
self.memory = PrioritizedReplayBuffer(
self.hyper_params["BUFFER_SIZE"],
self.hyper_params["BATCH_SIZE"],