Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
out['Q1'] = describe_it(Q1_vals)
def choose_action(self, obs, **kwargs):
obs = tensorify(obs, self.device)
out = {}
if kwargs['mode'] == 'train':
dist = self.actor(obs)
action = dist.rsample()
out['action'] = action
out['action_logprob'] = dist.log_prob(action)
elif kwargs['mode'] == 'stochastic':
with torch.no_grad():
out['action'] = numpify(self.actor(obs).sample(), 'float')
elif kwargs['mode'] == 'eval':
with torch.no_grad():
out['action'] = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
else:
raise NotImplementedError
return out
self.optimizer.step()
if self.config['agent.use_lr_scheduler']:
self.lr_scheduler.step(self.total_timestep)
self.total_timestep += sum([traj.T for traj in D])
out = {}
if self.config['agent.use_lr_scheduler']:
out['current_lr'] = self.lr_scheduler.get_lr()
out['loss'] = loss.item()
out['grad_norm'] = grad_norm
out['policy_loss'] = policy_loss.mean().item()
out['entropy_loss'] = entropy_loss.mean().item()
out['policy_entropy'] = -out['entropy_loss']
out['value_loss'] = value_loss.mean().item()
out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float'))
return out
def choose_action(self, obs, **kwargs):
obs = tensorify(obs, self.device)
out = {}
if kwargs['mode'] == 'train':
dist = self.actor(obs)
action = dist.rsample()
out['action'] = action
out['action_logprob'] = dist.log_prob(action)
elif kwargs['mode'] == 'stochastic':
with torch.no_grad():
out['action'] = numpify(self.actor(obs).sample(), 'float')
elif kwargs['mode'] == 'eval':
with torch.no_grad():
out['action'] = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
else:
raise NotImplementedError
return out
value_loss = torch.max(F.mse_loss(Vs, old_Qs, reduction='none'),
F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
value_loss = value_loss.mean(0)
self.value_optimizer.zero_grad()
value_loss.backward()
value_grad_norm = nn.utils.clip_grad_norm_(self.value.parameters(), self.config['agent.max_grad_norm'])
self.value_optimizer.step()
out = {}
out['policy_grad_norm'] = policy_grad_norm
out['value_grad_norm'] = value_grad_norm
out['policy_loss'] = policy_loss.mean().item()
out['policy_entropy'] = entropies.mean().item()
out['value_loss'] = value_loss.mean().item()
out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'), y_pred=numpify(Vs, 'float'))
out['approx_kl'] = torch.mean(old_logprobs - logprobs).item()
out['clip_frac'] = ((ratio < 1.0 - eps) | (ratio > 1.0 + eps)).float().mean().item()
return out
value_loss = torch.max(F.mse_loss(Vs, old_Qs, reduction='none'),
F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
value_loss = value_loss.mean(0)
self.value_optimizer.zero_grad()
value_loss.backward()
value_grad_norm = nn.utils.clip_grad_norm_(self.value.parameters(), self.config['agent.max_grad_norm'])
self.value_optimizer.step()
out = {}
out['policy_grad_norm'] = policy_grad_norm
out['value_grad_norm'] = value_grad_norm
out['policy_loss'] = policy_loss.item()
out['policy_entropy'] = entropies.mean().item()
out['value_loss'] = value_loss.item()
out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'), y_pred=numpify(Vs, 'float'))
out['approx_kl'] = (old_logprobs - logprobs).mean(0).item()
out['clip_frac'] = ((ratio < 1.0 - eps) | (ratio > 1.0 + eps)).float().mean(0).item()
return out
def choose_action(self, obs, **kwargs):
obs = tensorify(obs, self.device)
out = {}
if kwargs['mode'] == 'train':
dist = self.actor(obs)
action = dist.rsample()
out['action'] = action
out['action_logprob'] = dist.log_prob(action)
elif kwargs['mode'] == 'stochastic':
with torch.no_grad():
out['action'] = numpify(self.actor(obs).sample(), 'float')
elif kwargs['mode'] == 'eval':
with torch.no_grad():
out['action'] = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
else:
raise NotImplementedError
return out
def vtrace(behavior_logprobs, target_logprobs, gamma, Rs, Vs, last_V, reach_terminal, clip_rho=1.0, clip_pg_rho=1.0):
behavior_logprobs = numpify(behavior_logprobs, np.float32)
target_logprobs = numpify(target_logprobs, np.float32)
Rs = numpify(Rs, np.float32)
Vs = numpify(Vs, np.float32)
last_V = numpify(last_V, np.float32)
assert all([item.ndim == 1 for item in [behavior_logprobs, target_logprobs, Rs, Vs]])
assert np.isscalar(gamma)
rhos = np.exp(target_logprobs - behavior_logprobs)
clipped_rhos = np.minimum(clip_rho, rhos)
cs = np.minimum(1.0, rhos)
deltas = clipped_rhos*td0_error(gamma, Rs, Vs, last_V, reach_terminal)
vs_minus_V = []
total = 0.0
for delta_t, c_t in zip(deltas[::-1], cs[::-1]):
total = delta_t + gamma*c_t*total
vs_minus_V.append(total)
vs_minus_V = np.asarray(vs_minus_V)[::-1]
vs = vs_minus_V + Vs
def choose_action(self, x, **kwargs):
obs = tensorify(x.observation, self.device).unsqueeze(0)
with torch.no_grad():
if kwargs['mode'] == 'train':
action = numpify(self.actor(obs).sample(), 'float')
elif kwargs['mode'] == 'eval':
action = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
out = {}
out['raw_action'] = action.squeeze(0)
return out