How to use the lagom.utils.numpify function in lagom

To help you get started, we’ve selected a few lagom examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github zuoxingdong / lagom / baselines / sac / agent.py View on Github external
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
github zuoxingdong / lagom / baselines / sac / logs / default / source_files / agent.py View on Github external
def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}
        if kwargs['mode'] == 'train':
            dist = self.actor(obs)
            action = dist.rsample()
            out['action'] = action
            out['action_logprob'] = dist.log_prob(action)
        elif kwargs['mode'] == 'stochastic':
            with torch.no_grad():
                out['action'] = numpify(self.actor(obs).sample(), 'float')
        elif kwargs['mode'] == 'eval':
            with torch.no_grad():
                out['action'] = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
        else:
            raise NotImplementedError
        return out
github zuoxingdong / lagom / baselines / vpg / agent_lstm.py View on Github external
self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.total_timestep += sum([traj.T for traj in D])
        
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -out['entropy_loss']
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float'))
        return out
github zuoxingdong / lagom / baselines / sac / logs / _default / source_files / agent.py View on Github external
def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}
        if kwargs['mode'] == 'train':
            dist = self.actor(obs)
            action = dist.rsample()
            out['action'] = action
            out['action_logprob'] = dist.log_prob(action)
        elif kwargs['mode'] == 'stochastic':
            with torch.no_grad():
                out['action'] = numpify(self.actor(obs).sample(), 'float')
        elif kwargs['mode'] == 'eval':
            with torch.no_grad():
                out['action'] = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
        else:
            raise NotImplementedError
        return out
github zuoxingdong / lagom / baselines / ppo / logs / default / source_files / agent.py View on Github external
value_loss = torch.max(F.mse_loss(Vs, old_Qs, reduction='none'), 
                               F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
        value_loss = value_loss.mean(0)
        
        self.value_optimizer.zero_grad()
        value_loss.backward()
        value_grad_norm = nn.utils.clip_grad_norm_(self.value.parameters(), self.config['agent.max_grad_norm'])
        self.value_optimizer.step()
        
        out = {}
        out['policy_grad_norm'] = policy_grad_norm
        out['value_grad_norm'] = value_grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['policy_entropy'] = entropies.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'), y_pred=numpify(Vs, 'float'))
        out['approx_kl'] = torch.mean(old_logprobs - logprobs).item()
        out['clip_frac'] = ((ratio < 1.0 - eps) | (ratio > 1.0 + eps)).float().mean().item()
        return out
github zuoxingdong / lagom / baselines / ppo / agent.py View on Github external
value_loss = torch.max(F.mse_loss(Vs, old_Qs, reduction='none'), 
                               F.mse_loss(clipped_Vs, old_Qs, reduction='none'))
        value_loss = value_loss.mean(0)
        
        self.value_optimizer.zero_grad()
        value_loss.backward()
        value_grad_norm = nn.utils.clip_grad_norm_(self.value.parameters(), self.config['agent.max_grad_norm'])
        self.value_optimizer.step()
        
        out = {}
        out['policy_grad_norm'] = policy_grad_norm
        out['value_grad_norm'] = value_grad_norm
        out['policy_loss'] = policy_loss.item()
        out['policy_entropy'] = entropies.mean().item()
        out['value_loss'] = value_loss.item()
        out['explained_variance'] = ev(y_true=numpify(old_Qs, 'float'), y_pred=numpify(Vs, 'float'))
        out['approx_kl'] = (old_logprobs - logprobs).mean(0).item()
        out['clip_frac'] = ((ratio < 1.0 - eps) | (ratio > 1.0 + eps)).float().mean(0).item()
        return out
github zuoxingdong / lagom / baselines / sac / logs / default / source_files / agent.py View on Github external
def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}
        if kwargs['mode'] == 'train':
            dist = self.actor(obs)
            action = dist.rsample()
            out['action'] = action
            out['action_logprob'] = dist.log_prob(action)
        elif kwargs['mode'] == 'stochastic':
            with torch.no_grad():
                out['action'] = numpify(self.actor(obs).sample(), 'float')
        elif kwargs['mode'] == 'eval':
            with torch.no_grad():
                out['action'] = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
        else:
            raise NotImplementedError
        return out
github zuoxingdong / lagom / lagom / rl / vtrace.py View on Github external
def vtrace(behavior_logprobs, target_logprobs, gamma, Rs, Vs, last_V, reach_terminal, clip_rho=1.0, clip_pg_rho=1.0):
    behavior_logprobs = numpify(behavior_logprobs, np.float32)
    target_logprobs = numpify(target_logprobs, np.float32)
    Rs = numpify(Rs, np.float32)
    Vs = numpify(Vs, np.float32)
    last_V = numpify(last_V, np.float32)
    assert all([item.ndim == 1 for item in [behavior_logprobs, target_logprobs, Rs, Vs]])
    assert np.isscalar(gamma)

    rhos = np.exp(target_logprobs - behavior_logprobs)
    clipped_rhos = np.minimum(clip_rho, rhos)
    cs = np.minimum(1.0, rhos)
    deltas = clipped_rhos*td0_error(gamma, Rs, Vs, last_V, reach_terminal)

    vs_minus_V = []
    total = 0.0
    for delta_t, c_t in zip(deltas[::-1], cs[::-1]):
        total = delta_t + gamma*c_t*total
        vs_minus_V.append(total)
    vs_minus_V = np.asarray(vs_minus_V)[::-1]

    vs = vs_minus_V + Vs
github zuoxingdong / lagom / baselines / sac / agent.py View on Github external
def choose_action(self, x, **kwargs):
        obs = tensorify(x.observation, self.device).unsqueeze(0)
        with torch.no_grad():
            if kwargs['mode'] == 'train':
                action = numpify(self.actor(obs).sample(), 'float')
            elif kwargs['mode'] == 'eval':
                action = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
        out = {}
        out['raw_action'] = action.squeeze(0)
        return out