How to use the trfl.value_ops function in trfl

To help you get started, we’ve selected a few trfl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fomorians-oss / pyoneer / pyoneer / rl / agents / advantage_actor_critic_agent_impl.py View on Github external
base_ops.assert_rank_and_shape_compatibility(
            [rewards, multi_baseline_values], 3)
        multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
        num_values = len(multi_baseline_values)

        base_shape = rewards.shape
        decay = self._least_fit(decay, base_shape)
        lambda_ = self._least_fit(lambda_, base_shape)
        baseline_scale = self._least_fit(baseline_scale, base_shape)

        for i in range(num_values):
            pcontinues = decay[..., i] * weights
            lambdas = lambda_[..., i] * weights
            bootstrap_values = indexing_ops.batched_index(
                multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
            baseline_loss, td_lambda = value_ops.td_lambda(
                parray_ops.swap_time_major(multi_baseline_values[i]), 
                parray_ops.swap_time_major(rewards[..., i]), 
                parray_ops.swap_time_major(pcontinues), 
                bootstrap_values, 
                parray_ops.swap_time_major(lambdas))
            value_loss = pmath_ops.safe_divide(
                baseline_scale[i] * math_ops.reduce_sum(baseline_loss), total_num)
            self.value_loss.append(
                gen_array_ops.check_numerics(value_loss, 'value_loss'))
            advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
            multi_advantages.append(advantages)

        advantages = math_ops.add_n(multi_advantages) # A = A[0] + A[1] + ...
        if normalize_advantages:
            advantages = normalization_ops.normalize_by_moments(advantages, weights)
        advantages = gen_array_ops.stop_gradient(advantages)
github deepmind / trfl / trfl / policy_gradient_ops.py View on Github external
* `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
  flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
  scoped_values = (flat_policy_vars + nest.flatten(actions) +
                   [baseline_values, rewards, pcontinues, bootstrap_value])
  with tf.name_scope(name, values=scoped_values):
    # Loss for the baseline, summed over the time dimension.
    baseline_loss_td, td_lambda = value_ops.td_lambda(
        baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

    # The TD error provides an estimate of the advantages of the actions.
    advantages = td_lambda.temporal_differences
    baseline_loss = tf.multiply(
        tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
        baseline_loss_td,
        name="baseline_loss")

    # Loss for the policy. Doesn't push additional gradients through
    # the advantages.
    pg_loss = policy_gradient_loss(
        policies, actions, advantages, policy_vars,
        name="policy_gradient_loss")

    total_loss = tf.add(pg_loss, baseline_loss, name="total_loss")
github fomorians-oss / pyoneer / pyoneer / rl / agents / deterministic_policy_gradient_agent_impl.py View on Github external
target_policy = self.target_policy(next_states)

        bootstrap_value = gen_array_ops.reshape(
            self.target_value(next_states[:, -1:], target_policy[:, -1:]), 
            [-1])

        action_values = array_ops.squeeze(
            self.value(states, policy, training=True), 
            axis=-1) * mask
        self.policy_gradient_loss = losses_impl.compute_weighted_loss(
            -action_values, weights=weights)

        lambda_ = lambda_ * weights
        pcontinues = decay * weights

        baseline_loss = value_ops.td_lambda(
            parray_ops.swap_time_major(action_values), 
            parray_ops.swap_time_major(rewards),
            parray_ops.swap_time_major(pcontinues),
            gen_array_ops.stop_gradient(bootstrap_value),
            parray_ops.swap_time_major(lambda_)).loss

        self.value_loss = math_ops.reduce_mean(
            baseline_loss * baseline_scale * pmath_ops.safe_divide(1., sequence_length), 
            axis=0)

        self.total_loss = math_ops.add_n([
            self.value_loss,
            self.policy_gradient_loss])

        return self.total_loss
github fomorians-oss / pyoneer / pyoneer / rl / agents / proximal_policy_optimization_agent_impl.py View on Github external
del kwargs
        sequence_length = math_ops.reduce_sum(weights, axis=1)
        total_num = math_ops.reduce_sum(sequence_length)

        policy = self.policy(states, training=True)
        behavioral_policy = self.behavioral_policy(states)
        baseline_values = array_ops.squeeze(
            self.value(states, training=True), 
            axis=-1) * weights

        pcontinues = decay * weights
        lambda_ = lambda_ * weights

        bootstrap_values = indexing_ops.batched_index(
            baseline_values, math_ops.cast(sequence_length - 1, dtypes.int32))
        baseline_loss, td_lambda = value_ops.td_lambda(
            parray_ops.swap_time_major(baseline_values), 
            parray_ops.swap_time_major(rewards), 
            parray_ops.swap_time_major(pcontinues), 
            bootstrap_values, 
            parray_ops.swap_time_major(lambda_))

        advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
        advantages = normalization_ops.normalize_by_moments(advantages, weights)
        advantages = gen_array_ops.stop_gradient(advantages)

        ratio = gen_math_ops.exp(
            policy.log_prob(actions) - gen_array_ops.stop_gradient(
                behavioral_policy.log_prob(actions)))
        clipped_ratio = clip_ops.clip_by_value(ratio, 1. - ratio_epsilon, 1. + ratio_epsilon)

        self.policy_gradient_loss = -losses_impl.compute_weighted_loss(
github deepmind / trfl / trfl / discrete_policy_gradient_ops.py View on Github external
* `loss`: a tensor containing the total loss, shape `[B]`.
    * `extra`: a namedtuple with fields:
        * `entropy`: total loss per sequence, shape `[B]`.
        * `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
        * `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
        * `policy_gradient_loss`: policy gradient loss per sequence,
            shape `[B]`.
        * `advantages`: advantange estimates per timestep, shape `[T, B]`.
        * `discounted_returns`: discounted returns per timestep,
            shape `[T, B]`.
  """
  scoped_values = (nest.flatten(policy_logits) + nest.flatten(actions) +
                   [baseline_values, rewards, pcontinues, bootstrap_value])
  with tf.name_scope(name, values=scoped_values):
    # Loss for the baseline, summed over the time dimension.
    baseline_loss_td, td_lambda = value_ops.td_lambda(
        baseline_values, rewards, pcontinues, bootstrap_value, lambda_)

    # The TD error provides an estimate of the advantages of the actions.
    advantages = td_lambda.temporal_differences
    baseline_loss = tf.multiply(
        tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
        baseline_loss_td,
        name="baseline_loss")

    # Loss for the policy. Doesn't push additional gradients through
    # the advantages.
    policy_gradient_loss = discrete_policy_gradient_loss(
        policy_logits, actions, advantages, name="policy_gradient_loss")

    total_loss = tf.add(policy_gradient_loss, baseline_loss, name="total_loss")