How to use the trfl.sequence_ops function in trfl

To help you get started, we’ve selected a few trfl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fomorians-oss / pyoneer / pyoneer / rl / agents / vanilla_policy_gradient_agent_impl.py View on Github external
def _discounted_returns(rewards, decay, weights):
    """Compute the discounted returns given the decay factor."""
    sequence_lengths = math_ops.reduce_sum(weights, axis=1)
    bootstrap_values = indexing_ops.batched_index(
        rewards, math_ops.cast(sequence_lengths - 1, dtypes.int32))
    multi_step_returns = sequence_ops.scan_discounted_sum(
        parray_ops.swap_time_major(rewards * weights), 
        parray_ops.swap_time_major(decay * weights), 
        bootstrap_values, 
        reverse=True, 
        back_prop=False)
    return parray_ops.swap_time_major(multi_step_returns)
github deepmind / trfl / trfl / retrace_ops.py View on Github external
#   exp_q_t = 𝔼_π Q(x_{t+1},.)
  #   qa_t    = Q(x_t, a_t)
  # Hence:
  #   T_tm1   = (r_t + γ * exp_q_t - c_t * qa_t) + γ * c_t * T_t
  # Define:
  #   current = r_t + γ * (exp_q_t - c_t * qa_t)
  # Thus:
  #   T_tm1 = scan_discounted_sum(current, γ * c_t, reverse=True)
  args = [r_t, pcont_t, target_policy_t, c_t, q_t, a_t]
  with tf.name_scope(
      name, 'general_returns_based_off_policy_target', values=args):
    exp_q_t = tf.reduce_sum(target_policy_t * q_t, axis=2)
    qa_t = indexing_ops.batched_index(q_t, a_t)
    current = r_t + pcont_t * (exp_q_t - c_t * qa_t)
    initial_value = qa_t[-1]
    return sequence_ops.scan_discounted_sum(
        current,
        pcont_t * c_t,
        initial_value,
        reverse=True,
        back_prop=back_prop)
github deepmind / trfl / trfl / value_ops.py View on Github external
lambda_: an optional scalar or 2-D Tensor with shape `[T, B]`.
    name: Customises the name_scope for this op.

  Returns:
    2-D Tensor with shape `[T, B]`
  """
  values.get_shape().assert_has_rank(2)
  rewards.get_shape().assert_has_rank(2)
  pcontinues.get_shape().assert_has_rank(2)
  bootstrap_value.get_shape().assert_has_rank(1)
  scoped_values = [rewards, pcontinues, values, bootstrap_value, lambda_]
  with tf.name_scope(name, values=scoped_values):
    if lambda_ == 1:
      # This is actually equivalent to the branch below, just an optimisation
      # to avoid unnecessary work in this case:
      return sequence_ops.scan_discounted_sum(
          rewards,
          pcontinues,
          initial_value=bootstrap_value,
          reverse=True,
          back_prop=False,
          name="multistep_returns")
    else:
      v_tp1 = tf.concat(
          axis=0, values=[values[1:, :],
                          tf.expand_dims(bootstrap_value, 0)])
      # `back_prop=False` prevents gradients flowing into values and
      # bootstrap_value, which is what you want when using the bootstrapped
      # lambda-returns in an update as targets for values.
      return sequence_ops.multistep_forward_view(
          rewards,
          pcontinues,
github fomorians-oss / pyoneer / pyoneer / rl / rollout_impl.py View on Github external
def discounted_returns(self, decay):
        """Compute the discounted returns given the decay factor.
        """
        decay = ops.convert_to_tensor(decay)
        sequence = parray_ops.swap_time_major(self.rewards)
        decay = gen_array_ops.broadcast_to(decay, array_ops.shape(sequence))
        multi_step_returns = sequence_ops.scan_discounted_sum(
            sequence, decay, array_ops.zeros_like(sequence[0]), reverse=True, back_prop=False)
        return parray_ops.swap_time_major(multi_step_returns)