How to use the trfl.base_ops function in trfl

To help you get started, we’ve selected a few trfl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepmind / trfl / trfl / policy_gradient_ops.py View on Github external
"""
  flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
  with tf.name_scope(name, values=flat_policy_vars):
    # We want a value that we can minimize along with other losses, and where
    # minimizing means driving the policy towards a uniform distribution over
    # the actions. We thus scale it by negative one so that it can be simply
    # added to other losses.
    scale = tf.constant(-1.0, dtype=tf.float32)
    if scale_op:
      scale *= scale_op(policies)

    policies = nest.flatten(policies)
    entropy = tf.add_n(
        [policy.entropy() for policy in policies], name="entropy")
    loss = tf.multiply(scale, entropy, name="entropy_loss")
    return base_ops.LossOutput(loss, PolicyEntropyExtra(entropy))
github deepmind / trfl / trfl / retrace_ops.py View on Github external
# Targets are evaluated by using only Q values from the target network.
    # This provides fixed regression targets until the next target network
    # update.
    target = _general_off_policy_corrected_multistep_target(
        r_t, pcont_t, target_policy_t, c_t, targnet_q_t, a_t,
        not stop_targnet_gradients)

    if stop_targnet_gradients:
      target = tf.stop_gradient(target)
    # Regress Q values of the learning network towards the targets evaluated
    # by using the target network.
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
    delta = target - qa_tm1
    loss = 0.5 * tf.square(delta)

    return base_ops.LossOutput(
        loss, RetraceCoreExtra(retrace_weights=c_t, target=target))
github deepmind / trfl / trfl / pixel_control_ops.py View on Github external
to crop the input observations before computing the pseudo-rewards.

  Returns:
    A namedtuple with fields:

    * `loss`: a tensor containing the batch of losses, shape [B].
    * `extra`: a namedtuple with fields:
        * `target`: batch of target values for `q_tm1[a_tm1]`, shape [B].
        * `td_error`: batch of temporal difference errors, shape [B].

  Raises:
    ValueError: if the shape of `action_values` is not compatible with that of
      the pseudo-rewards derived from the observations.
  """
  # Useful shapes.
  sequence_length, batch_size = base_ops.best_effort_shape(actions)
  num_actions = action_values.get_shape().as_list()[-1]
  height_width_q = action_values.get_shape().as_list()[2:-1]
  # Calculate rewards using the observations. Crop observations if appropriate.
  if crop_height_dim[0] is not None:
    h_low, h_high = crop_height_dim
    observations = observations[:, :, h_low:h_high, :]
  if crop_width_dim[0] is not None:
    w_low, w_high = crop_width_dim
    observations = observations[:, :, :, w_low:w_high]
  # Rescale observations by a constant factor.
  observations *= tf.constant(scale)
  # Compute pseudo-rewards and get their shape.
  pseudo_rewards = pixel_control_rewards(observations, cell_size)
  height_width = pseudo_rewards.get_shape().as_list()[2:]
  # Check that pseudo-rewards and Q-values are compatible in shape.
  if height_width != height_width_q:
github deepmind / trfl / trfl / dist_value_ops.py View on Github external
# Convert logits to distribution, then find greedy policy action in
      # state s_t.
      q_t_probs = tf.nn.softmax(logits_q_t)
      pi_t = tf.argmax(q_t_selector, 1, output_type=tf.int32)
      # Compute distribution for greedy action.
      p_target_z = _slice_with_actions(q_t_probs, pi_t)

      # Project using the Cramer distance
      target = tf.stop_gradient(_l2_project(target_z, p_target_z, atoms_tm1))

    logit_qa_tm1 = _slice_with_actions(logits_q_tm1, a_tm1)

    loss = tf.nn.softmax_cross_entropy_with_logits(
        logits=logit_qa_tm1, labels=target)

    return base_ops.LossOutput(loss, Extra(target))
github deepmind / trfl / trfl / discrete_policy_gradient_ops.py View on Github external
tf.reduce_sum(entropy_loss_op, axis=0),
          name="scaled_entropy_loss")  # [B].
      total_loss = tf.add(total_loss, entropy_loss,
                          name="total_loss_with_entropy")
    else:
      entropy = None
      entropy_loss = None

    extra = SequenceAdvantageActorCriticExtra(
        entropy=entropy, entropy_loss=entropy_loss,
        baseline_loss=baseline_loss,
        policy_gradient_loss=policy_gradient_loss,
        advantages=advantages,
        discounted_returns=td_lambda.discounted_returns)

    return base_ops.LossOutput(total_loss, extra)
github deepmind / trfl / trfl / pixel_control_ops.py View on Github external
[1, 1] + height_width)
    pcont_t = tf.reshape(tiled_pcont, [sequence_length, -1])
  else:
    raise ValueError(
        "The discount_factor must be a scalar or a tensor of rank 2."
        "instead is a tensor of shape {}".format(
            discount_factor.shape.as_list()))
  # Compute a QLambda loss of shape [T,BHW]
  loss, _ = action_value_ops.qlambda(q_tm1, a_tm1, r_t, pcont_t, q_t, lambda_=1)
  # Take sum over sequence, sum over cells.
  expanded_shape = [sequence_length, batch_size] + height_width
  spatial_loss = tf.reshape(loss, expanded_shape)  # [T,B,H,W].
  # Return.
  extra = PixelControlExtra(
      spatial_loss=spatial_loss, pseudo_rewards=pseudo_rewards)
  return base_ops.LossOutput(
      tf.reduce_sum(spatial_loss, axis=[0, 2, 3]), extra)  # [B]