How to use trfl - 10 common examples

To help you get started, we’ve selected a few trfl examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fomorians-oss / pyoneer / pyoneer / rl / agents / advantage_actor_critic_agent_impl.py View on Github external
multi_baseline_values = self.value(states, training=True) * array_ops.expand_dims(weights, axis=-1)

        base_ops.assert_rank_and_shape_compatibility(
            [rewards, multi_baseline_values], 3)
        multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
        num_values = len(multi_baseline_values)

        base_shape = rewards.shape
        decay = self._least_fit(decay, base_shape)
        lambda_ = self._least_fit(lambda_, base_shape)
        baseline_scale = self._least_fit(baseline_scale, base_shape)

        for i in range(num_values):
            pcontinues = decay[..., i] * weights
            lambdas = lambda_[..., i] * weights
            bootstrap_values = indexing_ops.batched_index(
                multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
            baseline_loss, td_lambda = value_ops.td_lambda(
                parray_ops.swap_time_major(multi_baseline_values[i]), 
                parray_ops.swap_time_major(rewards[..., i]), 
                parray_ops.swap_time_major(pcontinues), 
                bootstrap_values, 
                parray_ops.swap_time_major(lambdas))
            value_loss = pmath_ops.safe_divide(
                baseline_scale[i] * math_ops.reduce_sum(baseline_loss), total_num)
            self.value_loss.append(
                gen_array_ops.check_numerics(value_loss, 'value_loss'))
            advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
            multi_advantages.append(advantages)

        advantages = math_ops.add_n(multi_advantages) # A = A[0] + A[1] + ...
        if normalize_advantages:
github fomorians-oss / pyoneer / pyoneer / rl / agents / advantage_actor_critic_agent_impl.py View on Github external
base_ops.assert_rank_and_shape_compatibility(
            [rewards, multi_baseline_values], 3)
        multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
        num_values = len(multi_baseline_values)

        base_shape = rewards.shape
        decay = self._least_fit(decay, base_shape)
        lambda_ = self._least_fit(lambda_, base_shape)
        baseline_scale = self._least_fit(baseline_scale, base_shape)

        for i in range(num_values):
            pcontinues = decay[..., i] * weights
            lambdas = lambda_[..., i] * weights
            bootstrap_values = indexing_ops.batched_index(
                multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
            baseline_loss, td_lambda = value_ops.td_lambda(
                parray_ops.swap_time_major(multi_baseline_values[i]), 
                parray_ops.swap_time_major(rewards[..., i]), 
                parray_ops.swap_time_major(pcontinues), 
                bootstrap_values, 
                parray_ops.swap_time_major(lambdas))
            value_loss = pmath_ops.safe_divide(
                baseline_scale[i] * math_ops.reduce_sum(baseline_loss), total_num)
            self.value_loss.append(
                gen_array_ops.check_numerics(value_loss, 'value_loss'))
            advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
            multi_advantages.append(advantages)

        advantages = math_ops.add_n(multi_advantages) # A = A[0] + A[1] + ...
        if normalize_advantages:
            advantages = normalization_ops.normalize_by_moments(advantages, weights)
        advantages = gen_array_ops.stop_gradient(advantages)
github fomorians-oss / pyoneer / pyoneer / rl / agents / vanilla_policy_gradient_agent_impl.py View on Github external
returns = _discounted_returns(rewards, decay, weights)
        self.value.fit(states, returns)

        action_values = (returns - array_ops.squeeze(self.value(states, training=True), axis=-1))
        action_values *= weights
        if normalize_action_values:
            action_values = normalization_ops.weighted_moments_normalize(action_values, weights)

        policy = self.policy(states, training=True)
        log_prob = policy.log_prob(actions)
        policy_gradient_loss = gen_array_ops.stop_gradient(action_values) * -log_prob
        self.policy_gradient_loss = losses_impl.compute_weighted_loss(
            policy_gradient_loss,
            weights=weights)

        entropy_loss = policy_gradient_ops.policy_entropy_loss(
            policy, 
            self.policy.trainable_variables,
            lambda policies: entropy_scale).loss
        self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
            entropy_loss,
            weights=weights)

        self.total_loss = math_ops.add_n([
            self.policy_gradient_loss, 
            self.policy_gradient_entropy_loss])

        return self.total_loss
github fomorians-oss / pyoneer / pyoneer / rl / agents / advantage_actor_critic_agent_impl.py View on Github external
baseline_scale: scalar or Tensor of shape `[B, T]` containing the baseline loss scale.
            **kwargs: positional arguments (unused)

        Returns:
            the total loss Tensor of shape [].
        """
        del kwargs
        base_ops.assert_rank_and_shape_compatibility([weights], 2)
        sequence_lengths = math_ops.reduce_sum(weights, axis=1)
        total_num = math_ops.reduce_sum(sequence_lengths)

        multi_advantages = []
        self.value_loss = []
        multi_baseline_values = self.value(states, training=True) * array_ops.expand_dims(weights, axis=-1)

        base_ops.assert_rank_and_shape_compatibility(
            [rewards, multi_baseline_values], 3)
        multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
        num_values = len(multi_baseline_values)

        base_shape = rewards.shape
        decay = self._least_fit(decay, base_shape)
        lambda_ = self._least_fit(lambda_, base_shape)
        baseline_scale = self._least_fit(baseline_scale, base_shape)

        for i in range(num_values):
            pcontinues = decay[..., i] * weights
            lambdas = lambda_[..., i] * weights
            bootstrap_values = indexing_ops.batched_index(
                multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
            baseline_loss, td_lambda = value_ops.td_lambda(
                parray_ops.swap_time_major(multi_baseline_values[i]), 
github fomorians-oss / pyoneer / pyoneer / rl / agents / advantage_actor_critic_agent_impl.py View on Github external
actions: Tensor of `[B, T, ...]` containing actions.
            rewards: Tensor of `[B, T, V]` containing rewards.
            weights: Tensor of shape `[B, T]` containing weights (1. or 0.).
            decay: scalar, 1-D Tensor of shape [V], or Tensor of shape 
                `[B, T]` or `[B, T, V]` containing decays/discounts.
            lambda_: scalar, 1-D Tensor of shape [V], or Tensor of shape 
                `[B, T]` or `[B, T, V]` containing generalized lambda parameter.
            entropy_scale: scalar or Tensor of shape `[B, T]` containing the entropy loss scale.
            baseline_scale: scalar or Tensor of shape `[B, T]` containing the baseline loss scale.
            **kwargs: positional arguments (unused)

        Returns:
            the total loss Tensor of shape [].
        """
        del kwargs
        base_ops.assert_rank_and_shape_compatibility([weights], 2)
        sequence_lengths = math_ops.reduce_sum(weights, axis=1)
        total_num = math_ops.reduce_sum(sequence_lengths)

        multi_advantages = []
        self.value_loss = []
        multi_baseline_values = self.value(states, training=True) * array_ops.expand_dims(weights, axis=-1)

        base_ops.assert_rank_and_shape_compatibility(
            [rewards, multi_baseline_values], 3)
        multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
        num_values = len(multi_baseline_values)

        base_shape = rewards.shape
        decay = self._least_fit(decay, base_shape)
        lambda_ = self._least_fit(lambda_, base_shape)
        baseline_scale = self._least_fit(baseline_scale, base_shape)
github deepmind / trfl / trfl / action_value_ops.py View on Github external
# SARSALambda op.
  with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]):

    # Select head to update and build target.
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
    qa_t = indexing_ops.batched_index(q_t, a_t)
    target = sequence_ops.multistep_forward_view(
        r_t, pcont_t, qa_t, lambda_, back_prop=False)
    target = tf.stop_gradient(target)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(loss, QExtra(target, td_error))
github deepmind / trfl / trfl / dpg_ops.py View on Github external
if dqda_clipping is not None:
      if dqda_clipping <= 0:
        raise ValueError("dqda_clipping should be bigger than 0, {} found"
                         .format(dqda_clipping))
      if clip_norm:
        dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1)
      else:
        dqda = tf.clip_by_value(dqda, -1. * dqda_clipping, dqda_clipping)

    # Target_a ensures correct gradient calculated during backprop.
    target_a = dqda + a_max
    # Stop the gradient going through Q network when backprop.
    target_a = tf.stop_gradient(target_a)
    # Gradient only go through actor network.
    loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1)
    return base_ops.LossOutput(
        loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
github deepmind / trfl / trfl / discrete_policy_gradient_ops.py View on Github external
tf.reduce_sum(
            -tf.nn.softmax(scalar_policy_logits)
            * tf.nn.log_softmax(scalar_policy_logits), axis=-1)
        for scalar_policy_logits in policy_logits], name="entropy")
    # We want a value that we can minimize along with other losses, and where
    # minimizing means driving the policy towards a uniform distribution over
    # the actions. We thus scale it by negative one so that it can be simply
    # added to other losses.
    scale = tf.constant(-1.0, dtype=tf.float32)
    if normalise:
      num_actions = [tf.to_float(tf.shape(scalar_policy_logits)[-1])
                     for scalar_policy_logits in policy_logits]
      scale /= tf.reduce_sum(tf.log(tf.stack(num_actions)))
    loss = tf.multiply(scale, entropy, name="entropy_loss")

  return base_ops.LossOutput(loss, DiscretePolicyEntropyExtra(entropy))
github deepmind / trfl / trfl / action_value_ops.py View on Github external
# double Q-learning op.
  with tf.name_scope(
      name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t_value, q_t_selector]):

    # Build target and select head to update.
    best_action = tf.argmax(q_t_selector, 1, output_type=tf.int32)
    double_q_bootstrapped = indexing_ops.batched_index(q_t_value, best_action)
    target = tf.stop_gradient(r_t + pcont_t * double_q_bootstrapped)
    qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)

    # Temporal difference error and loss.
    # Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
    td_error = target - qa_tm1
    loss = 0.5 * tf.square(td_error)
    return base_ops.LossOutput(
        loss, DoubleQExtra(target, td_error, best_action))
github deepmind / trfl / trfl / policy_gradient_ops.py View on Github external
"""
  flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
  with tf.name_scope(name, values=flat_policy_vars):
    # We want a value that we can minimize along with other losses, and where
    # minimizing means driving the policy towards a uniform distribution over
    # the actions. We thus scale it by negative one so that it can be simply
    # added to other losses.
    scale = tf.constant(-1.0, dtype=tf.float32)
    if scale_op:
      scale *= scale_op(policies)

    policies = nest.flatten(policies)
    entropy = tf.add_n(
        [policy.entropy() for policy in policies], name="entropy")
    loss = tf.multiply(scale, entropy, name="entropy_loss")
    return base_ops.LossOutput(loss, PolicyEntropyExtra(entropy))