Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
multi_baseline_values = self.value(states, training=True) * array_ops.expand_dims(weights, axis=-1)
base_ops.assert_rank_and_shape_compatibility(
[rewards, multi_baseline_values], 3)
multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
num_values = len(multi_baseline_values)
base_shape = rewards.shape
decay = self._least_fit(decay, base_shape)
lambda_ = self._least_fit(lambda_, base_shape)
baseline_scale = self._least_fit(baseline_scale, base_shape)
for i in range(num_values):
pcontinues = decay[..., i] * weights
lambdas = lambda_[..., i] * weights
bootstrap_values = indexing_ops.batched_index(
multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
baseline_loss, td_lambda = value_ops.td_lambda(
parray_ops.swap_time_major(multi_baseline_values[i]),
parray_ops.swap_time_major(rewards[..., i]),
parray_ops.swap_time_major(pcontinues),
bootstrap_values,
parray_ops.swap_time_major(lambdas))
value_loss = pmath_ops.safe_divide(
baseline_scale[i] * math_ops.reduce_sum(baseline_loss), total_num)
self.value_loss.append(
gen_array_ops.check_numerics(value_loss, 'value_loss'))
advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
multi_advantages.append(advantages)
advantages = math_ops.add_n(multi_advantages) # A = A[0] + A[1] + ...
if normalize_advantages:
base_ops.assert_rank_and_shape_compatibility(
[rewards, multi_baseline_values], 3)
multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
num_values = len(multi_baseline_values)
base_shape = rewards.shape
decay = self._least_fit(decay, base_shape)
lambda_ = self._least_fit(lambda_, base_shape)
baseline_scale = self._least_fit(baseline_scale, base_shape)
for i in range(num_values):
pcontinues = decay[..., i] * weights
lambdas = lambda_[..., i] * weights
bootstrap_values = indexing_ops.batched_index(
multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
baseline_loss, td_lambda = value_ops.td_lambda(
parray_ops.swap_time_major(multi_baseline_values[i]),
parray_ops.swap_time_major(rewards[..., i]),
parray_ops.swap_time_major(pcontinues),
bootstrap_values,
parray_ops.swap_time_major(lambdas))
value_loss = pmath_ops.safe_divide(
baseline_scale[i] * math_ops.reduce_sum(baseline_loss), total_num)
self.value_loss.append(
gen_array_ops.check_numerics(value_loss, 'value_loss'))
advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
multi_advantages.append(advantages)
advantages = math_ops.add_n(multi_advantages) # A = A[0] + A[1] + ...
if normalize_advantages:
advantages = normalization_ops.normalize_by_moments(advantages, weights)
advantages = gen_array_ops.stop_gradient(advantages)
returns = _discounted_returns(rewards, decay, weights)
self.value.fit(states, returns)
action_values = (returns - array_ops.squeeze(self.value(states, training=True), axis=-1))
action_values *= weights
if normalize_action_values:
action_values = normalization_ops.weighted_moments_normalize(action_values, weights)
policy = self.policy(states, training=True)
log_prob = policy.log_prob(actions)
policy_gradient_loss = gen_array_ops.stop_gradient(action_values) * -log_prob
self.policy_gradient_loss = losses_impl.compute_weighted_loss(
policy_gradient_loss,
weights=weights)
entropy_loss = policy_gradient_ops.policy_entropy_loss(
policy,
self.policy.trainable_variables,
lambda policies: entropy_scale).loss
self.policy_gradient_entropy_loss = losses_impl.compute_weighted_loss(
entropy_loss,
weights=weights)
self.total_loss = math_ops.add_n([
self.policy_gradient_loss,
self.policy_gradient_entropy_loss])
return self.total_loss
baseline_scale: scalar or Tensor of shape `[B, T]` containing the baseline loss scale.
**kwargs: positional arguments (unused)
Returns:
the total loss Tensor of shape [].
"""
del kwargs
base_ops.assert_rank_and_shape_compatibility([weights], 2)
sequence_lengths = math_ops.reduce_sum(weights, axis=1)
total_num = math_ops.reduce_sum(sequence_lengths)
multi_advantages = []
self.value_loss = []
multi_baseline_values = self.value(states, training=True) * array_ops.expand_dims(weights, axis=-1)
base_ops.assert_rank_and_shape_compatibility(
[rewards, multi_baseline_values], 3)
multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
num_values = len(multi_baseline_values)
base_shape = rewards.shape
decay = self._least_fit(decay, base_shape)
lambda_ = self._least_fit(lambda_, base_shape)
baseline_scale = self._least_fit(baseline_scale, base_shape)
for i in range(num_values):
pcontinues = decay[..., i] * weights
lambdas = lambda_[..., i] * weights
bootstrap_values = indexing_ops.batched_index(
multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
baseline_loss, td_lambda = value_ops.td_lambda(
parray_ops.swap_time_major(multi_baseline_values[i]),
actions: Tensor of `[B, T, ...]` containing actions.
rewards: Tensor of `[B, T, V]` containing rewards.
weights: Tensor of shape `[B, T]` containing weights (1. or 0.).
decay: scalar, 1-D Tensor of shape [V], or Tensor of shape
`[B, T]` or `[B, T, V]` containing decays/discounts.
lambda_: scalar, 1-D Tensor of shape [V], or Tensor of shape
`[B, T]` or `[B, T, V]` containing generalized lambda parameter.
entropy_scale: scalar or Tensor of shape `[B, T]` containing the entropy loss scale.
baseline_scale: scalar or Tensor of shape `[B, T]` containing the baseline loss scale.
**kwargs: positional arguments (unused)
Returns:
the total loss Tensor of shape [].
"""
del kwargs
base_ops.assert_rank_and_shape_compatibility([weights], 2)
sequence_lengths = math_ops.reduce_sum(weights, axis=1)
total_num = math_ops.reduce_sum(sequence_lengths)
multi_advantages = []
self.value_loss = []
multi_baseline_values = self.value(states, training=True) * array_ops.expand_dims(weights, axis=-1)
base_ops.assert_rank_and_shape_compatibility(
[rewards, multi_baseline_values], 3)
multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
num_values = len(multi_baseline_values)
base_shape = rewards.shape
decay = self._least_fit(decay, base_shape)
lambda_ = self._least_fit(lambda_, base_shape)
baseline_scale = self._least_fit(baseline_scale, base_shape)
# SARSALambda op.
with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]):
# Select head to update and build target.
qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
qa_t = indexing_ops.batched_index(q_t, a_t)
target = sequence_ops.multistep_forward_view(
r_t, pcont_t, qa_t, lambda_, back_prop=False)
target = tf.stop_gradient(target)
# Temporal difference error and loss.
# Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
td_error = target - qa_tm1
loss = 0.5 * tf.square(td_error)
return base_ops.LossOutput(loss, QExtra(target, td_error))
if dqda_clipping is not None:
if dqda_clipping <= 0:
raise ValueError("dqda_clipping should be bigger than 0, {} found"
.format(dqda_clipping))
if clip_norm:
dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1)
else:
dqda = tf.clip_by_value(dqda, -1. * dqda_clipping, dqda_clipping)
# Target_a ensures correct gradient calculated during backprop.
target_a = dqda + a_max
# Stop the gradient going through Q network when backprop.
target_a = tf.stop_gradient(target_a)
# Gradient only go through actor network.
loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1)
return base_ops.LossOutput(
loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
tf.reduce_sum(
-tf.nn.softmax(scalar_policy_logits)
* tf.nn.log_softmax(scalar_policy_logits), axis=-1)
for scalar_policy_logits in policy_logits], name="entropy")
# We want a value that we can minimize along with other losses, and where
# minimizing means driving the policy towards a uniform distribution over
# the actions. We thus scale it by negative one so that it can be simply
# added to other losses.
scale = tf.constant(-1.0, dtype=tf.float32)
if normalise:
num_actions = [tf.to_float(tf.shape(scalar_policy_logits)[-1])
for scalar_policy_logits in policy_logits]
scale /= tf.reduce_sum(tf.log(tf.stack(num_actions)))
loss = tf.multiply(scale, entropy, name="entropy_loss")
return base_ops.LossOutput(loss, DiscretePolicyEntropyExtra(entropy))
# double Q-learning op.
with tf.name_scope(
name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t_value, q_t_selector]):
# Build target and select head to update.
best_action = tf.argmax(q_t_selector, 1, output_type=tf.int32)
double_q_bootstrapped = indexing_ops.batched_index(q_t_value, best_action)
target = tf.stop_gradient(r_t + pcont_t * double_q_bootstrapped)
qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
# Temporal difference error and loss.
# Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
td_error = target - qa_tm1
loss = 0.5 * tf.square(td_error)
return base_ops.LossOutput(
loss, DoubleQExtra(target, td_error, best_action))
"""
flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
with tf.name_scope(name, values=flat_policy_vars):
# We want a value that we can minimize along with other losses, and where
# minimizing means driving the policy towards a uniform distribution over
# the actions. We thus scale it by negative one so that it can be simply
# added to other losses.
scale = tf.constant(-1.0, dtype=tf.float32)
if scale_op:
scale *= scale_op(policies)
policies = nest.flatten(policies)
entropy = tf.add_n(
[policy.entropy() for policy in policies], name="entropy")
loss = tf.multiply(scale, entropy, name="entropy_loss")
return base_ops.LossOutput(loss, PolicyEntropyExtra(entropy))