Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# SARSALambda op.
with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t, a_t]):
# Select head to update and build target.
qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
qa_t = indexing_ops.batched_index(q_t, a_t)
target = sequence_ops.multistep_forward_view(
r_t, pcont_t, qa_t, lambda_, back_prop=False)
target = tf.stop_gradient(target)
# Temporal difference error and loss.
# Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
td_error = target - qa_tm1
loss = 0.5 * tf.square(td_error)
return base_ops.LossOutput(loss, QExtra(target, td_error))
if dqda_clipping is not None:
if dqda_clipping <= 0:
raise ValueError("dqda_clipping should be bigger than 0, {} found"
.format(dqda_clipping))
if clip_norm:
dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1)
else:
dqda = tf.clip_by_value(dqda, -1. * dqda_clipping, dqda_clipping)
# Target_a ensures correct gradient calculated during backprop.
target_a = dqda + a_max
# Stop the gradient going through Q network when backprop.
target_a = tf.stop_gradient(target_a)
# Gradient only go through actor network.
loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1)
return base_ops.LossOutput(
loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
tf.reduce_sum(
-tf.nn.softmax(scalar_policy_logits)
* tf.nn.log_softmax(scalar_policy_logits), axis=-1)
for scalar_policy_logits in policy_logits], name="entropy")
# We want a value that we can minimize along with other losses, and where
# minimizing means driving the policy towards a uniform distribution over
# the actions. We thus scale it by negative one so that it can be simply
# added to other losses.
scale = tf.constant(-1.0, dtype=tf.float32)
if normalise:
num_actions = [tf.to_float(tf.shape(scalar_policy_logits)[-1])
for scalar_policy_logits in policy_logits]
scale /= tf.reduce_sum(tf.log(tf.stack(num_actions)))
loss = tf.multiply(scale, entropy, name="entropy_loss")
return base_ops.LossOutput(loss, DiscretePolicyEntropyExtra(entropy))
# double Q-learning op.
with tf.name_scope(
name, values=[q_tm1, a_tm1, r_t, pcont_t, q_t_value, q_t_selector]):
# Build target and select head to update.
best_action = tf.argmax(q_t_selector, 1, output_type=tf.int32)
double_q_bootstrapped = indexing_ops.batched_index(q_t_value, best_action)
target = tf.stop_gradient(r_t + pcont_t * double_q_bootstrapped)
qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
# Temporal difference error and loss.
# Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
td_error = target - qa_tm1
loss = 0.5 * tf.square(td_error)
return base_ops.LossOutput(
loss, DoubleQExtra(target, td_error, best_action))
base_ops.wrap_rank_shape_assert(
[[q_tm1], [a_tm1, r_t, pcont_t, v_t]], [2, 1], name)
# QV op.
with tf.name_scope(name, values=[q_tm1, a_tm1, r_t, pcont_t, v_t]):
# Build target and select head to update.
with tf.name_scope("target"):
target = tf.stop_gradient(r_t + pcont_t * v_t)
qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
# Temporal difference error and loss.
# Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
td_error = target - qa_tm1
loss = 0.5 * tf.square(td_error)
return base_ops.LossOutput(loss, QExtra(target, td_error))
tf.reduce_all(almost_prob),
["probs_a_t tensor does not sum to 1", probs_a_t]))
# With dependency on possible debug ops.
with tf.control_dependencies(deps):
# Select head to update and build target.
qa_tm1 = indexing_ops.batched_index(q_tm1, a_tm1)
target = tf.stop_gradient(
r_t + pcont_t * tf.reduce_sum(tf.multiply(q_t, probs_a_t), axis=1))
# Temporal difference error and loss.
# Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
td_error = target - qa_tm1
loss = 0.5 * tf.square(td_error)
return base_ops.LossOutput(loss, QExtra(target, td_error))
* `td_error`: batch of temporal difference errors, shape `[B]`.
"""
# Rank and compatibility checks.
base_ops.wrap_rank_shape_assert([[v_tm1, r_t, pcont_t], [q_t]], [1, 2], name)
# The QVMAX op.
with tf.name_scope(name, values=[v_tm1, r_t, pcont_t, q_t]):
# Build target.
target = tf.stop_gradient(r_t + pcont_t * tf.reduce_max(q_t, axis=1))
# Temporal difference error and loss.
# Loss is MSE scaled by 0.5, so the gradient is equal to the TD error.
td_error = target - v_tm1
loss = 0.5 * tf.square(td_error)
return base_ops.LossOutput(loss, TDExtra(target, td_error))