Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
base_ops.assert_rank_and_shape_compatibility(
[rewards, multi_baseline_values], 3)
multi_baseline_values = array_ops.unstack(multi_baseline_values, axis=-1)
num_values = len(multi_baseline_values)
base_shape = rewards.shape
decay = self._least_fit(decay, base_shape)
lambda_ = self._least_fit(lambda_, base_shape)
baseline_scale = self._least_fit(baseline_scale, base_shape)
for i in range(num_values):
pcontinues = decay[..., i] * weights
lambdas = lambda_[..., i] * weights
bootstrap_values = indexing_ops.batched_index(
multi_baseline_values[i], math_ops.cast(sequence_lengths - 1, dtypes.int32))
baseline_loss, td_lambda = value_ops.td_lambda(
parray_ops.swap_time_major(multi_baseline_values[i]),
parray_ops.swap_time_major(rewards[..., i]),
parray_ops.swap_time_major(pcontinues),
bootstrap_values,
parray_ops.swap_time_major(lambdas))
value_loss = pmath_ops.safe_divide(
baseline_scale[i] * math_ops.reduce_sum(baseline_loss), total_num)
self.value_loss.append(
gen_array_ops.check_numerics(value_loss, 'value_loss'))
advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
multi_advantages.append(advantages)
advantages = math_ops.add_n(multi_advantages) # A = A[0] + A[1] + ...
if normalize_advantages:
advantages = normalization_ops.normalize_by_moments(advantages, weights)
advantages = gen_array_ops.stop_gradient(advantages)
* `extra`: a namedtuple with fields:
* `entropy`: total loss per sequence, shape `[B]`.
* `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
* `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
* `policy_gradient_loss`: policy gradient loss per sequence,
shape `[B]`.
* `advantages`: advantange estimates per timestep, shape `[T, B]`.
* `discounted_returns`: discounted returns per timestep,
shape `[T, B]`.
"""
flat_policy_vars = nest.flatten(policy_vars) if policy_vars else list()
scoped_values = (flat_policy_vars + nest.flatten(actions) +
[baseline_values, rewards, pcontinues, bootstrap_value])
with tf.name_scope(name, values=scoped_values):
# Loss for the baseline, summed over the time dimension.
baseline_loss_td, td_lambda = value_ops.td_lambda(
baseline_values, rewards, pcontinues, bootstrap_value, lambda_)
# The TD error provides an estimate of the advantages of the actions.
advantages = td_lambda.temporal_differences
baseline_loss = tf.multiply(
tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
baseline_loss_td,
name="baseline_loss")
# Loss for the policy. Doesn't push additional gradients through
# the advantages.
pg_loss = policy_gradient_loss(
policies, actions, advantages, policy_vars,
name="policy_gradient_loss")
total_loss = tf.add(pg_loss, baseline_loss, name="total_loss")
target_policy = self.target_policy(next_states)
bootstrap_value = gen_array_ops.reshape(
self.target_value(next_states[:, -1:], target_policy[:, -1:]),
[-1])
action_values = array_ops.squeeze(
self.value(states, policy, training=True),
axis=-1) * mask
self.policy_gradient_loss = losses_impl.compute_weighted_loss(
-action_values, weights=weights)
lambda_ = lambda_ * weights
pcontinues = decay * weights
baseline_loss = value_ops.td_lambda(
parray_ops.swap_time_major(action_values),
parray_ops.swap_time_major(rewards),
parray_ops.swap_time_major(pcontinues),
gen_array_ops.stop_gradient(bootstrap_value),
parray_ops.swap_time_major(lambda_)).loss
self.value_loss = math_ops.reduce_mean(
baseline_loss * baseline_scale * pmath_ops.safe_divide(1., sequence_length),
axis=0)
self.total_loss = math_ops.add_n([
self.value_loss,
self.policy_gradient_loss])
return self.total_loss
del kwargs
sequence_length = math_ops.reduce_sum(weights, axis=1)
total_num = math_ops.reduce_sum(sequence_length)
policy = self.policy(states, training=True)
behavioral_policy = self.behavioral_policy(states)
baseline_values = array_ops.squeeze(
self.value(states, training=True),
axis=-1) * weights
pcontinues = decay * weights
lambda_ = lambda_ * weights
bootstrap_values = indexing_ops.batched_index(
baseline_values, math_ops.cast(sequence_length - 1, dtypes.int32))
baseline_loss, td_lambda = value_ops.td_lambda(
parray_ops.swap_time_major(baseline_values),
parray_ops.swap_time_major(rewards),
parray_ops.swap_time_major(pcontinues),
bootstrap_values,
parray_ops.swap_time_major(lambda_))
advantages = parray_ops.swap_time_major(td_lambda.temporal_differences)
advantages = normalization_ops.normalize_by_moments(advantages, weights)
advantages = gen_array_ops.stop_gradient(advantages)
ratio = gen_math_ops.exp(
policy.log_prob(actions) - gen_array_ops.stop_gradient(
behavioral_policy.log_prob(actions)))
clipped_ratio = clip_ops.clip_by_value(ratio, 1. - ratio_epsilon, 1. + ratio_epsilon)
self.policy_gradient_loss = -losses_impl.compute_weighted_loss(
* `loss`: a tensor containing the total loss, shape `[B]`.
* `extra`: a namedtuple with fields:
* `entropy`: total loss per sequence, shape `[B]`.
* `entropy_loss`: scaled entropy loss per sequence, shape `[B]`.
* `baseline_loss`: scaled baseline loss per sequence, shape `[B]`.
* `policy_gradient_loss`: policy gradient loss per sequence,
shape `[B]`.
* `advantages`: advantange estimates per timestep, shape `[T, B]`.
* `discounted_returns`: discounted returns per timestep,
shape `[T, B]`.
"""
scoped_values = (nest.flatten(policy_logits) + nest.flatten(actions) +
[baseline_values, rewards, pcontinues, bootstrap_value])
with tf.name_scope(name, values=scoped_values):
# Loss for the baseline, summed over the time dimension.
baseline_loss_td, td_lambda = value_ops.td_lambda(
baseline_values, rewards, pcontinues, bootstrap_value, lambda_)
# The TD error provides an estimate of the advantages of the actions.
advantages = td_lambda.temporal_differences
baseline_loss = tf.multiply(
tf.convert_to_tensor(baseline_cost, dtype=tf.float32),
baseline_loss_td,
name="baseline_loss")
# Loss for the policy. Doesn't push additional gradients through
# the advantages.
policy_gradient_loss = discrete_policy_gradient_loss(
policy_logits, actions, advantages, name="policy_gradient_loss")
total_loss = tf.add(policy_gradient_loss, baseline_loss, name="total_loss")