Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return time_step.termination(features, self.get_win_signal(env_state))
# Check the turn count last because if the previous move that incremented
# the turn over the count resulted in a win-condition, we want it to be honored.
if env_state.agent.moves_remaining <= 0:
return time_step.termination(features, GameRewards.LOSE)
if len(agent.history) > 0:
last_timestep = agent.history[-1]
rule = self.get_rule_from_timestep(last_timestep)
# The rewarding_actions can be user specified
for rewarding_class in self.rewarding_actions:
if isinstance(rule, rewarding_class):
return time_step.transition(
features,
reward=GameRewards.HELPFUL_MOVE,
discount=self.discount,
)
# The agent is penalized for returning to a previous state.
for key, group in groupby(
sorted([f"{h.raw}" for h in env_state.agent.history])
):
list_group = list(group)
list_count = len(list_group)
if list_count <= 1:
continue
return time_step.transition(
features, reward=GameRewards.PREVIOUS_LOCATION, discount=self.discount
)
def get_win_signal(self, env_state: MathEnvironmentState) -> float:
"""Calculate the reward value for completing the episode. This is done
so that the reward signal can be scaled based on the time it took to
complete the episode """
total_moves = env_state.max_moves
# guard against divide by zero with max and a small value
current_move = round(
max(3e-10, total_moves - env_state.agent.moves_remaining), 3
)
bonus = (total_moves / current_move) / total_moves
if current_move < total_moves / 2:
bonus *= 2
return GameRewards.WIN + bonus
features,
reward=GameRewards.HELPFUL_MOVE,
discount=self.discount,
)
# The agent is penalized for returning to a previous state.
for key, group in groupby(
sorted([f"{h.raw}" for h in env_state.agent.history])
):
list_group = list(group)
list_count = len(list_group)
if list_count <= 1:
continue
return time_step.transition(
features, reward=GameRewards.PREVIOUS_LOCATION, discount=self.discount
)
# We're in a new state, and the agent is a little older.
return time_step.transition(
features, reward=GameRewards.TIMESTEP, discount=self.discount
)
if (
env_state.agent.problem_type == MODE_SIMPLIFY_POLYNOMIAL
and not has_like_terms(root)
):
term_nodes = get_terms(root)
is_win = True
for term in term_nodes:
if not is_preferred_term_form(term):
is_win = False
if is_win:
return time_step.termination(features, self.get_win_signal(env_state))
# Check the turn count last because if the previous move that incremented
# the turn over the count resulted in a win-condition, we want it to be honored.
if env_state.agent.moves_remaining <= 0:
return time_step.termination(features, GameRewards.LOSE)
if len(agent.history) > 0:
last_timestep = agent.history[-1]
rule = self.get_rule_from_timestep(last_timestep)
# The rewarding_actions can be user specified
for rewarding_class in self.rewarding_actions:
if isinstance(rule, rewarding_class):
return time_step.transition(
features,
reward=GameRewards.HELPFUL_MOVE,
discount=self.discount,
)
# The agent is penalized for returning to a previous state.
for key, group in groupby(
sorted([f"{h.raw}" for h in env_state.agent.history])
def add_frame(self, frame: ExperienceFrame):
if frame.terminal and len(self._frames) > 0 and self._frames[-1].terminal:
# Discard if terminal frame continues
return
frame_index = self._top_frame_index + len(self._frames)
was_full = self.is_full()
# append frame
self._frames.append(frame)
# append index if there are enough (because we replay 4 at a time)
if frame_index >= 3:
# UNREAL uses 0 or non-zero, but we have a penatly timestep, so
# consider anything less than that to be zero.
if frame.reward <= GameRewards.TIMESTEP:
self._zero_reward_indices.append(frame_index)
else:
self._non_zero_reward_indices.append(frame_index)
if was_full:
self._top_frame_index += 1
cut_frame_index = self._top_frame_index + 3
# Cut frame if its index is lower than cut_frame_index.
if (
len(self._zero_reward_indices) > 0
and self._zero_reward_indices[0] < cut_frame_index
):
self._zero_reward_indices.popleft()
if (
def rp_samples(self, max_samples=2) -> Tuple[MathyBatchObservation, List[float]]:
output: MathyBatchObservation = MathyBatchObservation([], [], [], [[], []])
rewards: List[float] = []
if self.experience.is_full() is False:
return output, rewards
windows: List[MathyWindowObservation] = []
for i in range(max_samples):
frames = self.experience.sample_rp_sequence()
# 4 frames
states = [frame.state for frame in frames[:-1]]
target_reward = frames[-1].reward
if math.isclose(target_reward, GameRewards.TIMESTEP):
sample_label = 0 # zero
elif target_reward > 0:
sample_label = 1 # positive
else:
sample_label = 2 # negative
windows.append(observations_to_window(states))
rewards.append(sample_label)
return windows_to_batch(windows), rewards
def add_frame(self, frame: ExperienceFrame):
if frame.terminal and len(self._frames) > 0 and self._frames[-1].terminal:
# Discard if terminal frame continues
return
frame_index = self._top_frame_index + len(self._frames)
was_full = self.is_full()
# append frame
self._frames.append(frame)
# append index if there are enough (because we replay 4 at a time)
if frame_index >= 3:
# UNREAL uses 0 or non-zero, but we have a penatly timestep, so
# consider anything less than that to be zero.
if frame.reward <= GameRewards.TIMESTEP:
self._zero_reward_indices.append(frame_index)
else:
self._non_zero_reward_indices.append(frame_index)
if was_full:
self._top_frame_index += 1
cut_frame_index = self._top_frame_index + 3
# Cut frame if its index is lower than cut_frame_index.
if (
len(self._zero_reward_indices) > 0
and self._zero_reward_indices[0] < cut_frame_index
):
self._zero_reward_indices.popleft()
if (