Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def step(self, state):
''' Predict the action for genrating training data
Args:
state (numpy.array): current state
Returns:
action (int): an action id
'''
A = self.predict(state['obs'])
A = remove_illegal(A, state['legal_actions'])
action = np.random.choice(np.arange(len(A)), p=A)
return action
def eval_step(self, state):
''' Predict the action for evaluation purpose.
Args:
state (numpy.array): current state
Returns:
action (int): an action id
'''
q_values = self.q_estimator.predict(self.sess, np.expand_dims(self.normalizer.normalize(state['obs']), 0))[0]
probs = remove_illegal(np.exp(q_values), state['legal_actions'])
best_action = np.argmax(probs)
return best_action
Args:
state (dict): The current state
Returns:
action (int): An action id
'''
obs = state['obs']
legal_actions = state['legal_actions']
if self._mode == MODE.best_response:
probs = self._rl_agent.predict(obs)
self._add_transition(obs, probs)
elif self._mode == MODE.average_policy:
probs = self._act(obs)
probs = remove_illegal(probs, legal_actions)
action = np.random.choice(len(probs), p=probs)
return action
def eval_step(self, state):
''' Predict the action given state for evaluation
args:
state (dict): current state
returns:
action (int): an action id
'''
obs = state['obs']
legal_actions = state['legal_actions']
action_prob = self.action_probabilities(obs)
action_prob = remove_illegal(action_prob, legal_actions)
action_prob /= action_prob.sum()
action = np.random.choice(np.arange(len(action_prob)), p=action_prob)
return action
def eval_step(self, state):
''' Use the average policy for evaluation purpose
Args:
state (dict): The current state.
Returns:
action (int): An action id.
'''
if self.evaluate_with == 'best_response':
action = self._rl_agent.eval_step(state)
elif self.evaluate_with == 'average_policy':
obs = state['obs']
legal_actions = state['legal_actions']
probs = self._act(obs)
probs = remove_illegal(probs, legal_actions)
action = np.random.choice(len(probs), p=probs)
else:
raise ValueError("'evaluate_with' should be either 'average_policy' or 'best_response'.")
return action
Args:
state (dict): The current state
Returns:
action (int): An action id
'''
obs = state['obs']
legal_actions = state['legal_actions']
if self._mode == MODE.best_response:
probs = self._rl_agent.predict(obs)
self._add_transition(obs, probs)
elif self._mode == MODE.average_policy:
probs = self._act(obs)
probs = remove_illegal(probs, legal_actions)
action = np.random.choice(len(probs), p=probs)
return action
def eval_step(self, state):
''' Predict the action for evaluation purpose.
Args:
state (numpy.array): current state
Returns:
action (int): an action id
'''
q_values = self.q_estimator.predict_nograd(np.expand_dims(self.normalizer.normalize(state['obs']), 0))[0]
probs = remove_illegal(np.exp(q_values), state['legal_actions'])
best_action = np.argmax(probs)
return best_action
def step(self, state):
''' Predict the action for genrating training data but
have the predictions disconnected from the computation graph
Args:
state (numpy.array): current state
Returns:
action (int): an action id
'''
A = self.predict(state['obs'])
A = remove_illegal(A, state['legal_actions'])
action = np.random.choice(np.arange(len(A)), p=A)
return action