Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def get_value_func_dict(self, pol: Policy) -> VFType:
sa_dict = self.state_action_dict
s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
start_gen_f = get_rv_gen_func(s_uniform_dict)
vf_dict = {s: 0. for s in sa_dict.keys()}
act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
for s in self.state_action_dict.keys()}
episodes = 0
while episodes < self.num_episodes:
et_dict = {s: 0. for s in sa_dict.keys()}
state = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
action = act_gen_dict[state](1)[0]
next_state, reward = self.state_reward_gen_dict[state][action]()
delta = reward + self.gamma * vf_dict[next_state] - vf_dict[state]
et_dict[state] += 1
for s in self.state_action_dict.keys():
vf_dict[s] += self.alpha * delta * et_dict[s]
def get_act_value_func_dict(self, pol: Policy) -> QVFType:
sa_dict = self.state_action_dict
sa_uniform_dict = {(s, a): 1. / sum(len(v) for v in sa_dict.values())
for s, v1 in sa_dict.items() for a in v1}
start_gen_f = get_rv_gen_func(sa_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
for s in self.state_action_dict.keys()}
episodes = 0
while episodes < self.num_episodes:
state, action = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
next_state, reward = self.state_reward_gen_dict[state][action]()
next_action = act_gen_dict[next_state](1)[0]
qf_dict[state][action] += self.alpha *\
(reward + self.gamma * qf_dict[next_state][next_action] -
qf_dict[state][action])
def get_act_value_func_dict(self, pol: Policy) -> QVFType:
sa_dict = self.state_action_dict
sa_uniform_dict = {(s, a): 1. / sum(len(v) for v in sa_dict.values())
for s, v1 in sa_dict.items() for a in v1}
start_gen_f = get_rv_gen_func(sa_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
for s in self.state_action_dict.keys()}
episodes = 0
while episodes < self.num_episodes:
et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
state, action = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
next_state, reward = self.state_reward_gen_dict[state][action]()
next_action = act_gen_dict[next_state](1)[0]
delta = reward + self.gamma * qf_dict[next_state][next_action] -\
qf_dict[state][action]
et_dict[state][action] += 1
for s, a_set in self.state_action_dict.items():
def get_optimal(self) -> Tuple[DetPolicy, VFType]:
pol = self.get_init_policy()
sa_dict = self.state_action_dict
s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
start_gen_f = get_rv_gen_func(s_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
episodes = 0
max_steps = 10000
while episodes < self.num_episodes:
state = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
next_state, reward = self.state_reward_gen_dict[state][action]()
qf_dict[state][action] += self.alpha *\
(reward + self.gamma * max(qf_dict[next_state][a]
for a in sa_dict[next_state]) -
qf_dict[state][action])
state = next_state
steps += 1
terminate = steps >= max_steps or state in self.terminal_states
pol = get_soft_policy_from_qf(qf_dict, self.softmax, self.epsilon)
episodes += 1
pol = get_det_policy_from_qf(qf_dict)
vf_dict = self.get_value_func_dict(pol)
return pol, vf_dict
def get_optimal_det_policy(self) -> DetPolicy:
pol = self.get_init_policy()
sa_dict = self.state_action_dict
s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
start_gen_f = get_rv_gen_func(s_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
episodes = 0
while episodes < self.num_episodes:
et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
state = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
next_state, reward = self.state_reward_gen_dict[state][action]()
delta = reward + self.gamma * max(qf_dict[next_state][a]
for a in sa_dict[next_state])\
- qf_dict[state][action]
et_dict[state][action] += 1
for s, a_set in self.state_action_dict.items():
for a in a_set:
qf_dict[s][a] += self.alpha * delta * et_dict[s][a]
et_dict[s][a] *= self.gamma * self.lambd
pol = get_soft_policy_from_qf(qf_dict, self.softmax, self.epsilon)
state = next_state
steps += 1
terminate = steps >= self.max_steps or\
state in self.terminal_states
episodes += 1
def get_optimal_det_policy(self) -> DetPolicy:
pol = self.get_init_policy()
sa_dict = self.state_action_dict
s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
start_gen_f = get_rv_gen_func(s_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
episodes = 0
while episodes < self.num_episodes:
state = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
next_state, reward = self.state_reward_gen_dict[state][action]()
delta = reward + self.gamma * sum(
pol.get_state_action_probability(next_state, a) *
qf_dict[next_state][a] for a in sa_dict[next_state])\
- qf_dict[state][action]
def get_optimal_det_policy(self) -> DetPolicy:
pol = self.get_init_policy()
sa_dict = self.state_action_dict
s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
start_gen_f = get_rv_gen_func(s_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
episodes = 0
while episodes < self.num_episodes:
state = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
next_state, reward = self.state_reward_gen_dict[state][action]()
delta = reward + self.gamma * sum(
pol.get_state_action_probability(next_state, a) *
qf_dict[next_state][a] for a in sa_dict[next_state])\
- qf_dict[state][action]
et_dict[state][action] += 1
for s, a_set in self.state_action_dict.items():
for a in a_set:
qf_dict[s][a] += self.alpha * delta * et_dict[s][a]
et_dict[s][a] *= self.gamma * self.lambd
pol = get_soft_policy_from_qf(qf_dict, self.softmax, self.epsilon)
state = next_state
steps += 1
terminate = steps >= self.max_steps or\
state in self.terminal_states
def get_optimal(self) -> Tuple[DetPolicy, VFType]:
pol = self.get_init_policy()
sa_dict = self.state_action_dict
s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
start_gen_f = get_rv_gen_func(s_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
episodes = 0
max_steps = 10000
while episodes < self.num_episodes:
state = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
next_state, reward = self.state_reward_gen_dict[state][action]()
qf_dict[state][action] += self.alpha *\
(reward + self.gamma * max(qf_dict[next_state][a]
for a in sa_dict[next_state]) -
qf_dict[state][action])
def get_act_value_func_dict(self, pol: Policy) -> QVFType:
sa_dict = self.state_action_dict
sa_uniform_dict = {(s, a): 1. / sum(len(v) for v in sa_dict.values())
for s, v1 in sa_dict.items() for a in v1}
start_gen_f = get_rv_gen_func(sa_uniform_dict)
qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
for s in self.state_action_dict.keys()}
episodes = 0
while episodes < self.num_episodes:
et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
state, action = start_gen_f(1)[0]
steps = 0
terminate = False
while not terminate:
next_state, reward = self.state_reward_gen_dict[state][action]()
next_action = act_gen_dict[next_state](1)[0]
delta = reward + self.gamma * qf_dict[next_state][next_action] -\
qf_dict[state][action]