Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
},
2: {
'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
},
3: {
'a': {3: (1.0, 0.0)},
'b': {3: (1.0, 0.0)}
}
}
gamma_val = 0.9
mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()
exploring_start_val = False
algorithm_type = TDAlgorithm.ExpectedSARSA
softmax_flag = False
epsilon_val = 0.1
epsilon_half_life_val = 1000
learning_rate_val = 0.1
lambda_val = 0.7
episodes_limit = 10000
batch_size_val = 20
max_steps_val = 1000
offline_val = True
state_ff = [lambda s: float(s)]
sa_ff = [
lambda x: float(x[0]),
lambda x: 1. if x[1] == 'a' else 0.,
lambda x: 1. if x[1] == 'b' else 0.,
lambda x: 1. if x[1] == 'c' else 0.,
]
action = get_rv_gen_func_single(
this_pol.get_state_probabilities(state)
)()
steps = 0
terminate = False
while not terminate:
next_state, reward = \
self.mdp_rep.state_reward_gen_dict[state][action]()
next_action = get_rv_gen_func_single(
this_pol.get_state_probabilities(next_state)
)()
if self.algorithm == TDAlgorithm.QLearning and control:
next_qv = max(qf_dict[next_state][a] for a in
qf_dict[next_state])
elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
# next_qv = sum(this_pol.get_state_action_probability(
# next_state,
# a
# ) * qf_dict[next_state][a] for a in qf_dict[next_state])
next_qv = get_expected_action_value(
qf_dict[next_state],
self.softmax,
self.epsilon_func(episodes)
)
else:
next_qv = qf_dict[next_state][next_action]
delta = reward + self.mdp_rep.gamma * next_qv -\
qf_dict[state][action]
et_dict[state][action] += 1
alpha = self.learning_rate * (updates / self.learning_rate_decay
},
2: {
'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
},
3: {
'a': {3: (1.0, 0.0)},
'b': {3: (1.0, 0.0)}
}
}
gamma_val = 0.9
mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()
exploring_start_val = False
algorithm_type = TDAlgorithm.ExpectedSARSA
softmax_flag = False
epsilon_val = 0.1
epsilon_half_life_val = 1000
learning_rate_val = 0.1
learning_rate_decay_val = 1e6
lambda_val = 0.7
episodes_limit = 10000
batch_size_val = 20
max_steps_val = 1000
state_ff = [lambda s: float(s)]
sa_ff = [
lambda x: float(x[0]),
lambda x: 1. if x[1] == 'a' else 0.,
lambda x: 1. if x[1] == 'b' else 0.,
lambda x: 1. if x[1] == 'c' else 0.,
]
epsilon_half_life=epsilon_half_life,
num_episodes=num_episodes,
max_steps=max_steps,
fa_spec=FuncApproxSpec(
state_feature_funcs=state_feature_funcs,
sa_feature_funcs=sa_feature_funcs,
dnn_spec=None,
learning_rate=learning_rate,
add_unit_feature=False
)
)
self.vf_w: np.ndarray = np.zeros(self.vf_fa.num_features)
self.qvf_w: np.ndarray = np.zeros(self.qvf_fa.num_features)
self.vf_fa.params = [self.vf_w]
self.qvf_fa.params = [self.qvf_w]
self.algorithm: TDAlgorithm = algorithm
self.gamma_lambda: float = self.mdp_rep.gamma * lambd
self.batch_size: int = batch_size
self.learning_rate_decay: float = learning_rate_decay
def get_tabular_sarsa(self) -> td0_tabular.TD0:
return td0_tabular.TD0(
self.get_mdp_rep_for_rl_tabular(),
self.exploring_start,
TDAlgorithm.SARSA,
self.softmax,
self.epsilon,
self.epsilon_half_life,
self.learning_rate,
self.learning_rate_decay,
self.num_episodes,
self.max_steps
)
action = get_rv_gen_func_single(
this_pol.get_state_probabilities(state)
)()
steps = 0
terminate = False
while not terminate:
next_state, reward = \
self.mdp_rep.state_reward_gen_dict[state][action]()
next_action = get_rv_gen_func_single(
this_pol.get_state_probabilities(next_state)
)()
if self.algorithm == TDAlgorithm.QLearning and control:
next_qv = max(qf_dict[next_state][a] for a in
qf_dict[next_state])
elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
# next_qv = sum(this_pol.get_state_action_probability(
# next_state,
# a
# ) * qf_dict[next_state][a] for a in qf_dict[next_state])
next_qv = get_expected_action_value(
qf_dict[next_state],
self.softmax,
self.epsilon_func(episodes)
)
else:
next_qv = qf_dict[next_state][next_action]
qf_dict[state][action] += self.learning_rate *\
(updates / self.learning_rate_decay + 1) ** -0.5 *\
(reward + self.mdp_rep.gamma * next_qv -
qf_dict[state][action])
def get_fa_qlearning_lambda(self) -> tdl_fa.TDLambda:
return tdl_fa.TDLambda(
self.get_mdp_rep_for_rl_fa(),
self.exploring_start,
TDAlgorithm.QLearning,
self.softmax,
self.epsilon,
self.epsilon_half_life,
self.lambd,
self.num_episodes,
self.batch_size,
self.max_steps,
self.fa_spec,
self.tdl_fa_offline
)
learning_rate: float,
learning_rate_decay: float,
num_episodes: int,
max_steps: int
) -> None:
super().__init__(
mdp_rep_for_rl=mdp_rep_for_rl,
exploring_start=exploring_start,
softmax=softmax,
epsilon=epsilon,
epsilon_half_life=epsilon_half_life,
num_episodes=num_episodes,
max_steps=max_steps
)
self.algorithm: TDAlgorithm = algorithm
self.learning_rate: float = learning_rate
self.learning_rate_decay: Optional[float] = learning_rate_decay
# print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
# self.mdp_rep.state_action_func(state))))
# print(self.qvf_fa.params)
steps = 0
terminate = False
while not terminate:
next_state, reward = \
self.mdp_rep.state_reward_gen_func(state, action)
next_action = get_rv_gen_func_single(this_polf(next_state))()
if self.algorithm == TDAlgorithm.QLearning and control:
next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in
self.state_action_func(next_state))
elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
# next_qv = sum(this_polf(next_state).get(a, 0.) *
# self.qvf_fa.get_func_eval((next_state, a))
# for a in self.state_action_func(next_state))
next_qv = get_expected_action_value(
{a: self.qvf_fa.get_func_eval((next_state, a)) for a in
self.state_action_func(next_state)},
self.softmax,
self.epsilon_func(episodes)
)
else:
next_qv = self.qvf_fa.get_func_eval((next_state, next_action))
target = reward + self.mdp_rep.gamma * next_qv
# TD is online update and so, policy improves at every time step
self.qvf_fa.update_params([(state, action)], [target])
if control:
def get_tabular_expected_sarsa_lambda(self) -> tdl_tabular.TDLambda:
return tdl_tabular.TDLambda(
self.get_mdp_rep_for_rl_tabular(),
self.exploring_start,
TDAlgorithm.ExpectedSARSA,
self.softmax,
self.epsilon,
self.epsilon_half_life,
self.learning_rate,
self.learning_rate_decay,
self.lambd,
self.num_episodes,
self.max_steps
)