How to use the algorithms.td_algo_enum.TDAlgorithm function in algorithms

To help you get started, we’ve selected a few algorithms examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github coverdrive / MDP-DP-RL / src / algorithms / rl_func_approx / tdlambda.py View on Github external
},
        2: {
            'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
            'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
        },
        3: {
            'a': {3: (1.0, 0.0)},
            'b': {3: (1.0, 0.0)}
        }
    }
    gamma_val = 0.9
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()

    exploring_start_val = False
    algorithm_type = TDAlgorithm.ExpectedSARSA
    softmax_flag = False
    epsilon_val = 0.1
    epsilon_half_life_val = 1000
    learning_rate_val = 0.1
    lambda_val = 0.7
    episodes_limit = 10000
    batch_size_val = 20
    max_steps_val = 1000
    offline_val = True
    state_ff = [lambda s: float(s)]
    sa_ff = [
        lambda x: float(x[0]),
        lambda x: 1. if x[1] == 'a' else 0.,
        lambda x: 1. if x[1] == 'b' else 0.,
        lambda x: 1. if x[1] == 'c' else 0.,
    ]
github coverdrive / MDP-DP-RL / src / algorithms / rl_tabular / tdlambda.py View on Github external
action = get_rv_gen_func_single(
                    this_pol.get_state_probabilities(state)
                )()
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_dict[state][action]()
                next_action = get_rv_gen_func_single(
                    this_pol.get_state_probabilities(next_state)
                )()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(qf_dict[next_state][a] for a in
                                  qf_dict[next_state])
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_pol.get_state_action_probability(
                    #     next_state,
                    #     a
                    # ) * qf_dict[next_state][a] for a in qf_dict[next_state])
                    next_qv = get_expected_action_value(
                        qf_dict[next_state],
                        self.softmax,
                        self.epsilon_func(episodes)
                    )
                else:
                    next_qv = qf_dict[next_state][next_action]

                delta = reward + self.mdp_rep.gamma * next_qv -\
                    qf_dict[state][action]
                et_dict[state][action] += 1
                alpha = self.learning_rate * (updates / self.learning_rate_decay
github coverdrive / MDP-DP-RL / src / algorithms / rl_func_approx / tdlambda_exact.py View on Github external
},
        2: {
            'a': {1: (0.3, 9.8), 2: (0.6, 6.7), 3: (0.1, 1.8)},
            'c': {1: (0.2, 4.8), 2: (0.4, 9.2), 3: (0.4, -8.2)}
        },
        3: {
            'a': {3: (1.0, 0.0)},
            'b': {3: (1.0, 0.0)}
        }
    }
    gamma_val = 0.9
    mdp_ref_obj1 = MDPRefined(mdp_refined_data, gamma_val)
    mdp_rep_obj = mdp_ref_obj1.get_mdp_rep_for_rl_tabular()

    exploring_start_val = False
    algorithm_type = TDAlgorithm.ExpectedSARSA
    softmax_flag = False
    epsilon_val = 0.1
    epsilon_half_life_val = 1000
    learning_rate_val = 0.1
    learning_rate_decay_val = 1e6
    lambda_val = 0.7
    episodes_limit = 10000
    batch_size_val = 20
    max_steps_val = 1000
    state_ff = [lambda s: float(s)]
    sa_ff = [
        lambda x: float(x[0]),
        lambda x: 1. if x[1] == 'a' else 0.,
        lambda x: 1. if x[1] == 'b' else 0.,
        lambda x: 1. if x[1] == 'c' else 0.,
    ]
github coverdrive / MDP-DP-RL / src / algorithms / rl_func_approx / tdlambda_exact.py View on Github external
epsilon_half_life=epsilon_half_life,
            num_episodes=num_episodes,
            max_steps=max_steps,
            fa_spec=FuncApproxSpec(
                state_feature_funcs=state_feature_funcs,
                sa_feature_funcs=sa_feature_funcs,
                dnn_spec=None,
                learning_rate=learning_rate,
                add_unit_feature=False
            )
        )
        self.vf_w: np.ndarray = np.zeros(self.vf_fa.num_features)
        self.qvf_w: np.ndarray = np.zeros(self.qvf_fa.num_features)
        self.vf_fa.params = [self.vf_w]
        self.qvf_fa.params = [self.qvf_w]
        self.algorithm: TDAlgorithm = algorithm
        self.gamma_lambda: float = self.mdp_rep.gamma * lambd
        self.batch_size: int = batch_size
        self.learning_rate_decay: float = learning_rate_decay
github coverdrive / MDP-DP-RL / src / examples / run_all_algorithms.py View on Github external
def get_tabular_sarsa(self) -> td0_tabular.TD0:
        return td0_tabular.TD0(
            self.get_mdp_rep_for_rl_tabular(),
            self.exploring_start,
            TDAlgorithm.SARSA,
            self.softmax,
            self.epsilon,
            self.epsilon_half_life,
            self.learning_rate,
            self.learning_rate_decay,
            self.num_episodes,
            self.max_steps
        )
github coverdrive / MDP-DP-RL / src / algorithms / rl_tabular / td0.py View on Github external
action = get_rv_gen_func_single(
                    this_pol.get_state_probabilities(state)
                )()
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_dict[state][action]()
                next_action = get_rv_gen_func_single(
                    this_pol.get_state_probabilities(next_state)
                )()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(qf_dict[next_state][a] for a in
                                  qf_dict[next_state])
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_pol.get_state_action_probability(
                    #     next_state,
                    #     a
                    # ) * qf_dict[next_state][a] for a in qf_dict[next_state])
                    next_qv = get_expected_action_value(
                        qf_dict[next_state],
                        self.softmax,
                        self.epsilon_func(episodes)
                    )
                else:
                    next_qv = qf_dict[next_state][next_action]

                qf_dict[state][action] += self.learning_rate *\
                    (updates / self.learning_rate_decay + 1) ** -0.5 *\
                    (reward + self.mdp_rep.gamma * next_qv -
                     qf_dict[state][action])
github coverdrive / MDP-DP-RL / src / examples / run_all_algorithms.py View on Github external
def get_fa_qlearning_lambda(self) -> tdl_fa.TDLambda:
        return tdl_fa.TDLambda(
            self.get_mdp_rep_for_rl_fa(),
            self.exploring_start,
            TDAlgorithm.QLearning,
            self.softmax,
            self.epsilon,
            self.epsilon_half_life,
            self.lambd,
            self.num_episodes,
            self.batch_size,
            self.max_steps,
            self.fa_spec,
            self.tdl_fa_offline
        )
github coverdrive / MDP-DP-RL / src / algorithms / rl_tabular / td0.py View on Github external
learning_rate: float,
            learning_rate_decay: float,
            num_episodes: int,
            max_steps: int
    ) -> None:

        super().__init__(
            mdp_rep_for_rl=mdp_rep_for_rl,
            exploring_start=exploring_start,
            softmax=softmax,
            epsilon=epsilon,
            epsilon_half_life=epsilon_half_life,
            num_episodes=num_episodes,
            max_steps=max_steps
        )
        self.algorithm: TDAlgorithm = algorithm
        self.learning_rate: float = learning_rate
        self.learning_rate_decay: Optional[float] = learning_rate_decay
github coverdrive / MDP-DP-RL / src / algorithms / rl_func_approx / td0.py View on Github external
# print((episodes, max(self.qvf_fa.get_func_eval((state, a)) for a in
            #        self.mdp_rep.state_action_func(state))))
            # print(self.qvf_fa.params)

            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_func(state, action)
                next_action = get_rv_gen_func_single(this_polf(next_state))()
                if self.algorithm == TDAlgorithm.QLearning and control:
                    next_qv = max(self.qvf_fa.get_func_eval((next_state, a)) for a in
                                  self.state_action_func(next_state))
                elif self.algorithm == TDAlgorithm.ExpectedSARSA and control:
                    # next_qv = sum(this_polf(next_state).get(a, 0.) *
                    #               self.qvf_fa.get_func_eval((next_state, a))
                    #               for a in self.state_action_func(next_state))
                    next_qv = get_expected_action_value(
                        {a: self.qvf_fa.get_func_eval((next_state, a)) for a in
                         self.state_action_func(next_state)},
                        self.softmax,
                        self.epsilon_func(episodes)
                    )
                else:
                    next_qv = self.qvf_fa.get_func_eval((next_state, next_action))

                target = reward + self.mdp_rep.gamma * next_qv
                # TD is online update and so, policy improves at every time step
                self.qvf_fa.update_params([(state, action)], [target])
                if control:
github coverdrive / MDP-DP-RL / src / examples / run_all_algorithms.py View on Github external
def get_tabular_expected_sarsa_lambda(self) -> tdl_tabular.TDLambda:
        return tdl_tabular.TDLambda(
            self.get_mdp_rep_for_rl_tabular(),
            self.exploring_start,
            TDAlgorithm.ExpectedSARSA,
            self.softmax,
            self.epsilon,
            self.epsilon_half_life,
            self.learning_rate,
            self.learning_rate_decay,
            self.lambd,
            self.num_episodes,
            self.max_steps
        )