How to use the algorithms.helper_funcs.get_rv_gen_func function in algorithms

To help you get started, we’ve selected a few algorithms examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_tdl_base.py View on Github external
def get_value_func_dict(self, pol: Policy) -> VFType:
        sa_dict = self.state_action_dict
        s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
        start_gen_f = get_rv_gen_func(s_uniform_dict)
        vf_dict = {s: 0. for s in sa_dict.keys()}
        act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
                        for s in self.state_action_dict.keys()}
        episodes = 0

        while episodes < self.num_episodes:
            et_dict = {s: 0. for s in sa_dict.keys()}
            state = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                action = act_gen_dict[state](1)[0]
                next_state, reward = self.state_reward_gen_dict[state][action]()
                delta = reward + self.gamma * vf_dict[next_state] - vf_dict[state]
                et_dict[state] += 1
                for s in self.state_action_dict.keys():
                    vf_dict[s] += self.alpha * delta * et_dict[s]
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_td0_base.py View on Github external
def get_act_value_func_dict(self, pol: Policy) -> QVFType:
        sa_dict = self.state_action_dict
        sa_uniform_dict = {(s, a): 1. / sum(len(v) for v in sa_dict.values())
                           for s, v1 in sa_dict.items() for a in v1}
        start_gen_f = get_rv_gen_func(sa_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
                        for s in self.state_action_dict.keys()}
        episodes = 0

        while episodes < self.num_episodes:
            state, action = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = self.state_reward_gen_dict[state][action]()
                next_action = act_gen_dict[next_state](1)[0]
                qf_dict[state][action] += self.alpha *\
                    (reward + self.gamma * qf_dict[next_state][next_action] -
                     qf_dict[state][action])
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_tdl_base.py View on Github external
def get_act_value_func_dict(self, pol: Policy) -> QVFType:
        sa_dict = self.state_action_dict
        sa_uniform_dict = {(s, a): 1. / sum(len(v) for v in sa_dict.values())
                           for s, v1 in sa_dict.items() for a in v1}
        start_gen_f = get_rv_gen_func(sa_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
                        for s in self.state_action_dict.keys()}
        episodes = 0

        while episodes < self.num_episodes:
            et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
            state, action = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = self.state_reward_gen_dict[state][action]()
                next_action = act_gen_dict[next_state](1)[0]
                delta = reward + self.gamma * qf_dict[next_state][next_action] -\
                    qf_dict[state][action]
                et_dict[state][action] += 1
                for s, a_set in self.state_action_dict.items():
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_qlearning.py View on Github external
def get_optimal(self) -> Tuple[DetPolicy, VFType]:
        pol = self.get_init_policy()
        sa_dict = self.state_action_dict
        s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
        start_gen_f = get_rv_gen_func(s_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        episodes = 0
        max_steps = 10000

        while episodes < self.num_episodes:
            state = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
                next_state, reward = self.state_reward_gen_dict[state][action]()
                qf_dict[state][action] += self.alpha *\
                    (reward + self.gamma * max(qf_dict[next_state][a]
                                               for a in sa_dict[next_state]) -
                     qf_dict[state][action])
                state = next_state
                steps += 1
                terminate = steps >= max_steps or state in self.terminal_states

            pol = get_soft_policy_from_qf(qf_dict, self.softmax, self.epsilon)
            episodes += 1

        pol = get_det_policy_from_qf(qf_dict)
        vf_dict = self.get_value_func_dict(pol)
        return pol, vf_dict
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_qlearning_lambda.py View on Github external
def get_optimal_det_policy(self) -> DetPolicy:
        pol = self.get_init_policy()
        sa_dict = self.state_action_dict
        s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
        start_gen_f = get_rv_gen_func(s_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        episodes = 0

        while episodes < self.num_episodes:
            et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
            state = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
                next_state, reward = self.state_reward_gen_dict[state][action]()
                delta = reward + self.gamma * max(qf_dict[next_state][a]
                                                  for a in sa_dict[next_state])\
                    - qf_dict[state][action]
                et_dict[state][action] += 1
                for s, a_set in self.state_action_dict.items():
                    for a in a_set:
                        qf_dict[s][a] += self.alpha * delta * et_dict[s][a]
                        et_dict[s][a] *= self.gamma * self.lambd
                pol = get_soft_policy_from_qf(qf_dict, self.softmax, self.epsilon)
                state = next_state
                steps += 1
                terminate = steps >= self.max_steps or\
                    state in self.terminal_states

            episodes += 1
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_expsarsa_lambda.py View on Github external
def get_optimal_det_policy(self) -> DetPolicy:
        pol = self.get_init_policy()
        sa_dict = self.state_action_dict
        s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
        start_gen_f = get_rv_gen_func(s_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        episodes = 0

        while episodes < self.num_episodes:
            state = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
                action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
                next_state, reward = self.state_reward_gen_dict[state][action]()
                delta = reward + self.gamma * sum(
                    pol.get_state_action_probability(next_state, a) *
                    qf_dict[next_state][a] for a in sa_dict[next_state])\
                    - qf_dict[state][action]
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_expsarsa_lambda.py View on Github external
def get_optimal_det_policy(self) -> DetPolicy:
        pol = self.get_init_policy()
        sa_dict = self.state_action_dict
        s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
        start_gen_f = get_rv_gen_func(s_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        episodes = 0

        while episodes < self.num_episodes:
            state = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
                action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
                next_state, reward = self.state_reward_gen_dict[state][action]()
                delta = reward + self.gamma * sum(
                    pol.get_state_action_probability(next_state, a) *
                    qf_dict[next_state][a] for a in sa_dict[next_state])\
                    - qf_dict[state][action]
                et_dict[state][action] += 1
                for s, a_set in self.state_action_dict.items():
                    for a in a_set:
                        qf_dict[s][a] += self.alpha * delta * et_dict[s][a]
                        et_dict[s][a] *= self.gamma * self.lambd
                pol = get_soft_policy_from_qf(qf_dict, self.softmax, self.epsilon)
                state = next_state
                steps += 1
                terminate = steps >= self.max_steps or\
                    state in self.terminal_states
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_qlearning.py View on Github external
def get_optimal(self) -> Tuple[DetPolicy, VFType]:
        pol = self.get_init_policy()
        sa_dict = self.state_action_dict
        s_uniform_dict = {s: 1. / len(sa_dict) for s in sa_dict.keys()}
        start_gen_f = get_rv_gen_func(s_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        episodes = 0
        max_steps = 10000

        while episodes < self.num_episodes:
            state = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                action = get_rv_gen_func(pol.get_state_probabilities(state))(1)[0]
                next_state, reward = self.state_reward_gen_dict[state][action]()
                qf_dict[state][action] += self.alpha *\
                    (reward + self.gamma * max(qf_dict[next_state][a]
                                               for a in sa_dict[next_state]) -
                     qf_dict[state][action])
github coverdrive / MDP-DP-RL / src / algorithms / opt_learning_tdl_base.py View on Github external
def get_act_value_func_dict(self, pol: Policy) -> QVFType:
        sa_dict = self.state_action_dict
        sa_uniform_dict = {(s, a): 1. / sum(len(v) for v in sa_dict.values())
                           for s, v1 in sa_dict.items() for a in v1}
        start_gen_f = get_rv_gen_func(sa_uniform_dict)
        qf_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
        act_gen_dict = {s: get_rv_gen_func(pol.get_state_probabilities(s))
                        for s in self.state_action_dict.keys()}
        episodes = 0

        while episodes < self.num_episodes:
            et_dict = {s: {a: 0.0 for a in v} for s, v in sa_dict.items()}
            state, action = start_gen_f(1)[0]
            steps = 0
            terminate = False

            while not terminate:
                next_state, reward = self.state_reward_gen_dict[state][action]()
                next_action = act_gen_dict[next_state](1)[0]
                delta = reward + self.gamma * qf_dict[next_state][next_action] -\
                    qf_dict[state][action]