How to use the tianshou.data.advantage_estimation function in tianshou

To help you get started, we’ve selected a few tianshou examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github thu-ml / tianshou / examples / ppo_cartpole_gym.py View on Github external
# to allow network sharing between policy and value networks. This makes 'policy' and 'value_function'
    # imbalanced semantically (though they are naturally imbalanced since 'policy' is required to interact
    # with the environment and 'value_function' is not). I have an idea to solve this imbalance, which is
    # not based on passing function or overriding function.

    ### 2. build policy, loss, optimizer
    pi = policy.OnehotCategorical(my_policy, observation_placeholder=observation_ph, weight_update=0)

    ppo_loss_clip = losses.ppo_clip(pi, clip_param)

    total_loss = ppo_loss_clip
    optimizer = tf.train.AdamOptimizer(1e-4)
    train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)

    ### 3. define data collection
    training_data = Batch(env, pi, advantage_estimation.full_return)

    ### 4. start training
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        # assign pi to pi_old
        pi.sync_weights()  # TODO: automate this for policies with target network

        start_time = time.time()
        for i in range(100):
            # collect data
            training_data.collect(num_episodes=50)

            # print current return
github thu-ml / tianshou / examples / dqn.py View on Github external
return None, action_values  # no policy head

    ### 2. build policy, loss, optimizer
    dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, has_old_net=True)
    pi = policy.DQN(dqn)

    dqn_loss = losses.qlearning(dqn)

    total_loss = dqn_loss
    optimizer = tf.train.AdamOptimizer(1e-4)
    train_op = optimizer.minimize(total_loss, var_list=list(dqn.trainable_variables))

    ### 3. define data collection
    replay_buffer = VanillaReplayBuffer(capacity=2e4, nstep=1)

    process_functions = [advantage_estimation.nstep_q_return(1, dqn)]
    managed_networks = [dqn]

    data_collector = DataCollector(
        env=env,
        policy=pi,
        data_buffer=replay_buffer,
        process_functions=process_functions,
        managed_networks=managed_networks
    )

    ### 4. start training
    # hyper-parameters
    batch_size = 32
    replay_buffer_warmup = 1000
    epsilon_decay_interval = 500
    epsilon = 0.6
github thu-ml / tianshou / examples / dqn_example.py View on Github external
action_values = tf.layers.dense(net, action_dim, activation=None)

        return None, action_values  # no policy head

    ### 2. build policy, loss, optimizer
    dqn = value_function.DQN(my_network, observation_placeholder=observation_ph, weight_update=100)
    pi = policy.DQN(dqn)

    dqn_loss = losses.qlearning(dqn)

    total_loss = dqn_loss
    optimizer = tf.train.AdamOptimizer(1e-4)
    train_op = optimizer.minimize(total_loss, var_list=dqn.trainable_variables)

    ### 3. define data collection
    data_collector = Batch(env, pi, [advantage_estimation.nstep_q_return(1, dqn)], [dqn])

    ### 4. start training
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        # assign actor to pi_old
        pi.sync_weights()  # TODO: automate this for policies with target network

        start_time = time.time()
        #TODO : repeat_num shoulde be defined in some configuration files
        repeat_num = 100
        for i in range(repeat_num):
            # collect data
            data_collector.collect(num_episodes=50, epsilon_greedy= (repeat_num - i + 0.0) / repeat_num)
github thu-ml / tianshou / examples / actor_critic_separate_cartpole.py View on Github external
### 2. build policy, critic, loss, optimizer
    actor = policy.OnehotCategorical(my_network, observation_placeholder=observation_ph, weight_update=1)
    critic = value_function.StateValue(my_network, observation_placeholder=observation_ph)

    actor_loss = losses.REINFORCE(actor)
    critic_loss = losses.value_mse(critic)

    actor_optimizer = tf.train.AdamOptimizer(1e-4)
    actor_train_op = actor_optimizer.minimize(actor_loss, var_list=actor.trainable_variables)

    critic_optimizer = tf.train.RMSPropOptimizer(1e-4)
    critic_train_op = critic_optimizer.minimize(critic_loss, var_list=critic.trainable_variables)

    ### 3. define data collection
    data_collector = Batch(env, actor,
                           [advantage_estimation.gae_lambda(1, critic), advantage_estimation.nstep_return(1, critic)],
                           [actor, critic])

    ### 4. start training
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        start_time = time.time()
        for i in range(100):
            # collect data
            data_collector.collect(num_episodes=20)

            # print current return
            print('Epoch {}:'.format(i))
            data_collector.statistics()
github thu-ml / tianshou / examples / ppo.py View on Github external
pi = policy.Distributional(my_policy, observation_placeholder=observation_ph, has_old_net=True)

    ppo_loss_clip = losses.ppo_clip(pi, clip_param)

    total_loss = ppo_loss_clip
    optimizer = tf.train.AdamOptimizer(1e-4)
    train_op = optimizer.minimize(total_loss, var_list=list(pi.trainable_variables))

    ### 3. define data collection
    data_buffer = BatchSet()

    data_collector = DataCollector(
        env=env,
        policy=pi,
        data_buffer=data_buffer,
        process_functions=[advantage_estimation.full_return],
        managed_networks=[pi],
    )

    ### 4. start training
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        # assign actor to pi_old
        pi.sync_weights()

        start_time = time.time()
        for i in range(1000):
            # collect data
            data_collector.collect(num_episodes=50)
github thu-ml / tianshou / examples / actor_critic_fail_cartpole.py View on Github external
### 2. build policy, critic, loss, optimizer
    print('actor and critic will share the first two layers in this case, and the third layer will cause error')
    actor = policy.OnehotCategorical(my_actor, observation_placeholder=observation_ph, weight_update=1)
    critic = value_function.StateValue(my_critic, observation_placeholder=observation_ph)



    actor_loss = losses.vanilla_policy_gradient(actor)
    critic_loss = losses.value_mse(critic)
    total_loss = actor_loss + critic_loss

    optimizer = tf.train.AdamOptimizer(1e-4)
    train_op = optimizer.minimize(total_loss, var_list=actor.trainable_variables)

    ### 3. define data collection
    training_data = Batch(env, actor, advantage_estimation.full_return)

    ### 4. start training
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        # assign actor to pi_old
        actor.sync_weights()  # TODO: automate this for policies with target network

        start_time = time.time()
        for i in range(100):
            # collect data
            training_data.collect(num_episodes=20)

            # print current return
github thu-ml / tianshou / examples / ddpg.py View on Github external
critic = ts.value_function.ActionValue(my_network, observation_placeholder=observation_ph,
                                        action_placeholder=action_ph, has_old_net=True)
    soft_update_op = ts.get_soft_update_op(1e-2, [actor, critic])

    critic_loss = ts.losses.value_mse(critic)
    critic_optimizer = tf.train.AdamOptimizer(1e-3)
    critic_train_op = critic_optimizer.minimize(critic_loss, var_list=list(critic.trainable_variables))

    dpg_grads_vars = ts.opt.DPG(actor, critic)
    actor_optimizer = tf.train.AdamOptimizer(1e-3)
    actor_train_op = actor_optimizer.apply_gradients(dpg_grads_vars)

    ### 3. define data collection
    data_buffer = ts.data.VanillaReplayBuffer(capacity=10000, nstep=1)

    process_functions = [ts.data.advantage_estimation.ddpg_return(actor, critic)]

    data_collector = ts.data.DataCollector(
        env=env,
        policy=actor,
        data_buffer=data_buffer,
        process_functions=process_functions,
        managed_networks=[actor, critic]
    )

    ### 4. start training
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        # assign actor to pi_old
github thu-ml / tianshou / examples / ppo_example.py View on Github external
# 2. build losses, optimizers
    pi = policy.OnehotCategorical(action_logits, observation_placeholder=observation) # YongRen: policy.Gaussian (could reference the policy in TRPO paper, my code is adapted from zhusuan.distributions) policy.DQN etc.
    # for continuous action space, you may need to change an environment to run
    pi_old = policy.OnehotCategorical(action_logits_old, observation_placeholder=observation)

    action = tf.placeholder(dtype=tf.int32, shape=[None]) # batch of integer actions
    advantage = tf.placeholder(dtype=tf.float32, shape=[None]) # advantage values used in the Gradients

    ppo_loss_clip = losses.ppo_clip(action, advantage, clip_param, pi, pi_old) # TongzhengRen: losses.vpg ... management of placeholders and feed_dict

    total_loss = ppo_loss_clip
    optimizer = tf.train.AdamOptimizer(1e-3)
    train_op = optimizer.minimize(total_loss, var_list=train_var_list)

    # 3. define data collection
    training_data = Batch(env, pi, advantage_estimation.full_return) # YouQiaoben: finish and polish Batch, advantage_estimation.gae_lambda as in PPO paper
                                                             # ShihongSong: Replay(), see dqn_example.py
    # maybe a dict to manage the elements to be collected

    # 4. start training
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        minibatch_count = 0
        collection_count = 0
        while True: # until some stopping criterion met...
            # collect data
            training_data.collect(num_episodes=2) # YouQiaoben, ShihongSong
            collection_count += 1
            print('Collected {} times.'.format(collection_count))

            # update network
github thu-ml / tianshou / examples / ppo_cartpole_alternative.py View on Github external
is_training_ph = tf.placeholder(tf.bool, shape=())
    keep_prob_ph = tf.placeholder(tf.float32, shape=())

    my_policy = MyPolicy(observation_ph, is_training_ph, keep_prob_ph, action_dim)

    ### 2. build policy, loss, optimizer
    pi = policy.Normal(my_policy, observation_placeholder=observation_ph, weight_update=0)

    ppo_loss_clip = losses.ppo_clip(pi, clip_param)

    total_loss = ppo_loss_clip
    optimizer = tf.train.AdamOptimizer(1e-4)
    train_op = optimizer.minimize(total_loss, var_list=pi.trainable_variables)

    ### 3. define data collection
    training_data = Batch(env, pi, advantage_estimation.full_return)

    ### 4. start training
    feed_dict_train = {is_training_ph: True, keep_prob_ph: 0.8}
    feed_dict_test = {is_training_ph: False, keep_prob_ph: 1}

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        # assign pi to pi_old
        pi.sync_weights()  # TODO: automate this for policies with target network

        start_time = time.time()
        for i in range(100):
            # collect data
github thu-ml / tianshou / examples / actor_critic.py View on Github external
optimizer = tf.train.AdamOptimizer(1e-4)

    # this hack would be unnecessary if we have a `SharedPolicyValue` class, or hack the trainable_variables management
    var_list = list(actor.trainable_variables | critic.trainable_variables)

    train_op = optimizer.minimize(total_loss, var_list=var_list)

    ### 3. define data collection
    data_buffer = ts.data.BatchSet()

    data_collector = ts.data.DataCollector(
        env=env,
        policy=actor,
        data_buffer=data_buffer,
        process_functions=[ts.data.advantage_estimation.nstep_return(n=3, value_function=critic, return_advantage=True)],
        managed_networks=[actor, critic],
    )

    ### 4. start training
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())

        start_time = time.time()
        for i in range(1000):
            # collect data
            data_collector.collect(num_episodes=50)

            # print current return
            print('Epoch {}:'.format(i))