Gradient-free RL (Cartpole-v1)#

We can use pyhopper to even treat the weights of a neural network as hyperparameters. For instance

import numpy as np
import gym
import pyhopper

def replay_policy(policy, render=False):
    done = False
    x = env.reset()
    total_reward = 0
    while not done:
        # Forward pass of MLP with tanh activation
        x = np.tanh(np.dot(x, policy["w1"]) + policy["b1"])
        # Second layer with 2 output classes
        x = np.argmax(np.dot(x, policy["w2"]) + policy["b2"])
        x, r, done, info = env.step(x)
        total_reward += r
    return total_reward

search = pyhopper.Search(
    {
        "w1": pyhopper.float(shape=(4, 64)),
        "b1": pyhopper.float(shape=(64,)),
        "w2": pyhopper.float(shape=(64, 2)),
        "b2": pyhopper.float(shape=(2,)),
    }
)

env = gym.make("CartPole-v1")
policy = search.run(
    pyhopper.wrap_n_times(replay_policy, n=5),
    direction="max",
    runtime="1s",
    n_jobs=4,
)

trains an MLP with one hidden layer to a maximum reward of 500 in less than a second