The computations are run on an Ubuntu Linux computer, with the following conda environment:

conda create --name mountain-car-continuous python==3.7 --no-default-packages -y
conda activate mountain-car-continuous
pip install gym[classic_control]
sudo apt-get install xvfb
pip install jupyterlab
pip install pyvirtualdisplay
pip install matplotlib
pip install cma
pip install scipy
import os
import numpy as np
import gym
import cma
from copy import copy
import concurrent.futures
import matplotlib.pylab as plt
from matplotlib import animation
from itertools import product
import argparse
import pickle
env = gym.make('MountainCarContinuous-v0', render_mode='rgb_array')
state = env.reset()
np.random.seed(0)
state_shape = env.observation_space.shape[0]
action_shape = env.action_space.shape[0]
N_HIDDEN_1 = 25
POPULATION_SIZE = 16
def reshape(theta, state_shape, action_shape):
    w1_length = state_shape * N_HIDDEN_1
    b1_length = N_HIDDEN_1
    w2_length = N_HIDDEN_1 * action_shape
    b2_length = action_shape
    
    w1_theta = np.copy(theta[0:w1_length])
    b1_theta = np.copy(theta[w1_length:w1_length+b1_length])
    w2_theta = np.copy(theta[w1_length+b1_length:w1_length+b1_length+w2_length])
    b2_theta = np.copy(theta[-b2_length:].copy())
    
    w1_theta = np.reshape(w1_theta, (state_shape, N_HIDDEN_1))
    b1_theta = np.reshape(b1_theta, (N_HIDDEN_1,))
    w2_theta = np.reshape(w2_theta, (N_HIDDEN_1, action_shape))
    b2_theta = np.reshape(b2_theta, (action_shape,))
    return (w1_theta, b1_theta, w2_theta, b2_theta)
def play_episode(theta, render=False):
    theta = reshape(np.copy(theta), state_shape, action_shape)

    rewards = [0.0]

    state, _ = env.reset()
    frames = [env.render()] if render else None
    done = False
    while not done:
        state = np.squeeze(state)

        h1 = np.matmul(np.expand_dims(state, axis=0), theta[0]) + theta[1]
        action = np.tanh(np.matmul(h1, theta[2]) + theta[3])
        next_state, reward, done, terminated, _ = env.step(action)
        if terminated: done = True

        if render:
            frames.append(env.render())

        rewards.append(reward)
        state = next_state

        # avoid episodes too long
        if len(rewards) > 1000:
            break
    
    return rewards, frames
N_THETA = state_shape * N_HIDDEN_1 + N_HIDDEN_1 + N_HIDDEN_1 * action_shape + action_shape
print(f"# CPUs: {os.cpu_count()}, # parameters: {N_THETA}, population size: {POPULATION_SIZE}")
# CPUs: 8, # parameters: 101, population size: 16
es = cma.CMAEvolutionStrategy(N_THETA * [0], 0.5, {'popsize': POPULATION_SIZE, 'seed': 42})

history = []
for i in range(2_000):
    solutions = es.ask()
    outputs = list(map(play_episode, solutions))
    # we minimize, not maximize
    total_rewards = -np.array(list(map(lambda x: sum(x[0]), outputs)))
    es.tell(solutions, total_rewards)
    history.append(total_rewards)
    # print(f'\tMin: {total_rewards.min()}, max; {total_rewards.max()}, mean: {total_rewards.mean()}')
    es.disp()

env.close()
(8_w,16)-aCMA-ES (mu_w=4.8,w_1=32%) in dimension 101 (seed=42, Thu Sep 29 19:49:49 2022)


/opt/conda/envs/mountain-car-continuous/lib/python3.7/site-packages/gym/utils/passive_env_checker.py:165: UserWarning: WARN: The obs returned by the `step()` method is not within the observation space.
  logger.warn(f"{pre} is not within the observation space.")


Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
    1     16 1.502839754904724e+00 1.0e+00 4.81e-01  5e-01  5e-01 0:00.6
    2     32 -6.646012667324604e+01 1.0e+00 4.67e-01  5e-01  5e-01 0:01.3
    3     48 -5.465133991109310e+01 1.0e+00 4.55e-01  5e-01  5e-01 0:02.0
    8    128 -3.068559179630321e+01 1.0e+00 4.25e-01  4e-01  4e-01 0:05.2
   15    240 -9.552544701060445e+01 1.1e+00 4.19e-01  4e-01  4e-01 0:09.6


/opt/conda/envs/mountain-car-continuous/lib/python3.7/site-packages/gym/envs/classic_control/continuous_mountain_car.py:171: DeprecationWarning: setting an array element with a sequence. This was supported in some cases where the elements are arrays with a single element. For example `np.array([1, np.array([2])], dtype=int)`. In the future this will raise the same ValueError as `np.array([1, [2]], dtype=int)`.
  self.state = np.array([position, velocity], dtype=np.float32)


   24    384 -8.786137542985512e+01 1.1e+00 4.20e-01  4e-01  4e-01 0:14.8
   35    560 -9.520639168347108e+01 1.2e+00 4.28e-01  4e-01  4e-01 0:20.9
   48    768 -7.462872998594121e+01 1.2e+00 4.24e-01  4e-01  4e-01 0:28.1
   64   1024 -9.345241306785253e+01 1.3e+00 4.42e-01  4e-01  5e-01 0:36.2
   83   1328 -8.912790851050210e+01 1.4e+00 5.09e-01  5e-01  5e-01 0:45.7
  100   1600 -8.998464581098267e+01 1.4e+00 4.85e-01  5e-01  5e-01 0:54.2
  123   1968 -9.253289330772405e+01 1.5e+00 4.22e-01  4e-01  4e-01 1:05.2
  149   2384 -9.160463847218981e+01 1.5e+00 4.07e-01  4e-01  4e-01 1:17.3
  181   2896 -9.451325729414420e+01 1.6e+00 4.58e-01  4e-01  5e-01 1:30.3
  200   3200 -9.074752236387545e+01 1.6e+00 4.50e-01  4e-01  5e-01 1:37.1
  243   3888 -9.315807137483985e+01 1.7e+00 4.48e-01  4e-01  5e-01 1:52.4
  283   4528 -9.335987300123774e+01 1.8e+00 4.44e-01  4e-01  5e-01 2:08.5
  300   4800 -9.257067658096769e+01 1.8e+00 4.47e-01  4e-01  5e-01 2:13.6
  358   5728 -9.317723918847366e+01 1.9e+00 3.98e-01  4e-01  4e-01 2:31.7
  400   6400 -9.257488748293781e+01 2.0e+00 3.60e-01  3e-01  4e-01 2:45.3
  483   7728 -9.353893549681915e+01 2.1e+00 3.07e-01  3e-01  3e-01 3:05.4
  500   8000 -9.328339664349933e+01 2.1e+00 3.16e-01  3e-01  3e-01 3:08.9
  597   9552 -9.351628460627350e+01 2.3e+00 3.24e-01  3e-01  3e-01 3:31.0
  600   9600 -9.300784346591743e+01 2.3e+00 3.27e-01  3e-01  3e-01 3:31.6
  690  11040 -9.281270587728579e+01 2.4e+00 3.60e-01  3e-01  4e-01 3:55.8
  700  11200 -9.137431534209358e+01 2.4e+00 3.59e-01  3e-01  4e-01 3:58.6
  800  12800 -9.257380300057699e+01 2.6e+00 2.82e-01  3e-01  3e-01 4:22.8
  900  14400 -9.352347228619102e+01 2.7e+00 2.73e-01  3e-01  3e-01 4:41.8
 1000  16000 -9.349452192157889e+01 2.8e+00 2.07e-01  2e-01  2e-01 4:56.9
Iterat #Fevals   function value  axis ratio  sigma  min&max std  t[m:s]
 1100  17600 -9.345542787490389e+01 3.0e+00 1.91e-01  2e-01  2e-01 5:09.7
 1200  19200 -9.336087239567482e+01 3.1e+00 2.04e-01  2e-01  2e-01 5:21.2
 1300  20800 -9.327746855232169e+01 3.3e+00 1.93e-01  2e-01  2e-01 5:32.0
 1400  22400 -9.381875731368403e+01 3.5e+00 1.87e-01  2e-01  2e-01 5:42.9
 1500  24000 -9.344324633721889e+01 3.6e+00 1.76e-01  2e-01  2e-01 5:53.4
 1600  25600 -9.358110100892283e+01 3.9e+00 1.68e-01  2e-01  2e-01 6:03.9
 1700  27200 -9.338956351808618e+01 4.0e+00 1.78e-01  2e-01  2e-01 6:14.9
 1800  28800 -9.381769311707737e+01 4.2e+00 1.62e-01  1e-01  2e-01 6:25.4
 1900  30400 -9.364299196548910e+01 4.4e+00 1.98e-01  2e-01  2e-01 6:35.9
 2000  32000 -9.349874932676293e+01 4.6e+00 2.02e-01  2e-01  2e-01 6:46.3
history = -np.array(history)
def smooth(x,window_len=11,window='hanning'):
    s =np.r_[x[window_len-1:0:-1],x,x[-2:-window_len-1:-1]]
    #print(len(s))
    if window == 'flat': #moving average
        w=np.ones(window_len,'d')
    else:
        w=eval('np.'+window+'(window_len)')

    y=np.convolve(w/w.sum(),s,mode='valid')
    return y
from scipy.signal import butter, filtfilt
plt.figure(figsize=(12, 4))
for q in [0.1, 0.25, 0.5, 0.75, 0.9]:
    line = np.quantile(history, q, axis=1)
    plt.plot(smooth(line), label=f'Q: {q:.2f}')
plt.xlabel('Iteration')
plt.ylabel('Total Reward')
plt.legend(loc='upper left');

png

rewards, frames = play_episode(es.result.xbest, render=True)
total_rewards = [0] + np.cumsum(rewards).tolist()
# np array with shape (frames, height, width, channels)
video = np.array(frames[:]) 

fig, ax = plt.subplots(figsize=(4, 4))
im = ax.imshow(video[0,:,:,:])
ax.set_axis_off()
text = ax.text(30, 60, '', color='red')
plt.close() # this is required to not display the generated image

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    text.set_text(f'Step {i}, total reward: {total_rewards[i]:.2f}')
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=100)
anim.save('mountain-car-continuous-video.gif', writer='pillow', dpi=80, fps=24)