We’re having a lot of trouble hacking together a reinforcement learning version of this, so we are taking an alternative approacg, inspired by wtaching the MIT underactuated robotics course.

http://underactuated.csail.mit.edu/underactuated.html?chapter=acrobot

It took some pen and paper to get the equations of motion (which are maybe right?).

openai gym has

We switch over to LQR when the y position of the pole is above a certain height

This scipy function solves the algebriac ricatti equation in the ocntinous time infite horizon section

https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.linalg.solve_continuous_are.html

Things that helped: Trying to balance pole first from upright position then from downright.

Hacked in the LQR control by adjusting force_mag variable. Nasty.

Put it some slight compensation for a delayed observation, which reflects our actual sensor system

from custom_cartpole_delay import CartPoleEnv

import gym

import numpy as np
import scipy.linalg as linalg
lqr = linalg.solve_continuous_are

env = CartPoleEnv()
env.buffer_size = 5
gravity = 9.8
masscart = 1.0
masspole = 0.1
total_mass = (masspole + masscart)
length = 0.5 # actually half the pole's length
polemass_length = (masspole * length)
force_mag = 10.0
tau = 0.02

def E(x):
return 1/2 * masspole * (2 * length)**2 / 3 *  x**2 + np.cos(x) * polemass_length * gravity
def u(x):
#print(E(x)-Ed)
return 1.0 * (E(x)-Ed) * x * np.cos(x)

H = np.array([
[1,0,0,0],
[0,total_mass, 0, - polemass_length],
[0,0,1,0],
[0,- polemass_length, 0, (2 * length)**2 * masspole /3]
])

Hinv = np.linalg.inv(H)

A = Hinv @ np.array([
[0,1,0,0],
[0,0,0,0],
[0,0,0,1],
[0,0, - polemass_length * gravity, 0]
])
B = Hinv @ np.array([0,1.0,0,0]).reshape((4,1))
Q = np.diag([0.1, 1.0, 100.0, 5.0])
R = np.array([[0.1]])

P = lqr(A,B,Q,R)
Rinv = np.linalg.inv(R)
K = Rinv @ B.T @ P
print(K)
def ulqr(x):
x1 = np.copy(x)
x1 = np.sin(x1)
return np.dot(K, x1)

Ed = E([0,0,0,0])

observation = env.reset()
for _ in range(1000):
observation = np.copy(observation)
env.render()
observation += observation * tau * env.buffer_size
observation += observation * tau * env.buffer_size
observation += np.sin(observation) * tau * env.buffer_size * gravity  / (length * 2) / 2
#action = env.action_space.sample() # your agent here (this takes random actions)
a = u(observation) - 0.3 * observation -  0.1 * observation
if  abs(E(observation)-Ed) < 0.1 and np.cos(observation) > 0.6: #abs(E(observation)-Ed) < 0.1 and

a = ulqr(observation)
env.force_mag = min(abs(a), 10)
#print(a)
else:
env.force_mag = 10.0
if a < 0:
action = 0
else:
action = 1

observation, reward, done, info = env.step(action)

import math
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import logging
logger = logging.getLogger(__name__)

class CartPoleEnv(gym.Env):
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second' : 50
}

def __init__(self):
self.gravity = 9.8
self.masscart = 1.0
self.masspole = 0.1
self.total_mass = (self.masspole + self.masscart)
self.length = 0.5 # actually half the pole's length
self.polemass_length = (self.masspole * self.length)
self.force_mag = 3.0
self.tau = 0.02  # seconds between state updates

# Angle at which to fail the episode
# we expect full swings
self.theta_threshold_radians =  np.pi  #12 * 2 * math.pi / 360
self.x_threshold = 2.4

# Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
high = np.array([
self.x_threshold * 2,
np.finfo(np.float32).max,
np.finfo(np.float32).max])
high2 = np.array()
self.action_space = spaces.Discrete(2) #spaces.Box(-high2, high2)#
self.observation_space = spaces.Box(-high, high)

self._seed()
self.viewer = None
self.state = None
self.buffer_size = 0

self.steps_beyond_done = None
self.steps = 0
self.num_envs = 1
self.viewer = None
viewer = None
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]

def _step(self, action):
#assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
#print(action)
#action =action # max(-1, min(action,1))
state = self.state
x, x_dot, theta, theta_dot = state
force = self.force_mag if action==1 else -self.force_mag
#force = self.force_mag * action
#print(action)
#print(state)

x  = x + self.tau * x_dot
theta = theta + self.tau * theta_dot
costheta = math.cos(theta)
sintheta = math.sin(theta)
temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
xacc  = temp - self.polemass_length * thetaacc * costheta / self.total_mass

x_dot = x_dot + self.tau * xacc

theta_dot = theta_dot + self.tau * thetaacc
self.state = (x,x_dot,theta,theta_dot)

done =  x < -self.x_threshold \
or x > self.x_threshold \
or theta < -np.pi * 5 \
or theta > np.pi * 5 \
or self.steps > 1024
done = bool(done)

self.steps += 1
limit = 200

reward = 0.0
if  x < -self.x_threshold or x > self.x_threshold:
reward -= 1.0
reward += (np.cos(theta)+1)**2 / 4
#reward -= 0.1 * action**2
reward += -0.1 * x**2
if np.cos(theta) > 0.95:
reward += 1
#reward = reward/2048
'''
if not done:
reward = 1.0
elif self.steps_beyond_done is None:
# Pole just fell!
self.steps_beyond_done = 0
reward = 1.0
else:
if self.steps_beyond_done == 0:
logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
self.steps_beyond_done += 1
reward = 0.0
'''
#print(np.array(self.state).reshape((1,-1)), reward, done, {})

#obs = np.array([x,x_dot,np.cos(theta),np.sin(theta),theta_dot])# + self.action_buffer)
self.buffer.append(np.copy(self.state))
obs2 = self.buffer.pop(0)
return obs2, reward, done, {}

def _reset(self):
#self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
self.state = np.array([0, 0, np.pi+0.2 , 0]) # np.pi+0.2 + self.np_random.uniform(low=-1.0, high=1.0, size=(4,))
self.steps_beyond_done = None
self.steps = 0
self.buffer = []
for i in range(self.buffer_size):
self.buffer.append(self.state)
return np.array(self.state)

def _render(self, mode='human', close=False):
if close:
if self.viewer is not None:
self.viewer.close()
self.viewer = None
return

screen_width = 600
screen_height = 400

world_width = self.x_threshold*2
scale = screen_width/world_width
carty = 100 # TOP OF CART
polewidth = 10.0
polelen = scale * 1.0
cartwidth = 50.0
cartheight = 30.0

if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2
axleoffset =cartheight/4.0
cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
self.carttrans = rendering.Transform()
l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
pole.set_color(.8,.6,.4)
self.poletrans = rendering.Transform(translation=(0, axleoffset))
self.axle = rendering.make_circle(polewidth/2)
self.axle.set_color(.5,.5,.8)