# More Reinforcement Learning with cvxpy

So I spent thanksgiving doing this and playing Zelda. Even though that sounds like a hell of a day, seems a little sad for thanksgiving :(. I should probably make more of an effort to go home next year.

I tried implementing a more traditional q-learning pipeline using cvxpy (rather than the inequality trick of the last time). Couldn’t get it to work as well. And it’s still kind of slow despite a lot of rearrangement to vectorize operations (through batching basically).

I guess I’m still entranced with the idea of avoiding neural networks. In a sense, that is the old boring way of doing things. The Deep RL is the new stuff. Using ordinary function approximators is way older I think. But I feel like it takes a problem out of the equation (dealing with training neural nets). Also I like modeling languages/libraries.

I kept finding show stopping bugs throughout the day (incorrectly written maxaction functions, typos, cross episode data points, etc.), so I wouldn’t be surprised if there is one still in here. It’s very surprising how one can convince oneself that it is kind of working when it is actually impossible it’s working. All these environments are so simple, that I suspect I could randomly sample controllers out of a sack for the time I’ve been fiddling with this stuff and find a good one.

```import gym
import numpy as np
import cvxpy as cvx
from numpy.polynomial.chebyshev import chebval
env = gym.make('Pendulum-v0')
print(env.action_space)

print(env.observation_space)
print(env.observation_space.high)

print(env.observation_space.low) # [1,1,8]
print(env.action_space.high) # -2
print(env.action_space.low)

chebdeg = 2
alpha = cvx.Variable(3*chebdeg**3)
alpha.value = np.zeros(3*chebdeg**3)
gamma = 1. - 1./50
def basis(s,a):
n = np.arange(4)
f1 = chebval(s/1.,np.eye(chebdeg)).reshape(-1,1,1,1)
f2 = chebval(s/1.,np.eye(chebdeg)).reshape(1,-1,1,1)
f3 = chebval(s/8.,np.eye(chebdeg)).reshape(1,1,-1,1)
f4 = ((a/2)**np.arange(3)).reshape(1,1,1,-1)
return f1*f2*f3*f4
def vbasis(s,a):
#n = np.arange(4)
N = s.shape

#print(chebval(s[:,0]/1.,np.eye(chebdeg)).shape)
f1 = chebval(s[:,0]/1.,np.eye(chebdeg)).T.reshape(N,-1,1,1,1)
f2 = chebval(s[:,1]/1.,np.eye(chebdeg)).T.reshape(N,1,-1,1,1)
f3 = chebval(s[:,2]/8.,np.eye(chebdeg)).T.reshape(N,1,1,-1,1)
if type(a) == np.ndarray:
f4 = (a.reshape(-1,1,1,1,1)/2)**(np.arange(3).reshape(1,1,1,1,-1))
else:
f4 = (a/2)**(np.arange(3).reshape(1,1,1,1,-1))
return (f1*f2*f3*f4).reshape(N,-1)

def qmaxaction(alpha,s):
#n = np.arange(4)
N = s.shape

#print(chebval(s[:,0]/1.,np.eye(chebdeg)).shape)
f1 = chebval(s[:,0]/1.,np.eye(chebdeg)).T.reshape(N,-1,1,1,1)
f2 = chebval(s[:,1]/1.,np.eye(chebdeg)).T.reshape(N,1,-1,1,1)
f3 = chebval(s[:,2]/8.,np.eye(chebdeg)).T.reshape(N,1,1,-1,1)
z = f1*f2*f3
a = (np.array([0,0,0.25]).reshape(1,1,1,1,-1)*z).reshape(N,-1)@alpha.value
b = (np.array([0,0.5,0]).reshape(1,1,1,1,-1)*z).reshape(N,-1)@alpha.value

acts = np.clip(-b/2/(a+1e-7), -2,2)
q2 = vbasis(s,2)@alpha.value
print(acts)
qa = vbasis(s,acts)@alpha.value
qn2 = vbasis(s,-2)@alpha.value
return np.maximum(qa,np.maximum(qn2,q2))

def evalb(alpha, s,a):
f = basis(s,a)
return alpha*f.flatten()

def maxaction(alpha, obs):
f1 = np.sum(evalb(alpha.value, observation, 2))
f2 = np.sum(evalb(alpha.value, observation, 0))
f3 = np.sum(evalb(alpha.value, observation, -2))
coeff = np.polyfit([2,0,-2], [f1,f2,f3], deg=2)
if coeff < 0:
action = -coeff/2/coeff
action = min(max(action,-2),2)
elif f1 > f3:
#print("huh")
action = 2
else:
#print("how bout dat")
action = -2
return np.array([action])

constraints = []
objective = 0

for x in range(4):

constraints = []
epobs = []
eprewards = []
epactions = []
objective = 0
epsilon = 1.2/(x+1)
print("epsilon: ", epsilon)
epNum = 50
for i_episode in range(epNum):
observations = []
rewards = []
actions = []
observation = env.reset()
reward = -100
for t in range(100):

prev_obs = observation
prev_reward = reward
if np.random.rand() < epsilon:
action = env.action_space.sample()
else:
action = maxaction(alpha, observation)
observation, reward, done, info = env.step(action)
observations.append(observation)
rewards.append(reward)
actions.append(action)

#constraints.append(evalb(alpha, prev_obs, action) >= prev_reward + gamma*evalb(alpha, observation, action + np.random.randn()*0.2))#maxaction(alpha, observation)))
#objective += evalb(alpha, observation, env.action_space.sample())
if done:
print("Episode finished after {} timesteps".format(t+1))
break
epobs.append(observations)
epactions.append(actions)
eprewards.append(rewards)
for qiter in range(20):
print("Q iteration: ", qiter)
objective = 0
for (observations,actions, rewards) in zip(epobs,epactions,eprewards):
obs = np.array(observations)
act = np.array(actions)
bprev = vbasis(obs[:-1],act[1:]) #   np.array([basis(s,a).flatten() for (s,a) in zip(observations[:-1],actions[1:])])
#bnext = np.array([basis(s,a+np.random.randn()*0.2).flatten() for (s,a) in zip(observations[1:],actions[1:])])
#obs = np.array(observations)[:-1]
#q2 = np.array([basis(s,2).flatten() for s in observations[:-1]])@alpha.value
#q2 = vbasis(obs[1:],2)@alpha.value
#q0 = vbasis(obs[1:],0)@alpha.value
#qn2 = vbasis(obs[1:],-2)@alpha.value
#q1 = vbasis(obs[1:],1)@alpha.value
#qn1 = vbasis(obs[1:],-1)@alpha.value
#qs = []
#for a in np.linspace(-2,2,5):
#    qs.append(vbasis(obs[1:],a+np.random.randn()*0.5)@alpha.value)
#for a in np.linspace(-0.1,0.1,3):
#    qs.append(vbasis(obs[1:],a)@alpha.value)

#qmax = np.max(np.stack([q2,q0,qn2,q1,qn1],axis=1),axis=1)
#qmax = np.max(np.stack(qs,axis=1),axis=1)
qmax = qmaxaction(alpha, obs[1:])

#q0 = np.array([basis(s,0).flatten() for s in observations[:-1]])@alpha.value
#qn2 = np.array([basis(s,-2).flatten() for s in observations[:-1]])@alpha.value

#qmax = np.maximum(np.maximum(q0,q2),qn2)#np.array([np.dot(basis(s, maxaction(alpha,s)).flatten(),alpha.value) for s in observations[1:]])
rewards = np.array(rewards)[:-1]
objective += cvx.sum(cvx.huber(bprev@alpha - (rewards + gamma*qmax))) #cvx.sum_squares(bprev@alpha - (rewards + gamma*qmax))

#bcost = np.array([basis(s,env.action_space.sample()).flatten() for s in observations])
#objective = cvx.sum(bcost@alpha) # np.sum(bnext,axis=0) * alpha
loss = 0.001*cvx.sum_squares(alpha-alpha.value) + objective
prob = cvx.Problem(cvx.Minimize(loss), constraints)
# The optimal objective value is returned by `prob.solve()`.
print("solving problem")
result = prob.solve(verbose=True)
print(result)
# The optimal value for x is stored in `x.value`.
print(alpha.value)
#inspection loop
for i_episode in range(4):
observation = env.reset()
#env.env.state = np.random.randn()*sigma
for t in range(200):
env.render()
#print(observation)
prev_obs = observation
action = maxaction(alpha, observation)
#print(action)

observation, reward, done, info = env.step(action)
if done:
print("Episode finished after {} timesteps".format(t+1))
break
```

I also did the easy cartpole environment using the inequality trick.  Seems to work pretty well.

```import gym
import numpy as np
import cvxpy as cvx
from numpy.polynomial.chebyshev import chebval
env = gym.make('CartPole-v1')
print(env.action_space)

print(env.observation_space)
print(env.observation_space.high)

print(env.observation_space.low) # [1,1,8]
#print(env.action_space.high) # -2
#print(env.action_space.low)

chebdeg = 3
alpha = cvx.Variable(2*chebdeg**4)
alpha.value = np.zeros(2*chebdeg**4)
gamma = 1. - 1./25
def basis(s,a):
n = np.arange(4)
f1 = chebval(s/4.,np.eye(chebdeg)).reshape(-1,1,1,1,1)
f2 = chebval(s/10.,np.eye(chebdeg)).reshape(1,-1,1,1,1)
f3 = chebval(s/0.4,np.eye(chebdeg)).reshape(1,1,-1,1,1)
f4 = chebval(s/10.,np.eye(chebdeg)).reshape(1,1,1,-1,1)
f5 = ((a/2)**np.arange(2)).reshape(1,1,1,1,-1)
return f1*f2*f3*f4*f5

def evalb(alpha, s,a):
f = basis(s,a)
return alpha*f.flatten()

def maxaction(alpha, obs):
f1 = np.sum(evalb(alpha.value, observation, 0))
f2 = np.sum(evalb(alpha.value, observation, 1))
if f1 > f2:
action = 0
else:
action = 1
return action

constraints = []
objective = 0

for x in range(4):
constraints = []
objective = 0

epsilon = 1.2/(x+1)
print("epsilon: ", epsilon)
for i_episode in range(200):
observation = env.reset()
observations = []
rewards = []
actions = []
reward = -100
for t in range(50):

prev_obs = observation
prev_reward = reward
if np.random.rand() < epsilon:
action = env.action_space.sample()
else:
action = maxaction(alpha, observation)
observation, reward, done, info = env.step(action)
observations.append(observation)
rewards.append(reward)
actions.append(action)

#constraints.append(evalb(alpha, prev_obs, action) >= prev_reward + gamma*evalb(alpha, observation, action + np.random.randn()*0.2))#maxaction(alpha, observation)))
#objective += evalb(alpha, observation, env.action_space.sample())
if done:
print("Episode finished after {} timesteps".format(t+1))
break
#print(rewards)
bprev = np.array([basis(s,a).flatten() for (s,a) in zip(observations[:-1],actions[1:])])
bnext = np.array([basis(s,env.action_space.sample()).flatten() for (s,a) in zip(observations[1:],actions[1:])])
rewards = np.array(rewards)[:-1]
constraints += [(bprev@alpha) >= rewards + gamma*(bnext)@alpha]

bcost = np.array([basis(s,env.action_space.sample()).flatten() for s in observations])
objective += cvx.sum(bcost@alpha) # np.sum(bnext,axis=0) * alpha
loss = 1*cvx.sum(cvx.abs(alpha-alpha.value)) + objective
prob = cvx.Problem(cvx.Minimize(loss), constraints)

print("solving problem")
result = prob.solve(verbose=True)
print(result)
print(alpha.value)
#inspection loop
for i_episode in range(4):
observation = env.reset()
#env.env.state = np.random.randn()*sigma
for t in range(200):
env.render()
#print(observation)
prev_obs = observation
action = maxaction(alpha, observation)
#print(action)

observation, reward, done, info = env.step(action)
if done:
print("Episode finished after {} timesteps".format(t+1))
break
```

I also have some Work in Progress on getting full swingup cartpole. Currently is not really working. Seems to kind of be pumping about right? The continuous force control easy cartpole does work though.

```import gym
import numpy as np
import cvxpy as cvx
from numpy.polynomial.chebyshev import chebval
env = gym.make('CartPole-v1')
print(env.action_space)

print(env.observation_space)
print(env.observation_space.high)

print(env.observation_space.low) # [1,1,8]
#print(env.action_space.high) # -2
#print(env.action_space.low)

chebdeg = 2
varNum = 2*2*4*2*2*3
alpha = cvx.Variable(varNum)
alpha.value = np.zeros(varNum)
gamma = 1. - 1./150
def basis(s,a):
n = np.arange(4)
f1 = chebval(s/4.,np.eye(2) ).reshape(-1,1,1,1,1,1)
f2 = chebval(s/10.,np.eye(2)).reshape(1,-1,1,1,1,1)
f3 = chebval(s/1,np.eye(4)  ).reshape(1,1,-1,1,1,1)
f4 = chebval(s/1,np.eye(2)  ).reshape(1,1,1,-1,1,1)
f5 = chebval(s/10.,np.eye(2)).reshape(1,1,1,1,-1,1)
f6 = ((a/10)**np.arange(3)).reshape(1,1,1,1,1,-1)
return f1*f2*f3*f4*f5*f6

def evalb(alpha, s,a):
f = basis(s,a)
return alpha*f.flatten()
'''
def maxaction(alpha, obs):
f1 = np.sum(evalb(alpha.value, observation, 0))
f2 = np.sum(evalb(alpha.value, observation, 1))
if f1 > f2:
action = 0
else:
action = 1
return action
'''
def maxaction(alpha, obs):
f1 = np.sum(evalb(alpha.value, observation, 10))
f2 = np.sum(evalb(alpha.value, observation, 0))
f3 = np.sum(evalb(alpha.value, observation, -10))
coeff = np.polyfit([10,0,-10], [f1,f2,f3], deg=2)
if coeff < 0:
action = -coeff/2/coeff
action = min(max(action,-10),10)
elif f1 > f3:
action = 10
else:
action = -10
return np.array([action])

constraints = []
objective = 0

for x in range(4):
constraints = []
objective = 0

epsilon = 1.2/(x+1)
print("epsilon: ", epsilon)
for i_episode in range(100):
observation = env.reset()
observations = []
rewards = []
actions = []
reward = -100
env.env.state = np.random.rand()*2*np.pi
for t in range(100):

prev_obs = observation
prev_reward = reward
if np.random.rand() < epsilon or len(observation) < 5:
action = np.random.rand()*20-10
else:
action = maxaction(alpha, observation)
a = action
env.env.force_mag = abs(a) #min(100, abs(a))
#print(a)
if a < 0:
daction = 0
else:
daction = 1
observation, reward, done, info = env.step(daction)
o = observation
observation = np.array([o,o,np.cos(o),np.sin(o),o])

observations.append(observation)
bonus = 0
if observation > 0.9:
bonus = 1

rewards.append( (bonus + observation + 1)/2)#- observation**2) #reward
#print(rewards[-1])
actions.append(action)

#constraints.append(evalb(alpha, prev_obs, action) >= prev_reward + gamma*evalb(alpha, observation, action + np.random.randn()*0.2))#maxaction(alpha, observation)))
#objective += evalb(alpha, observation, env.action_space.sample())
x = observation
if x < -4 or x > 4:
break
#if done:
#    print("Episode finished after {} timesteps".format(t+1))
#    break
#print(rewards)
bprev = np.array([basis(s,a).flatten() for (s,a) in zip(observations[:-1],actions[1:])])
bnext = np.array([basis(s,a+np.random.randn()*0.3).flatten() for (s,a) in zip(observations[1:],actions[1:])])
rewards = np.array(rewards)[:-1]
#print(bprev.shape)
#print(bnext.shape)

constraints += [(bprev@alpha) >= rewards + gamma*(bnext)@alpha]

bcost = np.array([basis(s,a+np.random.randn()*0.3).flatten() for s in observations])
objective += cvx.sum(bcost@alpha) # np.sum(bnext,axis=0) * alpha
loss = 1*cvx.sum(cvx.abs(alpha-alpha.value)) + objective
prob = cvx.Problem(cvx.Minimize(loss), constraints)

print("solving problem")
result = prob.solve(verbose=True)
print(result)
print(alpha.value)
#inspection loop
for i_episode in range(4):
observation = env.reset()
env.env.state = np.random.rand()*2*np.pi
#env.env.state = np.random.randn()*sigma
o = observation
observation = np.array([o,o,np.cos(o),np.sin(o),o])
for t in range(200):
env.render()
#print(observation)
prev_obs = observation
o = observation
observation = np.array([o,o,np.cos(o),np.sin(o),o])
action = maxaction(alpha, observation)
a = action
env.env.force_mag = abs(a) #min(100, abs(a))
#print(a)
if a < 0:
daction = 0
else:
daction = 1
print(action)

observation, reward, done, info = env.step(daction)

if x < -4 or x > 4:
break
```

Now I feel that a thing that matters quite a bit is what is your choice of action for the next time step. Hypothetically you want a ton of samples here. I now think that using an action that is just slightly perturbed from the actual action works well because the actual action is tending to become roughly the optimal one. Subsequent time steps have roughly the same data in them.

One advantage of discrete action space is that you can really search it all.

Does that mean I should seriously investigate the sum of squares form? A semidefinite variable per data point sounds bad. I feel like I’d have to seriously limit the amount of data I’m using. Maybe I’ll be pleasantly surprised.

I haven’t even gotten to playing with different polynomials yet. The current implementation is exponentially sized in the number of variables. But in kind of a silly way. I think it would be better to use all terms of a bounded total degree.