import gym

import numpy as np

import cvxpy as cvx

from numpy.polynomial.chebyshev import chebval

env = gym.make('Pendulum-v0')

print(env.action_space)

print(env.observation_space)

print(env.observation_space.high)

print(env.observation_space.low) # [1,1,8]

print(env.action_space.high) # -2

print(env.action_space.low)

chebdeg = 2

alpha = cvx.Variable(3*chebdeg**3)

alpha.value = np.zeros(3*chebdeg**3)

gamma = 1. - 1./50

def basis(s,a):

n = np.arange(4)

f1 = chebval(s[0]/1.,np.eye(chebdeg)).reshape(-1,1,1,1)

f2 = chebval(s[1]/1.,np.eye(chebdeg)).reshape(1,-1,1,1)

f3 = chebval(s[2]/8.,np.eye(chebdeg)).reshape(1,1,-1,1)

f4 = ((a/2)**np.arange(3)).reshape(1,1,1,-1)

return f1*f2*f3*f4

def vbasis(s,a):

#n = np.arange(4)

N = s.shape[0]

#print(chebval(s[:,0]/1.,np.eye(chebdeg)).shape)

f1 = chebval(s[:,0]/1.,np.eye(chebdeg)).T.reshape(N,-1,1,1,1)

f2 = chebval(s[:,1]/1.,np.eye(chebdeg)).T.reshape(N,1,-1,1,1)

f3 = chebval(s[:,2]/8.,np.eye(chebdeg)).T.reshape(N,1,1,-1,1)

if type(a) == np.ndarray:

f4 = (a.reshape(-1,1,1,1,1)/2)**(np.arange(3).reshape(1,1,1,1,-1))

else:

f4 = (a/2)**(np.arange(3).reshape(1,1,1,1,-1))

return (f1*f2*f3*f4).reshape(N,-1)

def qmaxaction(alpha,s):

#n = np.arange(4)

N = s.shape[0]

#print(chebval(s[:,0]/1.,np.eye(chebdeg)).shape)

f1 = chebval(s[:,0]/1.,np.eye(chebdeg)).T.reshape(N,-1,1,1,1)

f2 = chebval(s[:,1]/1.,np.eye(chebdeg)).T.reshape(N,1,-1,1,1)

f3 = chebval(s[:,2]/8.,np.eye(chebdeg)).T.reshape(N,1,1,-1,1)

z = f1*f2*f3

a = (np.array([0,0,0.25]).reshape(1,1,1,1,-1)*z).reshape(N,-1)@alpha.value

b = (np.array([0,0.5,0]).reshape(1,1,1,1,-1)*z).reshape(N,-1)@alpha.value

acts = np.clip(-b/2/(a+1e-7), -2,2)

q2 = vbasis(s,2)@alpha.value

print(acts)

qa = vbasis(s,acts)@alpha.value

qn2 = vbasis(s,-2)@alpha.value

return np.maximum(qa,np.maximum(qn2,q2))

def evalb(alpha, s,a):

f = basis(s,a)

return alpha*f.flatten()

def maxaction(alpha, obs):

f1 = np.sum(evalb(alpha.value, observation, 2))

f2 = np.sum(evalb(alpha.value, observation, 0))

f3 = np.sum(evalb(alpha.value, observation, -2))

coeff = np.polyfit([2,0,-2], [f1,f2,f3], deg=2)

if coeff[0] < 0:

action = -coeff[1]/2/coeff[0]

action = min(max(action,-2),2)

elif f1 > f3:

#print("huh")

action = 2

else:

#print("how bout dat")

action = -2

return np.array([action])

constraints = []

objective = 0

for x in range(4):

constraints = []

epobs = []

eprewards = []

epactions = []

objective = 0

epsilon = 1.2/(x+1)

print("epsilon: ", epsilon)

epNum = 50

for i_episode in range(epNum):

observations = []

rewards = []

actions = []

observation = env.reset()

reward = -100

for t in range(100):

prev_obs = observation

prev_reward = reward

if np.random.rand() < epsilon:

action = env.action_space.sample()

else:

action = maxaction(alpha, observation)

observation, reward, done, info = env.step(action)

observations.append(observation)

rewards.append(reward)

actions.append(action)

#constraints.append(evalb(alpha, prev_obs, action) >= prev_reward + gamma*evalb(alpha, observation, action + np.random.randn()*0.2))#maxaction(alpha, observation)))

#objective += evalb(alpha, observation, env.action_space.sample())

if done:

print("Episode finished after {} timesteps".format(t+1))

break

epobs.append(observations)

epactions.append(actions)

eprewards.append(rewards)

for qiter in range(20):

print("Q iteration: ", qiter)

objective = 0

for (observations,actions, rewards) in zip(epobs,epactions,eprewards):

obs = np.array(observations)

act = np.array(actions)

bprev = vbasis(obs[:-1],act[1:]) # np.array([basis(s,a).flatten() for (s,a) in zip(observations[:-1],actions[1:])])

#bnext = np.array([basis(s,a+np.random.randn()*0.2).flatten() for (s,a) in zip(observations[1:],actions[1:])])

#obs = np.array(observations)[:-1]

#q2 = np.array([basis(s,2).flatten() for s in observations[:-1]])@alpha.value

#q2 = vbasis(obs[1:],2)@alpha.value

#q0 = vbasis(obs[1:],0)@alpha.value

#qn2 = vbasis(obs[1:],-2)@alpha.value

#q1 = vbasis(obs[1:],1)@alpha.value

#qn1 = vbasis(obs[1:],-1)@alpha.value

#qs = []

#for a in np.linspace(-2,2,5):

# qs.append(vbasis(obs[1:],a+np.random.randn()*0.5)@alpha.value)

#for a in np.linspace(-0.1,0.1,3):

# qs.append(vbasis(obs[1:],a)@alpha.value)

#qmax = np.max(np.stack([q2,q0,qn2,q1,qn1],axis=1),axis=1)

#qmax = np.max(np.stack(qs,axis=1),axis=1)

qmax = qmaxaction(alpha, obs[1:])

#q0 = np.array([basis(s,0).flatten() for s in observations[:-1]])@alpha.value

#qn2 = np.array([basis(s,-2).flatten() for s in observations[:-1]])@alpha.value

#qmax = np.maximum(np.maximum(q0,q2),qn2)#np.array([np.dot(basis(s, maxaction(alpha,s)).flatten(),alpha.value) for s in observations[1:]])

rewards = np.array(rewards)[:-1]

objective += cvx.sum(cvx.huber(bprev@alpha - (rewards + gamma*qmax))) #cvx.sum_squares(bprev@alpha - (rewards + gamma*qmax))

#bcost = np.array([basis(s,env.action_space.sample()).flatten() for s in observations])

#objective = cvx.sum(bcost@alpha) # np.sum(bnext,axis=0) * alpha

loss = 0.001*cvx.sum_squares(alpha-alpha.value) + objective

prob = cvx.Problem(cvx.Minimize(loss), constraints)

# The optimal objective value is returned by `prob.solve()`.

print("solving problem")

result = prob.solve(verbose=True)

print(result)

# The optimal value for x is stored in `x.value`.

print(alpha.value)

#inspection loop

for i_episode in range(4):

observation = env.reset()

#env.env.state[0] = np.random.randn()*sigma

for t in range(200):

env.render()

#print(observation)

prev_obs = observation

action = maxaction(alpha, observation)

#print(action)

observation, reward, done, info = env.step(action)

if done:

print("Episode finished after {} timesteps".format(t+1))

break