-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmountaincar.py
executable file
·88 lines (74 loc) · 2.76 KB
/
mountaincar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import numpy as np
class MountainCar:
def __init__(self,rbf_eps):
self.g = 9.8
self.k = 0.3
self.dt = 0.1
self.m = 0.2
self.x = -0.5
self.v = 0.0
self.n_actions = 3
self.action_list = [-2, 0, 2]
self.x_space = [-1.2, 0.5]
self.v_space = [-1.5, 1.5]
nrbf = [2, 4, 8, 16, 32]
rbf_mean_x = []
rbf_mean_v = []
for n_r in nrbf:
xm = np.linspace(self.x_space[0], self.x_space[1], num=n_r)
vm = np.linspace(self.v_space[0], self.v_space[1], num=n_r)
for k1 in range(n_r):
for k2 in range(n_r):
rbf_mean_x.append(xm[k1])
rbf_mean_v.append(vm[k2])
self.rbf_mean_x = np.array(rbf_mean_x)
self.rbf_mean_v = np.array(rbf_mean_v)
self.rbf_eps = rbf_eps
self.number_of_features = 1365
def randomstate(self):
self.x = np.random.uniform(self.x_space[0], self.x_space[1])
self.v = np.random.uniform(self.v_space[0], self.v_space[1])
return self.x, self.v
def RBF(self, x, v):
return np.multiply(np.exp(-self.rbf_eps * ((x - self.rbf_mean_x) ** 2)),
np.exp(-self.rbf_eps * ((v - self.rbf_mean_v) ** 2)))
def phi(self, x, v, a):
if a == 0:
return np.concatenate([[1], self.RBF(x, v), np.zeros(2 * self.number_of_features)], axis=0)
elif a == 1:
return np.concatenate(
[np.zeros(self.number_of_features), [1], self.RBF(x, v), np.zeros(self.number_of_features)], axis=0)
else:
return np.concatenate([np.zeros(2 * self.number_of_features), [1], self.RBF(x, v)], axis=0)
def step(self, action):
action_t = self.action_list[action]
v2 = self.v + (-self.g * self.m * np.cos(3 * self.x) + (action_t / self.m) - (self.k * self.v)) * self.dt
if v2 < self.v_space[0]:
v2 = self.v_space[0]
if v2 > self.v_space[1]:
v2 = self.v_space[1]
x2 = self.x + (v2 * self.dt)
if x2 < self.x_space[0]:
x2 = self.x_space[0]
v2 = 0
self.x = x2
self.v = v2
if x2 >= self.x_space[1]:
reward = 1.0
done = True
else:
reward = -0.01
done = False
return x2, v2, reward, done
def phi_f(self, D, w):
x = D['x1']
v = D['v1']
a = D['a']
phi1 = self.phi(x, v, a)
x2 = D['x2']
v2 = D['v2']
a2 = self.linear_policy(w, x2, v2)
phi2 = self.phi(x2, v2, a2)
return np.array(phi1), np.array(phi2)
def linear_policy(self, w, x, v):
return np.argmax([np.dot(w, self.phi(x, v, a)) for a in range(self.n_actions)])