-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheligibility_traces.py
202 lines (163 loc) · 7.7 KB
/
eligibility_traces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
"""Implementation of different methods using eligibility traces with function approximation"""
import argparse
import numpy as np
import plot as plt
import random
from env import Action, Easy21, State
from policy import EpsilonGreedyApproximationPolicy
from tqdm import tqdm
# For reproducibility
random.seed(0)
np.random.seed(0)
def encode(s: State, a: Action) -> np.ndarray:
"""
Encodes the given state-action pair using coarse coding as specified in the Easy21 assignment:
A binary feature vector rho(s, a) with 3 ∗ 6 ∗ 2 = 36 features. Each binary feature
has a value of 1 iff (s, a) lies within the cuboid of state-space corresponding to
that feature, and the action corresponding to that feature. The cuboids have the
following overlapping intervals:
- dealer(s) = {[1, 4], [4, 7], [7, 10]}
- player(s) = {[1, 6], [4, 9], [7, 12], [10, 15], [13, 18], [16, 21]}
- a = {hit, stick}
:param State s: The state to encode
:param Action a: The action to encode
:return: A binary feature vector representing the encoded state-action pair
:rtype: np.ndarray
"""
# `range` is end-exclusive so we add a 1 to make sure we capture the intervals inclusive ends
dealer = [range(1, 5), range(4, 8), range(7, 11)]
player = [range(1, 7), range(4, 10), range(7, 13), range(10, 16), range(13, 19), range(16, 22)]
encoded = np.zeros((3, 6, 2))
for i, d in enumerate(dealer):
for j, p in enumerate(player):
for k, action in enumerate([Action.hit, Action.stick]):
if s.dealer_first_card in d and s.player_sum in p and a == action:
encoded[i, j, k] = 1
return encoded.flatten()
class SemiGradientTDLambda:
"""Semi-gradient TD(lambda) with epsilon-soft policy"""
def __init__(self):
self._env = Easy21(seed=24)
def learn(self, epochs=200, alpha=0.01, gamma=0.9, l=1.0, verbose=False, **kwargs) -> np.ndarray:
"""
Learns the optimal value function.
:param int epochs: The number of epochs to take to learn the value function
:param float alpha: The learning rate
:param float gamma: The discount factor
:param float l: The lambda value for the eligibility trace update
:param bool verbose: Whether to use verbose mode or not
:return: The optimal value function
:rtype: np.ndarray
"""
w = np.random.rand(36)
approximator = lambda s: [np.dot(w, encode(s, a)) for a in [Action.hit, Action.stick]]
# Constant exploration as in the Easy21 assignment
pi = EpsilonGreedyApproximationPolicy(epsilon=0.05, approximator=approximator, seed=24)
for _ in tqdm(range(epochs), disable=not verbose):
s = self._env.reset()
done = False
# The eligibility trace vector
z = np.zeros_like(w)
while not done:
a = pi[s]
s_prime, r, done = self._env.step(a)
# Compute the TD target
if done:
td_target = r
else:
td_target = r + gamma * np.max(approximator(s_prime))
x = encode(s, a)
# Update the trace
z = l * gamma * z + x
# SGD update
w += alpha * (td_target - np.dot(w, x)) * z
s = s_prime
# Compute the optimal value function which is simply the value of the best action in each state
values = np.zeros(self._env.state_space)
for d in range(self._env.state_space[0]):
for p in range(self._env.state_space[1]):
values[d, p] = np.max(approximator(State(d, p)))
return values
class SemiGradientSarsaLambda:
"""On-policy semi-gradient SARSA(lambda) with epsilon-soft policy"""
def __init__(self):
self._env = Easy21(seed=24)
def learn(self, epochs=200, alpha=0.01, gamma=0.9, l=1.0, verbose=False, **kwargs) -> np.ndarray:
"""
Learns the optimal value function.
:param int epochs: The number of epochs to take to learn the value function
:param float alpha: The learning rate
:param float gamma: The discount factor
:param float l: The lambda value for the eligibility trace update
:param bool verbose: Whether to use verbose mode or not
:return: The optimal value function
:rtype: np.ndarray
"""
w = np.random.rand(36)
approximator = lambda s: [np.dot(w, encode(s, a)) for a in [Action.hit, Action.stick]]
# Constant exploration as in the Easy21 assignment
pi = EpsilonGreedyApproximationPolicy(epsilon=0.05, approximator=approximator, seed=24)
for _ in tqdm(range(epochs), disable=not verbose):
s = self._env.reset()
a = pi[s]
done = False
# The eligibility trace vector
z = np.zeros_like(w)
while not done:
# Generate S,A,R,S',A' trajectory
s_prime, r, done = self._env.step(a)
# Compute the TD target
if done:
a_prime = None
td_target = r
else:
a_prime = pi[s_prime]
td_target = r + gamma * np.dot(w, encode(s_prime, a_prime))
x = encode(s, a)
# Update the trace
z = l * gamma * z + x
# SGD update
w += alpha * (td_target - np.dot(w, x)) * z
s = s_prime
a = a_prime
# Compute the optimal value function which is simply the value of the best action in each state
values = np.zeros(self._env.state_space)
for d in range(self._env.state_space[0]):
for p in range(self._env.state_space[1]):
values[d, p] = np.max(approximator(State(d, p)))
return values
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Run eligibility trace methods")
parser.add_argument(
"--td-lambda", action="store_true", help="Execute Semi-gradient TD(lambda) with epsilon-soft policy"
)
parser.add_argument(
"--sarsa-lambda", action="store_true", help="Execute Semi-gradient SARSA(lambda) with epsilon-soft policy"
)
parser.add_argument("--epochs", type=int, default=200, help="Epochs to train")
parser.add_argument("--alpha", type=float, default=0.01, help="Learning rate to use")
parser.add_argument("--gamma", type=float, default=0.9, help="Discount factor")
parser.add_argument("--l", type=float, default=1.0, help="Lambda to use")
parser.add_argument("--verbose", action="store_true", help="Run in verbose mode")
args = parser.parse_args()
# The optimal value function obtained
V = None
# The algorithm to run
eligibility = None
# The title of the plot
title = None
if args.td_lambda:
print("Running Semi-gradient TD(lambda)")
eligibility = SemiGradientTDLambda()
title = "semi_grad_td_lambda"
elif args.sarsa_lambda:
print("Running Semi-gradient SARSA(lambda)")
eligibility = SemiGradientSarsaLambda()
title = "semi_grad_sarsa_lambda"
if eligibility is not None:
V = eligibility.learn(epochs=args.epochs, alpha=args.alpha, gamma=args.gamma, l=args.l, verbose=args.verbose)
if V is not None:
# Plot the value function as a surface
# Remove the state where the dealer's first card is 0 and the player's sum is 0 because these are not possible
# They were kept in the value function to avoid having to deal with 0-index vs 1-index
plt.plot_value_function(range(1, Easy21.state_space[0]), range(1, Easy21.state_space[1]), V[1:, 1:], title)