-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvalue_iteration.py
57 lines (46 loc) · 1.62 KB
/
value_iteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
def BellmanOperator(Q, env, gamma=0.99):
""" Apply the bellman operator to Q
Parameters
----------
Q : array of shape (S,A),
Action Value function/matrix, shape (S,A) with S the state space dimension and A the action space dimension.
env : gym-like env,
environment
Yields
-------
TQ : array of shape (S,A),
Results of the Bellman operator applied to the Action-Value function/matrix Q
"""
S = env.observation_space.n
A = env.action_space.n
TQ = np.zeros((S, A))
for s in range(S):
for a in range(A):
TQ[s,a] = env.R[s,a] + gamma * env.P[s,a] @ Q.max(axis=1)
return TQ
def ValueIteration(env, gamma=0.99, epsilon=1e-6):
''' Value iteration,
return optimal Action space function given the reward r and the transition probabilities P
Parameters
-----------
env : gym-like env,
environment with know reward and transition probabilities, and finite state and action space
gamma : float,
Discount factor ensuring the convergence of the iterations
epsilon : float,
Threshold used to stop the iteration
Yields
------
Q : array of shape (S,A),
Optimal action Value function/matrix that is the fixed point of the Bellman operator T , shape (S,A) with S the state space dimension and A the action space dimension.
'''
S = env.observation_space.n
A = env.action_space.n
Q = np.zeros((S, A))
while True :
TQ = BellmanOperator(Q,env,gamma)
if np.abs(TQ - Q).max() <= epsilon :
break
Q = TQ
return Q