Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(Incorrect) experience-replay MC control algorithm. #62

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions rl/markov_decision_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class FinitePolicy(Policy[S, A]):
policy_map: Mapping[S, Optional[FiniteDistribution[A]]]

def __init__(
self,
policy_map: Mapping[S, Optional[FiniteDistribution[A]]]
self,
policy_map: Mapping[S, Optional[FiniteDistribution[A]]]
):
self.policy_map = policy_map

Expand Down Expand Up @@ -110,8 +110,8 @@ def apply_policy(self, policy: Policy[S, A]) -> MarkovRewardProcess[S]:

class RewardProcess(MarkovRewardProcess[S]):
def transition_reward(
self,
state: S
self,
state: S
) -> Optional[Distribution[Tuple[S, float]]]:
actions: Optional[Distribution[A]] = policy.act(state)

Expand Down Expand Up @@ -146,9 +146,9 @@ def is_terminal(self, state: S) -> bool:

@abstractmethod
def step(
self,
state: S,
action: A
self,
state: S,
action: A
) -> Optional[Distribution[Tuple[S, float]]]:
pass

Expand Down
35 changes: 12 additions & 23 deletions rl/monte_carlo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@

from typing import Iterable, Iterator, Tuple, TypeVar

from rl.distribution import Distribution
from rl.function_approx import FunctionApprox
import rl.markov_process as mp
import rl.markov_decision_process as markov_decision_process
from rl.markov_decision_process import (MarkovDecisionProcess)
import rl.markov_decision_process as mdp
from rl.returns import returns

S = TypeVar('S')
Expand Down Expand Up @@ -38,7 +36,8 @@ def evaluate_mrp(
function after each episode.

'''
episodes = (returns(trace, γ, tolerance) for trace in traces)
episodes: Iterable[Iterable[mp.ReturnStep[S]]] =\
(returns(trace, γ, tolerance) for trace in traces)

return approx_0.iterate_updates(
((step.state, step.return_) for step in episode)
Expand All @@ -47,11 +46,9 @@ def evaluate_mrp(


def evaluate_mdp(
mdp: MarkovDecisionProcess[S, A],
states: Distribution[S],
traces: Iterable[Iterable[mdp.TransitionStep[S, A]]],
approx_0: FunctionApprox[Tuple[S, A]],
γ: float,
ϵ: float,
tolerance: float = 1e-6
) -> Iterator[FunctionApprox[Tuple[S, A]]]:
'''Evaluate an MRP using the monte carlo method, simulating episodes
Expand All @@ -61,27 +58,19 @@ def evaluate_mdp(
function for the MRP after one additional epsiode.

Arguments:
mrp -- the Markov Reward Process to evaluate
states -- distribution of states to start episodes from
traces -- an iterator of simulation traces from an MDP
approx_0 -- initial approximation of value function
γ -- discount rate (0 < γ ≤ 1)
ϵ -- the fraction of the actions where we explore rather
than following the optimal policy
tolerance -- a small value—we stop iterating once γᵏ ≤ tolerance

Returns an iterator with updates to the approximated Q function
after each episode.

'''
q = approx_0
p = markov_decision_process.policy_from_q(q, mdp)

while True:
trace: Iterable[markov_decision_process.TransitionStep[S, A]] =\
mdp.simulate_actions(states, p)
q = q.update(
((step.state, step.action), step.return_)
for step in returns(trace, γ, tolerance)
)
p = markov_decision_process.policy_from_q(q, mdp, ϵ)
yield q
episodes: Iterable[Iterable[mdp.ReturnStep[S, A]]] =\
(returns(trace, γ, tolerance) for trace in traces)

return approx_0.iterate_updates(
(((step.state, step.action), step.return_) for step in episode)
for episode in episodes
)
3 changes: 2 additions & 1 deletion rl/td.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def step(v, transition):
transition.reward + γ * v(transition.next_state))])

return itertools.accumulate(transitions, step, initial=approx_0)



A = TypeVar('A')


Expand Down
74 changes: 58 additions & 16 deletions rl/test_monte_carlo.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,33 @@
import unittest

import itertools
from typing import cast, Iterable, Tuple

from rl.distribution import Categorical, Choose
from rl.function_approx import Tabular
import rl.iterate as iterate
from rl.markov_process import FiniteMarkovRewardProcess
import rl.markov_decision_process as mdp
import rl.monte_carlo as mc


class FlipFlop(FiniteMarkovRewardProcess[bool]):
'''A version of FlipFlop implemented with the FiniteMarkovProcess
machinery.

'''

def __init__(self, p: float):
transition_reward_map = {
b: Categorical({(not b, 2.0): p, (b, 1.0): 1 - p})
for b in (True, False)
}
super().__init__(transition_reward_map)


class TestEvaluate(unittest.TestCase):
def setUp(self):
self.finite_flip_flop = FlipFlop(0.7)
self.finite_mdp = mdp.FiniteMarkovDecisionProcess({
True: {
True: Categorical({(True, 1.0): 0.7, (False, 2.0): 0.3}),
False: Categorical({(True, 1.0): 0.3, (False, 2.0): 0.7}),
},
False: {
True: Categorical({(False, 1.0): 0.7, (True, 2.0): 0.3}),
False: Categorical({(False, 1.0): 0.3, (True, 2.0): 0.7}),
}
})

optimal = mdp.FinitePolicy({
True: Choose({False}),
False: Choose({False})
})
self.finite_flip_flop = self.finite_mdp.apply_finite_policy(optimal)

def test_evaluate_finite_mrp(self):
start = Tabular({s: 0.0 for s in self.finite_flip_flop.states()})
Expand All @@ -40,3 +44,41 @@ def test_evaluate_finite_mrp(self):
# Intentionally loose bound—otherwise test is too slow.
# Takes >1s on my machine otherwise.
self.assertLess(abs(v(s) - 170), 1.0)

def test_evaluate_finite_mdp(self) -> None:
q_0: Tabular[Tuple[bool, bool]] = Tabular(
{(s, a): 0.0
for s in self.finite_mdp.states()
for a in self.finite_mdp.actions(s)},
count_to_weight_func=lambda _: 0.1
)

uniform_policy: mdp.FinitePolicy[bool, bool] =\
mdp.FinitePolicy({
s: Choose(self.finite_mdp.actions(s))
for s in self.finite_mdp.states()
})

transitions: Iterable[Iterable[mdp.TransitionStep[bool, bool]]] =\
self.finite_mdp.action_traces(
Choose(self.finite_mdp.states()),
uniform_policy
)

qs = mc.evaluate_mdp(
transitions,
q_0,
γ=0.99
)

q = iterate.last(itertools.islice(qs, 20))

if q is not None:
q = cast(Tabular[Tuple[bool, bool]], q)
self.assertEqual(len(q.values_map), 4)

for s in [True, False]:
self.assertLess(abs(q((s, False)) - 170.0), 2)
self.assertGreater(q((s, False)), q((s, True)))
else:
assert False