-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathq_learning.py
100 lines (83 loc) · 3.79 KB
/
q_learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
Let's design a simulation of a self-driving cab. The major goal is to demonstrate, in a simplified environment, how you
can use RL techniques to develop an efficient and safe approach for tackling this problem.
The Smartcab's job is to pick up the passenger at one location and drop them off in another. Here are a few things that
we'd love our Smartcab to take care of:
Drop off the passenger to the right location.
Save passenger's time by taking minimum time possible to drop off
Take care of passenger's safety and traffic rules
"""
import os
import gym
import numpy as np
class QLearning:
# Learnt from:
# https://www.learndatasci.com/tutorials/reinforcement-q-learning-scratch-python-openai-gym/
# Hyperparameters
ALPHA = 0.1
"""α (alpha) is the learning rate (0 < α ≤ 1) - Just like in supervised learning settings, α is the extent
to which our Q-values are being updated in every iteration."""
GAMMA = 0.6
"""
γ (gamma) is the discount factor (0 ≤ γ ≤ 1) - determines how much importance we want to give to future rewards.
A high value for the discount factor (close to 1) captures the long-term effective award, whereas, a discount
factor of 0 makes our agent consider only immediate reward, hence making it greedy.
"""
EPSILON = 0.1
ENV = gym.make("Taxi-v3")
ACTION_SPACE = ENV.action_space.n
STATE_SPACE = ENV.observation_space.n
Q_TABLE = np.zeros((STATE_SPACE, ACTION_SPACE))
def __init__(self):
os.environ.setdefault('TERM', 'xterm-color')
def update_q_table(self, state, action, reward, next_state):
old_q = self.Q_TABLE[state, action]
# Get the maximum reward for the next state
next_max = np.max(self.Q_TABLE[next_state])
# Get the new q value by running it through the Q-Learning formula
new_q = ((1 - self.ALPHA) * old_q) + (self.ALPHA * (reward + self.GAMMA * next_max))
self.Q_TABLE[state, action] = new_q
def train(self):
print('Training the agent to play the taxi game')
for i in range(5000):
done = False
state = self.ENV.reset()
penalties = 0
while not done:
if np.random.randint(0, 1) < self.EPSILON:
# Explore
action = self.ENV.action_space.sample()
else:
# Exploit
# Get the row of the state in the Q table, and check the action that has the max value
action = np.argmax(self.Q_TABLE[state])
# arg max is to get the index of the biggest action in that state row
next_state, reward, done, info = self.ENV.step(action)
# Update the Q-Table
self.update_q_table(state, action, reward, next_state)
if reward == -10:
penalties += 1
state = next_state
if i + 1 % 100 == 0:
print('Episode {} reached'.format(i))
def play(self):
print('Now the agent will play 100 episodes from the knowledge gained')
episodes = 100
total_epoch, total_penalties = 0, 0
for i in range(episodes):
state = self.ENV.reset()
epoch, penalties = 0, 0
done = False
while not done:
action = np.argmax(self.Q_TABLE[state])
state, reward, done, info = self.ENV.step(action)
if reward == -10:
penalties += 1
epoch += 1
total_epoch += epoch
total_penalties += penalties
print(f'Made {epoch} moves to complete episode {i} with {penalties} penalties')
# print('Results after {} episodes:')
# print(f'Total Numbers of ')
QLearning().train()
QLearning().play()