-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrl_custom_policy.py
142 lines (111 loc) · 4.46 KB
/
rl_custom_policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import numpy as np
from rl.policy import Policy
class CustomPolicy(Policy):
"""Implement the Custom Q Policy
This Custom Q Policy is a combination of Epsilon Greedy Policy and Boltzmann Q Policy with a parameter zeta
"""
def __init__(self, zeta_start=1.0, zeta_delta=0.1, eps=.1, tau=1., clip=(-500., 500.)):
super(CustomPolicy, self).__init__()
self.zeta = zeta_start
self.zeta_delta = zeta_delta
self.eps = eps
self.tau = tau
self.clip = clip
def select_action(self, q_values):
"""Return the selected action
# Arguments
q_values (np.ndarray): List of the estimations of Q for each action
# Returns
Selection action
"""
assert q_values.ndim == 1
q_values = q_values.astype('float64')
nb_actions = q_values.shape[0]
def get_action_from_boltzmann(q, tau, clip):
exp_values = np.exp(np.clip(q / tau, clip[0], clip[1]))
probs = exp_values / np.sum(exp_values)
a = np.random.choice(range(nb_actions), p=probs)
return a
if self.zeta < .5:
action = get_action_from_boltzmann(q_values, self.tau, self.clip)
else:
if np.random.uniform() < self.eps:
action = get_action_from_boltzmann(q_values, self.tau, self.clip)
else:
action = np.argmax(q_values)
if self.zeta > 0:
self.zeta -= self.zeta_delta
return action
def get_config(self):
"""Return configurations of CustomPolicy
# Returns
Dict of config
"""
config = super(CustomPolicy, self).get_config()
config['zeta'] = self.zeta
config['zeta_delta'] = self.zeta_delta
config['eps'] = self.eps
config['tau'] = self.tau
config['clip'] = self.clip
return config
class CustomPolicyBasedOnMaxBoltzmann(Policy):
"""Implement the Custom Q Policy
This Custom Q Policy is a combination of Epsilon Greedy Policy and Boltzmann Q Policy with a parameter zeta
"""
def __init__(self, zeta_start=1.0, zeta_end=.1, zeta_nb_steps=1000000, eps=.1, tau=1., clip=(-500., 500.)):
super(CustomPolicyBasedOnMaxBoltzmann, self).__init__()
self.zeta_start = zeta_start
self.zeta_end = zeta_end
self.zeta_nb_steps = zeta_nb_steps
self.eps = eps
self.tau = tau
self.clip = clip
print("Initializing CustomPolicyBasedOnMaxBoltzmann: \nzeta_start: {}\nzeta_end: {}\nzeta_nb_steps: {}".format(
self.zeta_start, self.zeta_end, self.zeta_nb_steps))
def get_zeta(self):
a = - ((self.zeta_start - self.zeta_end) / float(self.zeta_nb_steps))
b = self.zeta_start
return max(self.zeta_end, a * self.agent.step + b)
def select_action(self, q_values):
"""Return the selected action
# Arguments
q_values (np.ndarray): List of the estimations of Q for each action
# Returns
Selection action
"""
assert q_values.ndim == 1
q_values = q_values.astype('float64')
nb_actions = q_values.shape[0]
def get_action_from_boltzmann(q, tau, clip):
exp_values = np.exp(np.clip(q / tau, clip[0], clip[1]))
probs = exp_values / np.sum(exp_values)
a = np.random.choice(range(nb_actions), p=probs)
return a
def get_action_from_maxboltzmann(q, tau, clip, eps):
if np.random.uniform() < eps:
a = get_action_from_boltzmann(q, tau, clip)
else:
a = np.argmax(q_values)
return a
zeta = self.get_zeta()
if zeta > self.zeta_end:
action = get_action_from_maxboltzmann(q_values, self.tau, self.clip, self.eps)
else:
if np.random.uniform() < self.eps:
action = np.random.randint(0, nb_actions)
else:
action = np.argmax(q_values)
return action
def get_config(self):
"""Return configurations of CustomPolicy
# Returns
Dict of config
"""
config = super(CustomPolicyBasedOnMaxBoltzmann, self).get_config()
config['zeta_start'] = self.zeta_start
config['zeta_end'] = self.zeta_end
config['zeta_nb_steps'] = self.zeta_nb_steps
config['eps'] = self.eps
config['tau'] = self.tau
config['clip'] = self.clip
return config