-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmctsking.py
255 lines (198 loc) · 8.89 KB
/
mctsking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
Monte Carlo Tree Search.
Kingdomino player strategy in style of AlphaGo.
"""
"""
Monte carlo tree search first trains, and then plays.
> Save the trained state somewhere
Recall Thompson Sampling worked better in CSE 312 on HW.
"""
"""
Theoretical calculations
Game depth d: 48 cards / 4 players = 12 turns.
Game breadth b: Max legal moves possible: 8 moves per 2x2 segment * (4*4-1) 2x2 segments = 8*15=120
b^d possible moves at most (overestimate) = 120^12 = 8.91610045E24
Pretty large
Oh, and also, claiming is strategy, up to 4 choice per turn.
Two principles can reduce effective search space:
> Depth: Positional evaluation, truncate and replace subtree with value approximation function.
> Breadth: Sample from Policy p(a|s) distribution over legal moves a in state s.
- Monte Carlo rollouts: search to max depth w/o branching, sample long sequences of actions for all players.
* Effective to average over these rollouts, seen in backgammon and Scrabble.
- Monte Carlo Tree Search (MCTS) uses rollouts to estimate value. With more simulation, accuracy improves.
- Asymptotic policy convergence to optimal play and evaluation to optimal value function.
- AlphaGo noted prior work had linear feature combinations (shallow) for policy and value functions.
- AlphaGo used value network and policy network
However, would Kingdomino benefit from reduced depth? Already small, and fixed depth game.
# TODO Future work, after Monte Carlo rollout/playout, try nonlinear feature combinations
"""
"""
{{I did this with Simple Machine Learning project in high school :) }}
Pure Monte Carlo Game/Tree Search has rounds of 4 steps:
1) Selection - start at root, select children until reach leaf.
Root R is current game state, leaf L is any node with a child that had no simulation (playout) yet.
Clever selection, for example with Thompson sampling, is better
2) Expansion
Unless L ends the game, create 1+ child nodes (valid moves from L), choose node C from them.
3) Simulation
Complete 1 random playout from node C.
Simple: choose uniform random moves until game end.
4) Backpropagation
Use playout result to update information in nodes on path from C to R.
Rounds are repeated while time remains, then the move with most simulations made (highest denominator) is chosen.
Playout k games, record scores, move leading to best score is played. Converges to optimal play (k->inf) in board filling games with random turn order (Hex, Kingdomino).
AlphaZero replaces simulation step with evaluation based on a neural network.
"""
"""
Per wikipedia and their sources suggestion: Choose moves (in each node of game tree) with maximum (wi/ni) + c*sqrt(ln(Ni)/ni).
wi = # wins for node considered after ith move
ni = # simulations for the node considered after ith move
Ni = total # simulations after ith move run by the parent of the node considered
c = exploration parameter, theoretically sqrt(2), empirically chosen.
Formula is [exploit] + [explore]
Beware of pruning moves that would have later led to significant outcome difference
"""
"""
Improvements:
# TODO
Priors
Progressive bias
Rapid Action Value Estimation (<- interesting, for when multiple permutations of move sequence leads to same position)
"""
"""
Wikipedia, their authors, ai-boson.github.io
"""
import numpy as np
from collections import defaultdict
from kingdomino import *
"""
Tree looks like:
> Place Castle
Loop
- Claim card
- Place card
"""
class MCTSKingdominoNode():
def __init__(self, state:bool, parent=None, parent_action=None,
board:Board=None, claimable=None, d_to_place:Domino=None, turn:str=None):
assert state, 'Illegal state was reached'
self.state = state # Board state
self.parent = parent # Root has no parent
self.parent_action = parent_action
self.children = [] # Legal moves
self._number_of_visits = 0
self._results = defaultdict(int) # TODO learn about this
self._results[1], self._results[-1] = 0
self._untried_actions = None # Legal moves remaining to try
self._untried_actions = self.untried_actions()
# Kingdomino has an alternating move type
self.FOUND = 'found'
self.CLAIM = 'claim'
self.PLACE = 'place'
if not parent: self.turn = self.FOUND
else: self.turn = turn
# Kingdomino has
# TODO use Union here, this string thing is messy
self.d_to_place = d_to_place # May be passed as parameter;
self.cards_to_claim = claimable # Assume [Domino], 1 <= len <= 4
self.board = board
def untried_actions(self):
""" Returns different types depending on self.turn TODO fix this terrible style
Founding: list of coordinates at which to act
Claiming: the index of the domino to pick
Placing: the coordinates at which to place a domino
"""
if self.turn == self.FOUND:
castle_xy = []
for row in range(5):
for col in range(5):
castle_xy.append(row, col)
self._untried_actions = castle_xy
elif self.turn == self.CLAIM:
self._untried_actions = self.cards_to_claim
elif self.turn == self.PLACE:
self._untried_actions = self.board.get_legal_coords(self.d_to_place) # Replace this with forking for either claim or place
else:
print("Error, nonmatching recur code in untried actions") # TODO fail better
return self._untried_actions
def q(self): # TODO more descriptive name? Or is this common
wins = self._results[1]
losses = self._results[-1]
return wins - losses
def n(self):
return self._number_of_visits
def expand(self):
action = self._untried_actions.pop()
next_board = self.board
next_d = self.d_to_place
next_turn = None
next_state = True
if self.turn == self.FOUND:
assert not self.board.has_castle, "Board already has a castle."
next_board = self.board.put_castle(*action)
next_turn = self.PLACE # Assume defined turn order Found -> {Place, Claim} -> Place
elif self.turn == self.CLAIM:
next_d = action
next_turn = self.PLACE
elif self.turn == self.PLACE:
next_board = self.board.put_domino(self.d_to_place, *action)
# TODO need to claim here
next_turn = self.CLAIM
else:
assert 1 == 0, 'Failed to warn: expand() recieved nonmatched turn code.'
next_state = False
# next_state = self.state.move(action)
child_node = MCTSKingdominoNode(
next_state, parent=self, parent_action=action, board=next_board, claimable=
)
self.children.append(child_node)
return child_node
def is_terminal_node(self):
return self.state.is_game_over()
def rollout(self):
""" Fully simulate a game. Return outcome. """
current_rollout_state = self.state
while not current_rollout_state.is_game_over(): # TODO this is likely a typo bug
possible_moves = current_rollout_state.get_legal_actions()
action = self.rollout_policy(possible_moves)
current_rollout_state = current_rollout_state.move(action)
return current_rollout_state.game_result()
def backpropagate(self, result):
self._number_of_visits += 1
self._results[result] += 1
if self.parent:
self.parent.backpropagate(result)
def is_fully_expanded(self):
return len(self._untried_actions) == 0
def best_child(self, c_param=0.1):
choices_weights = (c.q() / c.n()) + c_param * np.sqrt(2 * np.log(self.n() / c.n()))
return self.children[np.argmax(choices_weights)]
def rollout_policy(self, possible_moves):
return possible_moves[np.random.randint(len(possible_moves))]
def _tree_policy(self):
""" Selects the node on which to run rollout. """
current_node = self
while not current_node.is_terminal_node():
if not current_node.is_fully_expanded(): # TODO could improve boolean semantics here
return current_node.expand()
else:
current_node = current_node.best_child()
return current_node
def best_action(self):
simulation_n = 100 # TODO adjust this by tuning, make it easier to access
for i in range(simulation_n):
v = self._tree_policy()
reward = v.rollout()
v.backpropagate(reward)
return self.best_child(c_param=0)
def get_legal_actions(self):
pass # TODO from KD
def is_game_over(self):
pass # TODO from KD
def game_result(self):
pass # TODO from KD
def move(self, action):
pass # TODO from KD
def main():
root = MCTSKingdominoNode(state=inital_state)
selected_node = root.best_action()