-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmcts.py
363 lines (299 loc) · 13.9 KB
/
mcts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
import torch
from alphazeroquixo import train, QuixoDataset, get_data_loader
from utilities import (
convert_board_to_binary,
generate_pol_labels,
generate_val_labels,
get_all_move,
possible_moves_for_piece,
move_to_index,
)
from game import Player, Move
from game_wrapper import GameWrapper as Game
import math
import numpy as np
from tqdm.auto import tqdm
from copy import deepcopy
# Determine if CUDA is available and use it for training, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
class GameState:
def __init__(self, board, player):
self.board = board
self.player = player
class MCTSNode:
def __init__(self, state, move=None, move_index=None, move_prob=None, parent=None):
# Initialize a node for the Monte Carlo Tree Search (MCTS) algorithm
self.state = state # The current state of the game
self.parent = parent # The parent node in the MCTS tree
self.move = move # The move made to reach this state
# If move index is not provided but a move is, calculate the move index
if move_index is None and move is not None:
move_index = move_to_index(move)
self.move_index = move_index # Index of the move
self.move_prob = 0 # Probability of choosing this move (used in MCTS)
self.children = [] # Child nodes in the MCTS tree
self.visits = 0 # Number of times this node has been visited during MCTS
self.value = 0 # Value of the node as determined by MCTS
# Generate a list of untried moves from this state
self.untried_moves = self.get_possible_moves()
def get_possible_moves(self):
# Get all possible moves in the game
all_moves = get_all_move()
# Filter out the legal moves based on the current state
legal_moves = [
move
for move in all_moves
if self.is_legal_move(move[0][0], move[0][1], move[1])
]
return legal_moves
def is_legal_move(self, x, y, direction):
# Determine if a move is legal in the current state
player = self.state.player # The current player
board = self.state.board # The game board
# Get possible moves for the piece at the specified position
possible_moves = possible_moves_for_piece(x, y)
# Check if the move is legal (either an empty spot or the player's piece)
if board[y][x] == player or board[y][x] == -1:
# Check if the specified move is among the possible moves
if ((x, y), direction) in possible_moves:
return True
return False
def select_child(self, C=1.41):
# Select the best child node using the Upper Confidence Bound (UCB) formula
best_child = None
best_ucb = float("-inf") # Initialize the best UCB as negative infinity
# Iterate through each child to find the child with the highest UCB
for child in self.children:
V = child.value # The average value of the child node
P = child.move_prob # Probability of the move as predicted by the model
N = self.visits if self.visits > 0 else 1 # Usa almeno 1 per evitare log(0)
n = (
child.visits if child.visits > 0 else 1
) # Usa almeno 1 per evitare divisione per 0
# Calculate the UCB value
ucb = V + C * P * math.sqrt(math.log(N) / n)
if ucb > best_ucb:
best_ucb = ucb
best_child = child
return best_child # Return the best child
def expand(self, policies):
while len(self.untried_moves) > 0:
move = self.untried_moves.pop()
# Check if the move is legal and, if so, add a new child
if self.is_legal_move(move[0][0], move[0][1], move[1]):
new_state = self.apply_move(move) # Apply the move to get a new state
index = move_to_index(move)
new_child = MCTSNode(
state=new_state, move=move, move_prob=policies[index], parent=self
)
self.children.append(
new_child
) # Add the new child to the children list
def apply_move(self, move):
# Apply a move to the game and return the resulting game state
game = Game(
deepcopy(self.state.board), self.state.player
) # Create a new game instance
game.move(move[0], move[1], self.state.player) # Apply the move
# Update the current player
game.current_player_idx += 1
game.current_player_idx %= 2
# Get the new board and player
board = game.get_board()
player = game.get_current_player()
return GameState(board, player) # Return the new game state
def update_value(self, value):
# Update the value of the node based on the given value
self.visits += 1 # Increment the visit count
self.value += value # Update the value
def update_move_prob(self, move_prob):
# Update the probability of choosing this move
self.move_prob = move_prob
def is_leaf(self):
# Check if the node is a leaf node (no children)
return len(self.children) == 0
def is_terminal(self):
game = Game(
deepcopy(self.state.board), self.state.player
) # Create a new game instance
return game.check_winner() >= 0
def print(self):
# Print the node's information
print(f"Move: {self.move}")
print(f"Move Index: {self.move_index}")
print(f"Move Probability: {self.move_prob}")
print(f"Visits: {self.visits}")
print(f"Value: {self.value}")
print(f"Untried Moves: {self.untried_moves}")
print(f"Children: {self.children}")
print(f"Is Leaf: {self.is_leaf()}")
print(f"Is Terminal: {self.is_terminal()}")
class MonteCarloTreeSearch:
def __init__(self, neural_network):
# Initialize the Monte Carlo Tree Search with a neural network
self.neural_network = neural_network # Neural network for evaluating game states
self.neural_network.to(device)
self.dataset = QuixoDataset() # Dataset for storing game states and labels
def get_model(self):
# Return the neural network model
return self.neural_network
def get_dataset(self):
# Return the dataset
return self.dataset
def train(
self,
n_games,
epochs,
simulation_per_game,
epochs_per_game=2,
max_moves_per_game=100,
max_moves_for_dataset=10_000,
):
# Train the neural network over a specified number of epochs and games per epoch
games = [] # List to store game states
policies = [] # List to store policy labels
values = [] # List to store value labels
# Simulate games and collect data
for i in range(n_games):
print(f"Game {i+1}")
# Initialize a new game board and set the starting player
board = np.ones((5, 5), dtype=np.uint8) * -1
player = 0
root = MCTSNode(GameState(board, player)) # Create the root node for MCTS
# Simulate a game using MCTS and collect the results
(
result,
game_states,
game_moves,
game_state_with_player,
) = self.simulate_game(root, simulation_per_game, max_moves_per_game)
games.extend(game_states) # Add the game states to the dataset
# Generate and add policy and value labels for the game states
policy = generate_pol_labels(game_moves)
value = generate_val_labels(game_state_with_player, result)
policies.extend(policy)
values.extend(value)
# Add the collected game data to the dataset
self.dataset.add_data_and_keep_fixed(
games, values, policies, fixed_num=max_moves_for_dataset
)
# Create a DataLoader and train the neural network
batch_size = 64 if len(games) > 64 else len(games)
train_loader = get_data_loader(
self.dataset, batch_size
) # Create a DataLoader
train(self.neural_network, train_loader, epochs=epochs_per_game) # Train the model
train(self.neural_network, train_loader, epochs=epochs) # Train the model
def simulate_game(self, node, num_simulations=500, max_moves=100):
# Simulate a game starting from a given node
board = node.state.board
player = node.state.player
game = Game(board, player) # Initialize the game state
game_states = [] # Store the game states
game_moves = [] # Store the moves made in the game
game_state_with_player = [] # Store game states along with the current player
progress_bar = tqdm(total=max_moves, desc="Game")
# Continue the game until there is a winner
while game.check_winner() < 0 and len(game_moves) < max_moves:
# Run multiple MCTS simulations for each move
for _ in range(num_simulations):
self.run_mcts_simulation(node)
# After simulations, choose the best move based on the MCTS results
best_node = self.choose_best_node(node)
# Determine the best move and apply it to the game
best_move = best_node.move
progress_bar.update(1)
progress_bar.set_postfix(
{
"Value": best_node.value,
"Player": game.get_current_player(),
"Move": best_move,
}
)
game_moves.append(best_move)
game.move(best_move[0], best_move[1], game.get_current_player())
# Update the current player
game.current_player_idx += 1
game.current_player_idx %= 2
# Prepare the next node for the next round of simulations
node = best_node
# Convert the board to a binary representation for the neural network
binary_board = convert_board_to_binary(
game.get_board(), game.get_current_player()
)
game_states.append(binary_board)
game_state_with_player.append((binary_board, game.get_current_player()))
progress_bar.close()
# Return the result of the game and the states, moves, and state-player pairs
return game.check_winner(), game_states, game_moves, game_state_with_player
def make_move(self, node, num_simulations=500):
# Simulate a game starting from a given node
for _ in range(num_simulations):
self.run_mcts_simulation(node)
# After simulations, choose the best move based on the MCTS results
best_node = self.choose_best_node(node, C = 0.71)
# Determine the best move and apply it to the game
best_move = best_node.move
return best_move
def choose_best_node(self, node, C=1.41):
return node.select_child(C)
def run_mcts_simulation(self, node):
path = []
current_node = node
# Step 1: Selezione
while not current_node.is_leaf():
current_node = current_node.select_child()
path.append(current_node)
# Step 2: Espansione
if not current_node.is_terminal(): # Verifica che il gioco non sia finito
policies = self.get_policies(current_node)
current_node.expand(policies)
new_child = current_node.select_child() # Seleziona uno dei nuovi figli
path.append(new_child)
current_node = new_child
# Step 3: Simulazione/Rollout
# Se usi una rete neurale per la valutazione, ottieni il valore dal modello
# Altrimenti, esegui un rollout casuale o basato su euristica
game_outcome = self.rollout(current_node)
# Step 4: Backpropagation
self.backpropagate_value(path, game_outcome)
def get_policies(self, node):
board = node.state.board
player = node.state.player
binary_board = convert_board_to_binary(board, player)
# Convert the board to a tensor for the neural network
board_tensor = torch.tensor(binary_board).float().unsqueeze(0).to(device)
# Predict the policy and value for the current state without updating gradients
with torch.no_grad():
self.neural_network.eval()
policy, _ = self.neural_network(board_tensor)
return policy.squeeze(0).exp().cpu().numpy()
def rollout(self, node):
board = node.state.board
player = node.state.player
binary_board = convert_board_to_binary(board, player)
board_tensor = torch.tensor(binary_board).float().unsqueeze(0).to(device)
with torch.no_grad():
self.neural_network.eval()
_, val = self.neural_network(board_tensor)
return val.item()
def backpropagate_value(self, path, val):
for node in reversed(path):
node.update_value(val)
val = -val
def update_children(self, node, policy):
# Update the move probabilities for the children of a node
for child in node.children:
# Update the move probability of the child based on the policy prediction
child.update_move_prob(policy[child.move_index])
class AlphaZeroPlayer(Player):
def __init__(self, mtcs, simulation = 500) -> None:
super().__init__()
self.mtcs = mtcs
self.simulation = simulation
def make_move(self, game: "Game") -> tuple[tuple[int, int], Move]:
board = game.get_board()
player = game.get_current_player()
node = MCTSNode(GameState(board, player))
best_move = self.mtcs.make_move(node, num_simulations=self.simulation)
return best_move