From 21009ef87e9b1a4b66ca768213b878a20c3dd86a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stig-Arne=20Gr=C3=B6nroos?= Date: Tue, 19 Dec 2023 21:13:07 +0200 Subject: [PATCH] Parameter --time_budget_s to end gpu assignment early --- tools/config_config.py | 5 +++++ tools/gpu_assignment.py | 19 +++++++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/tools/config_config.py b/tools/config_config.py index a7c14721..c0d23eb5 100644 --- a/tools/config_config.py +++ b/tools/config_config.py @@ -167,6 +167,10 @@ def add_allocate_device_args(parser): parser.add_argument('--n_gpus_per_node', type=int) parser.add_argument('--n_slots_per_gpu', type=int) parser.add_argument('--log_name', type=str) + parser.add_argument( + '--time_budget_s', type=int, + help='time budget for GPU assignment, in seconds', + ) def add_set_transforms_args(parser): @@ -533,6 +537,7 @@ def allocate_devices(opts): lang_to_group_mapping=cc_opts['groups'], lps_ready_to_start=lps_ready_to_start, log_name=opts.log_name, + time_budget_s=opts.time_budget_s, ) for gpu_slot, lp in assignment.items(): diff --git a/tools/gpu_assignment.py b/tools/gpu_assignment.py index 8406627c..b135bb68 100644 --- a/tools/gpu_assignment.py +++ b/tools/gpu_assignment.py @@ -361,9 +361,13 @@ def swap_all_slots_once(self, assignment, current_cost, slot_subset=None): slot_subset = self.gpu_slots if slot_subset is None else slot_subset for i, slot_a in enumerate(tqdm(slot_subset, desc='swap_all_slots_once', leave=False)): current_cost, assignment = self.best_swap_for(slot_a, assignment, current_cost, slot_subset) + if self.deadline and time.time() > self.deadline: + print('Time budget exceeded, finishing early mid-iteration', flush=True) + break return current_cost, assignment - def optimize(self, assignment, current_cost, iterations=10, patience=1): + def optimize(self, assignment, current_cost, iterations=10, patience=1, time_budget_s=None): + self.deadline = time.time() + time_budget_s if time_budget_s else None prev_cost = None stalled = 0 print(f'initial cost: {current_cost}', flush=True) @@ -376,6 +380,9 @@ def optimize(self, assignment, current_cost, iterations=10, patience=1): current_cost, slot_subset ) + if self.deadline and time.time() > self.deadline: + print('Time budget exceeded, finishing early', flush=True) + break print(f'\niteration {i} least_favorite cost: {current_cost}', flush=True) # Random subsets slot_subsets = self.slot_subsets(self.gpu_slots, n=100) @@ -393,6 +400,9 @@ def optimize(self, assignment, current_cost, iterations=10, patience=1): if stalled > patience: print('No improvement, finishing early', flush=True) break + if self.deadline and time.time() > self.deadline: + print('Time budget exceeded, finishing early', flush=True) + break return current_cost, assignment, i def slot_subsets(self, slots, n=100): @@ -435,6 +445,7 @@ def optimize_gpu_assignment( lang_to_group_mapping: Dict[str, str], lps_ready_to_start: Optional[Set[Tuple[str, str]]], log_name: Optional[str] = None, + time_budget_s: Optional[int] = None, ): optimizer = AssignmentOptimizer( n_nodes=n_nodes, @@ -447,7 +458,11 @@ def optimize_gpu_assignment( initial = optimizer.initial_assignment(lang_pairs) initial_cost = optimizer.cost(initial) start = time.time() - best_cost, assignment, iterations = optimizer.optimize(initial, initial_cost) + best_cost, assignment, iterations = optimizer.optimize( + initial, + initial_cost, + time_budget_s=time_budget_s + ) duration_s = time.time() - start print_assignment(assignment, lang_to_group_mapping, ready_to_start=lps_ready_to_start) print(f'assignment cost {best_cost}', flush=True)