fix uneven batch nums accross ranks

ad-freiburg · Jun 26, 2024 · c633811 · c633811
1 parent 82c11fa
commit c633811
Showing 1 changed file with 7 additions and 0 deletions.
diff --git a/python/text_utils/api/trainer.py b/python/text_utils/api/trainer.py
@@ -1036,6 +1036,7 @@ def _train_one_epoch(self):
             self.info.device,
             output_op="sum"
         )
+        min_num_batches = torch.zeros(1, dtype=torch.long, device=self.info.device)
 
         metrics = []
         for name, cfg in self.cfg["train"].get("metrics", {}).items():
@@ -1075,6 +1076,10 @@ def _train_one_epoch(self):
                 batches.append(batch)
 
             end_batch = time.perf_counter()
+            min_num_batches[0] = len(batches)
+            dist.all_reduce(min_num_batches, op=dist.ReduceOp.MIN)
+            batches = batches[:min_num_batches.item()]
+            min_num_batches[0] = 0
 
             if len(batches) == 0:
                 self.logger.info(
@@ -1123,6 +1128,8 @@ def _train_one_epoch(self):
                 if first_outputs is None:
                     first_outputs = outputs.detach()
 
+                dist.barrier()
+
             if self.clip_gradient_norm is not None:
                 self.grad_scaler.unscale_(self.optimizer)
                 if isinstance(self.model, FSDP):