Skip to content

Commit

Permalink
fix uneven batch nums accross ranks
Browse files Browse the repository at this point in the history
  • Loading branch information
bastiscode committed Jun 26, 2024
1 parent 82c11fa commit c633811
Showing 1 changed file with 7 additions and 0 deletions.
7 changes: 7 additions & 0 deletions python/text_utils/api/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,7 @@ def _train_one_epoch(self):
self.info.device,
output_op="sum"
)
min_num_batches = torch.zeros(1, dtype=torch.long, device=self.info.device)

metrics = []
for name, cfg in self.cfg["train"].get("metrics", {}).items():
Expand Down Expand Up @@ -1075,6 +1076,10 @@ def _train_one_epoch(self):
batches.append(batch)

end_batch = time.perf_counter()
min_num_batches[0] = len(batches)
dist.all_reduce(min_num_batches, op=dist.ReduceOp.MIN)
batches = batches[:min_num_batches.item()]
min_num_batches[0] = 0

if len(batches) == 0:
self.logger.info(
Expand Down Expand Up @@ -1123,6 +1128,8 @@ def _train_one_epoch(self):
if first_outputs is None:
first_outputs = outputs.detach()

dist.barrier()

if self.clip_gradient_norm is not None:
self.grad_scaler.unscale_(self.optimizer)
if isinstance(self.model, FSDP):
Expand Down

0 comments on commit c633811

Please sign in to comment.