Skip to content

Commit

Permalink
fix flaky distributed tests with barriers (#963)
Browse files Browse the repository at this point in the history
Summary: Pull Request resolved: #963

Reviewed By: anshulverma, diego-urgell

Differential Revision: D68466282

fbshipit-source-id: 836ba686237f243823410ae5b60242fedd705a62
  • Loading branch information
JKSenthil authored and facebook-github-bot committed Jan 22, 2025
1 parent edf6f85 commit 9984243
Show file tree
Hide file tree
Showing 6 changed files with 10 additions and 0 deletions.
1 change: 1 addition & 0 deletions tests/framework/callbacks/test_base_checkpointer.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ def _directory_sync_collective() -> None:
tc.assertTrue("tmp" in dirpath)
tc.assertFalse("foo" in dirpath)
finally:
dist.barrier() # avoid race condition
if get_global_rank() == 0:
shutil.rmtree(temp_dir) # delete temp directory

Expand Down
2 changes: 2 additions & 0 deletions tests/framework/callbacks/test_dcp_saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from unittest.mock import MagicMock, patch

import torch
import torch.distributed as dist
from pyre_extensions import none_throws
from torch import nn
from torch.distributed.checkpoint import FileSystemReader, FileSystemWriter
Expand Down Expand Up @@ -309,6 +310,7 @@ def _save_restore_ddp() -> None:
tc, my_new_unit.module.state_dict(), my_unit.module.state_dict()
)
finally:
dist.barrier() # avoid race condition
if get_global_rank() == 0:
shutil.rmtree(temp_dir) # delete temp directory

Expand Down
2 changes: 2 additions & 0 deletions tests/framework/callbacks/test_dcp_saver_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ def _save_restore_fsdp() -> None:
my_new_unit.optimizer.state_dict(), my_unit.optimizer.state_dict()
)
finally:
dist.barrier() # avoid race condition
if get_global_rank() == 0:
shutil.rmtree(temp_dir) # delete temp directory

Expand Down Expand Up @@ -165,5 +166,6 @@ def _save_restore_fsdp_with_id() -> None:
my_new_unit.optimizer.state_dict(), my_unit.optimizer.state_dict()
)
finally:
dist.barrier() # avoid race condition
if get_global_rank() == 0:
shutil.rmtree(temp_dir) # delete temp directory
2 changes: 2 additions & 0 deletions tests/framework/callbacks/test_torchsnapshot_saver.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from unittest.mock import MagicMock, patch

import torch
import torch.distributed as dist
from torch import nn
from torch.utils.data import DataLoader
from torchsnapshot.test_utils import assert_state_dict_eq, check_state_dict_eq
Expand Down Expand Up @@ -322,6 +323,7 @@ def _save_restore_ddp() -> None:
tc, my_new_unit.module.state_dict(), my_unit.module.state_dict()
)
finally:
dist.barrier() # avoid race condition
if get_global_rank() == 0:
shutil.rmtree(temp_dir) # delete temp directory

Expand Down
2 changes: 2 additions & 0 deletions tests/framework/callbacks/test_torchsnapshot_saver_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import unittest

import torch
import torch.distributed as dist
from torchtnt.framework._test_utils import DummyAutoUnit, generate_random_dataloader
from torchtnt.framework.callbacks.torchsnapshot_saver import TorchSnapshotSaver
from torchtnt.framework.train import train
Expand Down Expand Up @@ -68,5 +69,6 @@ def _save_restore_fsdp() -> None:
my_new_unit.optimizer.state_dict(), my_unit.optimizer.state_dict()
)
finally:
dist.barrier() # avoid race condition
if get_global_rank() == 0:
shutil.rmtree(temp_dir) # delete temp directory
1 change: 1 addition & 0 deletions tests/utils/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,6 +1324,7 @@ def create_tmp_dir() -> str:
get_checkpoint_dirpaths(temp_dir, metadata_fname=".metadata"), []
)
finally:
dist.barrier() # avoid race condition
if get_global_rank() == 0:
shutil.rmtree(temp_dir) # delete temp directory

Expand Down

0 comments on commit 9984243

Please sign in to comment.