Skip to content

Commit

Permalink
feat(runners): auto resume
Browse files Browse the repository at this point in the history
  • Loading branch information
LutingWang committed Jan 13, 2024
1 parent f333b60 commit 5fb2ade
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
2 changes: 2 additions & 0 deletions todd/runners/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,12 @@ def __init__(
name: str,
*args,
load_from: str | None = None,
auto_resume: bool = False,
**kwargs,
) -> None:
self._name = name
self._load_from = load_from
self._auto_resume = auto_resume

self._iter = 0
self._build(*args, **kwargs)
Expand Down
9 changes: 8 additions & 1 deletion todd/runners/callbacks/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,16 @@ def init(self) -> None:
self._latest_checkpoint_dir = self._checkpoint_dir / 'latest'

self._checkpoint_dir.mkdir(parents=True, exist_ok=True)
if self._runner.load_from is not None:

if self._runner._auto_resume and self._latest_checkpoint_dir.exists():
load_from = self._latest_checkpoint_dir
elif self._runner.load_from is not None:
load_from = pathlib.Path(self._runner.load_from)
assert load_from.exists()
else:
load_from = None

if load_from is not None:
if get_rank() == 0:
self._runner.logger.info("Loading from %s", load_from)
state_dict = {
Expand Down

0 comments on commit 5fb2ade

Please sign in to comment.