From a6e222a78b6d1a21c2af15c7a0b7591e0b52cd2e Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Tue, 22 Mar 2022 16:55:08 +0100 Subject: [PATCH 01/43] doc --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index e4db764..ad4215f 100644 --- a/README.md +++ b/README.md @@ -544,9 +544,13 @@ rules = dora.conf.SubmitRules(retry=True) # Should we reschedule failed jobs? # each sheep as 2 attributues: sheep.xp and sheep.job_id. sheeps = dora.grid.run_grid(main, explorer, grid_name='jupy', rules=rules, args=args) args.monitor = True +args.jupyter = True # The jupyter flag will make the grid API use the display API to clear the cell # output and update it regularly. This one will not return until all jobs # are done or failed. +# In the following, `grid_name` should be unique. It will be used +# to determine which experiments were previously scheduled with that grid +# and should potentially be cancelled if no longer needed. dora.grid.run_grid(main, explorer, grid_name='jupy', rules=rules, args=args) # You can retrieve the short names by using `main.get_names()` short_names, ref_name = main.get_names([sheep.xp for sheep in sheeps]) From 4e16e06695927eac7ebea33020ada3b8fcf6e6ad Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Mon, 11 Apr 2022 10:06:40 +0200 Subject: [PATCH 02/43] fixing issue --- CHANGELOG.md | 2 ++ dora/__init__.py | 2 +- dora/lightning.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5f9a5a..ca316fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ More reliable passing of arguments of Hydra (before, setting None would actually Allow for empty `mem` constraint in Slurm. +Fixing `callbacks` default value in PL. + ## [0.1.9] - 2022-02-28 Reliable rmtree used to avoid `--clear` being blocked by some locking issues on NFS. diff --git a/dora/__init__.py b/dora/__init__.py index 91ca0f2..a75fbe4 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.10a7" +__version__ = "0.1.10a8" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/lightning.py b/dora/lightning.py index f976f4d..b1aa561 100644 --- a/dora/lightning.py +++ b/dora/lightning.py @@ -170,7 +170,7 @@ def get_trainer(*args, auto_resume=True, add_dora_logger=True, no_unfinished_epo plugins += [env, 'ddp'] kwargs['plugins'] = plugins - callbacks = kwargs.pop("callbacks", []) + callbacks = kwargs.pop("callbacks", None) or [] callbacks.append(DoraCheckpointSync()) kwargs['callbacks'] = callbacks From d31aee19811a51d1a06576dcc8cc244d011b7b1d Mon Sep 17 00:00:00 2001 From: Louis Martin Date: Thu, 14 Apr 2022 10:06:07 +0100 Subject: [PATCH 03/43] Add an example of grid with hydra config like "model.hidden_dim=512" (#24) --- examples/mnist_hydra/conf/config.yaml | 2 ++ examples/mnist_hydra/grids/test.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/examples/mnist_hydra/conf/config.yaml b/examples/mnist_hydra/conf/config.yaml index 50e0fab..2724657 100644 --- a/examples/mnist_hydra/conf/config.yaml +++ b/examples/mnist_hydra/conf/config.yaml @@ -11,6 +11,8 @@ dry_run: false seed: 1234 log_interval: 100 save_model: false +model: + hidden_dim: 512 dora: exclude: ["save_model", "no_cuda", "dry_run", "log_interval", "data_root"] diff --git a/examples/mnist_hydra/grids/test.py b/examples/mnist_hydra/grids/test.py index 40ca3b0..d000833 100644 --- a/examples/mnist_hydra/grids/test.py +++ b/examples/mnist_hydra/grids/test.py @@ -22,6 +22,9 @@ def explorer(launcher): for bs in [32, 64, 128]: launcher(batch_size=bs) + for hidden_dim in [512, 1024]: + launcher.bind_([f"model.hidden_dim={hidden_dim}"]) + launcher.bind_(gamma=0.6) launcher.slurm_(mem_per_gpu=20) launcher() From a0a65582ca416d022e0251c711510fc8f99280e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?= Date: Thu, 14 Apr 2022 11:10:30 +0200 Subject: [PATCH 04/43] Update test.py --- examples/mnist_hydra/grids/test.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/mnist_hydra/grids/test.py b/examples/mnist_hydra/grids/test.py index d000833..1ead022 100644 --- a/examples/mnist_hydra/grids/test.py +++ b/examples/mnist_hydra/grids/test.py @@ -23,7 +23,14 @@ def explorer(launcher): launcher(batch_size=bs) for hidden_dim in [512, 1024]: - launcher.bind_([f"model.hidden_dim={hidden_dim}"]) + # here we get a sub launcher with `bind()`. All XPs scheduled with it + # will retain the bound params but it won't impact the parent launcher. + sub = launcher.bind({"model.hidden_dim": hidden_dim}) + # Or, the two are equivalent + # sub = launcher.bind([f"model.hidden_dim={hidden_dim}"]) + sub() + sub(gamma=0.6) + launcher.bind_(gamma=0.6) launcher.slurm_(mem_per_gpu=20) From dbfc738241dafe67d9bba31c081c607661902ebb Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Fri, 15 Apr 2022 17:55:15 +0200 Subject: [PATCH 05/43] allow extra config keys in hydra, allow to customize grid package --- CHANGELOG.md | 4 ++++ dora/conf.py | 8 ++++++-- dora/git_save.py | 2 +- dora/grid.py | 23 ++++++++++++++--------- dora/hydra.py | 10 +++++++++- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca316fd..53254b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,10 @@ Allow for empty `mem` constraint in Slurm. Fixing `callbacks` default value in PL. +Extra "keys" in Hydra config files are now allowed (i.e. overrides with `+something=12`). + +The package where Dora looks for grids can be customized, in Hydra with `dora.grid_package` in the base config or passing `grid_package='...'` to `argparse_main`. + ## [0.1.9] - 2022-02-28 Reliable rmtree used to avoid `--clear` being blocked by some locking issues on NFS. diff --git a/dora/conf.py b/dora/conf.py index c3ebe42..f197ad4 100644 --- a/dora/conf.py +++ b/dora/conf.py @@ -141,11 +141,14 @@ class DoraConfig: shared (Path or None): if provided, the path to a central repository of XPs. For the moment, this only supports sharing hyper-params, logs etc. will stay in the per user folder. + grid_package (str or None): if provided, package to look for grids. Default + to the package with the `train.py` module followed by `.grids`. """ dir: Path = Path("./outputs") # where everything will be stored exclude: tp.List[str] = field(default_factory=list) git_save: bool = False shared: tp.Optional[Path] = None # Optional path for shared XPs. + grid_package: tp.Optional[str] = None # Those are internal config values and are unlikely to be changed history: str = "history.json" # where metrics will be stored @@ -154,8 +157,9 @@ class DoraConfig: shep: ShepConfig = field(default_factory=ShepConfig) rendezvous_file: str = "rendezvous.txt" use_rendezvous: bool = False - grids: str = "grids" - codes: str = "codes" + # Filenames used in various places, you shouldn't edit that + _grids: str = "grids" + _codes: str = "codes" def is_excluded(self, arg_name: str) -> bool: """Return True if the given argument name should be excluded from diff --git a/dora/git_save.py b/dora/git_save.py index 5cad173..7da9e47 100644 --- a/dora/git_save.py +++ b/dora/git_save.py @@ -102,7 +102,7 @@ def get_new_clone(main: DecoratedMain) -> Path: source = get_git_root() commit = get_git_commit() check_repo_clean(source, main) - codes = main.dora.dir / main.dora.codes + codes = main.dora.dir / main.dora._codes codes.mkdir(parents=True, exist_ok=True) target = codes / commit if not target.exists(): diff --git a/dora/grid.py b/dora/grid.py index 05fadab..ea12970 100644 --- a/dora/grid.py +++ b/dora/grid.py @@ -17,7 +17,6 @@ from functools import partial import os from pathlib import Path -import pkgutil import typing as tp import shutil import sys @@ -91,24 +90,30 @@ class RunGridArgs: def _get_explore(args, main): # Finds the explorer. - root_name = main.package + ".grids" - grids = import_or_fatal(root_name) + grid_package = main.dora.grid_package + if grid_package is None: + grid_package = main.package + ".grids" + + grids = import_or_fatal(grid_package) if args.grid is not None: grid_filename = args.grid.replace('.', '/') + '.py' grid_file = Path(grids.__file__).parent / grid_filename if args.grid is None or not grid_file.exists(): candidates = [] - for info in pkgutil.walk_packages([Path(grids.__file__).parent]): - if not info.name.startswith('_'): - candidates.append(info.name) + pkg_root = Path(grids.__file__).parent + for root, folders, files in os.walk(pkg_root): + for file in files: + fullpath = (Path(root) / file).relative_to(pkg_root) + if fullpath.name.endswith('.py') and not fullpath.name.starswith('_'): + candidates.append(str(fullpath).replace('/', '.')) if args.grid is not None and not grid_file.exists(): - log(f'No grid file {grid_filename} in package {root_name}. ' + log(f'No grid file {grid_filename} in package {grid_package}. ' 'Maybe you made a typo?') log(f"Potential grids are: {', '.join(candidates)}") sys.exit(0) - grid_name = root_name + "." + args.grid + grid_name = grid_package + "." + args.grid grid = import_or_fatal(grid_name) try: @@ -157,7 +162,7 @@ def run_grid(main: DecoratedMain, explorer: Explorer, grid_name: str, if slurm is None: slurm = main.get_slurm_config() - grid_folder = main.dora.dir / main.dora.grids / grid_name + grid_folder = main.dora.dir / main.dora._grids / grid_name grid_folder.mkdir(exist_ok=True, parents=True) herd = Herd() diff --git a/dora/hydra.py b/dora/hydra.py index be7e8a4..e1c7299 100644 --- a/dora/hydra.py +++ b/dora/hydra.py @@ -46,6 +46,13 @@ def _no_copy(self: tp.Any, memo: tp.Any): _Difference = namedtuple("_Difference", "path key ref other ref_value other_value") +class _NotThere: + pass + + +NotThere = _NotThere() + + def _compare_config(ref, other, path=[]): """ Given two configs, gives an iterator over all the differences. For each difference, @@ -58,7 +65,8 @@ def _compare_config(ref, other, path=[]): for key in keys: path[-1] = key ref_value = ref[key] - assert key in other, f"Structure of config should be identical between XPs. Extra key {key}" + if key not in other: + other_value = NotThere() other_value = other[key] if isinstance(ref_value, DictConfig): From 780a492b40842574c3a89b723d3cb85fcf24e14e Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Fri, 15 Apr 2022 18:01:15 +0200 Subject: [PATCH 06/43] fixing listing of files --- dora/grid.py | 3 ++- dora/hydra.py | 10 ++++++---- examples/mnist_hydra/grids/test.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/dora/grid.py b/dora/grid.py index ea12970..5c22724 100644 --- a/dora/grid.py +++ b/dora/grid.py @@ -105,7 +105,8 @@ def _get_explore(args, main): for root, folders, files in os.walk(pkg_root): for file in files: fullpath = (Path(root) / file).relative_to(pkg_root) - if fullpath.name.endswith('.py') and not fullpath.name.starswith('_'): + if fullpath.name.endswith('.py') and not fullpath.name.startswith('_'): + fullpath = fullpath.parent / fullpath.stem candidates.append(str(fullpath).replace('/', '.')) if args.grid is not None and not grid_file.exists(): log(f'No grid file {grid_filename} in package {grid_package}. ' diff --git a/dora/hydra.py b/dora/hydra.py index e1c7299..075d88b 100644 --- a/dora/hydra.py +++ b/dora/hydra.py @@ -65,8 +65,7 @@ def _compare_config(ref, other, path=[]): for key in keys: path[-1] = key ref_value = ref[key] - if key not in other: - other_value = NotThere() + assert key in other, f"XP config shouldn't be missing any key. Missing key {key}" other_value = other[key] if isinstance(ref_value, DictConfig): @@ -76,8 +75,11 @@ def _compare_config(ref, other, path=[]): yield from _compare_config(ref_value, other_value, path) elif other_value != ref_value: yield _Difference(list(path), key, ref, other, ref_value, other_value) - assert len(remaining) == 0, "Structure of config should be identical between XPs. "\ - f"Missing keys: {remaining}" + + for key in remaining: + path[-1] = key + other_value = other[key] + yield _Difference(list(path), key, ref, other, NotThere, other_value) path.pop(-1) return delta diff --git a/examples/mnist_hydra/grids/test.py b/examples/mnist_hydra/grids/test.py index 1ead022..04e174e 100644 --- a/examples/mnist_hydra/grids/test.py +++ b/examples/mnist_hydra/grids/test.py @@ -30,7 +30,7 @@ def explorer(launcher): # sub = launcher.bind([f"model.hidden_dim={hidden_dim}"]) sub() sub(gamma=0.6) - + sub({'+new_param': 'whatever'}) # you can define extra keys with '+' if required launcher.bind_(gamma=0.6) launcher.slurm_(mem_per_gpu=20) From 2890532f27ab7af5dbae17088aab8a480bee7b6c Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Fri, 15 Apr 2022 18:24:01 +0200 Subject: [PATCH 07/43] docs --- CHANGELOG.md | 2 ++ README.md | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53254b7..6d694b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,8 @@ Extra "keys" in Hydra config files are now allowed (i.e. overrides with `+someth The package where Dora looks for grids can be customized, in Hydra with `dora.grid_package` in the base config or passing `grid_package='...'` to `argparse_main`. +Better doc for launcher API. + ## [0.1.9] - 2022-02-28 Reliable rmtree used to avoid `--clear` being blocked by some locking issues on NFS. diff --git a/README.md b/README.md index ad4215f..3f2bebd 100644 --- a/README.md +++ b/README.md @@ -412,6 +412,38 @@ This will do 3 thing: - A table containing job status and metadata as well as the latest metrics will be printed every 5 minutes. +### The Launcher API + +Here is a more comprehensive description of what `Launcher` object can do. + +- `launcher.bind_(...)`: remember the given parameters (command line option for argparse based +project, or overrides for Hydra based ones) for future scheduling, i.e. all experiments +later scheduled with that launcher will have those parameters set. +- `sub = launcher.bind(...)`: same as bind, but returns a new "sub" launcher, i.e. the object +`launcher` is not changed, only experiments scheduled with `sub` will use the given params. +`sub` also inherits from all the params already bound to its parent launcher (i.e. previous call to `launcher.bind_`). +Creating a sub-launcher is especially recommended inside loops, to avoid leaking params to the next loop iteration. +- `launcher(...)`: schedules an experiment with the given params, plus all the ones that have +been aggregated through the various calls to `bind_` and to `bind`. This is equivalent to +`launcher.bind(...)()`. +- `launcher.slurm_(key=value, ... and `launcher.slurm(key=value, ...)`: same as `bind_` and `bind` +but for the slurm config (nb of GPUs etc). For a list of possible options, checkout +[SlurmConf](https://facebookresearch.github.io/dora/dora/conf.html#dora.conf.SlurmConfig). + + +Now let us describe the format for passing parameters overrides or command line flags to +`launcher.bind_()`, `launcher.bind()` or `launcher()`: + +- Simple parameters (i.e. not nested) can be passed as kwargs, for instance if you have a `--batch_size` flag, you can +do `launcher.bind(batch_size=64)`. +- Command line flags can be explicitely passed as a list of strings, for instance `launcher.bind(['--lr=1e-4'])`. +- A dictionary of overrides can be passed, for instance `launcher.bind({'batch_size': 64})`. Note that this +also allows for nested keys in Hydra: `launcher.bind({'model.channels': 256})`. With Hydra, you can +also define new keys with `{'+model.activation': 'relu'}`. You must not remove keys though. +- Finally you can combine all of those (for a Hydra project here): +`launcher.bind(['optim.lr=1e-4'], {'model.channels': 256, 'seed': 42}, {'+model.activation': 'relu'}, batch_size=64)`. + + ### Flags The `dora grid` command supports the following flags: From 7e5d8942189126afbd4cf716b10bbd35aab12fbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?= Date: Tue, 19 Apr 2022 09:50:46 +0200 Subject: [PATCH 08/43] Customize grid package, better doc, and allows adding keys in hydra (#25) * allow extra config keys in hydra, allow to customize grid package * fixing listing of files * docs * bump version * doc * Update README.md * plop * plop * fix unit --- CHANGELOG.md | 6 +++++ README.md | 35 ++++++++++++++++++++++++++++++ dora/__init__.py | 2 +- dora/conf.py | 8 +++++-- dora/git_save.py | 2 +- dora/grid.py | 24 ++++++++++++-------- dora/hydra.py | 16 +++++++++++--- dora/tests/test_hydra.py | 4 ++-- examples/mnist_hydra/grids/test.py | 2 +- 9 files changed, 80 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ca316fd..6d694b5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,12 @@ Allow for empty `mem` constraint in Slurm. Fixing `callbacks` default value in PL. +Extra "keys" in Hydra config files are now allowed (i.e. overrides with `+something=12`). + +The package where Dora looks for grids can be customized, in Hydra with `dora.grid_package` in the base config or passing `grid_package='...'` to `argparse_main`. + +Better doc for launcher API. + ## [0.1.9] - 2022-02-28 Reliable rmtree used to avoid `--clear` being blocked by some locking issues on NFS. diff --git a/README.md b/README.md index ad4215f..2816ca7 100644 --- a/README.md +++ b/README.md @@ -412,6 +412,41 @@ This will do 3 thing: - A table containing job status and metadata as well as the latest metrics will be printed every 5 minutes. +### The Launcher API + +Here is a more comprehensive description of what `Launcher` object can do. + +- `launcher.bind_(...)`: remember the given parameters (command line option for argparse based +project, or overrides for Hydra based ones) for future scheduling, i.e. all experiments +later scheduled with that launcher will have those parameters set. +- `sub = launcher.bind(...)`: same as bind, but returns a new "sub" launcher, i.e. the object +`launcher` is not changed, only experiments scheduled with `sub` will use the given params. +`sub` also inherits from all the params already bound to its parent launcher (i.e. previous call to `launcher.bind_`). +Creating a sub-launcher is especially recommended inside loops, to avoid leaking params to the next loop iteration. +- `launcher(...)`: schedules an experiment with the given params, plus all the ones that have +been aggregated through the various calls to `bind_` and to `bind`. This is equivalent to +`launcher.bind(...)()`. +- `launcher.slurm_(key=value, ...)` and `launcher.slurm(key=value, ...)`: same as `bind_` and `bind` +but for the slurm config (nb of GPUs etc). For a list of possible options, checkout +[SlurmConf](https://facebookresearch.github.io/dora/dora/conf.html#dora.conf.SlurmConfig). + + +Now let us describe the format for passing parameters overrides or command line flags to +`launcher.bind_()`, `launcher.bind()` or `launcher()`: + +- Simple parameters (i.e. not nested) can be passed as kwargs, for instance if you have a `--batch_size` flag, you can +do `launcher.bind(batch_size=64)`. +- Command line flags can be explicitely passed as a list of strings, for instance `launcher.bind(['--lr=1e-4'])`. +- A dictionary of overrides can be passed, for instance `launcher.bind({'batch_size': 64})`. Note that this +also allows for nested keys in Hydra: `launcher.bind({'model.channels': 256})`. With Hydra, you can +also define new keys with `{'+model.activation': 'relu'}`. You must not remove keys though. +- Finally you can combine all of those (for a Hydra project here): + +```python +launcher.bind(['optim.lr=1e-4'], {'model.channels': 256, 'seed': 42}, {'+model.activation': 'relu'}, batch_size=64) +``` + + ### Flags The `dora grid` command supports the following flags: diff --git a/dora/__init__.py b/dora/__init__.py index a75fbe4..7f2b7b3 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.10a8" +__version__ = "0.1.10a9" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/conf.py b/dora/conf.py index c3ebe42..f197ad4 100644 --- a/dora/conf.py +++ b/dora/conf.py @@ -141,11 +141,14 @@ class DoraConfig: shared (Path or None): if provided, the path to a central repository of XPs. For the moment, this only supports sharing hyper-params, logs etc. will stay in the per user folder. + grid_package (str or None): if provided, package to look for grids. Default + to the package with the `train.py` module followed by `.grids`. """ dir: Path = Path("./outputs") # where everything will be stored exclude: tp.List[str] = field(default_factory=list) git_save: bool = False shared: tp.Optional[Path] = None # Optional path for shared XPs. + grid_package: tp.Optional[str] = None # Those are internal config values and are unlikely to be changed history: str = "history.json" # where metrics will be stored @@ -154,8 +157,9 @@ class DoraConfig: shep: ShepConfig = field(default_factory=ShepConfig) rendezvous_file: str = "rendezvous.txt" use_rendezvous: bool = False - grids: str = "grids" - codes: str = "codes" + # Filenames used in various places, you shouldn't edit that + _grids: str = "grids" + _codes: str = "codes" def is_excluded(self, arg_name: str) -> bool: """Return True if the given argument name should be excluded from diff --git a/dora/git_save.py b/dora/git_save.py index 5cad173..7da9e47 100644 --- a/dora/git_save.py +++ b/dora/git_save.py @@ -102,7 +102,7 @@ def get_new_clone(main: DecoratedMain) -> Path: source = get_git_root() commit = get_git_commit() check_repo_clean(source, main) - codes = main.dora.dir / main.dora.codes + codes = main.dora.dir / main.dora._codes codes.mkdir(parents=True, exist_ok=True) target = codes / commit if not target.exists(): diff --git a/dora/grid.py b/dora/grid.py index 05fadab..5c22724 100644 --- a/dora/grid.py +++ b/dora/grid.py @@ -17,7 +17,6 @@ from functools import partial import os from pathlib import Path -import pkgutil import typing as tp import shutil import sys @@ -91,24 +90,31 @@ class RunGridArgs: def _get_explore(args, main): # Finds the explorer. - root_name = main.package + ".grids" - grids = import_or_fatal(root_name) + grid_package = main.dora.grid_package + if grid_package is None: + grid_package = main.package + ".grids" + + grids = import_or_fatal(grid_package) if args.grid is not None: grid_filename = args.grid.replace('.', '/') + '.py' grid_file = Path(grids.__file__).parent / grid_filename if args.grid is None or not grid_file.exists(): candidates = [] - for info in pkgutil.walk_packages([Path(grids.__file__).parent]): - if not info.name.startswith('_'): - candidates.append(info.name) + pkg_root = Path(grids.__file__).parent + for root, folders, files in os.walk(pkg_root): + for file in files: + fullpath = (Path(root) / file).relative_to(pkg_root) + if fullpath.name.endswith('.py') and not fullpath.name.startswith('_'): + fullpath = fullpath.parent / fullpath.stem + candidates.append(str(fullpath).replace('/', '.')) if args.grid is not None and not grid_file.exists(): - log(f'No grid file {grid_filename} in package {root_name}. ' + log(f'No grid file {grid_filename} in package {grid_package}. ' 'Maybe you made a typo?') log(f"Potential grids are: {', '.join(candidates)}") sys.exit(0) - grid_name = root_name + "." + args.grid + grid_name = grid_package + "." + args.grid grid = import_or_fatal(grid_name) try: @@ -157,7 +163,7 @@ def run_grid(main: DecoratedMain, explorer: Explorer, grid_name: str, if slurm is None: slurm = main.get_slurm_config() - grid_folder = main.dora.dir / main.dora.grids / grid_name + grid_folder = main.dora.dir / main.dora._grids / grid_name grid_folder.mkdir(exist_ok=True, parents=True) herd = Herd() diff --git a/dora/hydra.py b/dora/hydra.py index be7e8a4..075d88b 100644 --- a/dora/hydra.py +++ b/dora/hydra.py @@ -46,6 +46,13 @@ def _no_copy(self: tp.Any, memo: tp.Any): _Difference = namedtuple("_Difference", "path key ref other ref_value other_value") +class _NotThere: + pass + + +NotThere = _NotThere() + + def _compare_config(ref, other, path=[]): """ Given two configs, gives an iterator over all the differences. For each difference, @@ -58,7 +65,7 @@ def _compare_config(ref, other, path=[]): for key in keys: path[-1] = key ref_value = ref[key] - assert key in other, f"Structure of config should be identical between XPs. Extra key {key}" + assert key in other, f"XP config shouldn't be missing any key. Missing key {key}" other_value = other[key] if isinstance(ref_value, DictConfig): @@ -68,8 +75,11 @@ def _compare_config(ref, other, path=[]): yield from _compare_config(ref_value, other_value, path) elif other_value != ref_value: yield _Difference(list(path), key, ref, other, ref_value, other_value) - assert len(remaining) == 0, "Structure of config should be identical between XPs. "\ - f"Missing keys: {remaining}" + + for key in remaining: + path[-1] = key + other_value = other[key] + yield _Difference(list(path), key, ref, other, NotThere, other_value) path.pop(-1) return delta diff --git a/dora/tests/test_hydra.py b/dora/tests/test_hydra.py index a749eb4..2944b97 100644 --- a/dora/tests/test_hydra.py +++ b/dora/tests/test_hydra.py @@ -98,8 +98,8 @@ def test_hydra(tmpdir): assert name == "opt.loss=l1" argv = ["+k=youpi"] - with pytest.raises(AssertionError): - xp2 = call(main, argv) + xp2 = call(main, argv) + assert xp2.cfg.k == 'youpi' with pytest.raises(ValueError): main.value_to_argv(0.5) diff --git a/examples/mnist_hydra/grids/test.py b/examples/mnist_hydra/grids/test.py index 1ead022..04e174e 100644 --- a/examples/mnist_hydra/grids/test.py +++ b/examples/mnist_hydra/grids/test.py @@ -30,7 +30,7 @@ def explorer(launcher): # sub = launcher.bind([f"model.hidden_dim={hidden_dim}"]) sub() sub(gamma=0.6) - + sub({'+new_param': 'whatever'}) # you can define extra keys with '+' if required launcher.bind_(gamma=0.6) launcher.slurm_(mem_per_gpu=20) From 00be644115e37139a18de4997fb4924ee851f9ee Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Wed, 27 Apr 2022 18:40:09 +0200 Subject: [PATCH 09/43] fix git_save with custom grid --- dora/__init__.py | 2 +- dora/git_save.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/dora/__init__.py b/dora/__init__.py index 7f2b7b3..57f1d36 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.10a9" +__version__ = "0.1.10a10" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/git_save.py b/dora/git_save.py index 7da9e47..1604cd3 100644 --- a/dora/git_save.py +++ b/dora/git_save.py @@ -40,7 +40,9 @@ def check_repo_clean(root: Path, main: DecoratedMain): # Here we try to detect the grids package and allow uncommitted changes # only to that folder. The rational is that as we edit the grid file, it is a pain # to constantly be commiting change to it and it should not impact the actual run code. - grid_name = main.name + ".grids" + grid_name = main.dora.grid_package + if grid_name is None: + grid_name = main.package + ".grids" spec = importlib.util.find_spec(grid_name) grid_path: tp.Optional[Path] = None if spec is not None: From c0b466cb7aa9aabb5c574767912fc2c9206aefd9 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Tue, 31 May 2022 16:49:10 +0200 Subject: [PATCH 10/43] testing complex types --- dora/tests/test_conf/test_conf.yaml | 4 ++++ dora/tests/test_hydra.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/dora/tests/test_conf/test_conf.yaml b/dora/tests/test_conf/test_conf.yaml index d8c6bc3..d991d30 100644 --- a/dora/tests/test_conf/test_conf.yaml +++ b/dora/tests/test_conf/test_conf.yaml @@ -7,6 +7,10 @@ optim: loss: mse num_workers: 10 +complex_types: + a: [1, 2, 3] + b: {"a": 1, "b": 2} + useless: a: 5 b: true diff --git a/dora/tests/test_hydra.py b/dora/tests/test_hydra.py index 2944b97..e053645 100644 --- a/dora/tests/test_hydra.py +++ b/dora/tests/test_hydra.py @@ -117,3 +117,13 @@ def test_hydra(tmpdir): argv = ["group=lapin", "plop.b=5"] with pytest.raises(Exception): xp2 = call(main, argv) + + +def test_complex_types(tmpdir): + # Test complex types parsing (e.g. lists and dict) + _main.__module__ = __name__ + main = get_main(tmpdir) + xp = call(main, ['complex.a=[0]']) + assert xp.cfg.complex.a == [0] + xp = call(main, ['complex.b={"c": "hello"}']) + assert xp.cfg.complex.b == {"c": "hello"} From 252f73c643596acb4162d9b32eda0c22ec202ff8 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Wed, 1 Jun 2022 14:39:26 +0200 Subject: [PATCH 11/43] fixing dict support in dora --- .github/workflows/documentation.yml | 1 + CHANGELOG.md | 2 ++ dora/__init__.py | 2 +- dora/hydra.py | 52 ++++++++++++++++++++++++--- dora/tests/test_conf/test_conf.yaml | 6 +++- dora/tests/test_grid.py | 36 +++++++++++++++++++ dora/tests/test_hydra.py | 12 +++++-- examples/mnist_hydra/conf/config.yaml | 2 ++ setup.py | 2 -- 9 files changed, 104 insertions(+), 11 deletions(-) diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 9869d87..baeead7 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -18,6 +18,7 @@ jobs: pip install '.[dev]' git config --global user.email "defossez@fb.com" git config --global user.name "Alexandre Défossez (autodoc)" + export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python - name: Reset branch diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d694b5..b02adea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ The package where Dora looks for grids can be customized, in Hydra with `dora.gr Better doc for launcher API. +Fix dict support with Hydra. Okay it is time that I release a new version now... + ## [0.1.9] - 2022-02-28 Reliable rmtree used to avoid `--clear` being blocked by some locking issues on NFS. diff --git a/dora/__init__.py b/dora/__init__.py index 57f1d36..36fb978 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.10a10" +__version__ = "0.1.10a11" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/hydra.py b/dora/hydra.py index 075d88b..f487ac5 100644 --- a/dora/hydra.py +++ b/dora/hydra.py @@ -99,12 +99,43 @@ def _simplify_argv(argv: tp.Sequence[str]) -> tp.List[str]: return simplified[::-1] +def _dump_key(key): + if key is None: + return "null" + elif isinstance(key, (bool, int, float)): + return str(key) + elif isinstance(key, str): + assert ":" not in key + return key + else: + raise TypeError(f"Unsupported dict key type {type(key)} for key {key}") + + +def _hydra_value_as_override(value): + # hydra doesn't support parsing dict with the json format, so for now + # we have to use a custom function to dump a value. + if value is None: + return "null" + elif isinstance(value, (bool, int, float, str)): + return json.dumps(value) + elif isinstance(value, dict): + return "{" + ", ".join( + f"{_dump_key(key)}: {_hydra_value_as_override(val)}" + for key, val in value.items() + ) + "}" + elif isinstance(value, (list, tuple)): + return "[" + ", ".join(_hydra_value_as_override(val) for val in value) + "]" + else: + raise TypeError(f"Unsupported value type {type(value)} for value {value}") + + class HydraMain(DecoratedMain): _slow = True - def __init__(self, main: MainFun, config_name: str, config_path: str): + def __init__(self, main: MainFun, config_name: str, config_path: str, **kwargs): self.config_name = config_name self.config_path = config_path + self.hydra_kwargs = kwargs module = main.__module__ if module == "__main__": @@ -161,13 +192,19 @@ def get_xp(self, argv: tp.Sequence[str]): return xp def value_to_argv(self, arg: tp.Any) -> tp.List[str]: + # Here we get the raw stuff from what is passed to the grid launcher. + # arg is either a str (in which case it is a raw override) + # or a dict, in which case each entry is an override, + # or a list of dict or a list of str. argv = [] if isinstance(arg, str): argv.append(arg) elif isinstance(arg, dict): for key, value in arg.items(): if key not in self._config_groups: - value = json.dumps(value) + # We need to convert the value using a custom function + # to respect how Hydra parses overrides. + value = _hydra_value_as_override(value) argv.append(f"{key}={value}") elif isinstance(arg, (list, tuple)): for part in arg: @@ -190,7 +227,8 @@ def _main(self): try: return hydra.main( config_name=self.config_name, - config_path=self.config_path)(self.main)() + config_path=self.config_path, + **self.hydra_kwargs)(self.main)() finally: if is_xp(): sys.argv.remove(run_dir) @@ -259,7 +297,11 @@ def _get_delta(self, init: DictConfig, other: DictConfig): return delta -def hydra_main(config_name: str, config_path: str): +def hydra_main(config_name: str, config_path: str, **kwargs): + """Wrap your main function with this. + You can pass extra kwargs, e.g. `version_base` introduced in 1.2. + """ def _decorator(main: MainFun): - return HydraMain(main, config_name=config_name, config_path=config_path) + return HydraMain(main, config_name=config_name, config_path=config_path, + **kwargs) return _decorator diff --git a/dora/tests/test_conf/test_conf.yaml b/dora/tests/test_conf/test_conf.yaml index d991d30..cb976d0 100644 --- a/dora/tests/test_conf/test_conf.yaml +++ b/dora/tests/test_conf/test_conf.yaml @@ -7,7 +7,7 @@ optim: loss: mse num_workers: 10 -complex_types: +complex: a: [1, 2, 3] b: {"a": 1, "b": 2} @@ -21,3 +21,7 @@ dora: slurm: cpus_per_task: 5 + +hydra: + job: + chdir: true \ No newline at end of file diff --git a/dora/tests/test_grid.py b/dora/tests/test_grid.py index 3fa1974..15caec0 100644 --- a/dora/tests/test_grid.py +++ b/dora/tests/test_grid.py @@ -6,9 +6,11 @@ from ..conf import SubmitRules from ..explore import Explorer, Launcher +from ..hydra import HydraMain from ..grid import run_grid, RunGridArgs from .fake_shep import mock_shep from .test_main import get_main +from .test_hydra import get_main as get_main_hydra _ret = None @@ -70,3 +72,37 @@ def rgrid(explore): assert sheeps[0].state() == "UNKNOWN" assert sheeps[0].job.job_id == "2" assert old_sheep.state() == "CANCELLED" + + +def explore_hydra(launcher: Launcher): + launcher.bind_({'epochs': 50, 'optim.loss': '123', 'num_workers': None}) + launcher({'complex.a': [{"test": "weird"}]}) + launcher({'complex.b': {"a": 21, "b": 4}}) + launcher({'+complex.b': {"a": 21, "b": 4, "c": 13}}) + + +def test_shep_hydra(tmpdir): + def rgrid(explore): + return run_grid(main, Explorer(explore), "unittest", + rules=rules, args=args) + HydraMain._slow = False + with mock_shep(): + main = get_main_hydra(tmpdir) + rules = SubmitRules() + args = RunGridArgs() + args.monitor = False + args.dry_run = True + + sheeps = rgrid(explore_hydra) + assert len(sheeps) == 3 + cfg = sheeps[0].xp.cfg + assert cfg.epochs == 50 + assert cfg.optim.loss == '123' + assert cfg.num_workers is None + assert cfg.complex.a == [{"test": "weird"}] + + cfg = sheeps[1].xp.cfg + assert cfg.complex.b == {"a": 21, "b": 4} + + cfg = sheeps[2].xp.cfg + assert cfg.complex.b == {"a": 21, "b": 4, "c": 13} diff --git a/dora/tests/test_hydra.py b/dora/tests/test_hydra.py index e053645..e8ef58f 100644 --- a/dora/tests/test_hydra.py +++ b/dora/tests/test_hydra.py @@ -123,7 +123,15 @@ def test_complex_types(tmpdir): # Test complex types parsing (e.g. lists and dict) _main.__module__ = __name__ main = get_main(tmpdir) + xp = call(main, []) + print(xp.cfg.complex) + assert xp.cfg.complex.a == [1, 2, 3] xp = call(main, ['complex.a=[0]']) assert xp.cfg.complex.a == [0] - xp = call(main, ['complex.b={"c": "hello"}']) - assert xp.cfg.complex.b == {"c": "hello"} + xp = call(main, ['complex.b.a=50']) + assert xp.cfg.complex.b == {"a": 50, "b": 2} + xp = call(main, ['complex.b={a:21}']) + assert xp.cfg.complex.b == {"a": 21, "b": 2} + argv = main.value_to_argv({"complex.b": {"a": 21, "b": 52}}) + xp = call(main, argv) + assert xp.cfg.complex.b == {"a": 21, "b": 52} diff --git a/examples/mnist_hydra/conf/config.yaml b/examples/mnist_hydra/conf/config.yaml index 2724657..4c0dc32 100644 --- a/examples/mnist_hydra/conf/config.yaml +++ b/examples/mnist_hydra/conf/config.yaml @@ -23,6 +23,8 @@ slurm: cpus_per_gpu: 10 hydra: + job: + chdir: true job_logging: handlers: file: diff --git a/setup.py b/setup.py index 1bf794a..7e74723 100644 --- a/setup.py +++ b/setup.py @@ -27,8 +27,6 @@ HERE = Path(__file__).parent -REQUIRED = [i.strip() for i in open("requirements.txt") if '/' not in i] - try: with open(HERE / "README.md", encoding='utf-8') as f: long_description = '\n' + f.read() From ae0b919de0015e99963e3dec2b4c559f8aab3ead Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 9 Jun 2022 14:49:21 +0200 Subject: [PATCH 12/43] doc --- CHANGELOG.md | 2 +- README.md | 2 +- dora/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b02adea..e17bc14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [0.1.10a] - ... +## [0.1.10] - 2022-06-09 Updated and simplified PyTorch Lightning distributed integration. Improved overall integration with PL, in particular with PLLogProgress and simplified diff --git a/README.md b/README.md index 2816ca7..591bee6 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ pip install -U dora-search See [the changelog](CHANGELOG.md) for details on releases. -- TBD: version 0.1.10: adding HiPlot support ! Updated PL support. +- 2022-06-09: version 0.1.10: adding HiPlot support ! Updated PL support, many small fixes. - 2022-02-28: version 0.1.9 - 2021-12-10: version 0.1.8: see changelog, many of small changes. - 2021-11-08: version 0.1.7: support for job arrays added. diff --git a/dora/__init__.py b/dora/__init__.py index 36fb978..385c65f 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.10a11" +__version__ = "0.1.10" # flake8: noqa from .explore import Explorer, Launcher From 2544537108856f5437223eadcbf0988584e82df1 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Mon, 13 Jun 2022 15:22:13 +0200 Subject: [PATCH 13/43] fixing hiplot support with custom grids --- dora/__init__.py | 2 +- dora/hiplot.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dora/__init__.py b/dora/__init__.py index 385c65f..c96fae2 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.10" +__version__ = "0.1.11a1" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/hiplot.py b/dora/hiplot.py index 8836261..a2dd29a 100644 --- a/dora/hiplot.py +++ b/dora/hiplot.py @@ -77,19 +77,22 @@ def load(uri: str) -> tp.Any: explorer_module: tp.Optional[str] = None explorer_name = "HiPlotExplorer" value: tp.Any + grids_name = main.dora.grid_package + if grids_name is None: + grids_name = main.package + ".grids" for token in shlex.split(uri): if '=' in token: key, value = token.split('=', 1) if key == 'explorer': explorer_name = value if explorer_module is None: - explorer_module = main.package + '.grids._hiplot' + explorer_module = grids_name + '._hiplot' elif key == 'explorer_module': explorer_module = value else: raise ValueError(f"Invalid param {key}") continue - grid_folder = main.dora.dir / main.dora.grids / token + grid_folder = main.dora.dir / main.dora._grids / token if grid_folder.exists(): for child in grid_folder.iterdir(): sigs.add(child.name) From 817c4763057bc8238bedfbf59ca1cdf8c3de7ae7 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Mon, 13 Jun 2022 15:32:04 +0200 Subject: [PATCH 14/43] support more config in hiplot --- dora/hiplot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dora/hiplot.py b/dora/hiplot.py index a2dd29a..59d0448 100644 --- a/dora/hiplot.py +++ b/dora/hiplot.py @@ -144,7 +144,7 @@ def load(uri: str) -> tp.Any: if isinstance(value, BaseContainer): value = OmegaConf.to_container(value, resolve=True) if isinstance(value, list): - value = ', '.join(value) + value = ', '.join(map(str, value)) values[key] = value values['sig'] = xp.sig from_uid: tp.Optional[str] = None From 8532cae63c75d391667fa145c5960ed5a7864db6 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Mon, 27 Jun 2022 17:45:14 +0200 Subject: [PATCH 15/43] adding exclude for slurm --- dora/__init__.py | 2 +- dora/conf.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dora/__init__.py b/dora/__init__.py index c96fae2..9647845 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a1" +__version__ = "0.1.11a2" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/conf.py b/dora/conf.py index f197ad4..9ebf61a 100644 --- a/dora/conf.py +++ b/dora/conf.py @@ -87,6 +87,7 @@ class SlurmConfig: constraint: str = "" one_task_per_node: bool = False array_parallelism: int = 256 + exclude: tp.Optional[str] = None @dataclass From 4b49997108383ca5b31c93e88d3d3892a23a800f Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 21 Jul 2022 10:48:03 +0200 Subject: [PATCH 16/43] using job id when available as part of the master port seed --- CHANGELOG.md | 4 ++++ dora/distrib.py | 12 ++++++++++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e17bc14..d77542c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.1.11a] - TBD + +Use job id based seed to avoid systematic failures with port allocation for distributed. + ## [0.1.10] - 2022-06-09 Updated and simplified PyTorch Lightning distributed integration. diff --git a/dora/distrib.py b/dora/distrib.py index 1f9edb9..3a19b5c 100644 --- a/dora/distrib.py +++ b/dora/distrib.py @@ -19,7 +19,7 @@ DistribSpec = namedtuple( - "DistribSpec", "rank world_size local_rank node_rank num_nodes source") + "DistribSpec", "rank world_size local_rank node_rank num_nodes source job_id") def set_distrib_env(): @@ -42,7 +42,15 @@ def set_distrib_env(): xp = get_xp() # Note that running twice the same XP on the same node will crash, # but that shouldn't really happen - rng = random.Random(int(xp.sig, 16)) + seed = xp.sig + # If we are in a Slurm job, let us use the Slurm job id. + try: + env = submitit.JobEnvironment() + except RuntimeError: + pass + else: + seed += env.job_id + rng = random.Random(seed) master_port = rng.randint(20000, 60000) os.environ['MASTER_PORT'] = str(master_port) if 'WORLD_SIZE' not in os.environ: From b19a1aeceb9a8c153aba2a78e58d7e59417a7e3b Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 21 Jul 2022 09:03:36 +0000 Subject: [PATCH 17/43] fix --- dora/distrib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dora/distrib.py b/dora/distrib.py index 3a19b5c..eb60145 100644 --- a/dora/distrib.py +++ b/dora/distrib.py @@ -19,7 +19,7 @@ DistribSpec = namedtuple( - "DistribSpec", "rank world_size local_rank node_rank num_nodes source job_id") + "DistribSpec", "rank world_size local_rank node_rank num_nodes source") def set_distrib_env(): From 99b81caa5bc5d1a16229b6f0d3caeae21c610f97 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 21 Jul 2022 09:58:34 +0000 Subject: [PATCH 18/43] allow os env to override submitit env --- dora/distrib.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/dora/distrib.py b/dora/distrib.py index eb60145..4e83c39 100644 --- a/dora/distrib.py +++ b/dora/distrib.py @@ -64,30 +64,30 @@ def get_distrib_spec(): This can be used even before distributed training is initialized, which is useful for PytorchLightning for instance. """ - try: - env = submitit.JobEnvironment() - except RuntimeError: - if 'WORLD_SIZE' in os.environ: - rank = int(os.environ['RANK']) - world_size = int(os.environ['WORLD_SIZE']) - local_rank = rank - node_rank = 0 - num_nodes = 1 - source = "env" - else: + if 'WORLD_SIZE' in os.environ: + rank = int(os.environ['RANK']) + world_size = int(os.environ['WORLD_SIZE']) + local_rank = rank + node_rank = 0 + num_nodes = 1 + source = "env" + else: + try: + env = submitit.JobEnvironment() + except RuntimeError: rank = 0 world_size = 1 local_rank = 0 node_rank = 0 num_nodes = 1 source = "empty" - else: - rank = env.global_rank - world_size = env.num_tasks - local_rank = env.local_rank - node_rank = env.node - num_nodes = env.num_nodes - source = "submitit" + else: + rank = env.global_rank + world_size = env.num_tasks + local_rank = env.local_rank + node_rank = env.node + num_nodes = env.num_nodes + source = "submitit" return DistribSpec(rank, world_size, local_rank, node_rank, num_nodes, source) From 1c69fd1727fc21ec7bb0e9cfb13bf3baa0cae503 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Fri, 22 Jul 2022 14:46:18 +0200 Subject: [PATCH 19/43] bump version --- dora/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dora/__init__.py b/dora/__init__.py index 9647845..75418cc 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a2" +__version__ = "0.1.11a3" # flake8: noqa from .explore import Explorer, Launcher From 1d210f4b5cc4f5ff786eb61516bd178b749bd9bc Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 28 Jul 2022 16:55:01 +0200 Subject: [PATCH 20/43] fixing new hydra version warnings --- dora/__init__.py | 2 +- dora/hydra.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/dora/__init__.py b/dora/__init__.py index 75418cc..33af282 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a3" +__version__ = "0.1.11a4" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/hydra.py b/dora/hydra.py index f487ac5..641593f 100644 --- a/dora/hydra.py +++ b/dora/hydra.py @@ -234,7 +234,8 @@ def _main(self): sys.argv.remove(run_dir) def _get_config_groups(self) -> tp.List[str]: - with initialize_config_dir(str(self.full_config_path), job_name=self._job_name): + with initialize_config_dir(str(self.full_config_path), job_name=self._job_name, + **self.hydra_kwargs): gh = GlobalHydra.instance().hydra assert gh is not None return list(gh.list_all_config_groups()) @@ -251,7 +252,8 @@ def _get_base_config( Return base config based on composition, along with delta for the composition overrides. """ - with initialize_config_dir(str(self.full_config_path), job_name=self._job_name): + with initialize_config_dir(str(self.full_config_path), job_name=self._job_name, + **self.hydra_kwargs): gh = GlobalHydra.instance().hydra assert gh is not None to_keep = [] @@ -274,7 +276,8 @@ def _get_config(self, Internal method, returns the config for the given override, but without the dora.sig field filled. """ - with initialize_config_dir(str(self.full_config_path), job_name=self._job_name): + with initialize_config_dir(str(self.full_config_path), job_name=self._job_name, + **self.hydra_kwargs): return self._get_config_noinit(overrides) def _get_config_noinit(self, overrides: tp.List[str] = []) -> DictConfig: From 55fb7161f62d487db85fcc864291d7c148a75988 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Fri, 29 Jul 2022 13:16:23 +0200 Subject: [PATCH 21/43] allowing non cuda based ddp for unit tests --- dora/__init__.py | 2 +- dora/__main__.py | 2 ++ dora/distrib.py | 6 +++++- dora/executor.py | 5 +++-- dora/run.py | 2 +- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/dora/__init__.py b/dora/__init__.py index 33af282..1dd72c0 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a4" +__version__ = "0.1.11a5" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/__main__.py b/dora/__main__.py index dee5baa..5ee8fe9 100644 --- a/dora/__main__.py +++ b/dora/__main__.py @@ -101,6 +101,8 @@ def get_parser(): run = subparsers.add_parser("run", help="Run locally the given command.") run.add_argument("-f", "--from_sig", help="Signature of job to use as baseline.") run.add_argument("-d", "--ddp", action="store_true", help="Distributed training.") + run.add_argument("--ddp_workers", type=int, + help="Nb of workers for distributed, default to nb of GPUs.") run.add_argument("--git_save", action="store_true", default=False, help="Run from a clean git clone.") run.add_argument("--clear", action='store_true', diff --git a/dora/distrib.py b/dora/distrib.py index 4e83c39..5685585 100644 --- a/dora/distrib.py +++ b/dora/distrib.py @@ -102,7 +102,11 @@ def init(backend='nccl'): logger.info("world_size is 1, skipping init.") return xp = get_xp() - torch.cuda.set_device(spec.local_rank) + if torch.cuda.is_available(): + torch.cuda.set_device(spec.local_rank) + else: + assert backend != 'nccl' + if xp.dora.use_rendezvous: init_method = 'file://' + os.path.abspath(xp.rendezvous_file) else: diff --git a/dora/executor.py b/dora/executor.py index 0a0a12d..34ead74 100644 --- a/dora/executor.py +++ b/dora/executor.py @@ -11,6 +11,7 @@ import os import subprocess as sp import sys +import typing as tp from .log import simple_log, fatal @@ -55,10 +56,10 @@ def __exit__(self, exc_type, exc_value, traceback): log("All workers completed successfully") -def start_ddp_workers(main, argv): +def start_ddp_workers(main, argv, num_workers: tp.Optional[int] = None): import torch as th - world_size = th.cuda.device_count() + world_size = num_workers or th.cuda.device_count() if not world_size: fatal( "DDP is only available on GPU. Make sure GPUs are properly configured with cuda.") diff --git a/dora/run.py b/dora/run.py index d985f31..cc1d8c4 100644 --- a/dora/run.py +++ b/dora/run.py @@ -58,7 +58,7 @@ def run_action(args, main: DecoratedMain): os.execv(sys.executable, [sys.executable, "-m", "dora"] + sys.argv[1:]) if args.ddp and not os.environ.get('RANK'): check_job_and_clear(args.argv, main, args.clear) - start_ddp_workers(main, args.argv) + start_ddp_workers(main, args.argv, args.ddp_workers) else: if 'WORLD_SIZE' not in os.environ: check_job_and_clear(args.argv, main, args.clear) From 3490b0c4fc58feec99d1f089886b802ae1e358e0 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Fri, 29 Jul 2022 13:48:19 +0200 Subject: [PATCH 22/43] fix typing --- dora/executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dora/executor.py b/dora/executor.py index 34ead74..90800da 100644 --- a/dora/executor.py +++ b/dora/executor.py @@ -72,7 +72,7 @@ def start_ddp_workers(main, argv, num_workers: tp.Optional[int] = None): log(f"Starting {world_size} worker processes for DDP.") with ChildrenManager() as manager: for rank in range(world_size): - kwargs = {} + kwargs: tp.Dict[str, tp.Any] = {} env = dict(os.environ) env['RANK'] = str(rank) env['WORLD_SIZE'] = str(world_size) From 1f75ed7703e4dbc868f59b114a05625c630fdf2d Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Sun, 31 Jul 2022 16:04:55 +0000 Subject: [PATCH 23/43] Merge --- dora/__init__.py | 2 +- dora/shep.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dora/__init__.py b/dora/__init__.py index 1dd72c0..6599ec9 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a5" +__version__ = "0.1.11a6" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/shep.py b/dora/shep.py index 8ecd2b0..ce1af86 100644 --- a/dora/shep.py +++ b/dora/shep.py @@ -203,7 +203,8 @@ def maybe_submit_lazy(self, sheep: Sheep, slurm_config: SlurmConfig, rules: Subm if rules.replace_done: logger.debug(f"Ignoring previously completed job {sheep.job.job_id}") sheep.job = None - elif state in ["FAILED", "CANCELLED", "OUT_OF_MEMORY", "TIMEOUT", "MISSING"]: + elif state in ["FAILED", "CANCELLED", "OUT_OF_MEMORY", "TIMEOUT", "MISSING", + "NODE_FAIL"]: logger.debug(f"Previous job {sheep.job.job_id} failed or was canceled") if rules.retry: sheep.job = None From c787b2a9f564424e1a64d937bfc3ddf7e1a4eb28 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 4 Aug 2022 18:11:22 +0200 Subject: [PATCH 24/43] always keep the same object for link history --- dora/__init__.py | 2 +- dora/link.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dora/__init__.py b/dora/__init__.py index 1dd72c0..6599ec9 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a5" +__version__ = "0.1.11a6" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/link.py b/dora/link.py index 34ec86a..655449c 100644 --- a/dora/link.py +++ b/dora/link.py @@ -54,7 +54,7 @@ def update_history(self, history: tp.List[dict]): history = utils.jsonable(history) if not isinstance(history, list): raise ValueError(f"history must be a list, but got {type(history)}") - self.history = history + self.history[:] = history self._commit() def push_metrics(self, metrics: dict): From ccaf534a990008f7ba2912413ecd54c16810d5f0 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 18 Aug 2022 10:27:30 +0200 Subject: [PATCH 25/43] removed automatic export of world size that interfered with get distrib spec --- CHANGELOG.md | 2 ++ dora/__init__.py | 2 +- dora/shep.py | 3 --- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d77542c..f9fed02 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). Use job id based seed to avoid systematic failures with port allocation for distributed. +Remove automatic export of WORLD_SIZE and RANK inside submitit job target, which seemed irrelevant. Use `dora.distrib.set_distrib_env` if you relied on it. + ## [0.1.10] - 2022-06-09 Updated and simplified PyTorch Lightning distributed integration. diff --git a/dora/__init__.py b/dora/__init__.py index 6599ec9..8125dd2 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a6" +__version__ = "0.1.11a7" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/shep.py b/dora/shep.py index ce1af86..3fa56bb 100644 --- a/dora/shep.py +++ b/dora/shep.py @@ -33,9 +33,6 @@ class _SubmitItTarget: def __call__(self, main: DecoratedMain, argv: tp.Sequence[str]): - spec = get_distrib_spec() - os.environ['RANK'] = str(spec.rank) - os.environ['WORLD_SIZE'] = str(spec.world_size) self.xp = main.get_xp(argv) sys.argv[1:] = argv main() From 0d279f828164153c44c59742b7a22b41dd87b039 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Mon, 22 Aug 2022 14:26:22 +0200 Subject: [PATCH 26/43] adding back RANK only --- dora/__init__.py | 2 +- dora/shep.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/dora/__init__.py b/dora/__init__.py index 8125dd2..b084300 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a7" +__version__ = "0.1.11a8" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/shep.py b/dora/shep.py index 3fa56bb..c3d2ffb 100644 --- a/dora/shep.py +++ b/dora/shep.py @@ -34,6 +34,10 @@ class _SubmitItTarget: def __call__(self, main: DecoratedMain, argv: tp.Sequence[str]): self.xp = main.get_xp(argv) + spec = get_distrib_spec() + # We export the RANK as it can be used to customize logging early on + # in the called program (e.g. using Hydra). + os.environ['RANK'] = str(spec.rank) sys.argv[1:] = argv main() From 81b5a8c1b3e98313f1bab3ace48b5104f7e1189e Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Thu, 22 Sep 2022 14:36:28 +0200 Subject: [PATCH 27/43] release --- CHANGELOG.md | 7 +++++-- dora/__init__.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f9fed02..627a19d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,11 +4,14 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [0.1.11a] - TBD +## [0.1.11] - 2022-09-22 Use job id based seed to avoid systematic failures with port allocation for distributed. -Remove automatic export of WORLD_SIZE and RANK inside submitit job target, which seemed irrelevant. Use `dora.distrib.set_distrib_env` if you relied on it. +Remove automatic export of WORLD_SIZE inside submitit job target, +use `dora.distrib.set_distrib_env` if you relied on it. + +Fixed version_base parameter support that appeared in Hydra. ## [0.1.10] - 2022-06-09 diff --git a/dora/__init__.py b/dora/__init__.py index b084300..b1180cb 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11a8" +__version__ = "0.1.11" # flake8: noqa from .explore import Explorer, Launcher From dfac43ce0d6821833e0f31f4f905eb68d43f0c58 Mon Sep 17 00:00:00 2001 From: Jean-Remi King Date: Mon, 26 Sep 2022 16:57:43 +0200 Subject: [PATCH 28/43] up --- dora/lightning.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dora/lightning.py b/dora/lightning.py index b1aa561..b72697d 100644 --- a/dora/lightning.py +++ b/dora/lightning.py @@ -85,6 +85,15 @@ def local_rank(self) -> int: def node_rank(self) -> int: return self.spec.node_rank + def detect(self) -> bool: + return False + + def main_address(self) -> str: + return os.environ['MAIN_ADDR'] + + def main_port(self) -> int: + return int(os.environ['MAIN_PORT']) + class DoraCheckpointSync(Callback): """Make sure Dora history, and checkpoint state are in sync. From e6e64379f65f9e17c7fe30ec7c4f04b2063ea88e Mon Sep 17 00:00:00 2001 From: Jean-Remi King Date: Tue, 27 Sep 2022 14:39:46 +0200 Subject: [PATCH 29/43] up --- dora/lightning.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/dora/lightning.py b/dora/lightning.py index b72697d..bd5ca9c 100644 --- a/dora/lightning.py +++ b/dora/lightning.py @@ -61,12 +61,6 @@ def creates_children(self) -> bool: def creates_processes_externally(self) -> bool: return True - def master_address(self) -> str: - return os.environ['MASTER_ADDR'] - - def master_port(self) -> int: - return int(os.environ['MASTER_PORT']) - def world_size(self) -> int: return self.spec.world_size @@ -85,14 +79,17 @@ def local_rank(self) -> int: def node_rank(self) -> int: return self.spec.node_rank - def detect(self) -> bool: + @staticmethod + def detect() -> bool: return False + @staticmethod def main_address(self) -> str: - return os.environ['MAIN_ADDR'] + return os.environ["MAIN_ADDR"] + @staticmethod def main_port(self) -> int: - return int(os.environ['MAIN_PORT']) + return int(os.environ["MAIN_PORT"]) class DoraCheckpointSync(Callback): From d42413a109a805a9e5de0ea59f458e63f391db75 Mon Sep 17 00:00:00 2001 From: Jean-Remi King Date: Tue, 27 Sep 2022 14:49:15 +0200 Subject: [PATCH 30/43] up --- dora/lightning.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dora/lightning.py b/dora/lightning.py index bd5ca9c..84c6872 100644 --- a/dora/lightning.py +++ b/dora/lightning.py @@ -83,11 +83,11 @@ def node_rank(self) -> int: def detect() -> bool: return False - @staticmethod + @property def main_address(self) -> str: return os.environ["MAIN_ADDR"] - @staticmethod + @property def main_port(self) -> int: return int(os.environ["MAIN_PORT"]) From ffa3bf7a79cc8797059f51ec6a8c0ad87b4ef4d3 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Mon, 14 Nov 2022 14:32:02 -0500 Subject: [PATCH 31/43] version bump --- CHANGELOG.md | 4 ++++ dora/__init__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 627a19d..6ae1317 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [0.1.12a] - TBD + +Fixed bug with PL (Thanks @kingjr). + ## [0.1.11] - 2022-09-22 Use job id based seed to avoid systematic failures with port allocation for distributed. diff --git a/dora/__init__.py b/dora/__init__.py index b1180cb..d9d15a6 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.11" +__version__ = "0.1.12a1" # flake8: noqa from .explore import Explorer, Launcher From 457d8e36dd46e5af546c87351d57cbd64d7200df Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Wed, 16 Nov 2022 10:22:07 -0500 Subject: [PATCH 32/43] fix linter --- dora/lightning.py | 4 ++-- dora/shep.py | 2 +- dora/xp.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dora/lightning.py b/dora/lightning.py index 84c6872..c0b3f78 100644 --- a/dora/lightning.py +++ b/dora/lightning.py @@ -226,13 +226,13 @@ class PLLogProgress(ProgressBarBase): """ - def __init__(self, logger, **kwargs): + def __init__(self, logger, **kwargs) -> None: super().__init__() # don't forget this :) self.logger = logger self.kwargs = kwargs self._pl_module: tp.Optional[LightningModule] = None - def setup(self, trainer, pl_module, stage: tp.Optional[str] = None) -> None: + def setup(self, trainer, pl_module, stage: str) -> None: super().setup(trainer, pl_module, stage) self._pl_module = pl_module self._replay_history: tp.List[tp.Any] = [] diff --git a/dora/shep.py b/dora/shep.py index c3d2ffb..ab07e86 100644 --- a/dora/shep.py +++ b/dora/shep.py @@ -54,7 +54,7 @@ class Sheep: A Sheep is a specific run for a given XP. Sheeps are managed by the Shepherd. """ - def __init__(self, xp: XP, job: SlurmJob = None): + def __init__(self, xp: XP): self.xp = xp self.job: tp.Optional[submitit.SlurmJob] = None # Other jobs contain the list of other jobs in the array diff --git a/dora/xp.py b/dora/xp.py index d5c58cb..35bb350 100644 --- a/dora/xp.py +++ b/dora/xp.py @@ -120,7 +120,7 @@ def enter(self, stack: bool = False): class _Context: # Used to keep track of a running XP and be able to provide # it on demand with `get_xp`. - def __init__(self): + def __init__(self) -> None: self._xps: tp.List[XP] = [] @contextmanager From 81989f879af5ccb4d63e200f0a59475f271efd0a Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Tue, 22 Nov 2022 13:11:34 +0000 Subject: [PATCH 33/43] fixing errors in config in hiplot --- dora/hiplot.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dora/hiplot.py b/dora/hiplot.py index 59d0448..17c732f 100644 --- a/dora/hiplot.py +++ b/dora/hiplot.py @@ -137,7 +137,10 @@ def load(uri: str) -> tp.Any: exp.parameters_definition[sname].label_css = STYLE.params for key in all_columns: if key not in parts: - value = eval('xp.cfg.' + key, {'xp': xp}) + try: + value = eval('xp.cfg.' + key, {'xp': xp}) + except AttributeError: + value = None sname = main.short_name_part(key, value).split('=', 1)[0] values[sname] = value for key, value in values.items(): From 8e1f1a979a98e4dca657d659e84fc58e9bba2450 Mon Sep 17 00:00:00 2001 From: Alexandre Defossez Date: Tue, 22 Nov 2022 13:20:26 +0000 Subject: [PATCH 34/43] fix argparse param assigned to wrong group --- dora/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dora/__main__.py b/dora/__main__.py index 5ee8fe9..66ed5aa 100644 --- a/dora/__main__.py +++ b/dora/__main__.py @@ -86,8 +86,8 @@ def get_parser(): group.add_argument("-t", "--tail", type=int, help="Show the log for the job with the given index") - group.add_argument("--init", action='store_true', - help="Init the given XPs so that their signature can be referenced.") + grid.add_argument("--init", action='store_true', + help="Init the given XPs so that their signature can be referenced.") grid.add_argument( 'grid', nargs='?', From 042ce7dd0329df7160b82170a37e2c3299d45ae9 Mon Sep 17 00:00:00 2001 From: Jade Copet Date: Wed, 23 Nov 2022 10:04:06 +0000 Subject: [PATCH 35/43] Update SLURM config to use GRES --- dora/shep.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dora/shep.py b/dora/shep.py index ab07e86..e5a67c3 100644 --- a/dora/shep.py +++ b/dora/shep.py @@ -277,13 +277,12 @@ def _get_submitit_executor(self, name: str, folder: Path, if mem_per_gpu: mem = slurm_config.mem_per_gpu * gpus_per_node kwargs['mem'] = f"{mem}GB" + kwargs['gres'] = f'gpu:{gpus}' if slurm_config.one_task_per_node: - kwargs['gpus_per_task'] = gpus_per_node kwargs['ntasks_per_node'] = 1 if slurm_config.cpus_per_task is None: kwargs['cpus_per_task'] = gpus_per_node * slurm_config.cpus_per_gpu else: - kwargs['gpus_per_task'] = 1 kwargs['ntasks_per_node'] = gpus_per_node if slurm_config.cpus_per_task is None: kwargs['cpus_per_task'] = slurm_config.cpus_per_gpu From 6f607097777a7cf9338d92c370d848129763077d Mon Sep 17 00:00:00 2001 From: Jade Copet Date: Wed, 23 Nov 2022 10:48:07 +0000 Subject: [PATCH 36/43] Bump version --- dora/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dora/__init__.py b/dora/__init__.py index d9d15a6..70ad819 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.12a1" +__version__ = "0.1.12a2" # flake8: noqa from .explore import Explorer, Launcher @@ -76,4 +76,3 @@ from .main import argparse_main from .shep import Sheep from .xp import get_xp, is_xp, XP - From 941b0bedcf2ead0c6558b9503ddf68646960b3db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?= Date: Fri, 2 Dec 2022 13:23:12 +0000 Subject: [PATCH 37/43] fixing bug with azure --- CHANGELOG.md | 2 ++ dora/__init__.py | 2 +- dora/shep.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ae1317..5eddb9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). Fixed bug with PL (Thanks @kingjr). +Added support for the Azure cluster (thanks @JadeCopet). + ## [0.1.11] - 2022-09-22 Use job id based seed to avoid systematic failures with port allocation for distributed. diff --git a/dora/__init__.py b/dora/__init__.py index 70ad819..8e4d26b 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.12a2" +__version__ = "0.1.12a3" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/shep.py b/dora/shep.py index e5a67c3..25ba19f 100644 --- a/dora/shep.py +++ b/dora/shep.py @@ -277,7 +277,7 @@ def _get_submitit_executor(self, name: str, folder: Path, if mem_per_gpu: mem = slurm_config.mem_per_gpu * gpus_per_node kwargs['mem'] = f"{mem}GB" - kwargs['gres'] = f'gpu:{gpus}' + kwargs['gres'] = f'gpu:{gpus_per_node}' if slurm_config.one_task_per_node: kwargs['ntasks_per_node'] = 1 if slurm_config.cpus_per_task is None: From aadbdfff51a8b95e16da2a8ab5d212f52a33567e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?= Date: Mon, 6 Feb 2023 12:03:44 +0100 Subject: [PATCH 38/43] fix local rank bug --- CHANGELOG.md | 2 ++ dora/__init__.py | 2 +- dora/distrib.py | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5eddb9a..e4c0437 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ Fixed bug with PL (Thanks @kingjr). Added support for the Azure cluster (thanks @JadeCopet). +Fixed local rank bug. + ## [0.1.11] - 2022-09-22 Use job id based seed to avoid systematic failures with port allocation for distributed. diff --git a/dora/__init__.py b/dora/__init__.py index 8e4d26b..ff49420 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.12a3" +__version__ = "0.1.12a4" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/distrib.py b/dora/distrib.py index 5685585..f7b7bcb 100644 --- a/dora/distrib.py +++ b/dora/distrib.py @@ -67,7 +67,10 @@ def get_distrib_spec(): if 'WORLD_SIZE' in os.environ: rank = int(os.environ['RANK']) world_size = int(os.environ['WORLD_SIZE']) - local_rank = rank + if 'LOCAL_RANK' in os.environ: + local_rank = int(os.environ['LOCAL_RANK']) + else: + local_rank = rank node_rank = 0 num_nodes = 1 source = "env" From 36d8b8cd8dd5bf6ad1fdd17ffffd13644e7916ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?= Date: Mon, 20 Feb 2023 16:48:07 +0100 Subject: [PATCH 39/43] avoid using resolve --- CHANGELOG.md | 2 ++ dora/__init__.py | 2 +- dora/git_save.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4c0437..f2fd3c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ Added support for the Azure cluster (thanks @JadeCopet). Fixed local rank bug. +Minor speed improvement if processing a lot of files with `to_absolute_path`. + ## [0.1.11] - 2022-09-22 Use job id based seed to avoid systematic failures with port allocation for distributed. diff --git a/dora/__init__.py b/dora/__init__.py index ff49420..150a7d0 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.12a4" +__version__ = "0.1.12a5" # flake8: noqa from .explore import Explorer, Launcher diff --git a/dora/git_save.py b/dora/git_save.py index 1604cd3..828d4d7 100644 --- a/dora/git_save.py +++ b/dora/git_save.py @@ -168,7 +168,8 @@ def to_absolute_path(path: AnyPath) -> AnyPath: try: import hydra.utils except ImportError: - _path = _path.resolve() + if not _path.is_absolute(): + _path = Path(os.getcwd()) / _path else: _path = Path(hydra.utils.to_absolute_path(str(_path))) return klass(_path) From b2e1758576269118e7acfd9ae1066d479d1c1623 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?= Date: Sat, 8 Apr 2023 10:14:19 -0700 Subject: [PATCH 40/43] adding qos --- dora/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dora/conf.py b/dora/conf.py index 9ebf61a..7118823 100644 --- a/dora/conf.py +++ b/dora/conf.py @@ -70,6 +70,7 @@ class SlurmConfig: per node, otherwise, will schedule one task per gpu (default is False). array_parallelism (int): when using job arrays, how many tasks can run in parallel. + qos: (str or None): qos param for slurm. ..warning:: this assumes one task per GPU. Set `one_task_per_node` if you do not want that. @@ -88,6 +89,7 @@ class SlurmConfig: one_task_per_node: bool = False array_parallelism: int = 256 exclude: tp.Optional[str] = None + qos: tp.Optional[str] = None @dataclass From ea32349b90834d683f00fedab0d98a0a3ba6212a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Ramamonjisoa?= Date: Thu, 27 Apr 2023 14:55:16 +0200 Subject: [PATCH 41/43] Add account parameter for slurm --- dora/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dora/conf.py b/dora/conf.py index 7118823..ff76395 100644 --- a/dora/conf.py +++ b/dora/conf.py @@ -71,6 +71,7 @@ class SlurmConfig: array_parallelism (int): when using job arrays, how many tasks can run in parallel. qos: (str or None): qos param for slurm. + account: (str or None): account param for slurm. ..warning:: this assumes one task per GPU. Set `one_task_per_node` if you do not want that. @@ -90,6 +91,7 @@ class SlurmConfig: array_parallelism: int = 256 exclude: tp.Optional[str] = None qos: tp.Optional[str] = None + acccount: tp.Optional[str] = None @dataclass From 8ddd0e5677f410cb1f80cb3a83138fc2c9fe5997 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Ramamonjisoa?= Date: Thu, 27 Apr 2023 14:57:40 +0200 Subject: [PATCH 42/43] Fix typo --- dora/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dora/conf.py b/dora/conf.py index ff76395..ab0611b 100644 --- a/dora/conf.py +++ b/dora/conf.py @@ -91,7 +91,7 @@ class SlurmConfig: array_parallelism: int = 256 exclude: tp.Optional[str] = None qos: tp.Optional[str] = None - acccount: tp.Optional[str] = None + account: tp.Optional[str] = None @dataclass From 53ba0966edcf4d0ec60eae59e0ce86e5c2a424cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20D=C3=A9fossez?= Date: Tue, 23 May 2023 16:27:44 +0200 Subject: [PATCH 43/43] preparing release --- CHANGELOG.md | 4 +++- dora/__init__.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f2fd3c6..20c0e0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [0.1.12a] - TBD +## [0.1.12] - 2023-05-23 Fixed bug with PL (Thanks @kingjr). @@ -14,6 +14,8 @@ Fixed local rank bug. Minor speed improvement if processing a lot of files with `to_absolute_path`. +Added `qos`, and `account` slurm params. + ## [0.1.11] - 2022-09-22 Use job id based seed to avoid systematic failures with port allocation for distributed. diff --git a/dora/__init__.py b/dora/__init__.py index 150a7d0..4d232fb 100644 --- a/dora/__init__.py +++ b/dora/__init__.py @@ -60,7 +60,7 @@ __pdoc__ = {} __pdoc__['tests'] = False -__version__ = "0.1.12a5" +__version__ = "0.1.12" # flake8: noqa from .explore import Explorer, Launcher