Merge branch 'main' of github.com:facebookresearch/dora

facebookresearch · Jul 7, 2023 · 5b615d3 · 5b615d3
2 parents 33bf261 + 53ba096
commit 5b615d3
Show file tree

Hide file tree

Showing 23 changed files with 327 additions and 79 deletions.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -18,6 +18,7 @@ jobs:
         pip install '.[dev]'
         git config --global user.email "defossez@fb.com"
         git config --global user.name "Alexandre Défossez (autodoc)"
+        export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
 
 
     - name: Reset branch

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,28 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
-## [0.1.10a] - ...
+## [0.1.12] - 2023-05-23
+
+Fixed bug with PL (Thanks @kingjr).
+
+Added support for the Azure cluster (thanks @JadeCopet).
+
+Fixed local rank bug.
+
+Minor speed improvement if processing a lot of files with `to_absolute_path`.
+
+Added `qos`, and `account` slurm params.
+
+## [0.1.11] - 2022-09-22
+
+Use job id based seed to avoid systematic failures with port allocation for distributed.
+
+Remove automatic export of WORLD_SIZE inside submitit job target,
+use `dora.distrib.set_distrib_env` if you relied on it.
+
+Fixed version_base parameter support that appeared in Hydra.
+
+## [0.1.10] - 2022-06-09
 
 Updated and simplified PyTorch Lightning distributed integration.
 Improved overall integration with PL, in particular with PLLogProgress and simplified
@@ -20,6 +41,16 @@ More reliable passing of arguments of Hydra (before, setting None would actually
 
 Allow for empty `mem` constraint in Slurm.
 
+Fixing `callbacks` default value in PL.
+
+Extra "keys" in Hydra config files are now allowed (i.e. overrides with `+something=12`).
+
+The package where Dora looks for grids can be customized, in Hydra with `dora.grid_package` in the base config or passing `grid_package='...'` to `argparse_main`.
+
+Better doc for launcher API.
+
+Fix dict support with Hydra. Okay it is time that I release a new version now...
+
 ## [0.1.9] - 2022-02-28
 
 Reliable rmtree used to avoid `--clear` being blocked by some locking issues on NFS.

diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ pip install -U dora-search
 
 See [the changelog](CHANGELOG.md) for details on releases.
 
-- TBD: version 0.1.10: adding HiPlot support ! Updated PL support.
+- 2022-06-09: version 0.1.10: adding HiPlot support ! Updated PL support, many small fixes.
 - 2022-02-28: version 0.1.9
 - 2021-12-10: version 0.1.8: see changelog, many of small changes.
 - 2021-11-08: version 0.1.7: support for job arrays added.
@@ -412,6 +412,41 @@ This will do 3 thing:
 - A table containing job status and metadata as well as the latest metrics will
     be printed every 5 minutes.
 
+### The Launcher API
+
+Here is a more comprehensive description of what `Launcher` object can do.
+
+- `launcher.bind_(...)`: remember the given parameters (command line option for argparse based
+project, or overrides for Hydra based ones) for future scheduling, i.e. all experiments
+later scheduled with that launcher will have those parameters set.
+- `sub = launcher.bind(...)`: same as bind, but returns a new "sub" launcher, i.e. the object
+`launcher` is not changed, only experiments scheduled with `sub` will use the given params.
+`sub` also inherits from all the params already bound to its parent launcher (i.e. previous call to `launcher.bind_`).
+Creating a sub-launcher is especially recommended inside loops, to avoid leaking params to the next loop iteration.
+- `launcher(...)`: schedules an experiment with the given params, plus all the ones that have
+been aggregated through the various calls to `bind_` and to `bind`. This is equivalent to
+`launcher.bind(...)()`.
+- `launcher.slurm_(key=value, ...)` and `launcher.slurm(key=value, ...)`: same as `bind_` and `bind`
+but for the slurm config (nb of GPUs etc). For a list of possible options, checkout
+[SlurmConf](https://facebookresearch.github.io/dora/dora/conf.html#dora.conf.SlurmConfig).
+
+
+Now let us describe the format for passing parameters overrides or command line flags to
+`launcher.bind_()`, `launcher.bind()` or `launcher()`:
+
+- Simple parameters (i.e. not nested) can be passed as kwargs, for instance if you have a `--batch_size` flag, you can
+do `launcher.bind(batch_size=64)`.
+- Command line flags can be explicitely passed as a list of strings, for instance `launcher.bind(['--lr=1e-4'])`.
+- A dictionary of overrides can be passed, for instance `launcher.bind({'batch_size': 64})`. Note that this
+also allows for nested keys in Hydra: `launcher.bind({'model.channels': 256})`. With Hydra, you can
+also define new keys with `{'+model.activation': 'relu'}`. You must not remove keys though.
+- Finally you can combine all of those (for a Hydra project here):
+
+```python
+launcher.bind(['optim.lr=1e-4'], {'model.channels': 256, 'seed': 42}, {'+model.activation': 'relu'}, batch_size=64)
+```
+
+
 ### Flags
 
 The `dora grid` command supports the following flags:
@@ -544,9 +579,13 @@ rules = dora.conf.SubmitRules(retry=True)  # Should we reschedule failed jobs?
 # each sheep as 2 attributues: sheep.xp and sheep.job_id.
 sheeps = dora.grid.run_grid(main, explorer, grid_name='jupy', rules=rules, args=args)
 args.monitor = True
+args.jupyter = True
 # The jupyter flag will make the grid API use the display API to clear the cell
 # output and update it regularly. This one will not return until all jobs
 # are done or failed.
+# In the following, `grid_name` should be unique. It will be used
+# to determine which experiments were previously scheduled with that grid
+# and should potentially be cancelled if no longer needed.
 dora.grid.run_grid(main, explorer, grid_name='jupy', rules=rules, args=args)
 # You can retrieve the short names by using `main.get_names()`
 short_names, ref_name = main.get_names([sheep.xp for sheep in sheeps])

diff --git a/dora/__init__.py b/dora/__init__.py
@@ -60,7 +60,7 @@
 __pdoc__ = {}
 __pdoc__['tests'] = False
 
-__version__ = "0.1.10a8"
+__version__ = "0.1.12"
 
 # flake8: noqa
 from .explore import Explorer, Launcher
@@ -76,4 +76,3 @@
 from .main import argparse_main
 from .shep import Sheep
 from .xp import get_xp, is_xp, XP
-
diff --git a/dora/__main__.py b/dora/__main__.py
@@ -86,8 +86,8 @@ def get_parser():
     group.add_argument("-t", "--tail", type=int,
                        help="Show the log for the job with the given index")
 
-    group.add_argument("--init", action='store_true',
-                       help="Init the given XPs so that their signature can be referenced.")
+    grid.add_argument("--init", action='store_true',
+                      help="Init the given XPs so that their signature can be referenced.")
 
     grid.add_argument(
         'grid', nargs='?',
@@ -101,6 +101,8 @@ def get_parser():
     run = subparsers.add_parser("run", help="Run locally the given command.")
     run.add_argument("-f", "--from_sig", help="Signature of job to use as baseline.")
     run.add_argument("-d", "--ddp", action="store_true", help="Distributed training.")
+    run.add_argument("--ddp_workers", type=int,
+                     help="Nb of workers for distributed, default to nb of GPUs.")
     run.add_argument("--git_save", action="store_true", default=False,
                      help="Run from a clean git clone.")
     run.add_argument("--clear", action='store_true',

diff --git a/dora/conf.py b/dora/conf.py
@@ -70,6 +70,8 @@ class SlurmConfig:
             per node, otherwise, will schedule one task per gpu (default is False).
         array_parallelism (int): when using job arrays, how many tasks can run
             in parallel.
+        qos: (str or None): qos param for slurm.
+        account: (str or None): account param for slurm.
 
     ..warning:: this assumes one task per GPU.
         Set `one_task_per_node` if you do not want that.
@@ -87,6 +89,9 @@ class SlurmConfig:
     constraint: str = ""
     one_task_per_node: bool = False
     array_parallelism: int = 256
+    exclude: tp.Optional[str] = None
+    qos: tp.Optional[str] = None
+    account: tp.Optional[str] = None
 
 
 @dataclass
@@ -141,11 +146,14 @@ class DoraConfig:
         shared (Path or None): if provided, the path to a central repository of XPs.
             For the moment, this only supports sharing hyper-params, logs etc. will stay
             in the per user folder.
+        grid_package (str or None): if provided, package to look for grids. Default
+            to the package with the `train.py` module followed by `.grids`.
     """
     dir: Path = Path("./outputs")  # where everything will be stored
     exclude: tp.List[str] = field(default_factory=list)
     git_save: bool = False
     shared: tp.Optional[Path] = None  # Optional path for shared XPs.
+    grid_package: tp.Optional[str] = None
 
     # Those are internal config values and are unlikely to be changed
     history: str = "history.json"  # where metrics will be stored
@@ -154,8 +162,9 @@ class DoraConfig:
     shep: ShepConfig = field(default_factory=ShepConfig)
     rendezvous_file: str = "rendezvous.txt"
     use_rendezvous: bool = False
-    grids: str = "grids"
-    codes: str = "codes"
+    # Filenames used in various places, you shouldn't edit that
+    _grids: str = "grids"
+    _codes: str = "codes"
 
     def is_excluded(self, arg_name: str) -> bool:
         """Return True if the given argument name should be excluded from

diff --git a/dora/distrib.py b/dora/distrib.py
@@ -42,7 +42,15 @@ def set_distrib_env():
         xp = get_xp()
         # Note that running twice the same XP on the same node will crash,
         # but that shouldn't really happen
-        rng = random.Random(int(xp.sig, 16))
+        seed = xp.sig
+        # If we are in a Slurm job, let us use the Slurm job id.
+        try:
+            env = submitit.JobEnvironment()
+        except RuntimeError:
+            pass
+        else:
+            seed += env.job_id
+        rng = random.Random(seed)
         master_port = rng.randint(20000, 60000)
         os.environ['MASTER_PORT'] = str(master_port)
     if 'WORLD_SIZE' not in os.environ:
@@ -56,30 +64,33 @@ def get_distrib_spec():
     This can be used even before distributed training is initialized, which is useful for
     PytorchLightning for instance.
     """
-    try:
-        env = submitit.JobEnvironment()
-    except RuntimeError:
-        if 'WORLD_SIZE' in os.environ:
-            rank = int(os.environ['RANK'])
-            world_size = int(os.environ['WORLD_SIZE'])
-            local_rank = rank
-            node_rank = 0
-            num_nodes = 1
-            source = "env"
+    if 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        if 'LOCAL_RANK' in os.environ:
+            local_rank = int(os.environ['LOCAL_RANK'])
         else:
+            local_rank = rank
+        node_rank = 0
+        num_nodes = 1
+        source = "env"
+    else:
+        try:
+            env = submitit.JobEnvironment()
+        except RuntimeError:
             rank = 0
             world_size = 1
             local_rank = 0
             node_rank = 0
             num_nodes = 1
             source = "empty"
-    else:
-        rank = env.global_rank
-        world_size = env.num_tasks
-        local_rank = env.local_rank
-        node_rank = env.node
-        num_nodes = env.num_nodes
-        source = "submitit"
+        else:
+            rank = env.global_rank
+            world_size = env.num_tasks
+            local_rank = env.local_rank
+            node_rank = env.node
+            num_nodes = env.num_nodes
+            source = "submitit"
     return DistribSpec(rank, world_size, local_rank, node_rank, num_nodes, source)
 
 
@@ -94,7 +105,11 @@ def init(backend='nccl'):
         logger.info("world_size is 1, skipping init.")
         return
     xp = get_xp()
-    torch.cuda.set_device(spec.local_rank)
+    if torch.cuda.is_available():
+        torch.cuda.set_device(spec.local_rank)
+    else:
+        assert backend != 'nccl'
+
     if xp.dora.use_rendezvous:
         init_method = 'file://' + os.path.abspath(xp.rendezvous_file)
     else:

diff --git a/dora/executor.py b/dora/executor.py
@@ -11,6 +11,7 @@
 import os
 import subprocess as sp
 import sys
+import typing as tp
 
 from .log import simple_log, fatal
 
@@ -55,10 +56,10 @@ def __exit__(self, exc_type, exc_value, traceback):
             log("All workers completed successfully")
 
 
-def start_ddp_workers(main, argv):
+def start_ddp_workers(main, argv, num_workers: tp.Optional[int] = None):
     import torch as th
 
-    world_size = th.cuda.device_count()
+    world_size = num_workers or th.cuda.device_count()
     if not world_size:
         fatal(
             "DDP is only available on GPU. Make sure GPUs are properly configured with cuda.")
@@ -71,7 +72,7 @@ def start_ddp_workers(main, argv):
     log(f"Starting {world_size} worker processes for DDP.")
     with ChildrenManager() as manager:
         for rank in range(world_size):
-            kwargs = {}
+            kwargs: tp.Dict[str, tp.Any] = {}
             env = dict(os.environ)
             env['RANK'] = str(rank)
             env['WORLD_SIZE'] = str(world_size)

diff --git a/dora/git_save.py b/dora/git_save.py
@@ -40,7 +40,9 @@ def check_repo_clean(root: Path, main: DecoratedMain):
     # Here we try to detect the grids package and allow uncommitted changes
     # only to that folder. The rational is that as we edit the grid file, it is a pain
     # to constantly be commiting change to it and it should not impact the actual run code.
-    grid_name = main.name + ".grids"
+    grid_name = main.dora.grid_package
+    if grid_name is None:
+        grid_name = main.package + ".grids"
     spec = importlib.util.find_spec(grid_name)
     grid_path: tp.Optional[Path] = None
     if spec is not None:
@@ -102,7 +104,7 @@ def get_new_clone(main: DecoratedMain) -> Path:
     source = get_git_root()
     commit = get_git_commit()
     check_repo_clean(source, main)
-    codes = main.dora.dir / main.dora.codes
+    codes = main.dora.dir / main.dora._codes
     codes.mkdir(parents=True, exist_ok=True)
     target = codes / commit
     if not target.exists():
@@ -166,7 +168,8 @@ def to_absolute_path(path: AnyPath) -> AnyPath:
         try:
             import hydra.utils
         except ImportError:
-            _path = _path.resolve()
+            if not _path.is_absolute():
+                _path = Path(os.getcwd()) / _path
         else:
             _path = Path(hydra.utils.to_absolute_path(str(_path)))
         return klass(_path)

diff --git a/dora/grid.py b/dora/grid.py
@@ -17,7 +17,6 @@
 from functools import partial
 import os
 from pathlib import Path
-import pkgutil
 import typing as tp
 import shutil
 import sys
@@ -91,24 +90,31 @@ class RunGridArgs:
 
 def _get_explore(args, main):
     # Finds the explorer.
-    root_name = main.package + ".grids"
-    grids = import_or_fatal(root_name)
+    grid_package = main.dora.grid_package
+    if grid_package is None:
+        grid_package = main.package + ".grids"
+
+    grids = import_or_fatal(grid_package)
 
     if args.grid is not None:
         grid_filename = args.grid.replace('.', '/') + '.py'
         grid_file = Path(grids.__file__).parent / grid_filename
     if args.grid is None or not grid_file.exists():
         candidates = []
-        for info in pkgutil.walk_packages([Path(grids.__file__).parent]):
-            if not info.name.startswith('_'):
-                candidates.append(info.name)
+        pkg_root = Path(grids.__file__).parent
+        for root, folders, files in os.walk(pkg_root):
+            for file in files:
+                fullpath = (Path(root) / file).relative_to(pkg_root)
+                if fullpath.name.endswith('.py') and not fullpath.name.startswith('_'):
+                    fullpath = fullpath.parent / fullpath.stem
+                    candidates.append(str(fullpath).replace('/', '.'))
         if args.grid is not None and not grid_file.exists():
-            log(f'No grid file {grid_filename} in package {root_name}. '
+            log(f'No grid file {grid_filename} in package {grid_package}. '
                 'Maybe you made a typo?')
         log(f"Potential grids are: {', '.join(candidates)}")
         sys.exit(0)
 
-    grid_name = root_name + "." + args.grid
+    grid_name = grid_package + "." + args.grid
     grid = import_or_fatal(grid_name)
 
     try:
@@ -157,7 +163,7 @@ def run_grid(main: DecoratedMain, explorer: Explorer, grid_name: str,
     if slurm is None:
         slurm = main.get_slurm_config()
 
-    grid_folder = main.dora.dir / main.dora.grids / grid_name
+    grid_folder = main.dora.dir / main.dora._grids / grid_name
     grid_folder.mkdir(exist_ok=True, parents=True)
 
     herd = Herd()