bump lightning dev sha, set max numpy req in examples extra for pyt…

…orch `2.2` compatibility, adjust model parallel import encapsulation testing, update fts_superglue_nb example, refine torchtitan_llama customization, remove deprecated pkg_resources references, change dynamo testing to test with python 3.12, prune no-longer required warnings
speediedan · Oct 13, 2024 · f803064 · f803064
1 parent d65f077
commit f803064
Show file tree

Hide file tree

Showing 14 changed files with 67 additions and 83 deletions.
diff --git a/README.md b/README.md
@@ -149,7 +149,7 @@ To ensure maximum stability, the latest Lightning patch release fully tested wit
 | :---------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 |      Linux \[GPUs\*\*\]       |                                                                                                            -                                                                                                             | [![Build Status](https://dev.azure.com//speediedan/finetuning-scheduler/_apis/build/status/Multi-GPU%20&%20Example%20Tests?branchName=main)](https://dev.azure.com/speediedan/finetuning-scheduler/_build/latest?definitionId=1&branchName=main) |
 |     Linux (Ubuntu 22.04)      | [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml) |             [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml)             |
-|           OSX (11)            | [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml) |             [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml)             |
+|           OSX (14)            | [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml) |             [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml)             |
 |        Windows (2022)         | [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml) |             [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml)             |
 
 - \*\* tests run on one RTX 4090 and one RTX 2070

diff --git a/pyproject.toml b/pyproject.toml
@@ -81,7 +81,15 @@ norecursedirs = [
     "build",
     "docs",
 ]
-addopts = "--strict-markers --doctest-modules --color=yes --disable-pytest-warnings --ignore-glob='src/fts_examples/ipynb_src/*.py' --ignore='.actions/assistant.py'"
+
+# two sets of default options to allow periodic warnings pruning
+################################################################
+addopts = """--strict-markers --doctest-modules --color=yes --ignore-glob='src/fts_examples/ipynb_src/*.py'
+--ignore='.actions/assistant.py' --disable-pytest-warnings""" # comment during warnings pruning, otherwise uncomment
+# addopts = """--strict-markers --doctest-modules --color=yes --ignore-glob='src/fts_examples/ipynb_src/*.py'
+# --ignore='.actions/assistant.py'""" # uncomment during warnings pruning, otherwise comment
+################################################################
+
 junit_duration_report = "call"
 
 [tool.jupytext]

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -1,4 +1,4 @@
 #lightning>=2.5.0,<2.5.1
 # the below is uncommented when master is targeting a specific pl dev master commit
-git+https://github.com/Lightning-AI/lightning.git@1551a16b94f5234a4a78801098f64d0732ef5cb5#egg=lightning
+git+https://github.com/Lightning-AI/lightning.git@8ad3e29816a63d8ce5c00ac104b14729a4176f4f#egg=lightning
 torch>=2.2.0
diff --git a/requirements/examples.txt b/requirements/examples.txt
@@ -6,3 +6,4 @@ sentencepiece
 tensorboardX>=2.2
 tabulate
 psutil
+numpy<2.0  # to avoid issues with oldest supported pytorch (2.2)
diff --git a/requirements/standalone_base.txt b/requirements/standalone_base.txt
@@ -1,4 +1,4 @@
 #pytorch-lightning>=2.5.0,<2.5.1
 # the below is uncommented when master is targeting a specific pl dev master commit
-git+https://github.com/Lightning-AI/pytorch-lightning.git@1551a16b94f5234a4a78801098f64d0732ef5cb5#egg=pytorch-lightning
+git+https://github.com/Lightning-AI/pytorch-lightning.git@8ad3e29816a63d8ce5c00ac104b14729a4176f4f#egg=pytorch-lightning
 torch>=2.2.0
diff --git a/setup.py b/setup.py
@@ -138,7 +138,7 @@ def _setup_args(standalone: bool = False) -> Dict[str, Any]:
         _INSTALL_PATHS["require"],
         file_name=base_reqs,
         standalone=standalone,
-        pl_commit="1551a16b94f5234a4a78801098f64d0732ef5cb5",
+        pl_commit="8ad3e29816a63d8ce5c00ac104b14729a4176f4f",
     )
     base_setup["install_requires"] = install_requires
     return base_setup

diff --git a/src/finetuning_scheduler/strategy_adapters/_mp_imports.py b/src/finetuning_scheduler/strategy_adapters/_mp_imports.py
@@ -25,4 +25,4 @@
                 "RowwiseParallel", "SequenceParallel", "implicit_replication", "parallelize_module", "loss_parallel",
                 "FSDPModule", "fully_shard", "checkpoint", "checkpoint_wrapper", "offload_wrapper", "ActivationWrapper",
                 "CPUOffloadPolicy", "sdpa_kernel", "FSDPMemTracker"]:
-        globals()[mp_obj] = None
+        globals()[mp_obj] = object
diff --git a/src/fts_examples/ipynb_src/fts_superglue_nb.py b/src/fts_examples/ipynb_src/fts_superglue_nb.py
@@ -147,6 +147,8 @@
 #
 # - ``ddp`` (and aliases ``ddp_find_unused_parameters_false``, ``ddp_find_unused_parameters_true``, ``ddp_spawn``, ``ddp_fork``, ``ddp_notebook``)
 # - ``fsdp`` (and alias ``fsdp_cpu_offload``)
+# - **NEW** ``ModelParallelStrategy`` (enabling use of PyTorch's composable distributed (e.g. ``fully_shard``, ``checkpoint``) and Tensor Parallelism (TP) APIs)
+#   - [See this example](https://finetuning-scheduler.readthedocs.io/en/latest/distributed/model_parallel_scheduled_fine_tuning.html)
 #
 # Custom or officially unsupported strategies can be used by setting [FinetuningScheduler.allow_untested](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=allow_untested#finetuning_scheduler.fts.FinetuningScheduler.params.allow_untested) to ``True``.
 # Note that most currently unsupported strategies are so because they require varying degrees of modification to be compatible. For example, ``deepspeed`` will require a [StrategyAdapter](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.strategy_adapters.html#finetuning_scheduler.strategy_adapters.StrategyAdapter) to be written (similar to the one for ``FSDP``, [FSDPStrategyAdapter](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.strategy_adapters.html#finetuning_scheduler.strategy_adapters.FSDPStrategyAdapter)) before support can be added (PRs welcome!),

diff --git a/src/fts_examples/model_parallel/torchtitan_llama.py b/src/fts_examples/model_parallel/torchtitan_llama.py
@@ -11,7 +11,8 @@
 # The only changes are:
 # - we fix use of `RMSNorm` rather than parameterizing the `norm_type` to avoid the need to import another
 #   torchtitan module (`torchtitan.models.norms`: https://bit.ly/torchtitan_norms_module)
-# - we add `reset_parameters` methods that invoke `init_weights` for more convenient deferred materialization
+# - we add a `reset_parameters` method that invokes `init_weights` to Transformers for more convenient deferred
+#   materialization (required for `freqs_cis` buffer)
 
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -191,9 +192,6 @@ def __init__(self, model_args: ModelCfg):
             model_args.n_heads * self.head_dim, model_args.dim, bias=False
         )
 
-    def reset_parameters(self):
-        self.init_weights()
-
     def init_weights(self, init_std: float):
         for linear in (self.wq, self.wk, self.wv):
             nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
@@ -278,9 +276,6 @@ def __init__(
     def forward(self, x):
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
 
-    def reset_parameters(self):
-        self.init_weights()
-
     def init_weights(self, init_std: float):
         nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
         for linear in (self.w2, self.w3):
@@ -345,9 +340,6 @@ def forward(
         out = h + self.feed_forward(self.ffn_norm(h))
         return out
 
-    def reset_parameters(self):
-        self.init_weights()
-
     def init_weights(self):
         for norm in (self.attention_norm, self.ffn_norm):
             norm.reset_parameters()

diff --git a/src/fts_examples/test_examples.py b/src/fts_examples/test_examples.py
@@ -14,12 +14,12 @@
 from unittest import mock
 from copy import copy
 from itertools import chain
+from packaging.version import Version
+import importlib.metadata as metadata
 import re
 
 import pytest
 from lightning.pytorch.callbacks import ModelCheckpoint
-from packaging.version import Version
-from pkg_resources import get_distribution
 
 from fts_examples import _HF_AVAILABLE
 from tests.test_model_parallel import MODEL_PARALLEL_BASE_WARNS
@@ -64,11 +64,8 @@
 MIN_VERSION_WARNS = "2.2"
 MAX_VERSION_WARNS = "2.5"
 # torch version-specific warns go here
-EXPECTED_VERSION_WARNS = {MIN_VERSION_WARNS: [],
-                          MAX_VERSION_WARNS: [
-                              'PairwiseParallel is deprecated and will be removed soon.',
-                              ]}
-torch_version = get_distribution("torch").version
+EXPECTED_VERSION_WARNS = {MIN_VERSION_WARNS: [], MAX_VERSION_WARNS:[] }
+torch_version = metadata.distribution('torch').version
 extended_torch_ver = EXTENDED_VER_PAT.match(torch_version).group() or torch_version
 if Version(extended_torch_ver) < Version(MAX_VERSION_WARNS):
     EXPECTED_WARNS.extend(EXPECTED_VERSION_WARNS[MIN_VERSION_WARNS])

diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py
@@ -14,13 +14,14 @@
 import re
 import sys
 from typing import Optional, Set, Union
+from packaging.version import Version
+import importlib.metadata as metadata
 
 import pytest
 import torch
 from lightning.fabric.accelerators.cuda import num_cuda_devices
 from lightning.pytorch.strategies.deepspeed import _DEEPSPEED_AVAILABLE
-from packaging.version import Version
-from pkg_resources import get_distribution
+
 from fts_examples.patching.dep_patch_shim import ExpPatch, _ACTIVE_PATCHES
 
 EXTENDED_VER_PAT = re.compile(r"([0-9]+\.){2}[0-9]+")
@@ -30,9 +31,9 @@
     "min2_5": {"min_torch": "2.5.0"},
     "alone": {"standalone": True},
     "bf16_alone": {"bf16_cuda": True, "standalone": True},
-    "min2_2": {"min_torch": "2.2.0"},
-    "max3_12_min2_3": {"max_python": "3.12", "min_torch": "2.3.0"},
-    "max3_12_min2_2": {"max_python": "3.12", "min_torch": "2.2.0"},
+    #"min2_2": {"min_torch": "2.2.0"},
+    #"max3_12_min2_3": {"max_python": "3.12", "min_torch": "2.3.0"},
+    #"max3_12_min2_2": {"max_python": "3.12", "min_torch": "2.2.0"},
     "einsum_exp": {"exp_patch": {ExpPatch.EINSUM_STRATEGIES}},
 }
 
@@ -92,13 +93,13 @@ def __new__(
             kwargs["min_cuda_gpus"] = True
 
         if min_torch:
-            torch_version = get_distribution("torch").version
+            torch_version = metadata.distribution('torch').version
             extended_torch_ver = EXTENDED_VER_PAT.match(torch_version).group() or torch_version
             conditions.append(Version(extended_torch_ver) < Version(min_torch))
             reasons.append(f"torch>={min_torch}, {extended_torch_ver} installed.")
 
         if max_torch:
-            torch_version = get_distribution("torch").version
+            torch_version = metadata.distribution('torch').version
             extended_torch_ver = EXTENDED_VER_PAT.match(torch_version).group() or torch_version
             conditions.append(Version(extended_torch_ver) > Version(max_torch))
             reasons.append(f"torch<={max_torch}, {extended_torch_ver} installed.")

diff --git a/tests/test_finetuning_scheduler_callback.py b/tests/test_finetuning_scheduler_callback.py
@@ -1248,7 +1248,7 @@ def test_fts_enforce_p0(tmpdir, init_lr_key, p0_params):
 }
 
 
-@RunIf(skip_windows=True, skip_mac_os=True, min_torch="2.2.0", max_python="3.12")
+@RunIf(skip_windows=True, skip_mac_os=True, min_torch="2.4.0", min_python="3.12")
 def test_fts_dynamo_enforce_p0(tmpdir, boring_ft_schedule):
     """Inspect the scheduled fine-tuning training path in the context of dynamo to ensure thawing schedule phase 0
     is enforced."""
@@ -1354,11 +1354,11 @@ def test_fts_decay(tmpdir, boring_ft_schedule, explicit_mode: bool, nodecay_mode
     "`max_epochs` was not",  # required for all PyTorch/Lightning versions
     "The dirpath has changed from",  # required for all PyTorch/Lightning versions
     # allowing below until https://github.com/pytorch/pytorch/pull/123619 is resolved wrt `ZeroRedundancyOptimizer`
-    "TorchScript support for functional optimizers is",
+    # "TorchScript support for functional optimizers is", # suppressed, can delete with next push
     # required w/ PT 2.4 (until Lightning changes `weights_only` default value or offers a way to override it)
-    "You are using `torch.load` with `weights_only=False`",
+    # "You are using `torch.load` with `weights_only=False`", # can delete with next push
     # still required for pytorch 2.1
-    "Conversion of an array with ndim > 0"
+    #"Conversion of an array with ndim > 0"  # can delete with next push
 
 ]
 EXPECTED_DIRPATH = "is not empty."
@@ -1440,7 +1440,7 @@ def test_fts_callback_resume(tmpdir, ckpt_set, recwarn, diff_dirpath: bool, trai
 EXPECTED_CKPT_WARNS = ["Be aware that when using `ckpt_path`, callbacks"]
 
 
-@RunIf(skip_windows=True, skip_mac_os=True, min_torch="2.2.0", max_python="3.12")
+@RunIf(skip_windows=True, skip_mac_os=True, min_torch="2.4.0", min_python="3.12")
 def test_fts_dynamo_resume(tmpdir, ckpt_set, boring_ft_schedule, recwarn):
     """Validate scheduled fine-tuning resumption functions as expected with a default dynamo configuration."""
     resume_warns = copy(EXPECTED_WARNS) + copy(DYNAMO_EXPECTED_WARNS) + copy(EXPECTED_CKPT_WARNS) + [EXPECTED_DIRPATH]
@@ -1551,7 +1551,7 @@ def test_fts_intrafit(tmpdir, restore_best: bool):
 }
 
 
-@RunIf(skip_windows=True, skip_mac_os=True, min_torch="2.2.0", max_python="3.12")
+@RunIf(skip_windows=True, skip_mac_os=True, min_torch="2.4.0", min_python="3.12")
 @pytest.mark.parametrize("restore_best", [True, False], ids=["default", "norestorebest"])
 def test_fts_dynamo_intrafit(tmpdir, boring_ft_schedule, restore_best: bool):
     """Inspect scheduled fine-tuning state within the training process to ensure it is taking the expected path in
@@ -2838,7 +2838,7 @@ def test_fts_multi_ddp(tmpdir, boring_ft_schedule, explicit_mode):
     assert finetuningscheduler_callback.curr_depth == finetuningscheduler_callback.max_depth
 
 
-@RunIf(standalone=True, min_cuda_gpus=2, skip_windows=True, skip_mac_os=True, min_torch="2.2.0", max_python="3.12")
+@RunIf(standalone=True, min_cuda_gpus=2, skip_windows=True, skip_mac_os=True, min_torch="2.4.0", min_python="3.12")
 def test_fts_multi_ddp_dynamo(tmpdir, boring_ft_schedule):
     """Validate :class:`~finetuning_scheduler.FinetuningScheduler` functions properly in a supported 'ddp'
     distributed context with default dynamo usage."""
@@ -2882,25 +2882,3 @@ def test_fts_multi_ddp_fork(tmpdir):
     trainer = Trainer(default_root_dir=tmpdir, callbacks=callbacks, strategy="ddp_fork", devices=2)
     trainer.fit(model)
     assert trainer.callback_metrics["val_loss"] < 0.1
-
-
-# @RunIf(standalone=False, min_cuda_gpus=2, min_torch="2.4.0")
-# @pytest.mark.parametrize("explicit_mode", [True, False], ids=["explicit", "implicit"])
-# def test_fts_multi_model_parallel(tmpdir, boring_ft_schedule, explicit_mode):
-#     """Validate :class:`~finetuning_scheduler.FinetuningScheduler` functions properly in a supported 'ddp'
-#     distributed context."""
-#     seed_everything(42)
-#     ft_schedule = boring_ft_schedule[12] if explicit_mode else None
-#     #expected_depth = 2 if explicit_mode else 3
-#     callbacks = [FinetuningScheduler(ft_schedule=ft_schedule), FTSEarlyStopping(monitor="val_loss", patience=1)]
-#     strategy = ModelParallelStrategy()
-#     trainer = Trainer(default_root_dir=tmpdir, callbacks=callbacks, strategy=strategy, devices=2,
-#                       num_sanity_val_steps=0)
-#     finetuningscheduler_callback = get_fts(trainer)
-#     #with trainer.init_module(empty_init=True):
-#     model = FTSModelParallelBoringModel()
-#     trainer.fit(model)
-#     pass
-    #assert finetuningscheduler_callback.depth_remaining == 0
-    #assert finetuningscheduler_callback.curr_depth == expected_depth
-    #assert finetuningscheduler_callback.curr_depth == finetuningscheduler_callback.max_depth
diff --git a/tests/test_fsdp.py b/tests/test_fsdp.py
@@ -49,27 +49,24 @@
         CheckpointImpl,
     )
     from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload, FullyShardedDataParallel, MixedPrecision
-    from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, wrap
+    from torch.distributed.fsdp.wrap import wrap
 else:
     FullyShardedDataParallel = None  # type: ignore[misc,assignment]
     MixedPrecision = None  # type: ignore[misc,assignment]
     BackwardPrefetch = None  # type: ignore[misc,assignment]
     CPUOffload = None  # type: ignore[misc,assignment]
-    size_based_auto_wrap_policy = object
     wrap = object
 
 from torch.distributed.fsdp.wrap import CustomPolicy
 
 DISABLE_USE_ORIG = {"use_orig_params": False}
-#_FSDPPolicy = object
-
 
 additional_fsdp_warns = [
     "The number of training batches",  # minimizing cost of training for these tests
-    "Please use torch.distributed.all_gather_into_tensor",  # still required for PyTorch/Lightning <=2.1
-    "Please use torch.distributed.reduce_scatter_tensor",  # still required for PyTorch/Lightning <=2.1
+    # "Please use torch.distributed.all_gather_into_tensor",  # can delete with next push
+    # "Please use torch.distributed.reduce_scatter_tensor",  # can delete with next push
     "when logging on epoch level in distributed",  # validating FTS handling in this scenario
-    "torch.cpu.amp.autocast",  # required as of PT 2.4
+    # "torch.cpu.amp.autocast",    # can delete with next push
     "FSDP.state_dict_type", # temporarily required until Lightning uses new FSDP state dict API with PT 2.4
     "of Tensor.pin_memory",  # required as of PT 2.5 nightly for FSDP1 `_flat_param` internal usage
     "Tensor.is_pinned",  # required as of PT 2.5 nightly for FSDP1 `_flat_param` internal usage
@@ -596,9 +593,7 @@ def warn_custom_auto_wrap_policy(
 
 
 # RunIf aliases
-runif_map = {
-    "min2_2": {"min_torch": "2.2.0"},
-}
+runif_map = {}  # none currently necessary
 
 # auto-wrap policy aliases
 cust_awp = custom_auto_wrap_policy