diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index e490b99..415d9a0 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -11,7 +11,7 @@ trigger: include: - "main" - "release/*" - - "model_parallel_exp_support" # temporarily add for new test infra enhancement validation + # - "model_parallel_exp_support" # temporarily add for specific feature branch validation - "refs/tags/*" paths: include: @@ -46,7 +46,7 @@ jobs: strategy: matrix: 'PyTorch | latest': - image: "speediedan/finetuning-scheduler:py3.12-pt2.5.1-pl2.5-azpl-init" + image: "speediedan/finetuning-scheduler:py3.12-pt2.6.0-pl2.6-azpl-init" scope: "" # how long to run the job before automatically cancelling timeoutInMinutes: "100" diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 626cec1..de095d6 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -39,14 +39,14 @@ python collect_env_details.py You can also fill out the list below manually. --> -- Fine-Tuning Scheduler Version (e.g., 2.5.0): -- Lightning Version (e.g., 2.5.0): -- PyTorch Version (e.g., 2.5.1): +- Fine-Tuning Scheduler Version (e.g., 2.6.0): +- Lightning Version (e.g., 2.6.0): +- PyTorch Version (e.g., 2.6.0): - Python version (e.g., 3.12): - OS (e.g., Linux): - CUDA/cuDNN version: - GPU models and configuration: -- How you installed PyTorch (`conda`, `pip`, source): +- How you installed PyTorch (`pip`, source): - If compiling from source, the output of `torch.__config__.show()`: - Any other relevant information: diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index cc77c44..a9a4d79 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -31,9 +31,9 @@ jobs: matrix: # initially building only the latest supported configuration python_version: ["3.12"] - pytorch_version: ["2.5.1"] - cust_base: ["cu12.4.0-"] - pl_version: ["2.5"] + pytorch_version: ["2.6.0"] + cust_base: ["cu12.6.2-"] + pl_version: ["2.6"] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index b4b683c..fe3eb35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [2.6.0] - 2024-XX-XX + +### Added + +- Support for Lightning and PyTorch ``2.6.0`` + +### Deprecated + +- removed support for PyTorch `2.2` +- removed use of conda builds (aligning with upstream PyTorch) + ## [2.5.0] - 2024-XX-XX ### Added diff --git a/CITATION.cff b/CITATION.cff index f132fb0..fce2bb5 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -6,7 +6,7 @@ date-released: 2022-02-04 authors: - family-names: "Dale" given-names: "Dan" -version: 2.5.0 +version: 2.6.0 identifiers: - description: "Fine-Tuning Scheduler (all versions)" type: doi diff --git a/README.md b/README.md index d9d4599..21780a6 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ To ensure maximum stability, the latest Lightning patch release fully tested wit
Current build statuses for Fine-Tuning Scheduler -| System / (PyTorch/Python ver) | 2.2.2/3.9 | 2.5.1/3.9, 2.5.1/3.12 | +| System / (PyTorch/Python ver) | 2.3.1/3.9 | 2.6.0/3.9, 2.6.0/3.12 | | :---------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | Linux \[GPUs\*\*\] | - | [![Build Status](https://dev.azure.com//speediedan/finetuning-scheduler/_apis/build/status/Multi-GPU%20&%20Example%20Tests?branchName=main)](https://dev.azure.com/speediedan/finetuning-scheduler/_build/latest?definitionId=1&branchName=main) | | Linux (Ubuntu 22.04) | [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml) | [![Test](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml/badge.svg?branch=main&event=push)](https://github.com/speediedan/finetuning-scheduler/actions/workflows/ci_test-full.yml) | diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index b07aa81..d8577a2 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -11,13 +11,13 @@ # limitations under the License. # initially based on https://bit.ly/3pdAf1G -ARG CUDA_VERSION=12.4.0 +ARG CUDA_VERSION=12.6.2 ARG OS_VER=ubuntu22.04 FROM nvidia/cuda:${CUDA_VERSION}-devel-${OS_VER} ARG PYTHON_VERSION=3.12 -ARG PYTORCH_VERSION=2.5.1 +ARG PYTORCH_VERSION=2.6.0 ARG CUST_BUILD=0 ARG MKL_THREADING_LAYER=GNU @@ -85,13 +85,13 @@ RUN \ else \ # or target a specific cuda build, by specifying a particular index url w/... # ... default channel - pip install torch torchvision --index-url https://download.pytorch.org/whl/cu124; \ + #pip install torch torchvision --index-url https://download.pytorch.org/whl/cu126; \ # ... pytorch patch version # pip install torch==1.11.1+cu113 torchvision==0.11.3+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html; \ # ... pytorch nightly dev version - #pip install --pre torch==2.5.0.dev20240827 torchvision==0.20.0.dev20240827 --index-url https://download.pytorch.org/whl/nightly/cu124; \ + pip install --pre torch==2.6.0.dev20241121 torchvision==0.20.0.dev20241121 --index-url https://download.pytorch.org/whl/nightly/cu126; \ # ... test channel - #pip install --pre torch==2.5.0 torchvision --index-url https://download.pytorch.org/whl/test/cu124; \ + #pip install --pre torch==2.6.0 torchvision --index-url https://download.pytorch.org/whl/test/cu126; \ fi && \ # Install all requirements pip install -r requirements/devel.txt --no-cache-dir && \ diff --git a/dockers/build_image_version.sh b/dockers/build_image_version.sh index b8eabfc..69bb5df 100755 --- a/dockers/build_image_version.sh +++ b/dockers/build_image_version.sh @@ -19,3 +19,9 @@ build_version(){ --build-arg PYTORCH_VERSION=${iv_ref["pytorch"]} --no-cache . >> $docker_build_log docker tag ${azpl_name} ${registry_name}:${azpl_name} >> $docker_build_log } + +maybe_deactivate(){ + if [ -n "$VIRTUAL_ENV" ]; then + deactivate + fi +} diff --git a/dockers/docker_images_main.sh b/dockers/docker_images_main.sh index fa2fa15..d64e515 100755 --- a/dockers/docker_images_main.sh +++ b/dockers/docker_images_main.sh @@ -12,14 +12,12 @@ registry_name=$2 build_new="${3:-1}" push_remote="${4:-1}" -eval "$(conda shell.bash hook)" # setup shell functions for conda, uses conda's .bashrc resident defined hook to execute conda init setup to enable subsequent conda command usage -conda deactivate +maybe_deactivate d=`date +%Y%m%d%H%M%S` tmp_docker_build_log_dir="/tmp" docker_build_log="${tmp_docker_build_log_dir}/fts_update_docker_main_images_${d}.log" - maybe_push(){ if [[ $push_remote -ne 0 ]]; then echo "Beginning upload of built images..." >> $docker_build_log @@ -43,7 +41,8 @@ maybe_build(){ build_eval(){ # latest PyTorch image supported by release - declare -A iv=(["cuda"]="12.4.0" ["python"]="3.12" ["pytorch"]="2.5.1" ["lightning"]="2.5" ["cust_build"]="1") + # see CUDA_ARCHES_FULL_VERSION for the full version of the pytorch-provided toolkit + declare -A iv=(["cuda"]="12.6.2" ["python"]="3.12" ["pytorch"]="2.6.0" ["lightning"]="2.6" ["cust_build"]="1") export latest_pt="base-cu${iv["cuda"]}-py${iv["python"]}-pt${iv["pytorch"]}-pl${iv["lightning"]}" export latest_azpl="py${iv["python"]}-pt${iv["pytorch"]}-pl${iv["lightning"]}-azpl-init" maybe_build iv "${latest_pt}" "${latest_azpl}" diff --git a/dockers/docker_images_release.sh b/dockers/docker_images_release.sh index 4cf8353..4857b0f 100755 --- a/dockers/docker_images_release.sh +++ b/dockers/docker_images_release.sh @@ -12,10 +12,7 @@ registry_name=$2 build_new="${3:-1}" push_remote="${4:-1}" -# setup shell functions for conda, uses conda's .bashrc resident defined hook to execute conda init setup to enable -# subsequent conda command usage -eval "$(conda shell.bash hook)" -conda deactivate +maybe_deactivate d=`date +%Y%m%d%H%M%S` tmp_docker_build_log_dir="/tmp" @@ -44,7 +41,7 @@ maybe_build(){ build_eval(){ # latest PyTorch image supported by release - declare -A iv=(["cuda"]="12.4.0" ["python"]="3.12" ["pytorch"]="2.5.1" ["lightning"]="2.5" ["cust_build"]="0") + declare -A iv=(["cuda"]="12.6.2" ["python"]="3.12" ["pytorch"]="2.6.0" ["lightning"]="2.6" ["cust_build"]="0") export latest_pt="base-cu${iv["cuda"]}-py${iv["python"]}-pt${iv["pytorch"]}-pl${iv["lightning"]}" export latest_azpl="py${iv["python"]}-pt${iv["pytorch"]}-pl${iv["lightning"]}-azpl-init" maybe_build iv "${latest_pt}" "${latest_azpl}" diff --git a/dockers/fts-az-base/Dockerfile b/dockers/fts-az-base/Dockerfile index a26cfc9..32ec9a8 100644 --- a/dockers/fts-az-base/Dockerfile +++ b/dockers/fts-az-base/Dockerfile @@ -11,8 +11,8 @@ # limitations under the License. ARG PYTHON_VERSION=3.12 -ARG PYTORCH_VERSION=2.5.1 -ARG LIGHTNING_VERSION=2.5 +ARG PYTORCH_VERSION=2.6.0 +ARG LIGHTNING_VERSION=2.6 ARG CUST_BASE FROM speediedan/finetuning-scheduler:base-${CUST_BASE}py${PYTHON_VERSION}-pt${PYTORCH_VERSION}-pl${LIGHTNING_VERSION} diff --git a/dockers/release-conda/Dockerfile b/dockers/release-conda/Dockerfile deleted file mode 100644 index d49b307..0000000 --- a/dockers/release-conda/Dockerfile +++ /dev/null @@ -1,88 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -## NB: This image is no longer actively maintained and kept here only as a reference for users. - -ARG CUDA_VERSION=11.8 - -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 - -ARG CUDATOOLKIT_VERSION=12.4 -ARG PYTHON_VERSION=3.12 -ARG PYTORCH_VERSION=2.5.1 -ARG CONDA_VERSION=4.13.0 - -SHELL ["/bin/bash", "-c"] -ENV \ - PATH="$PATH:/root/.local/bin" \ - DEBIAN_FRONTEND=noninteractive \ - TZ=US/Pacific - -RUN apt-get update -qq --fix-missing && \ - apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - git \ - wget \ - curl \ - unzip \ - ca-certificates \ - && \ - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py310_${CONDA_VERSION}-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ -# Cleaning - apt-get autoremove -y && \ - apt-get clean && \ - rm -rf /root/.cache && \ - rm -rf /var/lib/apt/lists/* -ENV \ - PATH="/root/miniconda3/bin:$PATH" \ - # LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" \ - CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ - MAKEFLAGS="-j2" \ - TORCH_CUDA_ARCH_LIST="6.0;7.0;7.5;8.0;8.6:9.0" \ - CONDA_ENV=finetuning-scheduler \ - CONDA_DEFAULT_ENV=${CONDA_ENV} - -LABEL maintainer="Dan Dale " - -WORKDIR /home/finetuning-scheduler -COPY ./tests ./tests -COPY ./requirements ./requirements -COPY ./src ./src -COPY ./.actions ./.actions -COPY ./requirements.txt ./setup.py ./pyproject.toml ./README.md ./ - -ENV PATH=/root/miniconda3/envs/${CONDA_ENV}/bin:$PATH \ - CONDA_DEFAULT_ENV=${CONDA_ENV} -# conda init -RUN conda update -n base -c defaults conda && \ - conda create -y --name $CONDA_ENV && \ - conda init bash - -SHELL ["conda", "run", "--no-capture-output", "-n", "finetuning-scheduler", "/bin/bash", "-c"] - -RUN conda install -c pytorch -c nvidia python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} cudatoolkit=${CUDATOOLKIT_VERSION} && \ - pip install ".[all]" && \ - conda clean -ya && \ - rm -rf requirements.* requirements/ - -COPY ./dockers/release-conda/conda_entrypoint.sh ./conda_entrypoint.sh -RUN echo "conda activate ${CONDA_ENV}" >> ~/.bashrc -RUN pip --version && \ - conda info && \ - pip list && \ - python -c "import lightning as L; print(L.__version__)" && \ - python -c "import finetuning_scheduler as fts; print(fts.__version__)" -ENTRYPOINT ["./conda_entrypoint.sh"] diff --git a/dockers/release-conda/conda_entrypoint.sh b/dockers/release-conda/conda_entrypoint.sh deleted file mode 100755 index 205b967..0000000 --- a/dockers/release-conda/conda_entrypoint.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash --login -set -e -LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 -. ~/.bashrc -exec "$@" diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index 76cb2a1..df424a3 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -11,8 +11,8 @@ # limitations under the License. ARG PYTHON_VERSION=3.12 -ARG PYTORCH_VERSION=2.5.1 -ARG LIGHTNING_VERSION=2.5 +ARG PYTORCH_VERSION=2.6.0 +ARG LIGHTNING_VERSION=2.6 ARG CUST_BASE FROM speediedan/finetuning-scheduler:base-${CUST_BASE}py${PYTHON_VERSION}-pt${PYTORCH_VERSION}-pl${LIGHTNING_VERSION} diff --git a/requirements/base.txt b/requirements/base.txt index c5cb302..01a825b 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,4 +1,4 @@ -#lightning>=2.5.0,<2.5.1 +#lightning>=2.6.0,<2.6.1 # the below is uncommented when master is targeting a specific pl dev master commit git+https://github.com/Lightning-AI/lightning.git@8ce52876ad6e5eb05e0965f72e034ffe46b327ba#egg=lightning -torch>=2.2.0 +torch>=2.3.0 diff --git a/requirements/examples.txt b/requirements/examples.txt index 3dc8271..5619e2b 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -6,4 +6,4 @@ sentencepiece tensorboardX>=2.2 tabulate psutil -numpy<2.0 # to avoid issues with oldest supported pytorch (2.2) +#numpy<2.0 # to avoid issues with oldest supported pytorch (2.3) diff --git a/requirements/pl_adjust_versions.py b/requirements/pl_adjust_versions.py index 659b347..b72bb3e 100644 --- a/requirements/pl_adjust_versions.py +++ b/requirements/pl_adjust_versions.py @@ -5,7 +5,7 @@ # IMPORTANT: this list needs to be sorted in reverse VERSIONS = [ - dict(torch="2.6.0", torchvision="0.21.0"), # nightly + dict(torch="2.6.0", torchvision="0.20.1"), # nightly torchvision nightly not yet bumped as of 20241124 dict(torch="2.5.1", torchvision="0.20.1"), # stable dict(torch="2.5.0", torchvision="0.20.0"), dict(torch="2.4.0", torchvision="0.19.0"), diff --git a/requirements/standalone_base.txt b/requirements/standalone_base.txt index e5afeb8..1c38bdc 100644 --- a/requirements/standalone_base.txt +++ b/requirements/standalone_base.txt @@ -1,4 +1,4 @@ -#pytorch-lightning>=2.5.0,<2.5.1 +#pytorch-lightning>=2.6.0,<2.6.1 # the below is uncommented when master is targeting a specific pl dev master commit git+https://github.com/Lightning-AI/pytorch-lightning.git@8ce52876ad6e5eb05e0965f72e034ffe46b327ba#egg=pytorch-lightning -torch>=2.2.0 +torch>=2.3.0 diff --git a/src/finetuning_scheduler/__about__.py b/src/finetuning_scheduler/__about__.py index 5118bcb..f15dd05 100644 --- a/src/finetuning_scheduler/__about__.py +++ b/src/finetuning_scheduler/__about__.py @@ -1,7 +1,7 @@ import time _this_year = time.strftime("%Y") -__version__ = "2.5.0.dev0" +__version__ = "2.6.0.dev0" __author__ = "Dan Dale" __author_email__ = "danny.dale@gmail.com" __license__ = "Apache-2.0" diff --git a/src/fts_examples/cli_experiment_utils.py b/src/fts_examples/cli_experiment_utils.py index 7237265..d128f00 100644 --- a/src/fts_examples/cli_experiment_utils.py +++ b/src/fts_examples/cli_experiment_utils.py @@ -65,13 +65,9 @@ def instantiate_class(init: Dict[str, Any], args: Optional[Union[Any, Tuple[Any, # override PyTorch default, extending it to capture additional salient packages for reproducability # https://github.com/pytorch/pytorch/blob/7c2489bdae5a96dc122c3bb7b42c18528bcfdc86/torch/utils/collect_env.py#L271 def get_pip_packages(run_lambda): - """Returns `pip list` output. - - Note: will also find conda-installed pytorch - and numpy packages. - """ + """Returns `pip list` output.""" # People generally have `pip` as `pip` or `pip3` - # But here it is incoved as `python -mpip` + # But here it is invoked as `python -mpip` def run_with_pip(pip): if collect_env.get_platform() == "win32": system_root = os.environ.get("SYSTEMROOT", "C:\\Windows") @@ -126,7 +122,6 @@ def get_env_info(): "miopen_runtime_version": miopen_runtime_version, "pip_version": pip_version, "pip_packages": pip_list_output, - "conda_packages": collect_env.get_conda_packages(run_lambda), "os": collect_env.get_os(run_lambda), "libc_version": collect_env.get_libc_version(), "gcc_version": collect_env.get_gcc_version(run_lambda), @@ -167,7 +162,6 @@ def collect_env_info() -> Dict: "cudnn_version", "pip_version", # 'pip' or 'pip3' "pip_packages", - "conda_packages", "hip_compiled_version", "hip_runtime_version", "miopen_runtime_version", diff --git a/src/fts_examples/patching/dep_patch_shim.py b/src/fts_examples/patching/dep_patch_shim.py index e2d4f76..74316df 100644 --- a/src/fts_examples/patching/dep_patch_shim.py +++ b/src/fts_examples/patching/dep_patch_shim.py @@ -54,7 +54,7 @@ def _patch_triton(): sys.modules.get(target_mod).__dict__.get('JITFunction').__init__ = _new_init -# required for `torch==2.5.x`, TBD wrt subsequent versions +# remove once `torch==2.6.x` is minimum (only required for `torch==2.5.x`) einsum_strategies_patch = DependencyPatch( condition=(lwt_compare_version("torch", operator.le, "2.5.2"), lwt_compare_version("torch", operator.ge, "2.5.0"),), @@ -71,13 +71,14 @@ def _patch_triton(): patched_package='datasets', description='Adjust `NumpyArrowExtractor` to properly use `numpy` 2.0 copy semantics') -# only required for `torch==2.4.x` +# TODO: remove once `torch==2.5.x` is minimum (only required for `torch==2.4.x`) triton_codgen_patch = DependencyPatch( condition=(lwt_compare_version("pytorch-triton", operator.eq, "3.0.0", "45fff310c8"),), env_flag=OSEnvToggle("ENABLE_FTS_TRITON_CODEGEN_PATCH", default="1"), function=_patch_triton, patched_package='pytorch-triton', description='Address `triton` #3564 until PyTorch pins the upstream fix') + class ExpPatch(Enum): EINSUM_STRATEGIES = einsum_strategies_patch NUMPY_EXTRACTOR = datasets_numpy_extractor_patch diff --git a/src/fts_examples/patching/patched_einsum_strategies.py b/src/fts_examples/patching/patched_einsum_strategies.py index 99fc847..2f9d587 100644 --- a/src/fts_examples/patching/patched_einsum_strategies.py +++ b/src/fts_examples/patching/patched_einsum_strategies.py @@ -5,7 +5,7 @@ # ruff: noqa: F821 # pyright: reportUndefinedVariable=false -if lwt_compare_version("torch", operator.ge, "2.5.0"): +if lwt_compare_version("torch", operator.ge, "2.5.0") and lwt_compare_version("torch", operator.le, "2.5.2"): globals().update(_prepare_module_ctx('torch.distributed.tensor._ops._einsum_strategy', globals())) diff --git a/src/fts_examples/test_examples.py b/src/fts_examples/test_examples.py index b70b8d1..5bedc92 100644 --- a/src/fts_examples/test_examples.py +++ b/src/fts_examples/test_examples.py @@ -61,8 +61,8 @@ EXPECTED_WARNS.extend(ALL_EXAMPLE_EXPECTED) # min/max versions only applies to base examples, TODO: consider for deprecation -MIN_VERSION_WARNS = "2.2" -MAX_VERSION_WARNS = "2.5" +MIN_VERSION_WARNS = "2.3" +MAX_VERSION_WARNS = "2.6" # torch version-specific warns go here EXPECTED_VERSION_WARNS = {MIN_VERSION_WARNS: [], MAX_VERSION_WARNS:[] } torch_version = metadata.distribution('torch').version diff --git a/tests/__init__.py b/tests/__init__.py index 63ec9af..86c629e 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -19,7 +19,7 @@ _PATH_DATASETS = os.path.join(_PROJECT_ROOT, "datasets") _PATH_LEGACY = os.path.join(_PROJECT_ROOT, "legacy") -# TODO: this setting `PYTHONPATH` may not be used by other evns like Conda for import packages +# TODO: this setting `PYTHONPATH` may not be used by other envs like Conda for import packages if _PROJECT_ROOT not in os.getenv("PYTHONPATH", ""): splitter = ":" if os.environ.get("PYTHONPATH", "") else "" os.environ["PYTHONPATH"] = f'{_PROJECT_ROOT}{splitter}{os.environ.get("PYTHONPATH", "")}' diff --git a/tests/helpers/runif.py b/tests/helpers/runif.py index 072633f..1f5e965 100644 --- a/tests/helpers/runif.py +++ b/tests/helpers/runif.py @@ -13,7 +13,7 @@ import os import re import sys -from typing import Optional, Set, Union +from typing import Optional, Set, Union, Dict from packaging.version import Version import importlib.metadata as metadata @@ -26,15 +26,26 @@ EXTENDED_VER_PAT = re.compile(r"([0-9]+\.){2}[0-9]+") +def maybe_mark_exp(exp_patch_set: Set[ExpPatch], mark_if_false: Optional[Dict] = None): + """This allows us to evaluate whether an experimental patch set that is conditionally required for a given test + is required in the current execution context. + + If the experimental patch set is not required, we mark the + test with the provided `mark_if_false` dictionary directive (or an empty dictionary). + """ + + exp_patch_set = {ep for ep in exp_patch_set if all(ep.value.condition)} + if any(exp_patch_set): + return {"exp_patch": exp_patch_set} + else: + return mark_if_false or {} + # RunIf aliases RUNIF_MAP = { "min2_5": {"min_torch": "2.5.0"}, "alone": {"standalone": True}, "bf16_alone": {"bf16_cuda": True, "standalone": True}, - #"min2_2": {"min_torch": "2.2.0"}, - #"max3_12_min2_3": {"max_python": "3.12", "min_torch": "2.3.0"}, - #"max3_12_min2_2": {"max_python": "3.12", "min_torch": "2.2.0"}, - "einsum_exp": {"exp_patch": {ExpPatch.EINSUM_STRATEGIES}}, + "einsum_exp": maybe_mark_exp({ExpPatch.EINSUM_STRATEGIES}, {"standalone": True}), } @@ -62,7 +73,7 @@ def __new__( skip_mac_os: bool = False, standalone: bool = False, deepspeed: bool = False, - exp_patch: Optional[Union[ExpPatch,Set[ExpPatch]]] = None, + exp_patch: Optional[Union[ExpPatch, Set[ExpPatch]]] = None, **kwargs, ): """ diff --git a/tests/special_tests.sh b/tests/special_tests.sh index b81d9cb..8f79b3e 100755 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -108,7 +108,7 @@ define_configuration(){ export PL_RUN_STANDALONE_TESTS=1 ;; exp_patch) - echo "Collecting and running only experimental patch tests supported w/ provided patch mask (${experiment_patch_mask[@]})." | tee -a $special_test_session_log + echo "Collecting and running only experimental patch tests that currently require the provided patch mask (${experiment_patch_mask[@]})." | tee -a $special_test_session_log export FTS_EXPERIMENTAL_PATCH_TESTS=1 ;; *) diff --git a/tests/test_fsdp.py b/tests/test_fsdp.py index dc414dd..375bf69 100644 --- a/tests/test_fsdp.py +++ b/tests/test_fsdp.py @@ -68,6 +68,7 @@ "of Tensor.pin_memory", # required with PT 2.5 for FSDP1 `_flat_param` internal usage "Tensor.is_pinned", # required with PT 2.5 for FSDP1 `_flat_param` internal usage "Deallocating Tensor ", # required with PT 2.5 + "`_get_pg_default_device` will be deprecated", # required with PT 2.6 20241121 nightly ] EXPECTED_WARNS.extend(additional_fsdp_warns) FSDP_BASE_WARNS = EXPECTED_WARNS diff --git a/tests/test_model_parallel.py b/tests/test_model_parallel.py index 30420ce..d645015 100644 --- a/tests/test_model_parallel.py +++ b/tests/test_model_parallel.py @@ -46,6 +46,8 @@ "The number of training batches", # minimizing cost of training for these tests "when logging on epoch level in distributed", # validating FTS handling in this scenario "You are using `torch.load` with `weights_only=False`", # known required w/ Lightning 2.4 + "of Tensor.pin_memory", # required with PT 2.6 nightly 2024.11.21 + "of Tensor.is_pinned" # required with PT 2.6 nightly 2024.11.21 ] MODEL_PARALLEL_BASE_WARNS.extend(additional_model_parallel_warns) MODEL_PARALLEL_DYNAMO_EXPECTED_WARNS = [] @@ -362,7 +364,6 @@ def gen_apply_transformer_tp_plan(model: nn.Module, device_mesh: DeviceMesh, los desired_input_layouts=Replicate(), ), "attention_norm": SequenceParallel(), - "attention.wq": ColwiseParallel(use_local_output=False), "attention.wk": ColwiseParallel(use_local_output=False), "attention.wv": ColwiseParallel(use_local_output=False),