diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index 0412c5f37952d..e29eb78a9f945 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -2,8 +2,11 @@ import sys import zipfile -# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB +# Note that we have 400 MiB quota, please use it wisely. +# See https://github.com/pypi/support/issues/3792 . +# Please also sync the value with the one in Dockerfile. +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300)) def print_top_10_largest_files(zip_file): diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 686f70dbece6c..69b6b146b3549 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -43,7 +43,7 @@ main() { - # The figures should be genereated by a separate process outside the CI/CD pipeline + # The figures should be generated by a separate process outside the CI/CD pipeline # # generate figures # python3 -m pip install tabulate pandas matplotlib diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh index 189714ebb6d75..1ad77cf50f612 100644 --- a/.buildkite/run-neuron-test.sh +++ b/.buildkite/run-neuron-test.sh @@ -25,8 +25,11 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then last_build=$(cat /tmp/neuron-docker-build-timestamp) current_time=$(date +%s) if [ $((current_time - last_build)) -gt 86400 ]; then + # Remove dangling images (those that are not tagged and not used by any container) docker image prune -f - docker system prune -f + # Remove unused volumes / force the system prune for old images as well. + docker volume prune -f && docker system prune -f + # Remove huggingface model artifacts and compiler cache rm -rf "${HF_MOUNT:?}/*" rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*" echo "$current_time" > /tmp/neuron-docker-build-timestamp @@ -51,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \ -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \ --name "${container_name}" \ ${image_name} \ - /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py" + /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys" diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh old mode 100644 new mode 100755 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bff557d7fc92f..d5d02fdeb7f4b 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -52,7 +52,6 @@ steps: - tests/worker - tests/standalone_tests/lazy_torch_compile.py commands: - - pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git # Used by multimoda processing test - python3 standalone_tests/lazy_torch_compile.py - pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s async_engine # AsyncLLMEngine @@ -77,7 +76,9 @@ steps: - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - tests/basic_correctness/test_preemption + - tests/basic_correctness/test_cumem.py commands: + - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py @@ -182,7 +183,16 @@ steps: - vllm/ - tests/v1 commands: - - VLLM_USE_V1=1 pytest -v -s v1 + # split the test to avoid interference + - VLLM_USE_V1=1 pytest -v -s v1/core + - VLLM_USE_V1=1 pytest -v -s v1/engine + - VLLM_USE_V1=1 pytest -v -s v1/sample + - VLLM_USE_V1=1 pytest -v -s v1/worker + - VLLM_USE_V1=1 pytest -v -s v1/test_stats.py + - VLLM_USE_V1=1 pytest -v -s v1/test_utils.py + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - VLLM_USE_V1=1 pytest -v -s v1/e2e - label: Examples Test # 25min working_dir: "/vllm-workspace/examples" @@ -478,7 +488,9 @@ steps: - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + # this test fails consistently. + # TODO: investigate and fix + # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py @@ -516,7 +528,9 @@ steps: - vllm/engine - tests/multi_step commands: - - pytest -v -s multi_step/test_correctness_async_llm.py + # this test is quite flaky + # TODO: investigate and fix. + # - pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_llm.py - label: Pipeline Parallelism Test # 45min diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml deleted file mode 100644 index d139f625d98ab..0000000000000 --- a/.github/workflows/actionlint.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: Lint GitHub Actions workflows -on: - push: - branches: - - "habana_main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - pull_request: - branches: - - "habana_main" - paths: - - '.github/workflows/*.ya?ml' - - '.github/workflows/actionlint.*' - - '.github/workflows/matchers/actionlint.json' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run actionlint" - run: | - echo "::add-matcher::.github/workflows/matchers/actionlint.json" - tools/actionlint.sh -color diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml deleted file mode 100644 index 2a4655b9cee05..0000000000000 --- a/.github/workflows/clang-format.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: clang-format - -on: - # Trigger the workflow on push or pull request, - # but only for the habana_main branch - push: - branches: - - habana_main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - pull_request: - branches: - - habana_main - paths: - - '**/*.h' - - '**/*.cpp' - - '**/*.cu' - - '**/*.cuh' - - '.github/workflows/clang-format.yml' - -jobs: - clang-format: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.11"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install clang-format==18.1.5 - - name: Running clang-format - run: | - EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' - ) - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \ - | xargs clang-format --dry-run --Werror diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml deleted file mode 100644 index 72e732d878e61..0000000000000 --- a/.github/workflows/codespell.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: codespell - -on: - # Trigger the workflow on push or pull request, - # but only for the main branch - push: - branches: - - habana_main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - pull_request: - branches: - - habana_main - paths: - - "**/*.py" - - "**/*.md" - - "**/*.rst" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/codespell.yml - -jobs: - codespell: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Spelling check with codespell - run: | - codespell --toml pyproject.toml diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml index b900239463323..cc40973934672 100644 --- a/.github/workflows/cpu-test.yml +++ b/.github/workflows/cpu-test.yml @@ -18,9 +18,9 @@ jobs: matrix: python-version: ["3.11"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.github/workflows/doc-lint.yml b/.github/workflows/doc-lint.yml deleted file mode 100644 index 2a156f627196e..0000000000000 --- a/.github/workflows/doc-lint.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: Lint documentation - -on: - push: - branches: - - habana_main - paths: - - "docs/**" - pull_request: - branches: - - habana_main - paths: - - "docs/**" - -jobs: - doc-lint: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Linting docs - run: tools/doc-lint.sh diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json deleted file mode 100644 index f6d4479ee1996..0000000000000 --- a/.github/workflows/matchers/ruff.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "problemMatcher": [ - { - "owner": "ruff", - "pattern": [ - { - "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", - "file": 1, - "line": 2, - "column": 3, - "code": 4, - "message": 5 - } - ] - } - ] - } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml deleted file mode 100644 index f436e0d8336d4..0000000000000 --- a/.github/workflows/mypy.yaml +++ /dev/null @@ -1,51 +0,0 @@ -name: mypy - -on: - # Trigger the workflow on push or pull request, - # but only for the habana_main branch - push: - branches: - - habana_main - paths: - - '**/*.py' - - '.github/workflows/mypy.yaml' - - 'tools/mypy.sh' - - 'pyproject.toml' - pull_request: - branches: - - habana_main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - '**/*.py' - # - '.github/workflows/mypy.yaml' - # - 'tools/mypy.sh' - # - 'pyproject.toml' - -jobs: - mypy: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.9", "3.10", "3.11", "3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install mypy==1.11.1 - pip install types-setuptools - pip install types-PyYAML - pip install types-requests - pip install types-setuptools - - name: Mypy - run: | - echo "::add-matcher::.github/workflows/matchers/mypy.json" - tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/png-lint.yml b/.github/workflows/png-lint.yml deleted file mode 100644 index 140cb5e050a6a..0000000000000 --- a/.github/workflows/png-lint.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint PNG exports from excalidraw -on: - push: - branches: - - "habana_main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - pull_request: - branches: - - "habana_main" - paths: - - '*.excalidraw.png' - - '.github/workflows/png-lint.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - actionlint: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Run png-lint.sh to check excalidraw exported images" - run: | - tools/png-lint.sh diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000000..06564969dc778 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,19 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: "3.12" + - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json" + - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1 + with: + extra_args: --all-files --hook-stage manual diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml deleted file mode 100644 index 42385ef947502..0000000000000 --- a/.github/workflows/ruff.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: ruff - -on: - # Trigger the workflow on push or pull request, - # but only for the habana_main branch - push: - branches: - - habana_main - paths: - - "**/*.py" - - pyproject.toml - - requirements-lint.txt - - .github/workflows/matchers/ruff.json - - .github/workflows/ruff.yml - pull_request: - branches: - - habana_main - # This workflow is only relevant when one of the following files changes. - # However, we have github configured to expect and require this workflow - # to run and pass before github with auto-merge a pull request. Until github - # allows more flexible auto-merge policy, we can just run this on every PR. - # It doesn't take that long to run, anyway. - #paths: - # - "**/*.py" - # - pyproject.toml - # - requirements-lint.txt - # - .github/workflows/matchers/ruff.json - # - .github/workflows/ruff.yml - -jobs: - ruff: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install -r requirements-lint.txt - - name: Analysing the code with ruff - run: | - echo "::add-matcher::.github/workflows/matchers/ruff.json" - ruff check --output-format github . - - name: Run isort - run: | - isort . --check-only diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml deleted file mode 100644 index f6931150c795d..0000000000000 --- a/.github/workflows/shellcheck.yml +++ /dev/null @@ -1,37 +0,0 @@ -name: Lint shell scripts -on: - push: - branches: - - "habana_main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - pull_request: - branches: - - "habana_main" - paths: - - '**/*.sh' - - '.github/workflows/shellcheck.yml' - -env: - LC_ALL: en_US.UTF-8 - -defaults: - run: - shell: bash - -permissions: - contents: read - -jobs: - shellcheck: - runs-on: ubuntu-latest - steps: - - name: "Checkout" - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - with: - fetch-depth: 0 - - - name: "Check shell scripts" - run: | - tools/shellcheck.sh diff --git a/.github/workflows/trigger_jenkins.yml b/.github/workflows/trigger_jenkins.yml index 6a8e2f6bed1ca..240444c5c0a71 100644 --- a/.github/workflows/trigger_jenkins.yml +++ b/.github/workflows/trigger_jenkins.yml @@ -81,7 +81,7 @@ jobs: elif [[ $TARGET_BRANCH =~ v*.*.* ]]; then synapse_version=${TARGET_BRANCH#v} else - echo "Cant Calculate Synapse Version, Failing The Test" + echo "Can't Calculate Synapse Version, Failing The Test" exit 1 fi synapse_build=$(curl "https://dms.habana-labs.com/api/v1.1/branch/info/v$synapse_version" | jq -r ".release_id") diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml deleted file mode 100644 index 554150da97c02..0000000000000 --- a/.github/workflows/yapf.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: yapf - -on: - # Trigger the workflow on push or pull request, - # but only for the habana_main branch - push: - branches: - - habana_main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - pull_request: - branches: - - habana_main - paths: - - "**/*.py" - - .github/workflows/yapf.yml - -jobs: - yapf: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.12"] - steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install yapf==0.32.0 - pip install toml==0.10.2 - - name: Running yapf - run: | - yapf --diff --recursive . diff --git a/.jenkins/requirements-test-hpu.txt b/.jenkins/requirements-test-hpu.txt index 523eb0d39d145..3986eface0a40 100644 --- a/.jenkins/requirements-test-hpu.txt +++ b/.jenkins/requirements-test-hpu.txt @@ -1,3 +1,4 @@ lm_eval pytest tokenizers<0.20.2 +transformers<=4.46.3 diff --git a/.jenkins/test_config.yaml b/.jenkins/test_config.yaml index 14ed9067ae0a1..c74b018f1ff14 100644 --- a/.jenkins/test_config.yaml +++ b/.jenkins/test_config.yaml @@ -68,4 +68,26 @@ stages: command: VLLM_SKIP_WARMUP=true pytest -v tests/lora/test_multilora_hpu.py::test_llama_multilora_1x - name: test_long_context flavor: g2 - command: VLLM_SKIP_WARMUP=true pytest -v tests/lora/test_long_context_hpu.py::test_quality \ No newline at end of file + command: VLLM_SKIP_WARMUP=true pytest -v tests/lora/test_long_context_hpu.py::test_quality + - name: tests_multimodal + steps: + - name: multimodal_small_g3_tp1 + flavor: g3 + command: cd .jenkins/vision && bash run-tests.sh -c configs/models-small.txt -t 1 + - name: multimodal_small_g3_tp2 + flavor: g3.s + command: cd .jenkins/vision && bash run-tests.sh -c configs/models-small.txt -t 2 + - name: multimodal_small_g3_tp1_mss + flavor: g3 + command: cd .jenkins/vision && bash run-tests.sh -c configs/models-mss.txt -t 1 + - name: multimodal_small_g3_tp2_mss + flavor: g3.s + command: cd .jenkins/vision && bash run-tests.sh -c configs/models-mss.txt -t 2 + - name: tests_int4_quantization + steps: + - name: test_awq + flavor: g2 + command: VLLM_SKIP_WARMUP=true pytest -v tests/quantization/test_awq.py::test_awq + - name: test_gptq + flavor: g2 + command: VLLM_SKIP_WARMUP=true pytest -v tests/quantization/test_gptq.py::test_gptq diff --git a/.jenkins/vision/configs/Meta-Llama-3.2-11B-Vision-Instruct-mss.yaml b/.jenkins/vision/configs/Meta-Llama-3.2-11B-Vision-Instruct-mss.yaml new file mode 100644 index 0000000000000..22b76c5376d95 --- /dev/null +++ b/.jenkins/vision/configs/Meta-Llama-3.2-11B-Vision-Instruct-mss.yaml @@ -0,0 +1,6 @@ +model_name: "/mnt/weka/data/pytorch/llama3.2/Meta-Llama-3.2-11B-Vision-Instruct" +dtype: "bfloat16" +max_model_len: 1024 +max_num_seqs: 32 +num_prompts: 4 +num_scheduler_steps: 10 diff --git a/.jenkins/vision/configs/Meta-Llama-3.2-11B-Vision-Instruct.yaml b/.jenkins/vision/configs/Meta-Llama-3.2-11B-Vision-Instruct.yaml new file mode 100644 index 0000000000000..b40a6ff809bba --- /dev/null +++ b/.jenkins/vision/configs/Meta-Llama-3.2-11B-Vision-Instruct.yaml @@ -0,0 +1,5 @@ +model_name: "/mnt/weka/data/pytorch/llama3.2/Meta-Llama-3.2-11B-Vision-Instruct" +dtype: "bfloat16" +max_model_len: 1024 +max_num_seqs: 32 +num_prompts: 4 \ No newline at end of file diff --git a/.jenkins/vision/configs/models-mss.txt b/.jenkins/vision/configs/models-mss.txt new file mode 100644 index 0000000000000..3a2950ecf896e --- /dev/null +++ b/.jenkins/vision/configs/models-mss.txt @@ -0,0 +1 @@ +Meta-Llama-3.2-11B-Vision-Instruct-mss.yaml \ No newline at end of file diff --git a/.jenkins/vision/configs/models-small.txt b/.jenkins/vision/configs/models-small.txt new file mode 100644 index 0000000000000..8227bd495f0d3 --- /dev/null +++ b/.jenkins/vision/configs/models-small.txt @@ -0,0 +1 @@ +Meta-Llama-3.2-11B-Vision-Instruct.yaml \ No newline at end of file diff --git a/.jenkins/vision/data/cherry_blossom.jpg b/.jenkins/vision/data/cherry_blossom.jpg new file mode 100644 index 0000000000000..63173db0da768 Binary files /dev/null and b/.jenkins/vision/data/cherry_blossom.jpg differ diff --git a/.jenkins/vision/run-tests.sh b/.jenkins/vision/run-tests.sh new file mode 100644 index 0000000000000..667ad6da272cf --- /dev/null +++ b/.jenkins/vision/run-tests.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +usage() { + echo`` + echo "Runs simple request check on multimodal models using vllm" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. configs/small-models.txt)" + echo " -t - tensor parallel size" + echo +} + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG" + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" + + export TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} + export TP_SIZE=$TP_SIZE + export PT_HPU_ENABLE_LAZY_COLLECTIVES=true + export VLLM_SKIP_WARMUP=true + export TQDM_BAR_FORMAT="{desc}: {percentage:3.0f}% {bar:10} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]" + RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 =8' \ - torchvision==0.20.0.dev20241113+rocm6.2 \ - --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* +# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. +# Manually remove it so that later steps of numpy upgrade can continue +RUN case "$(which python3)" in \ + *"/opt/conda/envs/py_3.9"*) \ + rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ *) ;; esac -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: -ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: - -ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} -ENV CCACHE_DIR=/root/.cache/ccache - - -### AMD-SMI build stage -FROM base AS build_amdsmi -# Build amdsmi wheel always -RUN cd /opt/rocm/share/amd_smi \ - && python3 -m pip wheel . --wheel-dir=/install - - -### Flash-Attention wheel build stage -FROM base AS build_fa -ARG BUILD_FA -ARG FA_GFX_ARCHS -ARG FA_BRANCH -# Build ROCm flash-attention wheel if `BUILD_FA = 1` -RUN --mount=type=cache,target=${CCACHE_DIR} \ - if [ "$BUILD_FA" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && git clone https://github.com/ROCm/flash-attention.git \ - && cd flash-attention \ - && git checkout "${FA_BRANCH}" \ - && git submodule update --init \ - && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ - # Create an empty directory otherwise as later build stages expect one - else mkdir -p /install; \ - fi - - -### Triton wheel build stage -FROM base AS build_triton -ARG BUILD_TRITON -ARG TRITON_BRANCH -# Build triton wheel if `BUILD_TRITON = 1` -RUN --mount=type=cache,target=${CCACHE_DIR} \ - if [ "$BUILD_TRITON" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && python3 -m pip install ninja cmake wheel pybind11 \ - && git clone https://github.com/OpenAI/triton.git \ - && cd triton \ - && git checkout "${TRITON_BRANCH}" \ - && cd python \ - && python3 setup.py bdist_wheel --dist-dir=/install; \ - # Create an empty directory otherwise as later build stages expect one - else mkdir -p /install; \ - fi - - -### Final vLLM build stage -FROM base AS final -# Import the vLLM development directory from the build context -COPY . . -ARG GIT_REPO_CHECK=0 -RUN --mount=type=bind,source=.git,target=.git \ - if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi +RUN python3 -m pip install --upgrade huggingface-hub[cli] +ARG BUILD_RPD +RUN if [ ${BUILD_RPD} -eq "1" ]; then \ + git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \ + && cd rocmProfileData/rpd_tracer \ + && pip install -r requirements.txt && cd ../ \ + && make && make install \ + && cd hipMarker && python3 setup.py install ; fi -RUN python3 -m pip install --upgrade pip +# Install vLLM +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + cd /install \ + && pip install -U -r requirements-rocm.txt \ + && pip uninstall -y vllm \ + && pip install *.whl -# Package upgrades for useful functionality or to avoid dependency issues -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard +ARG COMMON_WORKDIR +# Copy over the benchmark scripts as well +COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks +COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples -# Workaround for ray >= 2.10.0 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 -# Silences the HF Tokenizers warning ENV TOKENIZERS_PARALLELISM=false -RUN --mount=type=cache,target=${CCACHE_DIR} \ - --mount=type=bind,source=.git,target=.git \ - --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install -Ur requirements-rocm.txt \ - && python3 setup.py clean --all \ - && python3 setup.py develop - -# Copy amdsmi wheel into final image -RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ - mkdir -p libs \ - && cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y amdsmi; - -# Copy triton wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y triton; fi - -# Copy flash-attn wheel(s) into final image if they were built -RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ - mkdir -p libs \ - && if ls /install/*.whl; then \ - cp /install/*.whl libs \ - # Preemptively uninstall to avoid same-version no-installs - && python3 -m pip uninstall -y flash-attn; fi - -# Install wheels that were built to the final image -RUN --mount=type=cache,target=/root/.cache/pip \ - if ls libs/*.whl; then \ - python3 -m pip install libs/*.whl; fi - -# install development dependencies (for testing) -RUN python3 -m pip install -e tests/vllm_test_utils +# Performance environment variable. +ENV HIP_FORCE_DEV_KERNARG=1 CMD ["/bin/bash"] + diff --git a/Dockerfile.rocm_base b/Dockerfile.rocm_base new file mode 100644 index 0000000000000..5bbe98b0c2204 --- /dev/null +++ b/Dockerfile.rocm_base @@ -0,0 +1,158 @@ +ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete +ARG HIPBLASLT_BRANCH="4d40e36" +ARG HIPBLAS_COMMON_BRANCH="7c1566b" +ARG LEGACY_HIPBLASLT_OPTION= +ARG RCCL_BRANCH="648a58d" +ARG RCCL_REPO="https://github.com/ROCm/rccl" +ARG TRITON_BRANCH="e5be006" +ARG TRITON_REPO="https://github.com/triton-lang/triton.git" +ARG PYTORCH_BRANCH="8d4926e" +ARG PYTORCH_VISION_BRANCH="v0.19.1" +ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" +ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" +ARG FA_BRANCH="b7d29fb" +ARG FA_REPO="https://github.com/ROCm/flash-attention.git" + +FROM ${BASE_IMAGE} AS base + +ENV PATH=/opt/rocm/llvm/bin:$PATH +ENV ROCM_PATH=/opt/rocm +ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: +ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942 +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} + +ARG PYTHON_VERSION=3.12 + +RUN mkdir -p /app +WORKDIR /app +ENV DEBIAN_FRONTEND=noninteractive + +# Install Python and other dependencies +RUN apt-get update -y \ + && apt-get install -y software-properties-common git curl sudo vim less \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ + python${PYTHON_VERSION}-lib2to3 python-is-python3 \ + && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ + && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ + && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ + && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ + && python3 --version && python3 -m pip --version + +RUN pip install -U packaging cmake ninja wheel setuptools pybind11 Cython + +FROM base AS build_hipblaslt +ARG HIPBLASLT_BRANCH +ARG HIPBLAS_COMMON_BRANCH +# Set to "--legacy_hipblas_direct" for ROCm<=6.2 +ARG LEGACY_HIPBLASLT_OPTION +RUN git clone https://github.com/ROCm/hipBLAS-common.git +RUN cd hipBLAS-common \ + && git checkout ${HIPBLAS_COMMON_BRANCH} \ + && mkdir build \ + && cd build \ + && cmake .. \ + && make package \ + && dpkg -i ./*.deb +RUN git clone https://github.com/ROCm/hipBLASLt +RUN cd hipBLASLt \ + && git checkout ${HIPBLASLT_BRANCH} \ + && ./install.sh -d --architecture ${PYTORCH_ROCM_ARCH} ${LEGACY_HIPBLASLT_OPTION} \ + && cd build/release \ + && make package +RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install + +FROM base AS build_rccl +ARG RCCL_BRANCH +ARG RCCL_REPO +RUN git clone ${RCCL_REPO} +RUN cd rccl \ + && git checkout ${RCCL_BRANCH} \ + && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH} +RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install + +FROM base AS build_triton +ARG TRITON_BRANCH +ARG TRITON_REPO +RUN git clone ${TRITON_REPO} +RUN cd triton \ + && git checkout ${TRITON_BRANCH} \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/triton/python/dist/*.whl /app/install + +FROM base AS build_amdsmi +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=dist +RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install + +FROM base AS build_pytorch +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN git clone ${PYTORCH_REPO} pytorch +RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \ + pip install -r requirements.txt && git submodule update --init --recursive \ + && python3 tools/amd_build/build_amd.py \ + && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${PYTORCH_VISION_REPO} vision +RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && pip install dist/*.whl +RUN git clone ${FA_REPO} +RUN cd flash-attention \ + && git checkout ${FA_BRANCH} \ + && git submodule update --init \ + && MAX_JOBS=64 GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist +RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ + && cp /app/vision/dist/*.whl /app/install \ + && cp /app/flash-attention/dist/*.whl /app/install + +FROM base AS final +RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ + && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \ + dpkg -i /install/*deb \ + && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ + && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status +RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ + pip install /install/*.whl +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + pip install /install/*.whl + +ARG BASE_IMAGE +ARG HIPBLASLT_BRANCH +ARG LEGACY_HIPBLASLT_OPTION +ARG RCCL_BRANCH +ARG RCCL_REPO +ARG TRITON_BRANCH +ARG TRITON_REPO +ARG PYTORCH_BRANCH +ARG PYTORCH_VISION_BRANCH +ARG PYTORCH_REPO +ARG PYTORCH_VISION_REPO +ARG FA_BRANCH +ARG FA_REPO +RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ + && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \ + && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \ + && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \ + && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \ + && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \ + && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \ + && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \ + && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ + && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ + && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ + && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt diff --git a/Dockerfile.tpu b/Dockerfile.tpu index b617932a85b47..e268b39476665 100644 --- a/Dockerfile.tpu +++ b/Dockerfile.tpu @@ -1,4 +1,4 @@ -ARG NIGHTLY_DATE="20241017" +ARG NIGHTLY_DATE="20250124" ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" FROM $BASE_IMAGE diff --git a/README.md b/README.md index 80d36e4cccc0c..3f748095135c0 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,11 @@ Easy, fast, and cheap LLM serving for everyone --- > [!NOTE] -> For Intel Gaudi specific setup instructions and examples, please refer [Intel® Gaudi® README](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md). For jupyter notebook based quickstart tutorials refer [Getting Started with vLLM](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Getting_Started_with_vLLM/Getting_Started_with_vLLM.ipynb) and [Understanding vLLM on Gaudi](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Understanding_vLLM_on_Gaudi/Understanding_vLLM_on_Gaudi.ipynb). - -The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui) - ---- +> For Intel Gaudi specific setup instructions and examples, please refer [Intel® Gaudi® README](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md). For jupyter notebook based quickstart tutorials refer [Getting Started with vLLM](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/vLLM_Tutorials/Getting_Started_with_vLLM/Getting_Started_with_vLLM.ipynb) and [Understanding vLLM on Gaudi](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/vLLM_Tutorials/Understanding_vLLM_on_Gaudi/Understanding_vLLM_on_Gaudi.ipynb). *Latest News* 🔥 +- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html). +- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing). - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone! - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing). - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there! diff --git a/README_GAUDI.md b/README_GAUDI.md index 74d742e815df5..b98a067c03ef1 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -12,7 +12,7 @@ To achieve the best performance, please follow the methods outlined in the - Ubuntu 22.04 LTS OS - Python 3.10 -- Intel Gaudi accelerator +- Intel Gaudi 2 and 3 AI accelerators - Intel Gaudi software version 1.19.0 and above ## Quick Start Using Dockerfile @@ -81,6 +81,7 @@ To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), r $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork $ git checkout habana_main +$ pip install --upgrade pip $ pip install -r requirements-hpu.txt $ python setup.py develop ``` @@ -109,15 +110,17 @@ $ python setup.py develop | Inference with torch.compile (experimental) | vLLM HPU backend experimentally supports inference with torch.compile. | [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes) | | Attention with Linear Biases (ALiBi) | vLLM HPU backend supports models utilizing Attention with Linear Biases (ALiBi) such as mpt-7b. | [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | | INC quantization | vLLM HPU backend supports FP8 model and KV cache quantization and calibration with Intel Neural Compressor (INC). | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html) | +| AutoAWQ quantization | vLLM HPU backend supports the inference with models quantized using AutoAWQ library. | [Library](https://github.com/casper-hansen/AutoAWQ) | +| AutoGPTQ quantization | vLLM HPU backend supports the inference with models quantized using AutoGPTQ library. | [Library](https://github.com/AutoGPTQ/AutoGPTQ) | | LoRA/MultiLoRA support | vLLM HPU backend includes support for LoRA and MultiLoRA on supported models. | [Documentation](https://docs.vllm.ai/en/stable/models/lora.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html)
[vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | | Multi-step scheduling support | vLLM HPU backend includes multi-step scheduling support for host overhead reduction, configurable by standard `--num-scheduler-seqs` parameter. | [Feature RFC](https://github.com/vllm-project/vllm/issues/6854) | | Automatic prefix caching (experimental) | vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter. | [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html)
[Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html) | | Speculative decoding (experimental) | vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurabie via standard `--speculative_model` and `--num_speculative_tokens` parameters. | [Documentation](https://docs.vllm.ai/en/stable/models/spec_decode.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference_mlpspeculator.html) | + # Unsupported Features - Beam search -- AWQ quantization - Prefill chunking (mixed-batch inferencing) # Supported Configurations @@ -138,6 +141,7 @@ The following configurations have been validated to be function with Gaudi2 devi - [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling - [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling +- [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) on single HPU or with tensor parallelism on 8x HPU, BF16 datatype # Performance Tuning @@ -365,6 +369,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default. - `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs. +- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava model. # Quantization, FP8 Inference and Model Calibration Process @@ -374,7 +379,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM Once you have completed the model calibration process and collected the measurements, you can run FP8 inference with vLLM using the following command: ```bash -export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_measure_g3.json +export QUANT_CONFIG=/path/to/quant/config/inc/meta-llama-3.1-405b-instruct/maxabs_quant_g3.json vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor_paralel_size 8 ``` @@ -392,10 +397,8 @@ measurements for a given model. The quantization configuration is used during in # Troubleshooting -If you encounter device out-of-memory issues or want to attempt inference with higher batch sizes, try tweaking HPU Graphs as follows: - -- Tweak `gpu_memory_utilization` knob. This will decrease the allocation of KV cache, leaving some headroom for capturing graphs with larger batch size. By default, `gpu_memory_utilization` is set to 0.9. - It attempts to allocate ~90% of HBM left for KV cache after short profiling run. Note that this reduces the number of KV cache blocks you have available, and therefore reduces the effective maximum - number of tokens handled at a given time. -- If this method is not efficient, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. - You can do that by adding `--enforce-eager` flag to the server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). +The following steps address Out of Memory related errors: +- Increase gpu_memory_utilization - This addresses insufficient overall memory. The vLLM pre-allocates HPU cache by using gpu_memory_utilization% of device memory. By default, gpu_memory_utilization is set to 0.9. By increasing this utilization, you can provide more KV cache space. +- Decrease max_num_seqs or max_num_batched_tokens - This may reduce the number of concurrent requests in a batch, thereby requiring less KV cache space when overall usable memory is limited. +- Increase tensor_parallel_size - This approach shards model weights, so each GPU has more memory available for KV cache. +- For maximizing memory available for KV cache, you can disable `HPUGraph` completely. With HPU Graphs disabled, you are trading latency and throughput at lower batches for potentially higher throughput on higher batches. You can do that by adding `--enforce-eager` flag to the server (for online inference), or by passing `enforce_eager=True` argument to LLM constructor (for offline inference). diff --git a/SECURITY.md b/SECURITY.md index de0032d26c87b..47196a1f1221e 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -4,7 +4,7 @@ If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. -Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/contributing/vulnerability_management/). +Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). --- diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 9d71e4ecc4a37..0612e8778aca5 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -22,6 +22,7 @@ class RequestFuncInput: prompt_len: int output_len: int model: str + model_name: Optional[str] = None best_of: int = 1 logprobs: Optional[int] = None extra_body: Optional[dict] = None @@ -34,6 +35,7 @@ class RequestFuncOutput: generated_text: str = "" success: bool = False latency: float = 0.0 + output_tokens: int = 0 ttft: float = 0.0 # Time to first token itl: List[float] = field( default_factory=list) # List of inter-token latencies @@ -49,7 +51,8 @@ async def async_request_tgi( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: params = { "best_of": request_func_input.best_of, "max_new_tokens": request_func_input.output_len, @@ -78,7 +81,7 @@ async def async_request_tgi( continue chunk_bytes = chunk_bytes.decode("utf-8") - #NOTE: Sometimes TGI returns a ping response without + # NOTE: Sometimes TGI returns a ping response without # any data, we should skip it. if chunk_bytes.startswith(":"): continue @@ -121,7 +124,8 @@ async def async_request_trt_llm( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { "accumulate_tokens": True, @@ -155,7 +159,7 @@ async def async_request_trt_llm( timestamp = time.perf_counter() # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -185,7 +189,8 @@ async def async_request_deepspeed_mii( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: assert request_func_input.best_of == 1 payload = { @@ -233,17 +238,23 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "prompt": request_func_input.prompt, "temperature": 0.0, "best_of": request_func_input.best_of, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = { @@ -254,7 +265,6 @@ async def async_request_openai_completions( output.prompt_len = request_func_input.prompt_len generated_text = "" - ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st try: @@ -269,15 +279,16 @@ async def async_request_openai_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": data = json.loads(chunk) # NOTE: Some completion API might have a last # usage summary response without a token so we # want to check a token was generated - if data["choices"][0]["text"]: + if choices := data.get("choices"): + # Note that text could be empty here + # e.g. for special tokens + text = choices[0].get("text") timestamp = time.perf_counter() # First token if not first_chunk_received: @@ -291,7 +302,10 @@ async def async_request_openai_completions( most_recent_timestamp) most_recent_timestamp = timestamp - generated_text += data["choices"][0]["text"] + generated_text += text or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") if first_chunk_received: output.success = True else: @@ -300,7 +314,7 @@ async def async_request_openai_completions( "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!") output.generated_text = generated_text - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False @@ -323,12 +337,14 @@ async def async_request_openai_chat_completions( "chat/completions" ), "OpenAI Chat Completions API URL must end with 'chat/completions'." - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, + timeout=AIOHTTP_TIMEOUT) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) payload = { - "model": request_func_input.model, + "model": request_func_input.model_name \ + if request_func_input.model_name else request_func_input.model, "messages": [ { "role": "user", @@ -338,8 +354,12 @@ async def async_request_openai_chat_completions( "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, "stream": True, - "ignore_eos": request_func_input.ignore_eos, + "stream_options": { + "include_usage": True, + }, } + if request_func_input.ignore_eos: + payload["ignore_eos"] = request_func_input.ignore_eos if request_func_input.extra_body: payload.update(request_func_input.extra_body) headers = { @@ -365,17 +385,15 @@ async def async_request_openai_chat_completions( chunk = chunk_bytes.decode("utf-8").removeprefix( "data: ") - if chunk == "[DONE]": - latency = time.perf_counter() - st - else: + if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) - delta = data["choices"][0]["delta"] - if delta.get("content", None): + if choices := data.get("choices"): + content = choices[0]["delta"].get("content") # First token if ttft == 0.0: - ttft = time.perf_counter() - st + ttft = timestamp - st output.ttft = ttft # Decoding phase @@ -383,13 +401,16 @@ async def async_request_openai_chat_completions( output.itl.append(timestamp - most_recent_timestamp) - generated_text += delta["content"] + generated_text += content or "" + elif usage := data.get("usage"): + output.output_tokens = usage.get( + "completion_tokens") most_recent_timestamp = timestamp output.generated_text = generated_text output.success = True - output.latency = latency + output.latency = most_recent_timestamp - st else: output.error = response.reason or "" output.success = False diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4eb0e1f8ac903..8b3212831e7e0 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,6 +25,7 @@ import argparse import asyncio import base64 +import gc import io import json import os @@ -199,7 +200,7 @@ def sample_sonnet_requests( return sampled_requests -def sample_mmmu_pro_vision_requests( +def sample_vision_arena_requests( dataset, num_requests: int, tokenizer: PreTrainedTokenizerBase, @@ -211,13 +212,7 @@ def sample_mmmu_pro_vision_requests( if len(sampled_requests) == num_requests: break - # MMMU-Pro vision direct prompt - # Ref: https://github.com/MMMU-Benchmark/MMMU/blob/6ce42f4d8f70c1841c67867152648974415b5cac/mmmu-pro/prompts.yaml#L5 - prompt = ( - "Answer with the option letter from the given choices directly. " - "The last line of your response should be of the following " - "format: 'Answer: $LETTER' (without quotes) where LETTER is one of " - "options.") + prompt = data["turns"][0][0]['content'] prompt_token_ids = tokenizer(prompt).input_ids if fixed_output_len is None: @@ -229,10 +224,10 @@ def sample_mmmu_pro_vision_requests( output_len = fixed_output_len assert isinstance( - data["image"], + data["images"][0], Image), ("Input image format must be `PIL.Image.Image`, " f"given {type(data['image'])}.") - image: Image = data["image"] + image: Image = data["images"][0] image = image.convert("RGB") image_data = io.BytesIO() image.save(image_data, format='JPEG') @@ -251,7 +246,7 @@ def sample_mmmu_pro_vision_requests( def sample_hf_requests( dataset_path: str, - dataset_subset: str, + dataset_subset: Optional[str], dataset_split: str, num_requests: int, tokenizer: PreTrainedTokenizerBase, @@ -259,19 +254,17 @@ def sample_hf_requests( fixed_output_len: Optional[int] = None, ) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]: - # Special case for MMMU-Pro vision dataset - if dataset_path == 'MMMU/MMMU_Pro' and dataset_subset == 'vision': - assert dataset_split == "test" + # Special case for vision_arena dataset + if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \ + and dataset_subset is None: + assert dataset_split == "train" dataset = load_dataset(dataset_path, name=dataset_subset, split=dataset_split, streaming=True) - assert "image" in dataset.features, ( - "MMMU/MMMU_Pro vision dataset must have 'image' column.") - filter_func = lambda x: isinstance(x["image"], Image) - dataset = dataset.shuffle(seed=random_seed).filter(filter_func) - return sample_mmmu_pro_vision_requests(dataset, num_requests, - tokenizer, fixed_output_len) + dataset = dataset.shuffle(seed=random_seed) + return sample_vision_arena_requests(dataset, num_requests, tokenizer, + fixed_output_len) dataset = load_dataset(dataset_path, name=dataset_subset, @@ -423,7 +416,7 @@ def calculate_metrics( tokenizer: PreTrainedTokenizerBase, selected_percentile_metrics: List[str], selected_percentiles: List[float], - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], ) -> Tuple[BenchmarkMetrics, List[int]]: actual_output_lens: List[int] = [] total_input = 0 @@ -436,19 +429,23 @@ def calculate_metrics( e2els: List[float] = [] for i in range(len(outputs)): if outputs[i].success: - # We use the tokenizer to count the number of output tokens for all - # serving backends instead of looking at len(outputs[i].itl) since - # multiple output tokens may be bundled together - # Note : this may inflate the output token count slightly - output_len = len( - tokenizer(outputs[i].generated_text, - add_special_tokens=False).input_ids) + output_len = outputs[i].output_tokens + + if output_len is None: + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + output_len = len( + tokenizer(outputs[i].generated_text, + add_special_tokens=False).input_ids) actual_output_lens.append(output_len) total_input += input_requests[i][1] tpot = 0 if output_len > 1: - tpot = (outputs[i].latency - outputs[i].ttft) / (output_len - - 1) + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) tpots.append(tpot) # Note: if output_len <= 1, we regard tpot as 0 for goodput all_tpots.append(tpot) @@ -459,21 +456,21 @@ def calculate_metrics( else: actual_output_lens.append(0) - if gootput_config_dict: + if goodput_config_dict: valid_metrics = [] slo_values = [] - if "ttft" in gootput_config_dict: + if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(gootput_config_dict["ttft"] / + slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "tpot" in gootput_config_dict: + if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(gootput_config_dict["tpot"] / + slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "e2el" in gootput_config_dict: + if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(gootput_config_dict["e2el"] / + slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) for req_metric in zip(*valid_metrics): @@ -525,6 +522,7 @@ async def benchmark( api_url: str, base_url: str, model_id: str, + model_name: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], logprobs: Optional[int], @@ -536,7 +534,7 @@ async def benchmark( selected_percentile_metrics: List[str], selected_percentiles: List[str], ignore_eos: bool, - gootput_config_dict: Dict[str, float], + goodput_config_dict: Dict[str, float], max_concurrency: Optional[int], ): if backend in ASYNC_REQUEST_FUNCS: @@ -553,6 +551,7 @@ async def benchmark( "Multi-modal content is only supported on 'openai-chat' backend.") test_input = RequestFuncInput( model=model_id, + model_name=model_name, prompt=test_prompt, api_url=api_url, prompt_len=test_prompt_len, @@ -573,6 +572,7 @@ async def benchmark( if profile: print("Starting profiler...") profile_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=test_prompt, api_url=base_url + "/start_profile", prompt_len=test_prompt_len, @@ -616,6 +616,7 @@ async def limited_request_func(request_func_input, pbar): async for request in get_request(input_requests, request_rate, burstiness): prompt, prompt_len, output_len, mm_content = request request_func_input = RequestFuncInput(model=model_id, + model_name=model_name, prompt=prompt, api_url=api_url, prompt_len=prompt_len, @@ -657,7 +658,7 @@ async def limited_request_func(request_func_input, pbar): tokenizer=tokenizer, selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, ) print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) @@ -669,7 +670,7 @@ async def limited_request_func(request_func_input, pbar): metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) - if gootput_config_dict: + if goodput_config_dict: print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", @@ -684,7 +685,7 @@ async def limited_request_func(request_func_input, pbar): "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, "request_goodput:": - metrics.request_goodput if gootput_config_dict else None, + metrics.request_goodput if goodput_config_dict else None, "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -740,11 +741,11 @@ def process_one_metric( def check_goodput_args(args): # Check and parse goodput arguments - gootput_config_dict = {} + goodput_config_dict = {} VALID_NAMES = ["ttft", "tpot", "e2el"] if args.goodput: - gootput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in gootput_config_dict.items(): + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): if slo_name not in VALID_NAMES: raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " @@ -755,22 +756,22 @@ def check_goodput_args(args): f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " "non-negative.") - return gootput_config_dict + return goodput_config_dict def parse_goodput(slo_pairs): - gootput_config_dict = {} + goodput_config_dict = {} try: for slo_pair in slo_pairs: slo_name, slo_val = slo_pair.split(":") - gootput_config_dict[slo_name] = float(slo_val) + goodput_config_dict[slo_name] = float(slo_val) except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " "Specify service level objectives for goodput as \"KEY:VALUE\" " "pairs, where the key is a metric name, and the value is a " "number in milliseconds.") from err - return gootput_config_dict + return goodput_config_dict def main(args: argparse.Namespace): @@ -780,6 +781,7 @@ def main(args: argparse.Namespace): backend = args.backend model_id = args.model + model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model tokenizer_mode = args.tokenizer_mode @@ -869,7 +871,11 @@ def main(args: argparse.Namespace): else: raise ValueError(f"Unknown dataset: {args.dataset_name}") - gootput_config_dict = check_goodput_args(args) + goodput_config_dict = check_goodput_args(args) + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() benchmark_result = asyncio.run( benchmark( @@ -877,6 +883,7 @@ def main(args: argparse.Namespace): api_url=api_url, base_url=base_url, model_id=model_id, + model_name=model_name, tokenizer=tokenizer, input_requests=input_requests, logprobs=args.logprobs, @@ -890,7 +897,7 @@ def main(args: argparse.Namespace): float(p) for p in args.metric_percentiles.split(",") ], ignore_eos=args.ignore_eos, - gootput_config_dict=gootput_config_dict, + goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, )) @@ -919,8 +926,8 @@ def main(args: argparse.Namespace): ) # Traffic - result_json["request_rate"] = ( - args.request_rate if args.request_rate < float("inf") else "inf") + result_json["request_rate"] = (args.request_rate if args.request_rate + < float("inf") else "inf") result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency @@ -1222,5 +1229,12 @@ def main(args: argparse.Namespace): 'always use the slow tokenizer. \n* ' '"mistral" will always use the `mistral_common` tokenizer.') + parser.add_argument("--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ") + args = parser.parse_args() main(args) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 1d59a01422412..1fa0da75c79d2 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -12,10 +12,10 @@ from vllm.model_executor.layers.fused_moe.fused_moe import * from vllm.platforms import current_platform -from vllm.utils import FlexibleArgumentParser, is_navi +from vllm.utils import FlexibleArgumentParser FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm( -) and not is_navi() else torch.float8_e4m3fn +) else torch.float8_e4m3fn class BenchmarkConfig(TypedDict): diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 14eef00b855ac..219013a38134b 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -98,7 +98,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: start_time = time.perf_counter() # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, + dtype=torch.float32, + device=device) for _ in range(num_iters): if version == "v1": diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 15b09395a889f..1c1c539819d05 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -259,7 +259,7 @@ endmacro() # in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`. # We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is # in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add -# 9.0a to the result. +# 9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS). # The result is stored in `OUT_CUDA_ARCHS`. # # Example: @@ -270,34 +270,47 @@ endmacro() # function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) + set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS}) # if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should # remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS set(_CUDA_ARCHS) if ("9.0a" IN_LIST SRC_CUDA_ARCHS) list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") - if ("9.0" IN_LIST TGT_CUDA_ARCHS) + if ("9.0" IN_LIST TGT_CUDA_ARCHS_) + list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0") set(_CUDA_ARCHS "9.0a") endif() endif() list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) - # for each ARCH in CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that is - # less or eqault to ARCH - foreach(_ARCH ${CUDA_ARCHS}) - set(_TMP_ARCH) - foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) - if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) - set(_TMP_ARCH ${_SRC_ARCH}) - else() - break() + # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that + # is less or equal to ARCH (but has the same major version since SASS binary + # compatibility is only forward compatible within the same major version). + foreach(_ARCH ${TGT_CUDA_ARCHS_}) + set(_TMP_ARCH) + # Extract the major version of the target arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") + foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) + # Extract the major version of the source arch + string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") + # Check major-version match AND version-less-or-equal + if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) + if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) + set(_TMP_ARCH "${_SRC_ARCH}") + endif() + else() + # If we hit a version greater than the target, we can break + break() + endif() + endforeach() + + # If we found a matching _TMP_ARCH, append it to _CUDA_ARCHS + if (_TMP_ARCH) + list(APPEND _CUDA_ARCHS "${_TMP_ARCH}") endif() endforeach() - if (_TMP_ARCH) - list(APPEND _CUDA_ARCHS ${_TMP_ARCH}) - endif() - endforeach() list(REMOVE_DUPLICATES _CUDA_ARCHS) set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index 563e1438f0b01..eb216dc8baf10 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -105,7 +105,7 @@ __device__ void paged_attention_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float k_scale, const float v_scale, const int tp_rank, + const float* k_scale, const float* v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { const int seq_idx = blockIdx.y; @@ -285,7 +285,7 @@ __device__ void paged_attention_kernel( Quant_vec k_vec_quant = *reinterpret_cast( k_ptr + offset1 * BLOCK_SIZE * x + offset2); k_vecs[j] = fp8::scaled_convert( - k_vec_quant, k_scale); + k_vec_quant, *k_scale); } } @@ -415,7 +415,7 @@ __device__ void paged_attention_kernel( *reinterpret_cast(v_ptr + offset); // Vector conversion from V_quant_vec to V_vec. v_vec = fp8::scaled_convert(v_quant_vec, - v_scale); + *v_scale); } if (block_idx == num_seq_blocks - 1) { // NOTE(woosuk): When v_vec contains the tokens that are out of the @@ -513,7 +513,7 @@ __global__ void paged_attention_v1_kernel( const int max_num_blocks_per_seq, const float* __restrict__ alibi_slopes, // [num_heads] const int q_stride, const int kv_block_stride, const int kv_head_stride, - const float k_scale, const float v_scale, const int tp_rank, + const float* k_scale, const float* v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { paged_attention_kernel& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -80,6 +80,8 @@ void paged_attention_v1_launcher( CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* seq_lens_ptr = seq_lens.data_ptr(); + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; int padded_max_seq_len = @@ -177,8 +179,9 @@ void paged_attention_v1( torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { const bool is_block_sparse = (blocksparse_vert_stride > 1); diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index a453b2243e48c..9935359e02fb1 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -37,7 +37,7 @@ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ + kv_block_stride, kv_head_stride, k_scale_ptr, v_scale_ptr, tp_rank, \ blocksparse_local_blocks, blocksparse_vert_stride, \ blocksparse_block_size, blocksparse_head_sliding_step); \ vllm::paged_attention_v2_reduce_kernel& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { + const std::optional& alibi_slopes, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int tp_rank, + const int blocksparse_local_blocks, const int blocksparse_vert_stride, + const int blocksparse_block_size, const int blocksparse_head_sliding_step) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -84,6 +84,8 @@ void paged_attention_v2_launcher( CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* seq_lens_ptr = seq_lens.data_ptr(); + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); @@ -188,8 +190,9 @@ void paged_attention_v2( torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { const bool is_block_sparse = (blocksparse_vert_stride > 1); diff --git a/csrc/cache.h b/csrc/cache.h index 11c4c5001daaa..eedad9fafa3c0 100644 --- a/csrc/cache.h +++ b/csrc/cache.h @@ -18,15 +18,15 @@ void copy_blocks(std::vector const& key_caches, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale); + const std::string& kv_cache_dtype, + torch::Tensor& k_scale, torch::Tensor& v_scale); void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, const std::string& kv_cache_dtype, - const double k_scale, const double v_scale); + torch::Tensor& k_scale, torch::Tensor& v_scale); // Just for unittest void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache, diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu index 8a95279f9a25a..21a0aec0ececc 100644 --- a/csrc/cache_kernels.cu +++ b/csrc/cache_kernels.cu @@ -159,8 +159,8 @@ __global__ void reshape_and_cache_kernel( // block_size] const int64_t* __restrict__ slot_mapping, // [num_tokens] const int key_stride, const int value_stride, const int num_heads, - const int head_size, const int block_size, const int x, const float k_scale, - const float v_scale) { + const int head_size, const int block_size, const int x, + const float* k_scale, const float* v_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; if (slot_idx < 0) { @@ -196,9 +196,9 @@ __global__ void reshape_and_cache_kernel( value_cache[tgt_value_idx] = tgt_value; } else { key_cache[tgt_key_idx] = - fp8::scaled_convert(tgt_key, k_scale); + fp8::scaled_convert(tgt_key, *k_scale); value_cache[tgt_value_idx] = - fp8::scaled_convert(tgt_value, v_scale); + fp8::scaled_convert(tgt_value, *v_scale); } } } @@ -214,7 +214,7 @@ __global__ void reshape_and_cache_flash_kernel( const int64_t* __restrict__ slot_mapping, // [num_tokens] const int block_stride, const int key_stride, const int value_stride, const int num_heads, const int head_size, const int block_size, - const float k_scale, const float v_scale) { + const float* k_scale, const float* v_scale) { const int64_t token_idx = blockIdx.x; const int64_t slot_idx = slot_mapping[token_idx]; // NOTE: slot_idx can be -1 if the token is padded @@ -239,9 +239,9 @@ __global__ void reshape_and_cache_flash_kernel( value_cache[tgt_key_value_idx] = tgt_value; } else { key_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_key, k_scale); + fp8::scaled_convert(tgt_key, *k_scale); value_cache[tgt_key_value_idx] = - fp8::scaled_convert(tgt_value, v_scale); + fp8::scaled_convert(tgt_value, *v_scale); } } } @@ -258,7 +258,9 @@ __global__ void reshape_and_cache_flash_kernel( reinterpret_cast(key_cache.data_ptr()), \ reinterpret_cast(value_cache.data_ptr()), \ slot_mapping.data_ptr(), key_stride, value_stride, \ - num_heads, head_size, block_size, x, k_scale, v_scale); + num_heads, head_size, block_size, x, \ + reinterpret_cast(k_scale.data_ptr()), \ + reinterpret_cast(v_scale.data_ptr())); void reshape_and_cache( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -268,8 +270,8 @@ void reshape_and_cache( torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size] torch::Tensor& slot_mapping, // [num_tokens] - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale) { + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); @@ -299,7 +301,9 @@ void reshape_and_cache( reinterpret_cast(key_cache.data_ptr()), \ reinterpret_cast(value_cache.data_ptr()), \ slot_mapping.data_ptr(), block_stride, key_stride, \ - value_stride, num_heads, head_size, block_size, k_scale, v_scale); + value_stride, num_heads, head_size, block_size, \ + reinterpret_cast(k_scale.data_ptr()), \ + reinterpret_cast(v_scale.data_ptr())); void reshape_and_cache_flash( torch::Tensor& key, // [num_tokens, num_heads, head_size] @@ -308,8 +312,8 @@ void reshape_and_cache_flash( torch::Tensor& value_cache, // [num_blocks, block_size, num_heads, head_size] torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens] - const std::string& kv_cache_dtype, const double k_scale, - const double v_scale) { + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale) { // NOTE(woosuk): In vLLM V1, key.size(0) can be different from // slot_mapping.size(0) because of padding for CUDA graphs. // In vLLM V0, key.size(0) is always equal to slot_mapping.size(0) because diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp index 408e736d5bc0f..c2ae554c9f8e8 100644 --- a/csrc/core/scalar_type.hpp +++ b/csrc/core/scalar_type.hpp @@ -32,7 +32,7 @@ class ScalarType { signed_(signed_), bias(bias), finite_values_only(finite_values_only), - nan_repr(nan_repr){}; + nan_repr(nan_repr) {}; static constexpr ScalarType int_(uint8_t size_bits, int32_t bias = 0) { return ScalarType(0, size_bits - 1, true, bias); diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index ef5b14088c63b..b9764056e8a2d 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -460,11 +460,11 @@ void paged_attention_v1( torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); TORCH_CHECK(blocksparse_vert_stride <= 1, "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl", @@ -782,11 +782,11 @@ void paged_attention_v2( torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); TORCH_CHECK(blocksparse_vert_stride <= 1, "CPU backend does not support blocksparse attention yet."); VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl", diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp index 31d454328b2c1..e3809acad7453 100644 --- a/csrc/cpu/cache.cpp +++ b/csrc/cpu/cache.cpp @@ -107,10 +107,8 @@ void copy_blocks(std::vector const& key_caches, void reshape_and_cache(torch::Tensor& key, torch::Tensor& value, torch::Tensor& key_cache, torch::Tensor& value_cache, torch::Tensor& slot_mapping, - const std::string& kv_cache_dtype, double k_scale, - double v_scale) { - TORCH_CHECK(k_scale == 1.0f && v_scale == 1.0f); - + const std::string& kv_cache_dtype, + torch::Tensor& k_scale, torch::Tensor& v_scale) { int num_tokens = key.size(0); int num_heads = key.size(1); int head_size = key.size(2); diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 28db0479748bf..a71815106133a 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -2,13 +2,13 @@ #define CPU_TYPES_HPP #if defined(__x86_64__) - //x86 implementation + // x86 implementation #include "cpu_types_x86.hpp" #elif defined(__POWER9_VECTOR__) - //ppc implementation + // ppc implementation #include "cpu_types_vsx.hpp" #elif defined(__aarch64__) - //arm implementation + // arm implementation #include "cpu_types_arm.hpp" #else #warning "unsupported vLLM cpu implementation" diff --git a/csrc/cpu/cpu_types_arm.hpp b/csrc/cpu/cpu_types_arm.hpp index ae062a5b86892..990e99f2fc069 100644 --- a/csrc/cpu/cpu_types_arm.hpp +++ b/csrc/cpu/cpu_types_arm.hpp @@ -1,48 +1,50 @@ #include -#include +#include #include namespace vec_op { #ifdef ARM_BF16_SUPPORT - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) #else - #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) #endif -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { - template - constexpr void unroll_loop_item(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); - }; -}; +template +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { + (f(std::integral_constant{}), ...); +}; +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }; }; @@ -54,127 +56,124 @@ struct FP16Vec8 : public Vec { float16x8_t reg; - explicit FP16Vec8(const void *ptr) - : reg(vld1q_f16(static_cast(ptr))) {}; + explicit FP16Vec8(const void* ptr) + : reg(vld1q_f16(static_cast(ptr))) {}; - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { - vst1q_f16(static_cast<__fp16 *>(ptr), reg); - } + void save(void* ptr) const { vst1q_f16(static_cast<__fp16*>(ptr), reg); } }; struct FP16Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - float16x8x2_t reg; - - explicit FP16Vec16(const void *ptr) { - reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); - reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); - } - - explicit FP16Vec16(const FP32Vec16& vec); - - void save(void *ptr) const { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + constexpr static int VEC_ELEM_NUM = 16; + + float16x8x2_t reg; + + explicit FP16Vec16(const void* ptr) { + reg.val[0] = vld1q_f16(reinterpret_cast(ptr)); + reg.val[1] = vld1q_f16(reinterpret_cast(ptr) + 8); + } + + explicit FP16Vec16(const FP32Vec16& vec); + + void save(void* ptr) const { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } + + void save(void* ptr, const int elem_num) const { + int full_blocks = elem_num / 8; + int remainder = elem_num % 8; + + if (full_blocks > 0) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); + if (full_blocks > 1) { + vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); + } } - - void save(void *ptr, const int elem_num) const { - int full_blocks = elem_num / 8; - int remainder = elem_num % 8; - - if (full_blocks > 0) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr), reg.val[0]); - if (full_blocks > 1) { - vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]); - } - } - - // Note: below is the unrolled version of the following code: - // - // for (int i = 0; i < remainder; ++i) { - // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = - // vgetq_lane_f16(temp, i); - // } - // - // For macOS build (Clang), the arm/neon intrinsics function - // `vgetq_lane_f16` needs the parameter `i` to be constant at compile - // time. - - if (remainder > 0) { - float16x8_t temp = reg.val[full_blocks]; - __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); - switch (remainder) - { - case 1: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - break; - case 2: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - break; - case 3: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - break; - case 4: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - break; - case 5: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - break; - case 6: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - break; - case 7: - fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); - fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); - fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); - fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); - fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); - fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); - fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); - break; - - default: - break; - } - } + + // Note: below is the unrolled version of the following code: + // + // for (int i = 0; i < remainder; ++i) { + // reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = + // vgetq_lane_f16(temp, i); + // } + // + // For macOS build (Clang), the arm/neon intrinsics function + // `vgetq_lane_f16` needs the parameter `i` to be constant at compile + // time. + + if (remainder > 0) { + float16x8_t temp = reg.val[full_blocks]; + __fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr); + switch (remainder) { + case 1: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + break; + case 2: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + break; + case 3: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + break; + case 4: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + break; + case 5: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + break; + case 6: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + break; + case 7: + fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0); + fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1); + fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2); + fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3); + fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4); + fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5); + fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6); + break; + + default: + break; + } } + } }; - #ifdef ARM_BF16_SUPPORT struct BF16Vec8 : public Vec { constexpr static int VEC_ELEM_NUM = 8; bfloat16x8_t reg; - explicit BF16Vec8(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec8(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec8(bfloat16x8_t data) : reg(data) {}; - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - explicit BF16Vec8(float32x4x2_t v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; + explicit BF16Vec8(float32x4x2_t v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1])) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -182,19 +181,18 @@ struct BF16Vec16 : public Vec { bfloat16x8x2_t reg; - explicit BF16Vec16(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec16(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec16(bfloat16x8x2_t data) : reg(data) {}; - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - explicit BF16Vec16(float32x4x4_t v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3]) - }){}; + explicit BF16Vec16(float32x4x4_t v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[0]), v.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.val[2]), v.val[3])}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; struct BF16Vec32 : public Vec { @@ -202,19 +200,15 @@ struct BF16Vec32 : public Vec { bfloat16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {}; + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {}; explicit BF16Vec32(bfloat16x8x4_t data) : reg(data) {}; - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {}; + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}; - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; }; + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; }; }; #endif @@ -232,11 +226,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vdupq_n_f32(0.0f)) {}; - explicit FP32Vec4(const float *ptr) : reg(vld1q_f32(ptr)) {}; + explicit FP32Vec4(const float* ptr) : reg(vld1q_f32(ptr)) {}; explicit FP32Vec4(float32x4_t data) : reg(data) {}; - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}; + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}; }; struct FP32Vec8 : public Vec { @@ -252,32 +246,37 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {}; - explicit FP32Vec8(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; + explicit FP32Vec8(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4)}) {}; explicit FP32Vec8(float32x4x2_t data) : reg(data) {}; - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}; + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {}; - explicit FP32Vec8(const FP16Vec8 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); - }; + explicit FP32Vec8(const FP16Vec8& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg)); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg)); + }; - explicit FP32Vec8(float16x8_t v) : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; + explicit FP32Vec8(float16x8_t v) + : reg({vcvt_f32_f16(vget_low_f16(v)), vcvt_f32_f16(vget_high_f16(v))}) {}; - #ifdef ARM_BF16_SUPPORT +#ifdef ARM_BF16_SUPPORT - explicit FP32Vec8(bfloat16x8_t v) : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; + explicit FP32Vec8(bfloat16x8_t v) + : reg({vcvtq_low_f32_bf16(v), vcvtq_high_f32_bf16(v)}) {}; - explicit FP32Vec8(const BF16Vec8 &v) : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; + explicit FP32Vec8(const BF16Vec8& v) + : reg({vcvtq_low_f32_bf16(v.reg), vcvtq_high_f32_bf16(v.reg)}) {}; - #endif +#endif float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; } @@ -324,10 +323,14 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; - float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), static_cast(erf(ar.values[1]))}; - float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), static_cast(erf(ar.values[3]))}; - float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), static_cast(erf(ar.values[5]))}; - float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), static_cast(erf(ar.values[7]))}; + float32x2_t er_vec0 = {static_cast(erf(ar.values[0])), + static_cast(erf(ar.values[1]))}; + float32x2_t er_vec1 = {static_cast(erf(ar.values[2])), + static_cast(erf(ar.values[3]))}; + float32x2_t er_vec2 = {static_cast(erf(ar.values[4])), + static_cast(erf(ar.values[5]))}; + float32x2_t er_vec3 = {static_cast(erf(ar.values[6])), + static_cast(erf(ar.values[7]))}; float32x4_t result0 = vcombine_f32(er_vec0, er_vec1); float32x4_t result1 = vcombine_f32(er_vec2, er_vec3); @@ -337,25 +340,29 @@ struct FP32Vec8 : public Vec { result.val[1] = result1; return FP32Vec8(result); - } + } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), vmulq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), vaddq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), vsubq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1])})); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), vdivq_f32(reg.val[1], b.reg.val[1])})); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8(float32x4x2_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1])})); } - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); } @@ -370,103 +377,100 @@ struct FP32Vec16 : public Vec { float32x4x4_t reg; - explicit FP32Vec16(float v) : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} + explicit FP32Vec16(float v) + : reg({vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v), vmovq_n_f32(v)}) {} - explicit FP32Vec16() : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0)}) {} + explicit FP32Vec16() + : reg({vmovq_n_f32(0.0), vmovq_n_f32(0.0), vmovq_n_f32(0.0), + vmovq_n_f32(0.0)}) {} - explicit FP32Vec16(const float *ptr) : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), vld1q_f32(ptr + 12)}) {} + explicit FP32Vec16(const float* ptr) + : reg({vld1q_f32(ptr), vld1q_f32(ptr + 4), vld1q_f32(ptr + 8), + vld1q_f32(ptr + 12)}) {} explicit FP32Vec16(float32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec8 &data) { - reg.val[0] = data.reg.val[0]; - reg.val[1] = data.reg.val[1]; - reg.val[2] = data.reg.val[0]; - reg.val[3] = data.reg.val[1]; + explicit FP32Vec16(const FP32Vec8& data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + explicit FP32Vec16(const FP32Vec16& data) : reg(data.reg) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v.reg)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v.reg)) {} - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(bfloat16x8x2_t v) : reg({ - vcvtq_low_f32_bf16(v.val[0]), - vcvtq_high_f32_bf16(v.val[0]), - vcvtq_low_f32_bf16(v.val[1]), - vcvtq_high_f32_bf16(v.val[1]) - }) {}; - #endif +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(bfloat16x8x2_t v) + : reg({vcvtq_low_f32_bf16(v.val[0]), vcvtq_high_f32_bf16(v.val[0]), + vcvtq_low_f32_bf16(v.val[1]), vcvtq_high_f32_bf16(v.val[1])}) {}; +#endif - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; }; - #ifdef ARM_BF16_SUPPORT - explicit FP32Vec16(const BF16Vec16 &v) : reg({ - vcvtq_low_f32_bf16(v.reg.val[0]), - vcvtq_high_f32_bf16(v.reg.val[0]), - vcvtq_low_f32_bf16(v.reg.val[1]), - vcvtq_high_f32_bf16(v.reg.val[1]) - }) {}; - - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}; - #endif - - explicit FP32Vec16(const FP16Vec16 &v) { - reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); - reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); - reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); - reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); +#ifdef ARM_BF16_SUPPORT + explicit FP32Vec16(const BF16Vec16& v) + : reg({vcvtq_low_f32_bf16(v.reg.val[0]), + vcvtq_high_f32_bf16(v.reg.val[0]), + vcvtq_low_f32_bf16(v.reg.val[1]), + vcvtq_high_f32_bf16(v.reg.val[1])}) {}; + + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}; +#endif + + explicit FP32Vec16(const FP16Vec16& v) { + reg.val[0] = vcvt_f32_f16(vget_low_f16(v.reg.val[0])); + reg.val[1] = vcvt_f32_f16(vget_high_f16(v.reg.val[0])); + reg.val[2] = vcvt_f32_f16(vget_low_f16(v.reg.val[1])); + reg.val[3] = vcvt_f32_f16(vget_high_f16(v.reg.val[1])); }; - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vaddq_f32(reg.val[0], b.reg.val[0]), - vaddq_f32(reg.val[1], b.reg.val[1]), - vaddq_f32(reg.val[2], b.reg.val[2]), - vaddq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vaddq_f32(reg.val[0], b.reg.val[0]), + vaddq_f32(reg.val[1], b.reg.val[1]), + vaddq_f32(reg.val[2], b.reg.val[2]), + vaddq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vmulq_f32(reg.val[0], b.reg.val[0]), - vmulq_f32(reg.val[1], b.reg.val[1]), - vmulq_f32(reg.val[2], b.reg.val[2]), - vmulq_f32(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vmulq_f32(reg.val[0], b.reg.val[0]), + vmulq_f32(reg.val[1], b.reg.val[1]), + vmulq_f32(reg.val[2], b.reg.val[2]), + vmulq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vsubq_f32(reg.val[0], b.reg.val[0]), - vsubq_f32(reg.val[1], b.reg.val[1]), - vsubq_f32(reg.val[2], b.reg.val[2]), - vsubq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vsubq_f32(reg.val[0], b.reg.val[0]), + vsubq_f32(reg.val[1], b.reg.val[1]), + vsubq_f32(reg.val[2], b.reg.val[2]), + vsubq_f32(reg.val[3], b.reg.val[3])})); }; - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(float32x4x4_t({ - vdivq_f32(reg.val[0], b.reg.val[0]), - vdivq_f32(reg.val[1], b.reg.val[1]), - vdivq_f32(reg.val[2], b.reg.val[2]), - vdivq_f32(reg.val[3], b.reg.val[3]) - })); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(float32x4x4_t({vdivq_f32(reg.val[0], b.reg.val[0]), + vdivq_f32(reg.val[1], b.reg.val[1]), + vdivq_f32(reg.val[2], b.reg.val[2]), + vdivq_f32(reg.val[3], b.reg.val[3])})); }; float reduce_sum() const { AliasReg ar; ar.reg = reg; float answer = 0; - unroll_loop([&answer, &ar](int i) { answer += ar.values[i]; }); + unroll_loop( + [&answer, &ar](int i) { answer += ar.values[i]; }); return answer; }; - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -479,7 +483,7 @@ struct FP32Vec16 : public Vec { return answer; }; - void save(float *ptr) const { + void save(float* ptr) const { vst1q_f32(ptr, reg.val[0]); vst1q_f32(ptr + 4, reg.val[1]); vst1q_f32(ptr + 8, reg.val[2]); @@ -487,43 +491,59 @@ struct FP32Vec16 : public Vec { }; }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; #ifdef ARM_BF16_SUPPORT -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; #endif -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<__fp16 *>(ptr) = v; +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast<__fp16*>(ptr) = v; } -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) { - float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); - float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); - float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); - float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) { + float16x4_t low_0 = vcvt_f16_f32(v.reg.val[0]); + float16x4_t high_0 = vcvt_f16_f32(v.reg.val[1]); + float16x4_t low_1 = vcvt_f16_f32(v.reg.val[2]); + float16x4_t high_1 = vcvt_f16_f32(v.reg.val[3]); - reg.val[0] = vcombine_f16(low_0, high_0); - reg.val[1] = vcombine_f16(low_1, high_1); + reg.val[0] = vcombine_f16(low_0, high_0); + reg.val[1] = vcombine_f16(low_1, high_1); }; -inline FP16Vec8 :: FP16Vec8(const FP32Vec8 &v) { - float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); - float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); +inline FP16Vec8 ::FP16Vec8(const FP32Vec8& v) { + float16x4_t lower_half = vcvt_f16_f32(v.reg.val[0]); + float16x4_t upper_half = vcvt_f16_f32(v.reg.val[1]); - reg = vcombine_f16(lower_half, upper_half); + reg = vcombine_f16(lower_half, upper_half); }; -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { - +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc.reg.val[0] = vfmaq_f32(acc.reg.val[0], a.reg.val[0], b.reg.val[0]); acc.reg.val[1] = vfmaq_f32(acc.reg.val[1], a.reg.val[1], b.reg.val[1]); acc.reg.val[2] = vfmaq_f32(acc.reg.val[2], a.reg.val[2], b.reg.val[2]); @@ -531,8 +551,7 @@ inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { }; #ifdef ARM_BF16_SUPPORT -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { - +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { float32x4_t a0_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[0])); float32x4_t a0_high = vcvt_f32_bf16(vget_high_bf16(a.reg.val[0])); float32x4_t a1_low = vcvt_f32_bf16(vget_low_bf16(a.reg.val[1])); @@ -551,22 +570,22 @@ inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { #endif #ifdef ARM_BF16_SUPPORT -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) {}; - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) : reg({ - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), - vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), v.reg.val[3]) - }){}; +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) + : reg(vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1])) { + }; + +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) + : reg({vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[0]), v.reg.val[1]), + vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(v.reg.val[2]), + v.reg.val[3])}) {}; #endif -inline void prefetch(const void *addr) { - __builtin_prefetch(addr, 0, 1); -}; +inline void prefetch(const void* addr) { __builtin_prefetch(addr, 0, 1); }; #ifdef ARM_BF16_SUPPORT template <> -inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bf16 *>(ptr) = vcvth_bf16_f32(v); +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bf16*>(ptr) = vcvth_bf16_f32(v); }; #endif -}; \ No newline at end of file +}; // namespace vec_op \ No newline at end of file diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp index b50bdadc5713d..a8e1be37eb418 100644 --- a/csrc/cpu/cpu_types_vsx.hpp +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -9,38 +9,40 @@ namespace vec_op { // FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; + #define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; + #define CPU_KERNEL_GUARD_OUT(NAME) \ + std::cout << #NAME << " exit." << std::endl; #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -68,12 +70,14 @@ struct BF16Vec8 : public Vec { __vector signed short reg; - explicit BF16Vec8(const void *ptr) - : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__vector signed short)vec_xl(0, (__vector signed short*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } + void save(void* ptr) const { + *reinterpret_cast<__vector signed short*>(ptr) = reg; + } }; struct BF16Vec16 : public Vec { @@ -81,18 +85,18 @@ struct BF16Vec16 : public Vec { ss16x8x2_t reg; - explicit BF16Vec16(const void *ptr) { + explicit BF16Vec16(const void* ptr) { // Load 256 bits in two parts - reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); - reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); + reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr); } - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { + void save(void* ptr) const { // Save 256 bits in two parts - vec_xst(reg.val[0], 0, (signed short *)ptr); - vec_xst(reg.val[1], 16, (signed short *)ptr); + vec_xst(reg.val[0], 0, (signed short*)ptr); + vec_xst(reg.val[1], 16, (signed short*)ptr); } }; @@ -102,19 +106,15 @@ struct BF16Vec32 : public Vec { constexpr static int VEC_ELEM_NUM = 32; ss16x8x4_t reg; - explicit BF16Vec32(const void *ptr) - : reg(*reinterpret_cast(ptr)) {} + explicit BF16Vec32(const void* ptr) + : reg(*reinterpret_cast(ptr)) {} explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} - explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ - vec8_data.reg, - vec8_data.reg, - vec8_data.reg, - vec8_data.reg - }) {} + explicit BF16Vec32(const BF16Vec8& vec8_data) + : reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {} - void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast(ptr) = reg; } }; struct FP32Vec4 : public Vec { @@ -130,11 +130,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(vec_splats(0.0f)) {} - explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {} explicit FP32Vec4(__vector float data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -156,19 +156,19 @@ struct FP32Vec8 : public Vec { reg.val[1] = vec_splats(0.0f); } - explicit FP32Vec8(const float *ptr) { + explicit FP32Vec8(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); } explicit FP32Vec8(f32x4x2_t data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) { + explicit FP32Vec8(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; } - explicit FP32Vec8(const BF16Vec8 &v) { + explicit FP32Vec8(const BF16Vec8& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); reg.val[1] = (__vector float)vec_mergel(zero, v.reg); } @@ -177,7 +177,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -230,23 +231,27 @@ struct FP32Vec8 : public Vec { return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); } - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator*(const FP32Vec8& b) const { + return FP32Vec8( + {vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator+(const FP32Vec8& b) const { + return FP32Vec8( + {vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator-(const FP32Vec8& b) const { + return FP32Vec8( + {vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); } - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + FP32Vec8 operator/(const FP32Vec8& b) const { + return FP32Vec8( + {vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); } @@ -275,7 +280,7 @@ struct FP32Vec16 : public Vec { reg.val[3] = vec_splats(0.0f); } - explicit FP32Vec16(const float *ptr) { + explicit FP32Vec16(const float* ptr) { reg.val[0] = vec_xl(0, ptr); reg.val[1] = vec_xl(16, ptr); reg.val[2] = vec_xl(32, ptr); @@ -284,78 +289,76 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16(f32x4x4_t data) : reg(data) {} - explicit FP32Vec16(const FP32Vec16 &data) { + explicit FP32Vec16(const FP32Vec16& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[2]; reg.val[3] = data.reg.val[3]; } - explicit FP32Vec16(const FP32Vec4 &data) { + explicit FP32Vec16(const FP32Vec4& data) { reg.val[0] = data.reg; reg.val[1] = data.reg; reg.val[2] = data.reg; reg.val[3] = data.reg; } - explicit FP32Vec16(const FP32Vec8 &data) { + explicit FP32Vec16(const FP32Vec8& data) { reg.val[0] = data.reg.val[0]; reg.val[1] = data.reg.val[1]; reg.val[2] = data.reg.val[0]; reg.val[3] = data.reg.val[1]; } - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_mul(reg.val[0], b.reg.val[0]), - vec_mul(reg.val[1], b.reg.val[1]), - vec_mul(reg.val[2], b.reg.val[2]), - vec_mul(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator*(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_add(reg.val[0], b.reg.val[0]), - vec_add(reg.val[1], b.reg.val[1]), - vec_add(reg.val[2], b.reg.val[2]), - vec_add(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator+(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_sub(reg.val[0], b.reg.val[0]), - vec_sub(reg.val[1], b.reg.val[1]), - vec_sub(reg.val[2], b.reg.val[2]), - vec_sub(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator-(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); } - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(f32x4x4_t({ - vec_div(reg.val[0], b.reg.val[0]), - vec_div(reg.val[1], b.reg.val[1]), - vec_div(reg.val[2], b.reg.val[2]), - vec_div(reg.val[3], b.reg.val[3])})); + FP32Vec16 operator/(const FP32Vec16& b) const { + return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); } float reduce_sum() const { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); AliasReg ar; @@ -368,7 +371,7 @@ struct FP32Vec16 : public Vec { return result; } - void save(float *ptr) const { + void save(float* ptr) const { vec_xst(reg.val[0], 0, ptr); vec_xst(reg.val[1], 16, ptr); vec_xst(reg.val[2], 32, ptr); @@ -376,43 +379,62 @@ struct FP32Vec16 : public Vec { } }; -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } #ifndef __VEC_CLASS_FP_NAN -#define __VEC_CLASS_FP_NAN (1 << 6) + #define __VEC_CLASS_FP_NAN (1 << 6) #endif -const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +const static __vector unsigned char omask = {0, 1, 4, 5, 8, 9, 12, 13, + 16, 17, 20, 21, 24, 25, 28, 29}; #ifndef _ARCH_PWR10 -const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; -const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; -const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; -const static __vector unsigned int one = { 1, 1, 1, 1 }; +const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff, + 0x00007fff}; +const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000, + 0x7fc00000}; +const static __vector unsigned int sh16 = {16, 16, 16, 16}; +const static __vector unsigned int one = {1, 1, 1, 1}; #endif -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) { #ifdef _ARCH_PWR10 __vector signed short ret[2]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); reg = vec_perm(ret[0], ret[1], omask); #elif defined(_ARCH_PWR9) __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); @@ -425,8 +447,10 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { __vector unsigned int rnd1 = vec_add(lsb1, bias); inp0 = vec_add(inp0, rnd0); inp1 = vec_add(inp1, rnd1); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp0 = vec_sr(inp0, sh16); @@ -435,13 +459,17 @@ inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { #endif } -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { #ifdef _ARCH_PWR10 __vector signed short ret[4]; - ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); - ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); - ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); - ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[1]); + ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[2]); + ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16( + (__vector unsigned char)v.reg.val[3]); reg.val[0] = vec_perm(ret[0], ret[1], omask); reg.val[1] = vec_perm(ret[2], ret[3], omask); #elif defined(_ARCH_PWR9) @@ -465,10 +493,14 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { inp1 = vec_add(inp1, rnd1); inp2 = vec_add(inp2, rnd2); inp3 = vec_add(inp3, rnd3); - __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); - __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); - __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); - __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); + __vector __bool int sel0 = + vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = + vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel2 = + vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); + __vector __bool int sel3 = + vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); inp0 = vec_sel(inp0, nan, sel0); inp1 = vec_sel(inp1, nan, sel1); inp2 = vec_sel(inp2, nan, sel2); @@ -482,10 +514,10 @@ inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { #endif } -inline void prefetch(const void *addr) { +inline void prefetch(const void* addr) { __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp index 4bb4eb0f491ac..a4ef2be2a58ca 100644 --- a/csrc/cpu/cpu_types_x86.hpp +++ b/csrc/cpu/cpu_types_x86.hpp @@ -11,39 +11,40 @@ static_assert(false, "AVX2 must be supported for the current implementation."); namespace vec_op { -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) #ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) + #define CPU_KERNEL_GUARD_OUT(NAME) #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - RECORD_FUNCTION(#NAME, c10::ArrayRef({})); -#define CPU_KERNEL_GUARD_OUT(NAME) + #define CPU_KERNEL_GUARD_IN(NAME) \ + RECORD_FUNCTION(#NAME, c10::ArrayRef({})); + #define CPU_KERNEL_GUARD_OUT(NAME) #endif #define FORCE_INLINE __attribute__((always_inline)) inline namespace { template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { +constexpr void unroll_loop_item(std::integer_sequence, F&& f) { (f(std::integral_constant{}), ...); } -}; // namespace +}; // namespace template >> -constexpr void unroll_loop(F &&f) { +constexpr void unroll_loop(F&& f) { unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); } -template struct Vec { +template +struct Vec { constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } }; @@ -55,12 +56,12 @@ struct FP16Vec8 : public Vec { __m128i reg; - explicit FP16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit FP16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit FP16Vec8(const FP32Vec8 &); + explicit FP16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct FP16Vec16 : public Vec { @@ -68,12 +69,12 @@ struct FP16Vec16 : public Vec { __m256i reg; - explicit FP16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit FP16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit FP16Vec16(const FP32Vec16 &); + explicit FP16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -87,12 +88,12 @@ struct BF16Vec8 : public Vec { __m128i reg; - explicit BF16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + explicit BF16Vec8(const void* ptr) + : reg((__m128i)_mm_loadu_si128((__m128i*)ptr)) {} - explicit BF16Vec8(const FP32Vec8 &); + explicit BF16Vec8(const FP32Vec8&); - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m128i*>(ptr) = reg; } }; struct BF16Vec16 : public Vec { @@ -100,12 +101,12 @@ struct BF16Vec16 : public Vec { __m256i reg; - explicit BF16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + explicit BF16Vec16(const void* ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {} - explicit BF16Vec16(const FP32Vec16 &); + explicit BF16Vec16(const FP32Vec16&); - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m256i*>(ptr) = reg; } void save(void* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -120,11 +121,11 @@ struct BF16Vec32 : public Vec { __m512i reg; - explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} + explicit BF16Vec32(const void* ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} explicit BF16Vec32(__m512i data) : reg(data) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg((__m512i)_mm512_inserti32x4( _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( (__m128i)vec8_data.reg), @@ -132,7 +133,7 @@ struct BF16Vec32 : public Vec { (__m128i)vec8_data.reg, 2), (__m128i)vec8_data.reg, 3)) {} - void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } + void save(void* ptr) const { *reinterpret_cast<__m512i*>(ptr) = reg; } }; #else struct BF16Vec32 : public Vec { @@ -141,24 +142,24 @@ struct BF16Vec32 : public Vec { __m256i reg_low; __m256i reg_high; - explicit BF16Vec32(const void *ptr) - : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), - reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + explicit BF16Vec32(const void* ptr) + : reg_low(_mm256_loadu_si256((__m256i const*)ptr)), + reg_high(_mm256_loadu_si256((__m256i const*)ptr + 1)) {} - explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), - reg_high(high) {} + explicit BF16Vec32(__m256i low, __m256i high) + : reg_low(low), reg_high(high) {} - explicit BF16Vec32(BF16Vec8 &vec8_data) + explicit BF16Vec32(BF16Vec8& vec8_data) : reg_low((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)), + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), reg_high((__m256i)_mm256_inserti32x4( - _mm256_castsi128_si256((__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} - void save(void *ptr) const { - *reinterpret_cast<__m256i *>(ptr) = reg_low; - *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + void save(void* ptr) const { + *reinterpret_cast<__m256i*>(ptr) = reg_low; + *reinterpret_cast<__m256i*>((__m256i*)ptr + 1) = reg_high; } }; #endif @@ -176,11 +177,11 @@ struct FP32Vec4 : public Vec { explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} - explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} + explicit FP32Vec4(const float* ptr) : reg(_mm_loadu_ps(ptr)) {} explicit FP32Vec4(__m128 data) : reg(data) {} - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} + explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {} }; struct FP32Vec8 : public Vec { @@ -196,15 +197,15 @@ struct FP32Vec8 : public Vec { explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} - explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} + explicit FP32Vec8(const float* ptr) : reg(_mm256_loadu_ps(ptr)) {} explicit FP32Vec8(__m256 data) : reg(data) {} - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} + explicit FP32Vec8(const FP32Vec8& data) : reg(data.reg) {} - explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {} + explicit FP32Vec8(const FP16Vec8& v) : reg(_mm256_cvtph_ps(v.reg)) {} - explicit FP32Vec8(const BF16Vec8 &v) + explicit FP32Vec8(const BF16Vec8& v) : reg(_mm256_castsi256_ps( _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} @@ -212,7 +213,8 @@ struct FP32Vec8 : public Vec { AliasReg ar; ar.reg = reg; float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + unroll_loop( + [&result, &ar](int i) { result += ar.values[i]; }); return result; } @@ -244,27 +246,27 @@ struct FP32Vec8 : public Vec { erf(ar.values[1]), erf(ar.values[0]))); } - FP32Vec8 operator*(const FP32Vec8 &b) const { + FP32Vec8 operator*(const FP32Vec8& b) const { return FP32Vec8(_mm256_mul_ps(reg, b.reg)); } - FP32Vec8 operator+(const FP32Vec8 &b) const { + FP32Vec8 operator+(const FP32Vec8& b) const { return FP32Vec8(_mm256_add_ps(reg, b.reg)); } - FP32Vec8 operator-(const FP32Vec8 &b) const { + FP32Vec8 operator-(const FP32Vec8& b) const { return FP32Vec8(_mm256_sub_ps(reg, b.reg)); } - FP32Vec8 operator/(const FP32Vec8 &b) const { + FP32Vec8 operator/(const FP32Vec8& b) const { return FP32Vec8(_mm256_div_ps(reg, b.reg)); } - void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg); } }; #ifdef __AVX512F__ -struct INT32Vec16: public Vec { +struct INT32Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m512i reg; @@ -272,12 +274,11 @@ struct INT32Vec16: public Vec { }; __m512i reg; - - explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {} - void save(int32_t* ptr) const { - _mm512_storeu_epi32(ptr, reg); - } + explicit INT32Vec16(const void* data_ptr) + : reg(_mm512_loadu_epi32(data_ptr)) {} + + void save(int32_t* ptr) const { _mm512_storeu_epi32(ptr, reg); } void save(int32_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -301,11 +302,11 @@ struct FP32Vec16 : public Vec { explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} + explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {} explicit FP32Vec16(__m512 data) : reg(data) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg((__m512)_mm512_inserti32x4( _mm512_inserti32x4( _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), @@ -313,36 +314,37 @@ struct FP32Vec16 : public Vec { (__m128i)data.reg, 2), (__m128i)data.reg, 3)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg((__m512)_mm512_inserti32x8( _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} - explicit FP32Vec16(const BF16Vec16 &v) + explicit FP32Vec16(const BF16Vec16& v) : reg(_mm512_castsi512_ps( _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} - explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {} + explicit FP32Vec16(const FP16Vec16& v) : reg(_mm512_cvtph_ps(v.reg)) {} - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const INT32Vec16 &v) - : reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {} + explicit FP32Vec16(const INT32Vec16& v) + : reg(_mm512_cvt_roundepi32_ps( + v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm512_mul_ps(reg, b.reg)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm512_add_ps(reg, b.reg)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm512_sub_ps(reg, b.reg)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm512_div_ps(reg, b.reg)); } @@ -370,9 +372,7 @@ struct FP32Vec16 : public Vec { return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg)); } - FP32Vec16 abs() const { - return FP32Vec16(_mm512_abs_ps(reg)); - } + FP32Vec16 abs() const { return FP32Vec16(_mm512_abs_ps(reg)); } float reduce_sum() const { return _mm512_reduce_add_ps(reg); } @@ -380,14 +380,15 @@ struct FP32Vec16 : public Vec { float reduce_min() const { return _mm512_reduce_min_ps(reg); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); return _mm512_mask_reduce_add_ps(mask, reg); } - void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } + void save(float* ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -407,32 +408,30 @@ struct FP32Vec16 : public Vec { __m256 reg_low; __m256 reg_high; - explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), - reg_high(_mm256_set1_ps(v)) {} + explicit FP32Vec16(float v) + : reg_low(_mm256_set1_ps(v)), reg_high(_mm256_set1_ps(v)) {} - explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), - reg_high(_mm256_set1_ps(0.0)) {} + explicit FP32Vec16() + : reg_low(_mm256_set1_ps(0.0)), reg_high(_mm256_set1_ps(0.0)) {} - explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), - reg_high(_mm256_loadu_ps(ptr + 8)) {} + explicit FP32Vec16(const float* ptr) + : reg_low(_mm256_loadu_ps(ptr)), reg_high(_mm256_loadu_ps(ptr + 8)) {} explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} - explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), - reg_high(data.reg_high) {} + explicit FP32Vec16(const FP32Vec16& data) + : reg_low(data.reg_low), reg_high(data.reg_high) {} - explicit FP32Vec16(const FP32Vec4 &data) + explicit FP32Vec16(const FP32Vec4& data) : reg_low((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)), + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)), reg_high((__m256)_mm256_inserti128_si256( - _mm256_castsi128_si256((__m128i)data.reg), - (__m128i)data.reg, 1)) {} + _mm256_castsi128_si256((__m128i)data.reg), (__m128i)data.reg, 1)) {} - explicit FP32Vec16(const FP32Vec8 &data) + explicit FP32Vec16(const FP32Vec8& data) : reg_low(data.reg), reg_high(data.reg) {} - explicit FP32Vec16(const FP16Vec16 &v) { + explicit FP32Vec16(const FP16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -440,9 +439,9 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_cvtph_ps(high); } - explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const FP16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - explicit FP32Vec16(const BF16Vec16 &v) { + explicit FP32Vec16(const BF16Vec16& v) { __m128i low = _mm256_extractf128_si256(v.reg, 0); __m128i high = _mm256_extractf128_si256(v.reg, 1); @@ -456,24 +455,24 @@ struct FP32Vec16 : public Vec { reg_high = _mm256_castsi256_ps(v_high_shifted); } - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {} - FP32Vec16 operator*(const FP32Vec16 &b) const { + FP32Vec16 operator*(const FP32Vec16& b) const { return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), _mm256_mul_ps(reg_high, b.reg_high)); } - FP32Vec16 operator+(const FP32Vec16 &b) const { + FP32Vec16 operator+(const FP32Vec16& b) const { return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), _mm256_add_ps(reg_high, b.reg_high)); } - FP32Vec16 operator-(const FP32Vec16 &b) const { + FP32Vec16 operator-(const FP32Vec16& b) const { return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), _mm256_sub_ps(reg_high, b.reg_high)); } - FP32Vec16 operator/(const FP32Vec16 &b) const { + FP32Vec16 operator/(const FP32Vec16& b) const { return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), _mm256_div_ps(reg_high, b.reg_high)); } @@ -484,7 +483,8 @@ struct FP32Vec16 : public Vec { return low.reduce_sum() + high.reduce_sum(); } - template float reduce_sub_sum(int idx) { + template + float reduce_sub_sum(int idx) { float sum = 0.0; static_assert(VEC_ELEM_NUM % group_size == 0); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); @@ -507,7 +507,7 @@ struct FP32Vec16 : public Vec { return sum; } - void save(float *ptr) const { + void save(float* ptr) const { _mm256_storeu_ps(ptr, reg_low); _mm256_storeu_ps(ptr + 8, reg_high); } @@ -515,7 +515,7 @@ struct FP32Vec16 : public Vec { #endif #ifdef __AVX512F__ -struct INT8Vec16: public Vec { +struct INT8Vec16 : public Vec { constexpr static int VEC_ELEM_NUM = 16; union AliasReg { __m128i reg; @@ -523,14 +523,12 @@ struct INT8Vec16: public Vec { }; __m128i reg; - - explicit INT8Vec16(const FP32Vec16& vec) : reg( - _mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) - ) {} - void save(int8_t* ptr) const { - _mm_storeu_epi8(ptr, reg); - } + explicit INT8Vec16(const FP32Vec16& vec) + : reg(_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32( + vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))) {} + + void save(int8_t* ptr) const { _mm_storeu_epi8(ptr, reg); } void save(int8_t* ptr, const int elem_num) const { constexpr uint32_t M = 0xFFFFFFFF; @@ -540,71 +538,92 @@ struct INT8Vec16: public Vec { }; #endif -template struct VecType { using vec_type = void; }; +template +struct VecType { + using vec_type = void; +}; -template using vec_t = typename VecType::vec_type; +template +using vec_t = typename VecType::vec_type; -template <> struct VecType { using vec_type = FP32Vec8; }; +template <> +struct VecType { + using vec_type = FP32Vec8; +}; -template <> struct VecType { using vec_type = FP16Vec8; }; +template <> +struct VecType { + using vec_type = FP16Vec8; +}; -template <> struct VecType { using vec_type = BF16Vec8; }; +template <> +struct VecType { + using vec_type = BF16Vec8; +}; -template void storeFP32(float v, T *ptr) { *ptr = v; } +template +void storeFP32(float v, T* ptr) { + *ptr = v; +} -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { +inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) { acc = acc + a * b; } -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast(ptr) = +template <> +inline void storeFP32(float v, c10::Half* ptr) { + *reinterpret_cast(ptr) = _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); } -inline FP16Vec8::FP16Vec8(const FP32Vec8 &v) +inline FP16Vec8::FP16Vec8(const FP32Vec8& v) : reg(_mm256_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #ifdef __AVX512F__ -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) : reg(_mm512_cvtps_ph(v.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {} #else -inline FP16Vec16::FP16Vec16(const FP32Vec16 &v) - : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} +inline FP16Vec16::FP16Vec16(const FP32Vec16& v) + : reg(_mm256_insertf128_si256( + _mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), + FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {} #endif #ifdef __AVX512BF16__ -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + *reinterpret_cast<__bfloat16*>(ptr) = _mm_cvtness_sbh(v); } -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { +inline void fma(FP32Vec16& acc, BF16Vec32& a, BF16Vec32& b) { acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); } #else -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); +template <> +inline void storeFP32(float v, c10::BFloat16* ptr) { + c10::BFloat16 __attribute__((__may_alias__))* v_ptr = + reinterpret_cast(&v); *ptr = *(v_ptr + 1); } -#ifdef __AVX512F__ -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + #ifdef __AVX512F__ +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(_mm256_cvtepi32_epi16( _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) : reg(_mm512_cvtepi32_epi16( _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#else -namespace{ + #else +namespace { __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { __m256i ai = _mm256_castps_si256(a); ai = _mm256_srli_epi32(ai, 16); @@ -612,21 +631,21 @@ __m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { ai = _mm256_permute4x64_epi64(ai, 0b00111001); return _mm256_extracti128_si256(ai, 0); } -} +} // namespace -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) +inline BF16Vec8::BF16Vec8(const FP32Vec8& v) : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +inline BF16Vec16::BF16Vec16(const FP32Vec16& v) { BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); } -#endif // __AVX512F__ -#endif // __AVX512BF16__ + #endif // __AVX512F__ +#endif // __AVX512BF16__ -inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } +inline void prefetch(const void* addr) { _mm_prefetch(addr, _MM_HINT_T1); } -}; // namespace vec_op +}; // namespace vec_op #endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 74e4d8189d403..5d1c5f4c83d3e 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -148,7 +148,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache, Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache); } diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp new file mode 100644 index 0000000000000..e8555d853b7ac --- /dev/null +++ b/csrc/cumem_allocator.cpp @@ -0,0 +1,310 @@ +// A CUDAPluggableAllocator based on cumem* APIs. +// Important: allocation size, CUdeviceptr and CUmemGenericAllocationHandle* +// need to be unsigned long long +#include + +extern "C" { + +#define PY_SSIZE_T_CLEAN +#include + +#include +#include +#include + +#define CUDA_CHECK(condition) \ + do { \ + CUresult error = condition; \ + if (error != 0) { \ + char* error_string; \ + cuGetErrorString(error, (const char**)&error_string); \ + std::cerr << "CUDA Error: " << error_string << " at " << __FILE__ << ":" \ + << __LINE__ << std::endl; \ + } \ + } while (0) + +// Global references to Python callables +// NOTE: this is borrowed reference, so we don't need to DECREF them. +// This brings the limitation that the allocator needs to be singleton. +static PyObject* g_python_malloc_callback = nullptr; +static PyObject* g_python_free_callback = nullptr; + +// --------------------------------------------------------------------------- +// Helper functions: + +void ensure_context(unsigned long long device) { + CUcontext pctx; + CUDA_CHECK(cuCtxGetCurrent(&pctx)); + if (!pctx) { + // Ensure device context. + CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device)); + CUDA_CHECK(cuCtxSetCurrent(pctx)); + } +} + +void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + ensure_context(device); + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Allocate memory using cuMemCreate + CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + CUDA_CHECK(cuMemMap(d_mem, size, 0, *p_memHandle, 0)); + + CUmemAccessDesc accessDesc = {}; + accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + accessDesc.location.id = device; + accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + + CUDA_CHECK(cuMemSetAccess(d_mem, size, &accessDesc, 1)); + // std::cout << "create_and_map: device=" << device << ", size=" << size << ", + // d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; +} + +void unmap_and_release(unsigned long long device, ssize_t size, + CUdeviceptr d_mem, + CUmemGenericAllocationHandle* p_memHandle) { + // std::cout << "unmap_and_release: device=" << device << ", size=" << size << + // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl; + ensure_context(device); + CUDA_CHECK(cuMemUnmap(d_mem, size)); + CUDA_CHECK(cuMemRelease(*p_memHandle)); +} + +PyObject* create_tuple_from_c_integers(unsigned long long a, + unsigned long long b, + unsigned long long c, + unsigned long long d) { + // Create a new tuple of size 4 + PyObject* tuple = PyTuple_New(4); + if (!tuple) { + return NULL; // Return NULL on failure + } + + // Convert integers to Python objects and set them in the tuple + PyTuple_SetItem( + tuple, 0, + PyLong_FromUnsignedLongLong(a)); // Steals reference to the PyLong + PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b)); + PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c)); + PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d)); + + // Note: PyTuple_SetItem "steals" a reference to each object, + // so we do not need to Py_DECREF the PyLong objects explicitly. + + return tuple; // Return the created tuple +} + +// --------------------------------------------------------------------------- +// Our exported C functions that call Python: + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void* my_malloc(ssize_t size, int device, CUstream stream) { + ensure_context(device); + + // first allocation, align the size, and reserve an address, and also allocate + // a CUmemGenericAllocationHandle + + // Define memory allocation properties + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.allocFlags.compressionType = CU_MEM_ALLOCATION_COMP_NONE; + + // Check if the allocation is supported + size_t granularity; + CUDA_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, + CU_MEM_ALLOC_GRANULARITY_MINIMUM)); + + size_t alignedSize = ((size + granularity - 1) / granularity) * granularity; + + CUdeviceptr d_mem; + CUDA_CHECK(cuMemAddressReserve(&d_mem, alignedSize, 0, 0, 0)); + + // allocate the CUmemGenericAllocationHandle + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)malloc( + sizeof(CUmemGenericAllocationHandle)); + + if (!g_python_malloc_callback) { + std::cerr << "ERROR: g_python_malloc_callback not set.\n"; + return nullptr; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* arg_tuple = create_tuple_from_c_integers( + (unsigned long long)device, (unsigned long long)alignedSize, + (unsigned long long)d_mem, (unsigned long long)p_memHandle); + + // Call g_python_malloc_callback + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL); + Py_DECREF(arg_tuple); + + if (!py_result) { + PyErr_Print(); + PyGILState_Release(gstate); + return nullptr; + } + + PyGILState_Release(gstate); + + // do the final mapping + create_and_map(device, alignedSize, d_mem, p_memHandle); + + return (void*)d_mem; +} + +// use CUstream instead of cudaStream_t, to avoid including cuda_runtime_api.h +void my_free(void* ptr, ssize_t size, int device, CUstream stream) { + // get memory handle from the pointer + if (!g_python_free_callback) { + std::cerr << "ERROR: g_python_free_callback not set.\n"; + return; + } + + // Acquire GIL (not in stable ABI officially, but often works) + PyGILState_STATE gstate = PyGILState_Ensure(); + + PyObject* py_ptr = + PyLong_FromUnsignedLongLong(reinterpret_cast(ptr)); + + PyObject* py_result = + PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL); + + if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size, + &recv_d_mem, &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return; + } + + PyGILState_Release(gstate); + + // recv_size == size + // recv_device == device + + // Free memory + + CUdeviceptr d_mem = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + unmap_and_release(device, size, d_mem, p_memHandle); + + // free address and the handle + CUDA_CHECK(cuMemAddressFree(d_mem, size)); + free(p_memHandle); +} + +// --------------------------------------------------------------------------- +// Python extension boilerplate: + +// Python-exposed function: init_module(python_malloc, python_free) +static PyObject* py_init_module(PyObject* self, PyObject* args) { + PyObject* malloc_callback = nullptr; + PyObject* free_callback = nullptr; + + if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) { + return nullptr; + } + + if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) { + PyErr_SetString(PyExc_TypeError, "Both arguments must be callables"); + return nullptr; + } + + // Save the Python callables + // This module does not handle GC of these objects, so they must be kept alive + // outside of this module. + g_python_malloc_callback = malloc_callback; + g_python_free_callback = free_callback; + + Py_RETURN_NONE; +} + +static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyObject* python_create_and_map(PyObject* self, PyObject* args) { + if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) { + PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4"); + return nullptr; + } + + unsigned long long recv_device, recv_size; + unsigned long long recv_d_mem, recv_p_memHandle; + // Unpack the tuple into four C integers + if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem, + &recv_p_memHandle)) { + // PyArg_ParseTuple sets an error if it fails + return nullptr; + } + + CUdeviceptr d_mem_ptr = (CUdeviceptr)recv_d_mem; + CUmemGenericAllocationHandle* p_memHandle = + (CUmemGenericAllocationHandle*)recv_p_memHandle; + + create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle); + + Py_RETURN_NONE; +} + +static PyMethodDef module_methods[] = { + {"init_module", (PyCFunction)py_init_module, METH_VARARGS, + "Initialize module with python_malloc and python_free callables."}, + {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS, + "Create and map memory on the device."}, + {"python_unmap_and_release", (PyCFunction)python_unmap_and_release, + METH_VARARGS, "Unmap and release memory on the device."}, + {NULL, NULL, 0, NULL} // sentinel +}; + +static struct PyModuleDef cumem_allocator_module = { + PyModuleDef_HEAD_INIT, "cumem_allocator", + "cumem-based allocator for CUDAPluggableAllocator", -1, module_methods}; + +PyMODINIT_FUNC PyInit_cumem_allocator(void) { + // Initialize the module + PyObject* module = PyModule_Create(&cumem_allocator_module); + if (!module) { + return NULL; + } + return module; +} +} // extern "C" diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh index 6be4d4f2b2eb8..b9df4ed160b03 100644 --- a/csrc/custom_all_reduce.cuh +++ b/csrc/custom_all_reduce.cuh @@ -38,9 +38,13 @@ struct Signal { alignas(128) FlagType peer_counter[2][kMaxBlocks][8]; }; -struct __align__(16) RankData { const void* __restrict__ ptrs[8]; }; +struct __align__(16) RankData { + const void* __restrict__ ptrs[8]; +}; -struct __align__(16) RankSignals { Signal* signals[8]; }; +struct __align__(16) RankSignals { + Signal* signals[8]; +}; // like std::array, but aligned template diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp index 85e359aa57113..07c9e46c27b06 100644 --- a/csrc/cutlass_extensions/common.hpp +++ b/csrc/cutlass_extensions/common.hpp @@ -27,8 +27,7 @@ inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { int max_shared_mem_per_block_opt_in = 0; cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, - cudaDevAttrMaxSharedMemoryPerBlockOptin, - device); + cudaDevAttrMaxSharedMemoryPerBlockOptin, device); return max_shared_mem_per_block_opt_in; } diff --git a/csrc/moe/marlin_kernels/marlin_moe_kernel.h b/csrc/moe/marlin_kernels/marlin_moe_kernel.h index a217401b3d7c2..47ecf109d0f53 100644 --- a/csrc/moe/marlin_kernels/marlin_moe_kernel.h +++ b/csrc/moe/marlin_kernels/marlin_moe_kernel.h @@ -138,8 +138,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -182,8 +182,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 24341d63fb1f8..8b6fe72ad743b 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, } } // namespace -template +template __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids, @@ -32,12 +32,10 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, const size_t start_idx = threadIdx.x * tokens_per_thread; extern __shared__ int32_t shared_mem[]; - - int32_t* tokens_cnts = - shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts) - int32_t* cumsum = - shared_mem + - (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1) + int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) + token_cnts_t* tokens_cnts = + (token_cnts_t*)(shared_mem + num_experts + + 1); // 2d tensor with shape (blockDim.x + 1, num_experts) for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -74,7 +72,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, block_size) * block_size; } - *total_tokens_post_pad = cumsum[num_experts]; + *total_tokens_post_pad = static_cast(cumsum[num_experts]); } __syncthreads(); @@ -224,26 +222,46 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // If we have very large number of experts, we can no longer use shared - // memory. - // TODO(simon): the right solution should be calculating the exact right - // amount of shared memory and use that. The num_experts >= 256 is just a - // temporary solution to unblock Deepseek V3. - if (num_experts >= 256) { + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + + const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); + const int32_t shared_mem_i32 = + ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); + const int32_t shared_mem_i16 = + ((num_thread + 1) * num_experts) * sizeof(uint16_t) + + (num_experts + 1) * sizeof(int32_t); + + bool use_global_memory = false; + bool use_i16 = false; // Use uint16_t for shared memory token counts + if (shared_mem_i32 < device_max_shared_mem) { + // Do nothing in this case. We're all set to use int32_t token counts + } else if (shared_mem_i16 < device_max_shared_mem && + topk_ids.numel() <= 65535) { + // when nelements of topk_ids is smaller than 65535 (max value of uint16), + // element value of token_cnts would also smaller than 65535, + // so we can use uint16 as dtype of token_cnts + use_i16 = true; + } else { + use_global_memory = true; + } + + if (use_global_memory) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t mem_tokens_cnts = - ((num_experts + 1) * num_experts) * sizeof(int32_t); - const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t); - // allocate global memory - int32_t* tokens_cnts; - int32_t* cumsum; - cudaMalloc(&tokens_cnts, mem_tokens_cnts); - cudaMalloc(&cumsum, mem_cumsum); + auto options_int = torch::TensorOptions() + .dtype(torch::kInt) + .device(topk_ids.device()); + torch::Tensor token_cnts_buffer = + torch::empty({(num_experts + 1) * num_experts}, options_int); + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); auto kernel = vllm::moe::moe_align_block_size_global_mem_kernel; @@ -252,25 +270,32 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), tokens_cnts, cumsum); - cudaFree(tokens_cnts); - cudaFree(cumsum); + topk_ids.numel(), token_cnts_buffer.data_ptr(), + cumsum_buffer.data_ptr()); }); - } else { + } else if (use_i16) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors - const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t shared_mem = - ((num_thread + 1) * num_experts + (num_experts + 1)) * - sizeof(int32_t); - // set dynamic shared mem - auto kernel = vllm::moe::moe_align_block_size_kernel; + auto kernel = + vllm::moe::moe_align_block_size_kernel; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem_i16)); + kernel<<<1, num_thread, shared_mem_i16, stream>>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel()); + }); + } else { + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { + auto kernel = + vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem)); - kernel<<<1, num_thread, shared_mem, stream>>>( + (void*)kernel, shared_mem_i32)); + kernel<<<1, num_thread, shared_mem_i32, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), diff --git a/csrc/ops.h b/csrc/ops.h index 5a194a0dd3654..346898964010d 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -34,8 +34,9 @@ void paged_attention_v1( torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); @@ -45,8 +46,9 @@ void paged_attention_v2( torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, int64_t max_seq_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale, const int64_t tp_rank, + const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step); diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 04ef842fbdf95..7c33fea93d6ae 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -173,8 +173,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; @@ -197,9 +197,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; @@ -221,8 +221,8 @@ dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); const int SUB = 0x64006400; const int MUL = 0x2c002c00; @@ -244,9 +244,9 @@ dequant(int q) { // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); q >>= 4; - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, MASK, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, MASK, EX); typename ScalarType::FragB frag_b; static constexpr uint32_t MUL = 0x3F803F80; diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu index c03fef886e4db..4db8f5dcdabf6 100644 --- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu +++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu @@ -96,8 +96,8 @@ __device__ inline FragB dequant(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu index 103a6444f3a21..048a3f736fb71 100644 --- a/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu +++ b/csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu @@ -141,8 +141,8 @@ __device__ inline FragB dequant_per_group(int q, FragS_GROUP& frag_s, int i) { static constexpr uint32_t HI = 0x00f000f0; static constexpr uint32_t EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - uint32_t t0 = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - uint32_t t1 = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + uint32_t t0 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + uint32_t t1 = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. static constexpr uint32_t SUB = 0x64086408; diff --git a/csrc/quantization/marlin/sparse/common/mma.h b/csrc/quantization/marlin/sparse/common/mma.h index b26505f771c8b..49eee4128ee7c 100644 --- a/csrc/quantization/marlin/sparse/common/mma.h +++ b/csrc/quantization/marlin/sparse/common/mma.h @@ -127,8 +127,8 @@ __device__ inline FragB dequant_4bit(int q) { const int HI = 0x00f000f0; const int EX = 0x64006400; // Guarantee that the `(a & b) | c` operations are LOP3s. - int lo = lop3<(0xf0 & 0xcc) | 0xaa>(q, LO, EX); - int hi = lop3<(0xf0 & 0xcc) | 0xaa>(q, HI, EX); + int lo = lop3 < (0xf0 & 0xcc) | 0xaa > (q, LO, EX); + int hi = lop3 < (0xf0 & 0xcc) | 0xaa > (q, HI, EX); // We want signed int4 outputs, hence we fuse the `-8` symmetric zero point // directly into `SUB` and `ADD`. const int SUB = 0x64086408; diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index 0fec9624c457e..ffa9d44610a7f 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -218,7 +218,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, // head_size] scalar_t* __restrict__ final_out, // [num_seqs, num_heads, head_size] - int max_ctx_blocks, float k_scale, float v_scale) { + int max_ctx_blocks, const float* k_scale_ptr, const float* v_scale_ptr) { constexpr int NWARPS = NUM_THREADS / WARP_SIZE; const int warpid = threadIdx.x / WARP_SIZE; const int laneid = threadIdx.x % WARP_SIZE; @@ -406,7 +406,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( // Vlocalb8[h][b * BLOCK_SIZE / 8 + d] = v_ptrh8be[d]; const _B8x8 Vlocalb8 = v_ptrh8be[d]; Vlocal[h][b * BLOCK_SIZE / 8 + d] = - scaled_convert_b8x8(Vlocalb8, v_scale); + scaled_convert_b8x8(Vlocalb8, *v_scale_ptr); } } } @@ -416,7 +416,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( #pragma unroll for (int d = 0; d < KHELOOP; d++) { Klocal[d] = - scaled_convert_b8x8(Klocalb8[d], k_scale); + scaled_convert_b8x8(Klocalb8[d], *k_scale_ptr); } } @@ -890,7 +890,7 @@ __global__ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_kernel( scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, // head_size] scalar_t* __restrict__ final_out, // [num_seqs, num_heads, head_size] - int max_ctx_blocks, float k_scale, float v_scale) { + int max_ctx_blocks, const float* k_scale, const float* v_scale) { UNREACHABLE_CODE } @@ -907,7 +907,9 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, // max_num_partitions, head_size] const int* __restrict__ context_lens, // [num_seqs] - const int max_num_partitions){UNREACHABLE_CODE} + const int max_num_partitions) { + UNREACHABLE_CODE +} #endif // defined(__HIP__MI300_MI250__) TODO: Add NAVI support @@ -919,7 +921,7 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel( block_tables_ptr, context_lens_ptr, max_num_blocks_per_seq, \ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr, max_ctx_blocks, \ - k_scale, v_scale); + k_scale_ptr, v_scale_ptr); template @@ -929,7 +931,7 @@ void paged_attention_custom_launcher( torch::Tensor& value_cache, const int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& context_lens, int max_context_len, const std::optional& alibi_slopes, - float k_scale, float v_scale) { + torch::Tensor& k_scale, torch::Tensor& v_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -953,6 +955,8 @@ void paged_attention_custom_launcher( KVT* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); int* block_tables_ptr = block_tables.data_ptr(); int* context_lens_ptr = context_lens.data_ptr(); + const float* k_scale_ptr = reinterpret_cast(k_scale.data_ptr()); + const float* v_scale_ptr = reinterpret_cast(v_scale.data_ptr()); const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE); const int max_num_partitions = @@ -1087,7 +1091,8 @@ void paged_attention( torch::Tensor& context_lens, // [num_seqs] int64_t block_size, int64_t max_context_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale) { + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale) { const int head_size = query.size(2); if (kv_cache_dtype == "auto") { if (query.dtype() == at::ScalarType::Half) { diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index 34b2f9ce8a4c4..ba161951772ad 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -10,5 +10,5 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& context_lens, int64_t block_size, int64_t max_context_len, const std::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, - double v_scale); + const std::string& kv_cache_dtype, torch::Tensor& k_scale, + torch::Tensor& v_scale); diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp index a283d4263d293..a5d2e2f97a3ed 100644 --- a/csrc/rocm/torch_bindings.cpp +++ b/csrc/rocm/torch_bindings.cpp @@ -27,7 +27,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) { " int max_context_len," " Tensor? alibi_slopes," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention); } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index fb53d122487d3..ec63170d511f0 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -30,7 +30,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -44,7 +44,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor value_cache, int num_kv_heads, float scale," " Tensor block_tables, Tensor seq_lens, int block_size," " int max_seq_len, Tensor? alibi_slopes," - " str kv_cache_dtype, float k_scale, float v_scale," + " str kv_cache_dtype, Tensor k_scale, Tensor v_scale," " int tp_rank, int blocksparse_local_blocks," " int blocksparse_vert_stride, int blocksparse_block_size," " int blocksparse_head_sliding_step) -> ()"); @@ -449,7 +449,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! key_cache, Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache); // Reshape the key and value tensors and cache them. @@ -459,7 +459,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { " Tensor! value_cache," " Tensor slot_mapping," " str kv_cache_dtype," - " float k_scale, float v_scale) -> ()"); + " Tensor k_scale, Tensor v_scale) -> ()"); cache_ops.impl("reshape_and_cache_flash", torch::kCUDA, &reshape_and_cache_flash); diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 8217bc3ba3ded..1d669699f4b2a 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,10 +1,10 @@ sphinx==6.2.1 +sphinx-argparse==0.4.0 sphinx-book-theme==1.0.1 sphinx-copybutton==0.5.2 -myst-parser==3.0.1 -sphinx-argparse==0.4.0 sphinx-design==0.6.1 sphinx-togglebutton==0.3.2 +myst-parser==3.0.1 msgspec cloudpickle diff --git a/docs/source/api/engine/index.md b/docs/source/api/engine/index.md index 701cb95d3be33..b6544d94afdf8 100644 --- a/docs/source/api/engine/index.md +++ b/docs/source/api/engine/index.md @@ -8,10 +8,10 @@ .. currentmodule:: vllm.engine ``` -```{toctree} +:::{toctree} :caption: Engines :maxdepth: 2 llm_engine async_llm_engine -``` +::: diff --git a/docs/source/api/model/index.md b/docs/source/api/model/index.md index 113792147be7c..8fee3a55c93de 100644 --- a/docs/source/api/model/index.md +++ b/docs/source/api/model/index.md @@ -2,10 +2,10 @@ ## Submodules -```{toctree} +:::{toctree} :maxdepth: 1 interfaces_base interfaces adapters -``` +::: diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index 14efdb506d76f..069ed53e545c5 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -17,7 +17,7 @@ Looking to add your own multi-modal model? Please follow the instructions listed ## Submodules -```{toctree} +:::{toctree} :maxdepth: 1 inputs @@ -25,4 +25,4 @@ parse processing profiling registry -``` +::: diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md index 76b2fb95a5009..21bd938be9e89 100644 --- a/docs/source/api/multimodal/inputs.md +++ b/docs/source/api/multimodal/inputs.md @@ -43,7 +43,7 @@ ``` ```{eval-rst} -.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 +.. autoclass:: vllm.multimodal.inputs.MultiModalInputs :members: :show-inheritance: ``` diff --git a/docs/source/api/offline_inference/index.md b/docs/source/api/offline_inference/index.md index c32f99d59e3db..ec2cc599d923c 100644 --- a/docs/source/api/offline_inference/index.md +++ b/docs/source/api/offline_inference/index.md @@ -1,9 +1,9 @@ # Offline Inference -```{toctree} +:::{toctree} :caption: Contents :maxdepth: 1 llm llm_inputs -``` +::: diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md new file mode 100644 index 0000000000000..e8030edfa02ee --- /dev/null +++ b/docs/source/community/blog.md @@ -0,0 +1,3 @@ +# vLLM Blog + +vLLM blog posts are published [here](https://blog.vllm.ai/). diff --git a/docs/source/community/meetups.md b/docs/source/community/meetups.md index 43fa9ee616096..ab5ea147f4c6a 100644 --- a/docs/source/community/meetups.md +++ b/docs/source/community/meetups.md @@ -4,6 +4,7 @@ We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: +- [The eighth vLLM meetup](https://lu.ma/zep56hui), with Google Cloud, January 22nd 2025. [[Slides]](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing) - [The seventh vLLM meetup](https://lu.ma/h0qvrajz), with Snowflake, November 14th 2024. [[Slides]](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing) - [The sixth vLLM meetup](https://lu.ma/87q3nvnh), with NVIDIA, September 9th 2024. [[Slides]](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing) - [The fifth vLLM meetup](https://lu.ma/lp0gyjqr), with AWS, July 24th 2024. [[Slides]](https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing) diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index cb142318b8724..96674805df534 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -17,11 +17,11 @@ The edges of the build graph represent: - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) - > ```{figure} /assets/contributing/dockerfile-stages-dependency.png + > :::{figure} /assets/contributing/dockerfile-stages-dependency.png > :align: center > :alt: query > :width: 100% - > ``` + > ::: > > Made using: > diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md index b9b92fd027f6e..180fdd59e9a64 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/source/contributing/model/basic.md @@ -10,9 +10,9 @@ First, clone the PyTorch model code from the source repository. For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. -```{warning} +:::{warning} Make sure to review and adhere to the original code's copyright and licensing terms! -``` +::: ## 2. Make your code compatible with vLLM @@ -80,10 +80,10 @@ def forward( ... ``` -```{note} +:::{note} Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. -``` +::: For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md index fe018b61b08cf..721ee3cd2047c 100644 --- a/docs/source/contributing/model/index.md +++ b/docs/source/contributing/model/index.md @@ -4,7 +4,7 @@ This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM. -```{toctree} +:::{toctree} :caption: Contents :maxdepth: 1 @@ -12,16 +12,16 @@ basic registration tests multimodal -``` +::: -```{note} +:::{note} The complexity of adding a new model depends heavily on the model's architecture. The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. -``` +::: -```{tip} +:::{tip} If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) or ask on our [developer slack](https://slack.vllm.ai). We will be happy to help you out! -``` +::: diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md index e5fd9a2877ceb..6c6f3b701cd28 100644 --- a/docs/source/contributing/model/multimodal.md +++ b/docs/source/contributing/model/multimodal.md @@ -48,9 +48,9 @@ Further update the model as follows: return vision_embeddings ``` - ```{important} + :::{important} The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request. - ``` + ::: - Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings. @@ -89,10 +89,10 @@ Further update the model as follows: + class YourModelForImage2Seq(nn.Module, SupportsMultiModal): ``` - ```{note} + :::{note} The model class does not have to be named {code}`*ForCausalLM`. Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples. - ``` + ::: ## 2. Specify processing information @@ -120,8 +120,8 @@ When calling the model, the output embeddings from the visual encoder are assign containing placeholder feature tokens. Therefore, the number of placeholder feature tokens should be equal to the size of the output embeddings. -::::{tab-set} -:::{tab-item} Basic example: LLaVA +:::::{tab-set} +::::{tab-item} Basic example: LLaVA :sync: llava Looking at the code of HF's `LlavaForConditionalGeneration`: @@ -254,12 +254,12 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self.get_max_image_tokens()} ``` -```{note} +:::{note} Our [actual code](gh-file:vllm/model_executor/models/llava.py) is more abstracted to support vision encoders other than CLIP. -``` - ::: + :::: +::::: ## 3. Specify dummy inputs @@ -315,17 +315,17 @@ def get_dummy_processor_inputs( Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to fill in the missing details about HF processing. -```{seealso} +:::{seealso} [Multi-Modal Data Processing](#mm-processing) -``` +::: ### Multi-modal fields Override {class}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items. -::::{tab-set} -:::{tab-item} Basic example: LLaVA +:::::{tab-set} +::::{tab-item} Basic example: LLaVA :sync: llava Looking at the model's `forward` method: @@ -367,13 +367,13 @@ def _get_mm_fields_config( ) ``` -```{note} +:::{note} Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument. -``` - ::: + :::: +::::: ### Prompt replacements diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md index d6c9e4181dfee..64cd25b53807e 100644 --- a/docs/source/contributing/model/registration.md +++ b/docs/source/contributing/model/registration.md @@ -17,17 +17,17 @@ After you have implemented your model (see [tutorial](#new-model-basic)), put it Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. Finally, update our [list of supported models](#supported-models) to promote your model! -```{important} +:::{important} The list of models in each section should be maintained in alphabetical order. -``` +::: ## Out-of-tree models You can load an external model using a plugin without modifying the vLLM codebase. -```{seealso} +:::{seealso} [vLLM's Plugin System](#plugin-system) -``` +::: To register the model, use the following code: @@ -45,11 +45,11 @@ from vllm import ModelRegistry ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") ``` -```{important} +:::{important} If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. Read more about that [here](#supports-multimodal). -``` +::: -```{note} +:::{note} Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. -``` +::: diff --git a/docs/source/contributing/model/tests.md b/docs/source/contributing/model/tests.md index 74c933b2f45da..68d51d89f7cff 100644 --- a/docs/source/contributing/model/tests.md +++ b/docs/source/contributing/model/tests.md @@ -14,14 +14,14 @@ Without them, the CI for your PR will fail. Include an example HuggingFace repository for your model in . This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM. -```{important} +:::{important} The list of models in each section should be maintained in alphabetical order. -``` +::: -```{tip} +:::{tip} If your model requires a development version of HF Transformers, you can set `min_transformers_version` to skip the test in CI until the model is released. -``` +::: ## Optional Tests diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index e92104399342d..908c7cb4d38ee 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -25,25 +25,27 @@ Check out the [building from source](#build-from-source) documentation for detai ```bash pip install -r requirements-dev.txt -# linting and formatting -bash format.sh -# Static type checking -mypy +# Linting, formatting and static type checking +pre-commit install + +# You can manually run pre-commit with +pre-commit run --all-files + # Unit tests pytest tests/ ``` -```{note} +:::{note} Currently, the repository is not fully checked by `mypy`. -``` +::: ## Issues If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -```{important} +:::{important} If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability). -``` +::: ## Pull Requests & Code Reviews @@ -79,16 +81,17 @@ appropriately to indicate the type of change. Please use one of the following: - `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. -```{note} +:::{note} If the PR spans more than one category, please include all relevant prefixes. -``` +::: ### Code Quality The PR needs to meet the following code quality standards: - We adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). -- Pass all linter checks. Please use to format your code. +- Pass all linter checks. Please use `pre-commit` to format your code. See + if `pre-commit` is new to you. - The code needs to be well-documented to ensure future contributors can easily understand the code. - Include sufficient tests to ensure the project stays correct and robust. This diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/source/contributing/profiling/profiling_index.md index 001db86bdf555..79aeb292a9b73 100644 --- a/docs/source/contributing/profiling/profiling_index.md +++ b/docs/source/contributing/profiling/profiling_index.md @@ -6,21 +6,21 @@ The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` en When using `benchmarks/benchmark_serving.py`, you can enable profiling by passing the `--profile` flag. -```{warning} +:::{warning} Only enable profiling in a development environment. -``` +::: Traces can be visualized using . -```{tip} +:::{tip} Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. -``` +::: -```{tip} +:::{tip} To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100. Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes. `export VLLM_RPC_TIMEOUT=1800000` -``` +::: ## Example commands and usage diff --git a/docs/source/contributing/vulnerability_management.md b/docs/source/contributing/vulnerability_management.md index 422dc13e6a644..a9bbfde2af770 100644 --- a/docs/source/contributing/vulnerability_management.md +++ b/docs/source/contributing/vulnerability_management.md @@ -41,3 +41,20 @@ You may use the `#security` channel in the [VLLM Slack](https://slack.vllm.ai) to discuss security-related topics. However, please do not disclose any vulnerabilities in this channel. If you need to report a vulnerability, please use the GitHub security advisory system or contact a VMT member privately. + +## Vulnerability Disclosure + +The process for disclosing vulnerabilities is the following: + +- The VMT will work with the project maintainers to develop a fix for the + vulnerability. +- The VMT will coordinate with the reporter and project maintainers to prepare a + security advisory that adequately describes the vulnerability and its impact. +- The VMT will coordinate with the project maintainers to publish a fix and + release an update that includes that fix. +- The VMT will publish the security advisory on GitHub. Release notes will be + updated to include a reference to the security advisory. + +The VMT and project maintainers will work to minimize the amount of time in +between disclosing any public information about the vulnerability and making a +release and advisory available. diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md index 438be47316f3b..334c02225bd6b 100644 --- a/docs/source/deployment/docker.md +++ b/docs/source/deployment/docker.md @@ -21,11 +21,11 @@ $ docker run --runtime nvidia --gpus all \ You can add any other you need after the image tag (`vllm/vllm-openai:latest`). -```{note} +:::{note} You can either use the `ipc=host` flag or `--shm-size` flag to allow the container to access the host's shared memory. vLLM uses PyTorch, which uses shared memory to share data between processes under the hood, particularly for tensor parallel inference. -``` +::: (deployment-docker-build-image-from-source)= @@ -38,25 +38,25 @@ You can build and run vLLM from source via the provided . To DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai ``` -```{note} +:::{note} By default vLLM will build for all GPU types for widest distribution. If you are just building for the current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""` for vLLM to find the current GPU type and build for that. If you are using Podman instead of Docker, you might need to disable SELinux labeling by adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184). -``` +::: ## Building for Arm64/aarch64 A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64. -```{note} +:::{note} Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=` flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits. Keep an eye on memory usage with parallel jobs as it can be substantial (see example below). -``` +::: ```console # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB) @@ -85,6 +85,6 @@ $ docker run --runtime nvidia --gpus all \ The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command). -```{note} +:::{note} **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . -``` +::: diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/source/deployment/frameworks/cerebrium.md index 5787c4a407bfb..b20c95137b6e7 100644 --- a/docs/source/deployment/frameworks/cerebrium.md +++ b/docs/source/deployment/frameworks/cerebrium.md @@ -2,11 +2,11 @@ # Cerebrium -```{raw} html +:::{raw} html

vLLM_plus_cerebrium

-``` +::: vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/source/deployment/frameworks/dstack.md index b42a34125c6d7..a16e28f2d8983 100644 --- a/docs/source/deployment/frameworks/dstack.md +++ b/docs/source/deployment/frameworks/dstack.md @@ -2,11 +2,11 @@ # dstack -```{raw} html +:::{raw} html

vLLM_plus_dstack

-``` +::: vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment. @@ -97,6 +97,6 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -```{note} +:::{note} dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm) -``` +::: diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md index 18ed293191468..e4fc5e1313079 100644 --- a/docs/source/deployment/frameworks/helm.md +++ b/docs/source/deployment/frameworks/helm.md @@ -38,213 +38,213 @@ chart **including persistent volumes** and deletes the release. ## Architecture -```{image} /assets/deployment/architecture_helm_deployment.png -``` +:::{image} /assets/deployment/architecture_helm_deployment.png +::: ## Values -```{list-table} +:::{list-table} :widths: 25 25 25 25 :header-rows: 1 -* - Key - - Type - - Default - - Description -* - autoscaling - - object - - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} - - Autoscaling configuration -* - autoscaling.enabled - - bool - - false - - Enable autoscaling -* - autoscaling.maxReplicas - - int - - 100 - - Maximum replicas -* - autoscaling.minReplicas - - int - - 1 - - Minimum replicas -* - autoscaling.targetCPUUtilizationPercentage - - int - - 80 - - Target CPU utilization for autoscaling -* - configs - - object - - {} - - Configmap -* - containerPort - - int - - 8000 - - Container port -* - customObjects - - list - - [] - - Custom Objects configuration -* - deploymentStrategy - - object - - {} - - Deployment strategy configuration -* - externalConfigs - - list - - [] - - External configuration -* - extraContainers - - list - - [] - - Additional containers configuration -* - extraInit - - object - - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} - - Additional configuration for the init container -* - extraInit.pvcStorage - - string - - "50Gi" - - Storage size of the s3 -* - extraInit.s3modelpath - - string - - "relative_s3_model_path/opt-125m" - - Path of the model on the s3 which hosts model weights and config files -* - extraInit.awsEc2MetadataDisabled - - boolean - - true - - Disables the use of the Amazon EC2 instance metadata service -* - extraPorts - - list - - [] - - Additional ports configuration -* - gpuModels - - list - - ["TYPE_GPU_USED"] - - Type of gpu used -* - image - - object - - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} - - Image configuration -* - image.command - - list - - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] - - Container launch command -* - image.repository - - string - - "vllm/vllm-openai" - - Image repository -* - image.tag - - string - - "latest" - - Image tag -* - livenessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} - - Liveness probe configuration -* - livenessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive -* - livenessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server -* - livenessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server -* - livenessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening -* - livenessProbe.initialDelaySeconds - - int - - 15 - - Number of seconds after the container has started before liveness probe is initiated -* - livenessProbe.periodSeconds - - int - - 10 - - How often (in seconds) to perform the liveness probe -* - maxUnavailablePodDisruptionBudget - - string - - "" - - Disruption Budget Configuration -* - readinessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} - - Readiness probe configuration -* - readinessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready -* - readinessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server -* - readinessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server -* - readinessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening -* - readinessProbe.initialDelaySeconds - - int - - 5 - - Number of seconds after the container has started before readiness probe is initiated -* - readinessProbe.periodSeconds - - int - - 5 - - How often (in seconds) to perform the readiness probe -* - replicaCount - - int - - 1 - - Number of replicas -* - resources - - object - - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} - - Resource configuration -* - resources.limits."nvidia.com/gpu" - - int - - 1 - - Number of gpus used -* - resources.limits.cpu - - int - - 4 - - Number of CPUs -* - resources.limits.memory - - string - - "16Gi" - - CPU memory configuration -* - resources.requests."nvidia.com/gpu" - - int - - 1 - - Number of gpus used -* - resources.requests.cpu - - int - - 4 - - Number of CPUs -* - resources.requests.memory - - string - - "16Gi" - - CPU memory configuration -* - secrets - - object - - {} - - Secrets configuration -* - serviceName - - string - - - - Service name -* - servicePort - - int - - 80 - - Service port -* - labels.environment - - string - - test - - Environment name -* - labels.release - - string - - test - - Release name -``` +- * Key + * Type + * Default + * Description +- * autoscaling + * object + * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} + * Autoscaling configuration +- * autoscaling.enabled + * bool + * false + * Enable autoscaling +- * autoscaling.maxReplicas + * int + * 100 + * Maximum replicas +- * autoscaling.minReplicas + * int + * 1 + * Minimum replicas +- * autoscaling.targetCPUUtilizationPercentage + * int + * 80 + * Target CPU utilization for autoscaling +- * configs + * object + * {} + * Configmap +- * containerPort + * int + * 8000 + * Container port +- * customObjects + * list + * [] + * Custom Objects configuration +- * deploymentStrategy + * object + * {} + * Deployment strategy configuration +- * externalConfigs + * list + * [] + * External configuration +- * extraContainers + * list + * [] + * Additional containers configuration +- * extraInit + * object + * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} + * Additional configuration for the init container +- * extraInit.pvcStorage + * string + * "50Gi" + * Storage size of the s3 +- * extraInit.s3modelpath + * string + * "relative_s3_model_path/opt-125m" + * Path of the model on the s3 which hosts model weights and config files +- * extraInit.awsEc2MetadataDisabled + * boolean + * true + * Disables the use of the Amazon EC2 instance metadata service +- * extraPorts + * list + * [] + * Additional ports configuration +- * gpuModels + * list + * ["TYPE_GPU_USED"] + * Type of gpu used +- * image + * object + * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} + * Image configuration +- * image.command + * list + * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] + * Container launch command +- * image.repository + * string + * "vllm/vllm-openai" + * Image repository +- * image.tag + * string + * "latest" + * Image tag +- * livenessProbe + * object + * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} + * Liveness probe configuration +- * livenessProbe.failureThreshold + * int + * 3 + * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive +- * livenessProbe.httpGet + * object + * {"path":"/health","port":8000} + * Configuration of the Kubelet http request on the server +- * livenessProbe.httpGet.path + * string + * "/health" + * Path to access on the HTTP server +- * livenessProbe.httpGet.port + * int + * 8000 + * Name or number of the port to access on the container, on which the server is listening +- * livenessProbe.initialDelaySeconds + * int + * 15 + * Number of seconds after the container has started before liveness probe is initiated +- * livenessProbe.periodSeconds + * int + * 10 + * How often (in seconds) to perform the liveness probe +- * maxUnavailablePodDisruptionBudget + * string + * "" + * Disruption Budget Configuration +- * readinessProbe + * object + * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} + * Readiness probe configuration +- * readinessProbe.failureThreshold + * int + * 3 + * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready +- * readinessProbe.httpGet + * object + * {"path":"/health","port":8000} + * Configuration of the Kubelet http request on the server +- * readinessProbe.httpGet.path + * string + * "/health" + * Path to access on the HTTP server +- * readinessProbe.httpGet.port + * int + * 8000 + * Name or number of the port to access on the container, on which the server is listening +- * readinessProbe.initialDelaySeconds + * int + * 5 + * Number of seconds after the container has started before readiness probe is initiated +- * readinessProbe.periodSeconds + * int + * 5 + * How often (in seconds) to perform the readiness probe +- * replicaCount + * int + * 1 + * Number of replicas +- * resources + * object + * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} + * Resource configuration +- * resources.limits."nvidia.com/gpu" + * int + * 1 + * Number of gpus used +- * resources.limits.cpu + * int + * 4 + * Number of CPUs +- * resources.limits.memory + * string + * "16Gi" + * CPU memory configuration +- * resources.requests."nvidia.com/gpu" + * int + * 1 + * Number of gpus used +- * resources.requests.cpu + * int + * 4 + * Number of CPUs +- * resources.requests.memory + * string + * "16Gi" + * CPU memory configuration +- * secrets + * object + * {} + * Secrets configuration +- * serviceName + * string + * + * Service name +- * servicePort + * int + * 80 + * Service port +- * labels.environment + * string + * test + * Environment name +- * labels.release + * string + * test + * Release name +::: diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md index 964782763f6b3..cb758d3e6d2e4 100644 --- a/docs/source/deployment/frameworks/index.md +++ b/docs/source/deployment/frameworks/index.md @@ -1,6 +1,6 @@ # Using other frameworks -```{toctree} +:::{toctree} :maxdepth: 1 bentoml @@ -11,4 +11,4 @@ lws modal skypilot triton -``` +::: diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/source/deployment/frameworks/skypilot.md index 051fc2f2a8d4e..5e101b9001033 100644 --- a/docs/source/deployment/frameworks/skypilot.md +++ b/docs/source/deployment/frameworks/skypilot.md @@ -2,11 +2,11 @@ # SkyPilot -```{raw} html +:::{raw} html

vLLM

-``` +::: vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html). @@ -104,10 +104,10 @@ service: max_completion_tokens: 1 ``` -```{raw} html +:::{raw} html
Click to see the full recipe YAML -``` +::: ```yaml service: @@ -153,9 +153,9 @@ run: | 2>&1 | tee api_server.log ``` -```{raw} html +:::{raw} html
-``` +::: Start the serving the Llama-3 8B model on multiple replicas: @@ -169,10 +169,10 @@ Wait until the service is ready: watch -n10 sky serve status vllm ``` -```{raw} html +:::{raw} html
Example outputs: -``` +::: ```console Services @@ -185,9 +185,9 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 ``` -```{raw} html +:::{raw} html
-``` +::: After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: @@ -223,10 +223,10 @@ service: This will scale the service up to when the QPS exceeds 2 for each replica. -```{raw} html +:::{raw} html
Click to see the full recipe YAML -``` +::: ```yaml service: @@ -275,9 +275,9 @@ run: | 2>&1 | tee api_server.log ``` -```{raw} html +:::{raw} html
-``` +::: To update the service with the new config: @@ -295,10 +295,10 @@ sky serve down vllm It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. -```{raw} html +:::{raw} html
Click to see the full GUI YAML -``` +::: ```yaml envs: @@ -328,9 +328,9 @@ run: | --stop-token-ids 128009,128001 | tee ~/gradio.log ``` -```{raw} html +:::{raw} html
-``` +::: 1. Start the chat web UI: diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md index d47ede8967547..c286edb4d7bc1 100644 --- a/docs/source/deployment/integrations/index.md +++ b/docs/source/deployment/integrations/index.md @@ -1,9 +1,9 @@ # External Integrations -```{toctree} +:::{toctree} :maxdepth: 1 kserve kubeai llamastack -``` +::: diff --git a/docs/source/deployment/nginx.md b/docs/source/deployment/nginx.md index a58f791c2997b..87feb48856853 100644 --- a/docs/source/deployment/nginx.md +++ b/docs/source/deployment/nginx.md @@ -105,9 +105,9 @@ docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-si docker run -itd --ipc host --privileged --network vllm_nginx --gpus all --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf ``` -```{note} +:::{note} If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. -``` +::: (nginxloadbalancer-nginx-launch-nginx)= diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index cec503ef2f77d..04886e5981eef 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -4,19 +4,19 @@ This document provides an overview of the vLLM architecture. -```{contents} Table of Contents +:::{contents} Table of Contents :depth: 2 :local: true -``` +::: ## Entrypoints vLLM provides a number of entrypoints for interacting with the system. The following diagram shows the relationship between them. -```{image} /assets/design/arch_overview/entrypoints.excalidraw.png +:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png :alt: Entrypoints Diagram -``` +::: ### LLM Class @@ -84,9 +84,9 @@ More details on the API server can be found in the [OpenAI-Compatible Server](#o The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of the vLLM system, handling model inference and asynchronous request processing. -```{image} /assets/design/arch_overview/llm_engine.excalidraw.png +:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png :alt: LLMEngine Diagram -``` +::: ### LLMEngine @@ -144,11 +144,11 @@ configurations affect the class we ultimately get. The following figure shows the class hierarchy of vLLM: -> ```{figure} /assets/design/hierarchy.png +> :::{figure} /assets/design/hierarchy.png > :align: center > :alt: query > :width: 100% -> ``` +> ::: There are several important design choices behind this class hierarchy: @@ -178,7 +178,7 @@ of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model. -````{note} +:::{note} To support this change, all vLLM models' signatures have been updated to: ```python @@ -215,7 +215,7 @@ else: ``` This way, the model can work with both old and new versions of vLLM. -```` +::: 3\. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md index f896f903c78f5..5f2582877260a 100644 --- a/docs/source/design/kernel/paged_attention.md +++ b/docs/source/design/kernel/paged_attention.md @@ -139,26 +139,26 @@ const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE; ``` - ```{figure} ../../assets/kernel/query.png + :::{figure} ../../assets/kernel/query.png :align: center :alt: query :width: 70% Query data of one token at one head - ``` + ::: - Each thread defines its own `q_ptr` which points to the assigned query token data on global memory. For example, if `VEC_SIZE` is 4 and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains total of 128 elements divided into 128 / 4 = 32 vecs. - ```{figure} ../../assets/kernel/q_vecs.png + :::{figure} ../../assets/kernel/q_vecs.png :align: center :alt: q_vecs :width: 70% `q_vecs` for one thread group - ``` + ::: ```cpp __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD]; @@ -195,13 +195,13 @@ points to key token data based on `k_cache` at assigned block, assigned head and assigned token. - ```{figure} ../../assets/kernel/key.png + :::{figure} ../../assets/kernel/key.png :align: center :alt: key :width: 70% Key data of all context tokens at one head - ``` + ::: - The diagram above illustrates the memory layout for key data. It assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is @@ -214,13 +214,13 @@ elements for one token) that will be processed by 2 threads (one thread group) separately. - ```{figure} ../../assets/kernel/k_vecs.png + :::{figure} ../../assets/kernel/k_vecs.png :align: center :alt: k_vecs :width: 70% `k_vecs` for one thread - ``` + ::: ```cpp K_vec k_vecs[NUM_VECS_PER_THREAD] @@ -289,14 +289,14 @@ should be performed across the entire thread block, encompassing results between the query token and all context key tokens. - ```{math} + :::{math} :nowrap: true \begin{gather*} m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\ \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)} \end{gather*} - ``` + ::: ### `qk_max` and `logits` @@ -379,29 +379,29 @@ ## Value -```{figure} ../../assets/kernel/value.png +:::{figure} ../../assets/kernel/value.png :align: center :alt: value :width: 70% Value data of all context tokens at one head -``` +::: -```{figure} ../../assets/kernel/logits_vec.png +:::{figure} ../../assets/kernel/logits_vec.png :align: center :alt: logits_vec :width: 50% `logits_vec` for one thread -``` +::: -```{figure} ../../assets/kernel/v_vec.png +:::{figure} ../../assets/kernel/v_vec.png :align: center :alt: v_vec :width: 70% List of `v_vec` for one thread -``` +::: - Now we need to retrieve the value data and perform dot multiplication with `logits`. Unlike query and key, there is no thread group diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index c2cdb75ea08a7..55dae0bb92d4e 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -7,9 +7,9 @@ page for information on known issues and how to solve them. ## Introduction -```{important} +:::{important} The source code references are to the state of the code at the time of writing in December, 2024. -``` +::: The use of Python multiprocessing in vLLM is complicated by: diff --git a/docs/source/features/automatic_prefix_caching.md b/docs/source/features/automatic_prefix_caching.md index 3d70cbb29c385..59016d7fcf6b3 100644 --- a/docs/source/features/automatic_prefix_caching.md +++ b/docs/source/features/automatic_prefix_caching.md @@ -6,9 +6,9 @@ Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. -```{note} +:::{note} Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching). -``` +::: ## Enabling APC in vLLM diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md index 86a82eb36df33..b0018ebccf5ba 100644 --- a/docs/source/features/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -4,13 +4,13 @@ The tables below show mutually exclusive features and the support on some hardware. -```{note} +:::{note} Check the '✗' with links to see tracking issue for unsupported feature/hardware combination. -``` +::: ## Feature x Feature -```{raw} html +:::{raw} html -``` +::: -```{list-table} - :header-rows: 1 - :stub-columns: 1 - :widths: auto +:::{list-table} +:header-rows: 1 +:stub-columns: 1 +:widths: auto - * - Feature - - [CP](#chunked-prefill) - - [APC](#automatic-prefix-caching) - - [LoRA](#lora-adapter) - - prmpt adptr - - [SD](#spec_decode) - - CUDA graph - - pooling - - enc-dec - - logP - - prmpt logP - - async output - - multi-step - - mm - - best-of - - beam-search - - guided dec - * - [CP](#chunked-prefill) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - [APC](#automatic-prefix-caching) - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - [LoRA](#lora-adapter) - - [✗](gh-pr:9057) - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - * - prmpt adptr - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - - - * - [SD](#spec_decode) - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - - - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - - - - - * - pooling - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - - - - - - - - - - - - - - - - - - - - * - enc-dec - - ✗ - - [✗](gh-issue:7366) - - ✗ - - ✗ - - [✗](gh-issue:7366) - - ✅ - - ✅ - - - - - - - - - - - - - - - - - - - * - logP - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - - - - - - - - - - - - - - - - * - prmpt logP - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-pr:8199) - - ✅ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - - - * - async output - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - ✅ - - - - - - - - - - - - - * - multi-step - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✅ - - ✗ - - ✗ - - ✅ - - [✗](gh-issue:8198) - - ✅ - - - - - - - - - - - * - mm - - ✅ - - [✗](gh-pr:8348) - - [✗](gh-pr:7199) - - ? - - ? - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - - - - - - - - - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:6137) - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - [✗](gh-issue:7968) - - ✅ - - - - - - - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:6137) - - ✅ - - ✗ - - ✅ - - ✅ - - ✅ - - ? - - [✗](gh-issue:7968>) - - ? - - ✅ - - - - - * - guided dec - - ✅ - - ✅ - - ? - - ? - - ✅ - - ✅ - - ✗ - - ? - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:9893) - - ? - - ✅ - - ✅ - - - -``` +- * Feature + * [CP](#chunked-prefill) + * [APC](#automatic-prefix-caching) + * [LoRA](#lora-adapter) + * prmpt adptr + * [SD](#spec_decode) + * CUDA graph + * pooling + * enc-dec + * logP + * prmpt logP + * async output + * multi-step + * mm + * best-of + * beam-search + * guided dec +- * [CP](#chunked-prefill) + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * +- * [APC](#automatic-prefix-caching) + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * +- * [LoRA](#lora-adapter) + * [✗](gh-pr:9057) + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * + * + * +- * prmpt adptr + * ✅ + * ✅ + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * + * +- * [SD](#spec_decode) + * ✅ + * ✅ + * ✗ + * ✅ + * + * + * + * + * + * + * + * + * + * + * + * +- * CUDA graph + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * + * + * + * + * + * + * + * + * + * + * +- * pooling + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * + * + * + * + * + * + * + * + * + * +- * enc-dec + * ✗ + * [✗](gh-issue:7366) + * ✗ + * ✗ + * [✗](gh-issue:7366) + * ✅ + * ✅ + * + * + * + * + * + * + * + * + * +- * logP + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✅ + * + * + * + * + * + * + * + * +- * prmpt logP + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-pr:8199) + * ✅ + * ✗ + * ✅ + * ✅ + * + * + * + * + * + * + * +- * async output + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✅ + * ✗ + * ✗ + * ✅ + * ✅ + * + * + * + * + * + * +- * multi-step + * ✗ + * ✅ + * ✗ + * ✅ + * ✗ + * ✅ + * ✗ + * ✗ + * ✅ + * [✗](gh-issue:8198) + * ✅ + * + * + * + * + * +- * mm + * ✅ + * [✗](gh-pr:8348) + * [✗](gh-pr:7199) + * ? + * ? + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ? + * + * + * + * +- * best-of + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:6137) + * ✅ + * ✗ + * ✅ + * ✅ + * ✅ + * ? + * [✗](gh-issue:7968) + * ✅ + * + * + * +- * beam-search + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:6137) + * ✅ + * ✗ + * ✅ + * ✅ + * ✅ + * ? + * [✗](gh-issue:7968>) + * ? + * ✅ + * + * +- * guided dec + * ✅ + * ✅ + * ? + * ? + * [✗](gh-issue:11484) + * ✅ + * ✗ + * ? + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:9893) + * ? + * ✅ + * ✅ + * +::: (feature-x-hardware)= ## Feature x Hardware -```{list-table} - :header-rows: 1 - :stub-columns: 1 - :widths: auto +:::{list-table} +:header-rows: 1 +:stub-columns: 1 +:widths: auto - * - Feature - - Volta - - Turing - - Ampere - - Ada - - Hopper - - CPU - - AMD - * - [CP](#chunked-prefill) - - [✗](gh-issue:2729) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - [APC](#automatic-prefix-caching) - - [✗](gh-issue:3687) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - [LoRA](#lora-adapter) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - prmpt adptr - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:8475) - - ✅ - * - [SD](#spec_decode) - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - CUDA graph - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✅ - * - pooling - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ? - * - enc-dec - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - * - mm - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - logP - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - prmpt logP - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - async output - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✗ - - ✗ - * - multi-step - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - [✗](gh-issue:8477) - - ✅ - * - best-of - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - beam-search - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - * - guided dec - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ - - ✅ -``` +- * Feature + * Volta + * Turing + * Ampere + * Ada + * Hopper + * CPU + * AMD +- * [CP](#chunked-prefill) + * [✗](gh-issue:2729) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * [APC](#automatic-prefix-caching) + * [✗](gh-issue:3687) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * [LoRA](#lora-adapter) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * prmpt adptr + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:8475) + * ✅ +- * [SD](#spec_decode) + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * CUDA graph + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✅ +- * pooling + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ? +- * enc-dec + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ +- * mm + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * logP + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * prmpt logP + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * async output + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✗ + * ✗ +- * multi-step + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * [✗](gh-issue:8477) + * ✅ +- * best-of + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * beam-search + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +- * guided dec + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ + * ✅ +::: diff --git a/docs/source/features/disagg_prefill.md b/docs/source/features/disagg_prefill.md index efa2efc66192e..52d253b9c2b18 100644 --- a/docs/source/features/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -4,9 +4,9 @@ This page introduces you the disaggregated prefilling feature in vLLM. -```{note} +:::{note} This feature is experimental and subject to change. -``` +::: ## Why disaggregated prefilling? @@ -15,9 +15,9 @@ Two main reasons: - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT. - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL. -```{note} +:::{note} Disaggregated prefill DOES NOT improve throughput. -``` +::: ## Usage example @@ -39,21 +39,21 @@ Key abstractions for disaggregated prefilling: - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer. - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`. -```{note} +:::{note} `insert` is non-blocking operation but `drop_select` is blocking operation. -``` +::: Here is a figure illustrating how the above 3 abstractions are organized: -```{image} /assets/features/disagg_prefill/abstraction.jpg +:::{image} /assets/features/disagg_prefill/abstraction.jpg :alt: Disaggregated prefilling abstractions -``` +::: The workflow of disaggregated prefilling is as follows: -```{image} /assets/features/disagg_prefill/overview.jpg +:::{image} /assets/features/disagg_prefill/overview.jpg :alt: Disaggregated prefilling workflow -``` +::: The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer. diff --git a/docs/source/features/lora.md b/docs/source/features/lora.md index b00d05147bb32..fb5a7a0d519cb 100644 --- a/docs/source/features/lora.md +++ b/docs/source/features/lora.md @@ -60,9 +60,9 @@ vllm serve meta-llama/Llama-2-7b-hf \ --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/ ``` -```{note} +:::{note} The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one. -``` +::: The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`, etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along diff --git a/docs/source/features/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md index 404505eb3890e..30735b1161ff3 100644 --- a/docs/source/features/quantization/auto_awq.md +++ b/docs/source/features/quantization/auto_awq.md @@ -2,11 +2,11 @@ # AutoAWQ -```{warning} +:::{warning} Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version. -``` +::: To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ). Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. diff --git a/docs/source/features/quantization/fp8.md b/docs/source/features/quantization/fp8.md index 1398e8a324201..a62e0124b7706 100644 --- a/docs/source/features/quantization/fp8.md +++ b/docs/source/features/quantization/fp8.md @@ -14,10 +14,10 @@ The FP8 types typically supported in hardware have two distinct representations, - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`. - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values. -```{note} +:::{note} FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper). FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin. -``` +::: ## Quick Start with Online Dynamic Quantization @@ -32,9 +32,9 @@ model = LLM("facebook/opt-125m", quantization="fp8") result = model.generate("Hello, my name is") ``` -```{warning} +:::{warning} Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model. -``` +::: ## Installation @@ -110,9 +110,9 @@ model.generate("Hello my name is") Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`): -```{note} +:::{note} Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations. -``` +::: ```console $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic @@ -137,10 +137,10 @@ If you encounter any issues or have feature requests, please open an issue on th ## Deprecated Flow -```{note} +:::{note} The following information is preserved for reference and search purposes. The quantization method described below is deprecated in favor of the `llmcompressor` method described above. -``` +::: For static per-tensor offline quantization to FP8, please install the [AutoFP8 library](https://github.com/neuralmagic/autofp8). diff --git a/docs/source/features/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md deleted file mode 100644 index 1cd67cb8fd336..0000000000000 --- a/docs/source/features/quantization/fp8_e4m3_kvcache.md +++ /dev/null @@ -1,44 +0,0 @@ -(fp8-e4m3-kvcache)= - -# FP8 E4M3 KV Cache - -Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, -improving throughput. OCP (Open Compute Project www.opencompute.org) specifies two common 8-bit floating point data formats: E5M2 -(5 exponent bits and 2 mantissa bits) and E4M3FN (4 exponent bits and 3 mantissa bits), often shortened as E4M3. One benefit of -the E4M3 format over E5M2 is that floating point numbers are represented in higher precision. However, the small dynamic range of -FP8 E4M3 (±240.0 can be represented) typically necessitates the use of a higher-precision (typically FP32) scaling factor alongside -each quantized tensor. For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling -factors of a finer granularity (e.g. per-channel). - -These scaling factors can be specified by passing an optional quantization param JSON to the LLM engine at load time. If -this JSON is not specified, scaling factors default to 1.0. These scaling factors are typically obtained when running an -unquantized model through a quantizer tool (e.g. AMD quantizer or NVIDIA AMMO). - -To install AMMO (AlgorithMic Model Optimization): - -```console -pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo -``` - -Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy. The most recent silicon -offerings e.g. AMD MI300, NVIDIA Hopper or later support native hardware conversion to and from fp32, fp16, bf16, etc. -Thus, LLM inference is greatly accelerated with minimal accuracy loss. - -Here is an example of how to enable this feature: - -```python -# two float8_e4m3fn kv cache scaling factor files are provided under tests/fp8_kv, please refer to -# https://github.com/vllm-project/vllm/blob/main/examples/other/fp8/README.md to generate kv_cache_scales.json of your own. - -from vllm import LLM, SamplingParams -sampling_params = SamplingParams(temperature=1.3, top_p=0.8) -llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") -prompt = "London is the capital of" -out = llm.generate(prompt, sampling_params)[0].outputs[0].text -print(out) - -# output w/ scaling factors: England, the United Kingdom, and one of the world's leading financial, -# output w/o scaling factors: England, located in the southeastern part of the country. It is known -``` diff --git a/docs/source/features/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md deleted file mode 100644 index 3a81ab17f332f..0000000000000 --- a/docs/source/features/quantization/fp8_e5m2_kvcache.md +++ /dev/null @@ -1,31 +0,0 @@ -(fp8-kv-cache)= - -# FP8 E5M2 KV Cache - -The int8/int4 quantization scheme requires additional scale GPU memory storage, which reduces the expected GPU memory benefits. -The FP8 data format retains 2~3 mantissa bits and can convert float/fp16/bfloat16 and fp8 to each other. - -Here is an example of how to enable this feature: - -```python -from vllm import LLM, SamplingParams -# Sample prompts. -prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM(model="facebook/opt-125m", kv_cache_dtype="fp8") -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") -``` diff --git a/docs/source/features/quantization/gguf.md b/docs/source/features/quantization/gguf.md index 640997cf4bc39..65c181900f9be 100644 --- a/docs/source/features/quantization/gguf.md +++ b/docs/source/features/quantization/gguf.md @@ -2,13 +2,13 @@ # GGUF -```{warning} +:::{warning} Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team. -``` +::: -```{warning} +:::{warning} Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model. -``` +::: To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command: @@ -25,9 +25,9 @@ You can also add `--tensor-parallel-size 2` to enable tensor parallelism inferen vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2 ``` -```{warning} +:::{warning} We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size. -``` +::: You can also use the GGUF model directly through the LLM entrypoint: diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md index 58f9c4d42947f..c1e817fa55fca 100644 --- a/docs/source/features/quantization/index.md +++ b/docs/source/features/quantization/index.md @@ -4,7 +4,7 @@ Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. -```{toctree} +:::{toctree} :caption: Contents :maxdepth: 1 @@ -15,6 +15,5 @@ gguf inc int8 fp8 -fp8_e5m2_kvcache -fp8_e4m3_kvcache -``` +quantized_kvcache +::: diff --git a/docs/source/features/quantization/int8.md b/docs/source/features/quantization/int8.md index 592a60d3988b2..fedb16f4350e5 100644 --- a/docs/source/features/quantization/int8.md +++ b/docs/source/features/quantization/int8.md @@ -7,9 +7,9 @@ This quantization method is particularly useful for reducing model size while ma Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415). -```{note} +:::{note} INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper). -``` +::: ## Prerequisites @@ -119,9 +119,9 @@ $ lm_eval --model vllm \ --batch_size 'auto' ``` -```{note} +:::{note} Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations. -``` +::: ## Best Practices diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/source/features/quantization/quantized_kvcache.md new file mode 100644 index 0000000000000..9f36c2949e0dd --- /dev/null +++ b/docs/source/features/quantization/quantized_kvcache.md @@ -0,0 +1,147 @@ +(quantized-kvcache)= + +# Quantized KV Cache + +## FP8 KV Cache + +Quantizing the KV cache to FP8 reduces its memory footprint. This increases the number of tokens that can be stored in the cache, improving throughput. + +### FP8 Formats + +[OCP (Open Compute Project)](https://www.opencompute.org) specifies two common 8-bit floating point data formats: + +- E5M2 (5 exponent bits and 2 mantissa bits) +- E4M3FN (4 exponent bits and 3 mantissa bits, often shortened as E4M3) + +The E4M3 format offers higher precision compared to E5M2. However, due to its small dynamic range (±240.0), E4M3 typically requires a higher-precision (FP32) scaling factor alongside each quantized tensor. + +### Current Limitations + +For now, only per-tensor (scalar) scaling factors are supported. Development is ongoing to support scaling factors of a finer granularity (e.g. per-channel). + +### Performance Impact + +The current FP8 KV cache implementation primarily benefits throughput by allowing approximately double the amount of space for KV cache allocation. This enables either: + +- Processing longer context lengths for individual requests, or +- Handling more concurrent request batches + +However, there are currently no latency improvements as the implementation does not yet include fused dequantization and attention operations. Future releases will support quantized attention with hardware acceleration, which should provide additional performance benefits. While the most recent silicon offerings (e.g. AMD MI300, NVIDIA Hopper or later) support native hardware conversion between FP8 and other formats (fp32, fp16, bf16), this benefit is not yet fully realized. + +Studies have shown that FP8 E4M3 quantization typically only minimally degrades inference accuracy, making it a practical choice for throughput optimization. + +## Usage Example + +Here is an example of how to enable FP8 quantization: + +```python +# To calculate kv cache scales on the fly enable the calculate_kv_scales +# parameter + +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True) +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) +``` + +The `kv_cache_dtype` argument specifies the data type for KV cache storage: +- `"auto"`: Uses the model's default "unquantized" data type +- `"fp8"` or `"fp8_e4m3"`: Supported on CUDA 11.8+ and ROCm (AMD GPU) +- `"fp8_e5m2"`: Supported on CUDA 11.8+ + +## Calibrated Scales for Better Accuracy + +For optimal model quality when using FP8 KV Cache, we recommend using calibrated scales tuned to representative inference data. [LLM Compressor](https://github.com/vllm-project/llm-compressor/) is the recommended tool for this process. + +### Installation + +First, install the required dependencies: + +```console +pip install llmcompressor +``` + +### Example Usage + +Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern): + +```python +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer +from llmcompressor.transformers import oneshot + +# Select model and load it +MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct" +model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto") +tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) + +# Select calibration dataset +DATASET_ID = "HuggingFaceH4/ultrachat_200k" +DATASET_SPLIT = "train_sft" + +# Configure calibration parameters +NUM_CALIBRATION_SAMPLES = 512 # 512 samples is a good starting point +MAX_SEQUENCE_LENGTH = 2048 + +# Load and preprocess dataset +ds = load_dataset(DATASET_ID, split=DATASET_SPLIT) +ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES)) + +def process_and_tokenize(example): + text = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return tokenizer( + text, + padding=False, + max_length=MAX_SEQUENCE_LENGTH, + truncation=True, + add_special_tokens=False, + ) + +ds = ds.map(process_and_tokenize, remove_columns=ds.column_names) + +# Configure quantization settings +recipe = """ +quant_stage: + quant_modifiers: + QuantizationModifier: + kv_cache_scheme: + num_bits: 8 + type: float + strategy: tensor + dynamic: false + symmetric: true +""" + +# Apply quantization +oneshot( + model=model, + dataset=ds, + recipe=recipe, + max_seq_length=MAX_SEQUENCE_LENGTH, + num_calibration_samples=NUM_CALIBRATION_SAMPLES, +) + +# Save quantized model +SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV" +model.save_pretrained(SAVE_DIR, save_compressed=True) +tokenizer.save_pretrained(SAVE_DIR) +``` + +The above script will create a folder in your current directory containing your quantized model (e.g., `Llama-3.1-8B-Instruct-FP8-KV`) with calibrated scales. + +When running the model you must specify `kv_cache_dtype="fp8"` in order to enable the kv cache quantization and use the scales. + +```python +from vllm import LLM, SamplingParams + +sampling_params = SamplingParams(temperature=0.7, top_p=0.8) +llm = LLM(model="Llama-3.1-8B-Instruct-FP8-KV", kv_cache_dtype="fp8") +prompt = "London is the capital of" +out = llm.generate(prompt, sampling_params)[0].outputs[0].text +print(out) +``` diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md index c375d044dd64b..76726f3cfda22 100644 --- a/docs/source/features/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -4,150 +4,150 @@ The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: -```{list-table} +:::{list-table} :header-rows: 1 :widths: 20 8 8 8 8 8 8 8 8 8 8 8 -* - Implementation - - Volta - - Turing - - Ampere - - Ada - - Hopper - - AMD GPU - - Intel GPU - - Intel Gaudi - - x86 CPU - - AWS Inferentia - - Google TPU -* - AWQ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✗ - - ✅︎ - - ✗ - - ✗ -* - GPTQ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✗ - - ✅︎ - - ✗ - - ✗ -* - Marlin (GPTQ/AWQ/FP8) - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - INT8 (W8A8) - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ -* - FP8 (W8A8) - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - AQLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - bitsandbytes - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - DeepSpeedFP - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - GGUF - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ -* - INC (W8A8) - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ - - ✗ -``` +- * Implementation + * Volta + * Turing + * Ampere + * Ada + * Hopper + * AMD GPU + * Intel GPU + * Intel Gaudi + * x86 CPU + * AWS Inferentia + * Google TPU +- * AWQ + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✅︎ + * ✗ + * ✅︎ + * ✗ + * ✗ +- * GPTQ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✅︎ + * ✗ + * ✅︎ + * ✗ + * ✗ +- * Marlin (GPTQ/AWQ/FP8) + * ✗ + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * INT8 (W8A8) + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✅︎ + * ✗ + * ✗ +- * FP8 (W8A8) + * ✗ + * ✗ + * ✗ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✅︎ + * ✗ + * ✗ + * ✗ +- * AQLM + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * bitsandbytes + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * DeepSpeedFP + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * GGUF + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✅︎ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ +- * INC (W8A8) + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✗ + * ✅︎ + * ✗ + * ✗ + * ✗ +::: - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - "✅︎" indicates that the quantization method is supported on the specified hardware. - "✗" indicates that the quantization method is not supported on the specified hardware. -```{note} +:::{note} This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. -``` +::: diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md new file mode 100644 index 0000000000000..e39bbacf1138d --- /dev/null +++ b/docs/source/features/reasoning_outputs.md @@ -0,0 +1,151 @@ +(reasoning-outputs)= + +# Reasoning Outputs + +vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions. + +Reasoning models return a additional `reasoning_content` field in their outputs, which contains the reasoning steps that led to the final conclusion. This field is not present in the outputs of other models. + +## Supported Models + +vLLM currently supports the following reasoning models: + +- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) (`deepseek_r1`, which looks for ` ... `) + +## Quickstart + +To use reasoning models, you need to specify the `--enable-reasoning` and `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output. + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --enable-reasoning --reasoning-parser deepseek_r1 +``` + +Next, make a request to the model that should return the reasoning content in the response. + +```python +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +# Round 1 +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +response = client.chat.completions.create(model=model, messages=messages) + +reasoning_content = response.choices[0].message.reasoning_content +content = response.choices[0].message.content + +print("reasoning_content:", reasoning_content) +print("content:", content) +``` + +The `reasoning_content` field contains the reasoning steps that led to the final conclusion, while the `content` field contains the final conclusion. + +## Streaming chat completions + +Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming). + +```json +{ + "id": "chatcmpl-123", + "object": "chat.completion.chunk", + "created": 1694268190, + "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", + "system_fingerprint": "fp_44709d6fcb", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "reasoning_content": "is", + }, + "logprobs": null, + "finish_reason": null + } + ] +} +``` + +Please note that it is not compatible with the OpenAI Python client library. You can use the `requests` library to make streaming requests. + +## How to support a new reasoning model + +You can add a new `ReasoningParser` similar to `vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py`. + +```python +# import the required packages + +from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import ( + ReasoningParser, ReasoningParserManager) +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) + +# define a reasoning parser and register it to vllm +# the name list in register_module can be used +# in --reasoning-parser. +@ReasoningParserManager.register_module(["example"]) +class ExampleParser(ReasoningParser): + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Instance method that should be implemented for extracting reasoning + from an incomplete response; for use when handling reasoning calls and + streaming. Has to be an instance method because it requires state - + the current tokens/diffs, but also the information about what has + previously been parsed and extracted (see constructor) + """ + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> Tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from a complete model-generated string. + + Used for non-streaming responses where we have the entire model response + available before sending to the client. + + Parameters: + model_output: str + The model-generated string to extract reasoning content from. + + request: ChatCompletionRequest + The request object that was used to generate the model_output. + + Returns: + Tuple[Optional[str], Optional[str]] + A tuple containing the reasoning content and the content. + """ +``` + +After defining the reasoning parser, you can use it by specifying the `--reasoning-parser` flag when making a request to the chat completion endpoint. + +```bash +vllm serve \ + --enable-reasoning --reasoning-parser example +``` + +## Limitations + +- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`). +- It is not compatible with the [`structured_outputs`](#structured_outputs) and [`tool_calling`](#tool_calling) features. +- The reasoning content is not available for all models. Check the model's documentation to see if it supports reasoning. diff --git a/docs/source/features/spec_decode.md b/docs/source/features/spec_decode.md index ab7b2f302bd13..da87127057dc5 100644 --- a/docs/source/features/spec_decode.md +++ b/docs/source/features/spec_decode.md @@ -2,15 +2,15 @@ # Speculative Decoding -```{warning} +:::{warning} Please note that speculative decoding in vLLM is not yet optimized and does not usually yield inter-token latency reductions for all prompt datasets or sampling parameters. The work to optimize it is ongoing and can be followed here: -``` +::: -```{warning} +:::{warning} Currently, speculative decoding in vLLM is not compatible with pipeline parallelism. -``` +::: This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM. Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference. diff --git a/docs/source/features/structured_outputs.md b/docs/source/features/structured_outputs.md index 1d77c7339a33f..90c880e8cfa46 100644 --- a/docs/source/features/structured_outputs.md +++ b/docs/source/features/structured_outputs.md @@ -95,10 +95,10 @@ completion = client.chat.completions.create( print(completion.choices[0].message.content) ``` -```{tip} +:::{tip} While not strictly necessary, normally it´s better to indicate in the prompt that a JSON needs to be generated and which fields and how should the LLM fill them. This can improve the results notably in most cases. -``` +::: Finally we have the `guided_grammar`, which probably is the most difficult one to use but it´s really powerful, as it allows us to define complete languages like SQL queries. It works by using a context free EBNF grammar, which for example we can use to define a specific format of simplified SQL queries, like in the example below: diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py index aaa13d0fb6d3f..ac592e22328da 100644 --- a/docs/source/generate_examples.py +++ b/docs/source/generate_examples.py @@ -57,9 +57,9 @@ class Index: def generate(self) -> str: content = f"# {self.title}\n\n{self.description}\n\n" - content += "```{toctree}\n" + content += ":::{toctree}\n" content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n" - content += "\n".join(self.documents) + "\n```\n" + content += "\n".join(self.documents) + "\n:::\n" return content diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md index 8ac6e7045f780..d0bc8208d21a5 100644 --- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md @@ -55,6 +55,7 @@ vLLM releases are being performed periodically to align with Intel® Gaudi® sof git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork git checkout v0.6.4.post2+Gaudi-1.19.0 +pip install --upgrade pip pip install -r requirements-hpu.txt python setup.py develop ``` @@ -67,6 +68,7 @@ Currently, the latest features and performance optimizations are being developed git clone https://github.com/HabanaAI/vllm-fork.git cd vllm-fork git checkout habana_main +pip install --upgrade pip pip install -r requirements-hpu.txt python setup.py develop ``` @@ -78,6 +80,7 @@ If you prefer to build and install directly from the main vLLM source, where per ```console git clone https://github.com/vllm-project/vllm.git cd vllm +pip install --upgrade pip pip install -r requirements-hpu.txt python setup.py develop ``` @@ -97,9 +100,9 @@ docker build -f Dockerfile.hpu -t vllm-hpu-env . docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env ``` -```{tip} +:::{tip} If you are facing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Optional Packages" section of [Install Driver and Software](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#install-driver-and-software) and "Configure Container Runtime" section of [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Installation_Methods/Docker_Installation.html#configure-container-runtime).. Make sure you have `habanalabs-container-runtime` package installed and that `habana` container runtime is registered. -``` +::: ## Extra information @@ -146,6 +149,7 @@ The following configurations have been validated to be function with Gaudi2 devi - [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) with tensor parallelism on 8x HPU, BF16 datatype with random or greedy sampling - [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) on single HPU or with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling - [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) with tensor parallelism on 2x HPU, BF16 datatype with random or greedy sampling +- [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) on single HPU or with tensor parallelism on 8x HPU, BF16 datatype ## Performance Tuning @@ -160,17 +164,16 @@ Currently in vLLM for HPU we support four execution modes, depending on selected | 1 | 0 | HPU Graphs | | 1 | 1 | PyTorch lazy mode | -```{warning} -All modes using PT_HPU_LAZY_MODE=0 are experimental and should only be used for validating functional correctness. To achieve the best performance, use HPU Graphs or PyTorch Lazy Mode. Performance improvements are planned for future releases. -``` +> [!WARNING] +> All modes using PT_HPU_LAZY_MODE=0 are experimental and should only be used for validating functional correctness. To achieve the best performance, use HPU Graphs or PyTorch Lazy Mode. Performance improvements are planned for future releases. ### Bucketing Mechanism Intel Gaudi accelerators perform best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) generates optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be highly dependent on input and output tensor shapes, requiring graph recompilation when encountering tensors with different shapes within the same topology. While these binaries efficiently utilize Gaudi, the compilation process itself can introduce noticeable overhead in end-to-end execution. In dynamic inference serving scenarios, it is important to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently, this is achieved by "bucketing" the model's forward pass across two dimensions: `batch_size` and `sequence_length`. -```{note} +:::{note} Bucketing helps significantly reduce the number of required graphs, but it does not handle graph compilation or device code generation. These tasks are performed during the warmup and HPUGraph capture phase. -``` +::: Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup: @@ -203,15 +206,15 @@ min = 128, step = 128, max = 512 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket. -```{warning} +:::{warning} If a request exceeds the maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario. -``` +::: For example, if a request with 3 sequences, each having a maximum sequence length of 412, is sent to an idle vLLM server, it will be padded and executed as a `(4, 512)` prefill bucket. This is because the `batch_size` (number of sequences) will be padded to 4 (the nearest batch size dimension higher than 3), and the maximum sequence length will be padded to 512 (the nearest sequence length dimension higher than 412). After the prefill stage, it will be executed as a `(4, 512)` decode bucket and will remain in this bucket until either the batch dimension changes (e.g., due to a request being completed), in which case it will become a `(2, 512)` bucket, or the context length increases beyond 512 tokens, at which point it will become a `(4, 640)` bucket. -```{note} +:::{note} Bucketing is transparent to the user – padding in the sequence length dimension is never returned, and padding in the batch dimension does not create new requests. -``` +::: ### Warmup @@ -233,9 +236,9 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` -```{tip} +:::{tip} Compiling all the buckets may take some time and can be disabled by setting the VLLM_SKIP_WARMUP=true environment variable. Keep in mind that if you do this, you may encounter graph compilations when executing a given bucket for the first time. Disabling warmup is fine for development, but it is highly recommended to enable it in deployment. -``` +::: ### HPU Graph Capture @@ -244,9 +247,9 @@ Compiling all the buckets may take some time and can be disabled by setting the When HPU Graphs are used, they share the common memory pool ("usable memory") with the KV cache, as determined by the `gpu_memory_utilization` flag (default value is `0.9`). Before the KV cache is allocated, the model weights are loaded onto the device, and a forward pass of the model is executed on dummy data to estimate memory usage. Only after that, the `gpu_memory_utilization` flag is applied. At its default value, it marks 90% of the free device memory at that point as usable. Next, the KV cache is allocated, the model is warmed up, and HPU Graphs are captured. The `VLLM_GRAPH_RESERVED_MEM` environment variable defines the ratio of memory reserved for HPU Graph capture. With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of the usable memory will be reserved for graph capture (referred to as "usable graph memory"), and the remaining 90% will be used for the KV cache. The environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages share equal memory constraints. A lower value corresponds to less usable graph memory reserved for the prefill stage. For example, setting `VLLM_GRAPH_PROMPT_RATIO=0.2` reserves 20% of usable graph memory for prefill graphs, while 80% is allocated for decode graphs. -```{note} +:::{note} `gpu_memory_utilization` does not represent the absolute memory usage across the HPU. Instead, it specifies the memory margin after loading the model and running a profile. For example, if a device has 100 GiB of total memory and 50 GiB of free memory after loading the model weights and executing the profiling run, the default value of `gpu_memory_utilization` will mark 90% of the 50 GiB as usable, leaving 5 GiB as a margin, regardless of the total device memory. -``` +::: You can also configure the strategy for capturing HPU graphs separately for the prompt and decode stages. The strategy affects the order in which graphs are captured. Two strategies are implemented: @@ -255,9 +258,9 @@ You can also configure the strategy for capturing HPU graphs separately for the When a large number of requests are pending, the vLLM scheduler attempts to fill the maximum batch size for decoding as quickly as possible. Once a request is finished, the decode batch size decreases. When this happens, vLLM attempts to schedule a prefill iteration for requests in the waiting queue to restore the decode batch size to its previous state. In a fully loaded scenario, the decode batch size is often at its maximum, making large-batch HPU graphs critical to capture, as indicated by the `max_bs` strategy. Conversely, prefill iterations will typically be executed with very low batch sizes (1-4), as reflected in the `min_tokens` strategy. -```{note} +:::{note} `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on the memory allocated for graphs in each stage (prefill and decode). vLLM first attempts to use the entire usable prefill graph memory (usable graph memory * VLLM_GRAPH_PROMPT_RATIO) for capturing prefill HPU Graphs. It will then attempt to do the same for decode graphs and the usable decode graph memory pool. If one stage is fully captured and there is unused memory remaining in the usable graph memory pool, vLLM will attempt to capture more graphs for the other stage, until no more HPU Graphs can be captured without exceeding the reserved memory pool. The behavior of this mechanism is illustrated in the example below. -``` +::: Each step outlined is logged by the vLLM server, with negative values indicating memory release: @@ -322,13 +325,13 @@ INFO 08-02 17:38:43 hpu_executor.py:91] init_cache_engine took 37.92 GiB of devi - `VLLM_{phase}_{dim}_BUCKET_{param}` - collection of 12 environment variables configuring ranges of bucketing mechanism. - - `{phase}` is either `PROMPT` or `DECODE` + * `{phase}` is either `PROMPT` or `DECODE` - - `{dim}` is either `BS`, `SEQ` or `BLOCK` + * `{dim}` is either `BS`, `SEQ` or `BLOCK` - - `{param}` is either `MIN`, `STEP` or `MAX` + * `{param}` is either `MIN`, `STEP` or `MAX` - - Default values: + * Default values: - Prompt: @@ -356,6 +359,7 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - `PT_HPU_LAZY_MODE`: if `0`, PyTorch Eager backend for Gaudi will be used, if `1` PyTorch Lazy backend for Gaudi will be used. `1` is the default. - `PT_HPU_ENABLE_LAZY_COLLECTIVES` must be set to `true` for tensor parallel inference with HPU Graphs. +- `PT_HPUGRAPH_DISABLE_TENSOR_CACHE` must be set to `false` for llava model. ## Quantization, FP8 Inference and Model Calibration Process diff --git a/docs/source/getting_started/installation/ai_accelerator/index.md b/docs/source/getting_started/installation/ai_accelerator/index.md index a6c4c44305a4c..88352f639567b 100644 --- a/docs/source/getting_started/installation/ai_accelerator/index.md +++ b/docs/source/getting_started/installation/ai_accelerator/index.md @@ -2,374 +2,374 @@ vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: :::: +::::: + ## Requirements -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "## Requirements" :end-before: "## Configure a new environment" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "## Requirements" :end-before: "## Configure a new environment" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "## Requirements" :end-before: "## Configure a new environment" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: :::: +::::: + ## Configure a new environment -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "## Configure a new environment" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "## Configure a new environment" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "## Configure a new environment" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} OpenVINO -:sync: openvino +:::: -```{include} ../python_env_setup.inc.md -``` +::::{tab-item} OpenVINO +:sync: openvino +:::{include} ../python_env_setup.inc.md ::: :::: +::::: + ## Set up using Python ### Pre-built wheels -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: :::: +::::: + ### Build wheel from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: :::: +::::: + ## Set up using Docker ### Pre-built images -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: :::: +::::: + ### Build image from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "### Build image from source" :end-before: "## Extra information" -``` - ::: :::: +::::: + ## Extra information -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} TPU +::::{tab-item} TPU :sync: tpu -```{include} tpu.inc.md +:::{include} tpu.inc.md :start-after: "## Extra information" -``` - ::: -:::{tab-item} Intel Gaudi +:::: + +::::{tab-item} Intel Gaudi :sync: hpu-gaudi -```{include} hpu-gaudi.inc.md +:::{include} hpu-gaudi.inc.md :start-after: "## Extra information" -``` - ::: -:::{tab-item} Neuron +:::: + +::::{tab-item} Neuron :sync: neuron -```{include} neuron.inc.md +:::{include} neuron.inc.md :start-after: "## Extra information" -``` - ::: -:::{tab-item} OpenVINO +:::: + +::::{tab-item} OpenVINO :sync: openvino -```{include} openvino.inc.md +:::{include} openvino.inc.md :start-after: "## Extra information" -``` - ::: :::: + +::::: diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md index 575a9f9c2e2f0..145cc9d668efd 100644 --- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md @@ -67,9 +67,9 @@ Currently, there are no pre-built Neuron wheels. ### Build wheel from source -```{note} +:::{note} The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. -``` +::: Following instructions are applicable to Neuron SDK 2.16 and beyond. diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md index 6a911cc6b9eba..6827afc805fd8 100644 --- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md +++ b/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md @@ -47,10 +47,10 @@ When you request queued resources, the request is added to a queue maintained by the Cloud TPU service. When the requested resource becomes available, it's assigned to your Google Cloud project for your immediate exclusive use. -```{note} +:::{note} In all of the following commands, replace the ALL CAPS parameter names with appropriate values. See the parameter descriptions table for more information. -``` +::: ### Provision Cloud TPUs with GKE @@ -75,33 +75,33 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` -```{list-table} Parameter descriptions +:::{list-table} Parameter descriptions :header-rows: 1 -* - Parameter name - - Description -* - QUEUED_RESOURCE_ID - - The user-assigned ID of the queued resource request. -* - TPU_NAME - - The user-assigned name of the TPU which is created when the queued +- * Parameter name + * Description +- * QUEUED_RESOURCE_ID + * The user-assigned ID of the queued resource request. +- * TPU_NAME + * The user-assigned name of the TPU which is created when the queued resource request is allocated. -* - PROJECT_ID - - Your Google Cloud project -* - ZONE - - The GCP zone where you want to create your Cloud TPU. The value you use +- * PROJECT_ID + * Your Google Cloud project +- * ZONE + * The GCP zone where you want to create your Cloud TPU. The value you use depends on the version of TPUs you are using. For more information, see `TPU regions and zones `_ -* - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, for example +- * ACCELERATOR_TYPE + * The TPU version you want to use. Specify the TPU version, for example `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, see `TPU versions `_. -* - RUNTIME_VERSION - - The TPU VM runtime version to use. For more information see `TPU VM images `_. -* - SERVICE_ACCOUNT - - The email address for your service account. You can find it in the IAM +- * RUNTIME_VERSION + * The TPU VM runtime version to use. For more information see `TPU VM images `_. +- * SERVICE_ACCOUNT + * The email address for your service account. You can find it in the IAM Cloud Console under *Service Accounts*. For example: `tpu-service-account@.iam.gserviceaccount.com` -``` +::: Connect to your TPU using SSH: @@ -178,15 +178,15 @@ Run the Docker image with the following command: docker run --privileged --net host --shm-size=16G -it vllm-tpu ``` -```{note} +:::{note} Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each shape. The compilation time may take 20~30 minutes in the first run. However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default). -``` +::: -````{tip} +:::{tip} If you encounter the following error: ```console @@ -198,9 +198,10 @@ file or directory Install OpenBLAS with the following command: ```console -$ sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev +sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev ``` -```` + +::: ## Extra information diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/source/getting_started/installation/cpu/apple.inc.md index 56545253b1ef7..0808b869fdb7b 100644 --- a/docs/source/getting_started/installation/cpu/apple.inc.md +++ b/docs/source/getting_started/installation/cpu/apple.inc.md @@ -25,9 +25,9 @@ pip install -r requirements-cpu.txt pip install -e . ``` -```{note} +:::{note} On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device. -``` +::: #### Troubleshooting diff --git a/docs/source/getting_started/installation/cpu/index.md b/docs/source/getting_started/installation/cpu/index.md index 4ec907c0e9fda..2f549ede0cf48 100644 --- a/docs/source/getting_started/installation/cpu/index.md +++ b/docs/source/getting_started/installation/cpu/index.md @@ -2,86 +2,86 @@ vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} x86 +::::{tab-item} x86 :sync: x86 -```{include} x86.inc.md +:::{include} x86.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} ARM +:::: + +::::{tab-item} ARM :sync: arm -```{include} arm.inc.md +:::{include} arm.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} Apple silicon +:::: + +::::{tab-item} Apple silicon :sync: apple -```{include} apple.inc.md +:::{include} apple.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: :::: +::::: + ## Requirements - Python: 3.9 -- 3.12 -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} x86 +::::{tab-item} x86 :sync: x86 -```{include} x86.inc.md +:::{include} x86.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} ARM +:::: + +::::{tab-item} ARM :sync: arm -```{include} arm.inc.md +:::{include} arm.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} Apple silicon +:::: + +::::{tab-item} Apple silicon :sync: apple -```{include} apple.inc.md +:::{include} apple.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: :::: +::::: + ## Set up using Python ### Create a new Python environment -```{include} ../python_env_setup.inc.md -``` +:::{include} ../python_env_setup.inc.md +::: ### Pre-built wheels @@ -89,41 +89,41 @@ Currently, there are no pre-built CPU wheels. ### Build wheel from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} x86 +::::{tab-item} x86 :sync: x86 -```{include} x86.inc.md +:::{include} x86.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} ARM +:::: + +::::{tab-item} ARM :sync: arm -```{include} arm.inc.md +:::{include} arm.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} Apple silicon +:::: + +::::{tab-item} Apple silicon :sync: apple -```{include} apple.inc.md +:::{include} apple.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: :::: +::::: + ## Set up using Docker ### Pre-built images @@ -142,9 +142,9 @@ $ docker run -it \ vllm-cpu-env ``` -:::{tip} +::::{tip} For ARM or Apple silicon, use `Dockerfile.arm` -::: +:::: ## Supported features diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md index e4f99d3cebdf2..f146ae0918b44 100644 --- a/docs/source/getting_started/installation/cpu/x86.inc.md +++ b/docs/source/getting_started/installation/cpu/x86.inc.md @@ -17,10 +17,10 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform, :::{include} build.inc.md ::: -```{note} -- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. +:::{note} +- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. -``` +::: ## Set up using Docker diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/source/getting_started/installation/gpu/cuda.inc.md index 4cce65278c069..5c2ea30dbfde1 100644 --- a/docs/source/getting_started/installation/gpu/cuda.inc.md +++ b/docs/source/getting_started/installation/gpu/cuda.inc.md @@ -10,9 +10,9 @@ vLLM contains pre-compiled C++ and CUDA (12.1) binaries. ### Create a new Python environment -```{note} +:::{note} PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See for more details. -``` +::: In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations. @@ -100,10 +100,10 @@ pip install --editable . You can find more information about vLLM's wheels in . -```{note} +:::{note} There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors. It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to for instructions on how to install a specified wheel. -``` +::: #### Full build (with compilation) @@ -115,7 +115,7 @@ cd vllm pip install -e . ``` -```{tip} +:::{tip} Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results. For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` . @@ -123,7 +123,7 @@ As long as `which ccache` command can find the `ccache` binary, it will be used [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments. The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`. -``` +::: ##### Use an existing PyTorch installation diff --git a/docs/source/getting_started/installation/gpu/index.md b/docs/source/getting_started/installation/gpu/index.md index 6c007382b2c3d..0a61f889753a3 100644 --- a/docs/source/getting_started/installation/gpu/index.md +++ b/docs/source/getting_started/installation/gpu/index.md @@ -2,299 +2,299 @@ vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "# Installation" :end-before: "## Requirements" -``` - ::: :::: +::::: + ## Requirements - OS: Linux - Python: 3.9 -- 3.12 -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "## Requirements" :end-before: "## Set up using Python" -``` - ::: :::: +::::: + ## Set up using Python ### Create a new Python environment -```{include} ../python_env_setup.inc.md -``` +:::{include} ../python_env_setup.inc.md +::: -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "## Create a new Python environment" :end-before: "### Pre-built wheels" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm There is no extra information on creating a new Python environment for this device. -::: +:::: -:::{tab-item} XPU +::::{tab-item} XPU :sync: xpu There is no extra information on creating a new Python environment for this device. -::: - :::: +::::: + ### Pre-built wheels -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Pre-built wheels" :end-before: "### Build wheel from source" -``` - ::: :::: +::::: + (build-from-source)= ### Build wheel from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Build wheel from source" :end-before: "## Set up using Docker" -``` - ::: :::: +::::: + ## Set up using Docker ### Pre-built images -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Pre-built images" :end-before: "### Build image from source" -``` - ::: :::: +::::: + ### Build image from source -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "### Build image from source" :end-before: "## Supported features" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "### Build image from source" :end-before: "## Supported features" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "### Build image from source" :end-before: "## Supported features" -``` - ::: :::: +::::: + ## Supported features -::::{tab-set} +:::::{tab-set} :sync-group: device -:::{tab-item} CUDA +::::{tab-item} CUDA :sync: cuda -```{include} cuda.inc.md +:::{include} cuda.inc.md :start-after: "## Supported features" -``` - ::: -:::{tab-item} ROCm +:::: + +::::{tab-item} ROCm :sync: rocm -```{include} rocm.inc.md +:::{include} rocm.inc.md :start-after: "## Supported features" -``` - ::: -:::{tab-item} XPU +:::: + +::::{tab-item} XPU :sync: xpu -```{include} xpu.inc.md +:::{include} xpu.inc.md :start-after: "## Supported features" -``` - ::: :::: + +::::: diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/source/getting_started/installation/gpu/rocm.inc.md index f6f9d3c303f89..131ad1704ea11 100644 --- a/docs/source/getting_started/installation/gpu/rocm.inc.md +++ b/docs/source/getting_started/installation/gpu/rocm.inc.md @@ -13,6 +13,14 @@ vLLM supports AMD GPUs with ROCm 6.2. Currently, there are no pre-built ROCm wheels. +However, the [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized +docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator. + +:::{tip} +Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html) +for instructions on how to use this prebuilt docker image. +::: + ### Build wheel from source 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): @@ -39,9 +47,9 @@ Currently, there are no pre-built ROCm wheels. cd ../.. ``` - ```{note} - - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. - ``` + :::{note} + If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + ::: 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention/tree/ck_tile) @@ -59,9 +67,9 @@ Currently, there are no pre-built ROCm wheels. cd .. ``` - ```{note} - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - ``` + :::{note} + You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) + ::: 3. Build vLLM. For example, vLLM on ROCM 6.2 can be built with the following steps: @@ -70,7 +78,7 @@ Currently, there are no pre-built ROCm wheels. # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.6.0.dev20241024 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 + $ pip install --no-cache-dir --pre torch --index-url https://download.pytorch.org/whl/rocm6.2 # Build & install AMD SMI $ pip install /opt/rocm/share/amd_smi @@ -87,17 +95,18 @@ Currently, there are no pre-built ROCm wheels. This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation. - ```{tip} + + :::{tip} - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support. - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention. - The ROCm version of PyTorch, ideally, should match the ROCm driver version. - ``` + ::: -```{tip} +:::{tip} - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level. For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization). -``` +::: ## Set up using Docker @@ -123,11 +132,10 @@ It is important that the user kicks off the docker build using buildkit. Either uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: -- `BASE_IMAGE`: specifies the base image used when running `docker build`, specifically the PyTorch on ROCm base image. -- `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For [Radeon RX 7900 series (gfx1100)](https://rocm.docs.amd.com/projects/radeon/en/latest/index.html), this should be set to 0 before flash-attention supports this target. -- `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` -- `FA_BRANCH`: specifies the branch used to build the CK flash-attention in [ROCm's flash-attention repo](https://github.com/ROCmSoftwarePlatform/flash-attention). The default is `ae7928c` -- `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1. +- `BASE_IMAGE`: specifies the base image used when running `docker build`. The default value `rocm/vllm-dev:base` is an image published and maintained by AMD. It is being built using +- `USE_CYTHON`: An option to run cython compilation on a subset of python files upon docker build +- `BUILD_RPD`: Include RocmProfileData profiling tool in the image +- `ARG_PYTORCH_ROCM_ARCH`: Allows to override the gfx architecture values from the base docker image Their values can be passed in when running `docker build` with `--build-arg` options. @@ -137,10 +145,10 @@ To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . ``` -To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify `BUILD_FA` as below: +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should pick the alternative base image: ```console -DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . +DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f Dockerfile.rocm -t vllm-rocm . ``` To run the above docker image `vllm-rocm`, use the below command: diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/source/getting_started/installation/gpu/xpu.inc.md index 577986eba74fd..bc01c6000bc07 100644 --- a/docs/source/getting_started/installation/gpu/xpu.inc.md +++ b/docs/source/getting_started/installation/gpu/xpu.inc.md @@ -30,10 +30,10 @@ pip install -v -r requirements-xpu.txt VLLM_TARGET_DEVICE=xpu python setup.py install ``` -```{note} +:::{note} - FP16 is the default data type in the current XPU backend. The BF16 data type will be supported in the future. -``` +::: ## Set up using Docker diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md index bc1d268bf0c7e..0f5e013ce071a 100644 --- a/docs/source/getting_started/installation/index.md +++ b/docs/source/getting_started/installation/index.md @@ -4,10 +4,10 @@ vLLM supports the following hardware platforms: -```{toctree} +:::{toctree} :maxdepth: 1 gpu/index cpu/index ai_accelerator/index -``` +::: diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md index 25cfac5f58aa7..cb73914c9c75e 100644 --- a/docs/source/getting_started/installation/python_env_setup.inc.md +++ b/docs/source/getting_started/installation/python_env_setup.inc.md @@ -6,9 +6,9 @@ conda create -n myenv python=3.12 -y conda activate myenv ``` -```{note} +:::{note} [PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages. -``` +::: Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command: diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index a20e315f791e4..83df317103e4a 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -34,9 +34,9 @@ conda activate myenv pip install vllm ``` -```{note} +:::{note} For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM. -``` +::: (quickstart-offline)= @@ -71,9 +71,9 @@ The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model]( llm = LLM(model="facebook/opt-125m") ``` -```{note} +:::{note} By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. -``` +::: Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. @@ -99,10 +99,10 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru vllm serve Qwen/Qwen2.5-1.5B-Instruct ``` -```{note} +:::{note} By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it [here](#chat-template). -``` +::: This server can be queried in the same format as OpenAI API. For example, to list the models: diff --git a/docs/source/getting_started/troubleshooting.md b/docs/source/getting_started/troubleshooting.md index 1e290d2b4c0bd..2f41fa3b6b19e 100644 --- a/docs/source/getting_started/troubleshooting.md +++ b/docs/source/getting_started/troubleshooting.md @@ -4,9 +4,9 @@ This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. -```{note} +:::{note} Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. -``` +::: ## Hangs downloading a model @@ -18,13 +18,13 @@ It's recommended to download the model first using the [huggingface-cli](https:/ If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. -```{note} +:::{note} To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. -``` +::: -## Model is too large +## Out of memory -If the model is too large to fit in a single GPU, you might want to [consider tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. +If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider [using tensor parallelism](#distributed-serving) to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. ## Enable more logging @@ -132,14 +132,14 @@ If the script runs successfully, you should see the message `sanity check is suc If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully. -```{note} +:::{note} A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments: - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`. - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`. Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. -``` +::: (troubleshooting-python-multiprocessing)= @@ -197,6 +197,63 @@ if __name__ == '__main__': llm = vllm.LLM(...) ``` +## `torch.compile` Error + +vLLM heavily depends on `torch.compile` to optimize the model for better performance, which introduces the dependency on the `torch.compile` functionality and the `triton` library. By default, we use `torch.compile` to [optimize some functions](https://github.com/vllm-project/vllm/pull/10406) in the model. Before running vLLM, you can check if `torch.compile` is working as expected by running the following script: + +```python +import torch + +@torch.compile +def f(x): + # a simple function to test torch.compile + x = x + 1 + x = x * 2 + x = x.sin() + return x + +x = torch.randn(4, 4).cuda() +print(f(x)) +``` + +If it raises errors from `torch/_inductor` directory, usually it means you have a custom `triton` library that is not compatible with the version of PyTorch you are using. See [this issue](https://github.com/vllm-project/vllm/issues/12219) for example. + +## Model failed to be inspected + +If you see an error like: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. +``` + +It means that vLLM failed to import the model file. +Usually, it is related to missing dependencies or outdated binaries in the vLLM build. +Please read the logs carefully to determine the root cause of the error. + +## Model not supported + +If you see an error like: + +```text +Traceback (most recent call last): +... + File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls + for arch in architectures: +TypeError: 'NoneType' object is not iterable +``` + +or: + +```text + File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported + raise ValueError( +ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] +``` + +But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model. + ## Known Issues - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). diff --git a/docs/source/index.md b/docs/source/index.md index d7a1117df9c27..e90e81c72860a 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -1,13 +1,13 @@ # Welcome to vLLM -```{figure} ./assets/logos/vllm-logo-text-light.png +:::{figure} ./assets/logos/vllm-logo-text-light.png :align: center :alt: vLLM :class: no-scaled-link :width: 60% -``` +::: -```{raw} html +:::{raw} html

Easy, fast, and cheap LLM serving for everyone @@ -19,7 +19,7 @@ Watch Fork

-``` +::: vLLM is a fast and easy-to-use library for LLM inference and serving. @@ -58,7 +58,7 @@ For more information, check out the following: % How to start using vLLM? -```{toctree} +:::{toctree} :caption: Getting Started :maxdepth: 1 @@ -67,11 +67,11 @@ getting_started/quickstart getting_started/examples/examples_index getting_started/troubleshooting getting_started/faq -``` +::: % What does vLLM support? -```{toctree} +:::{toctree} :caption: Models :maxdepth: 1 @@ -79,27 +79,28 @@ models/generative_models models/pooling_models models/supported_models models/extensions/index -``` +::: % Additional capabilities -```{toctree} +:::{toctree} :caption: Features :maxdepth: 1 features/quantization/index features/lora features/tool_calling +features/reasoning_outputs features/structured_outputs features/automatic_prefix_caching features/disagg_prefill features/spec_decode features/compatibility_matrix -``` +::: % Details about running vLLM -```{toctree} +:::{toctree} :caption: Inference and Serving :maxdepth: 1 @@ -112,11 +113,11 @@ serving/engine_args serving/env_vars serving/usage_stats serving/integrations/index -``` +::: % Scaling up vLLM for production -```{toctree} +:::{toctree} :caption: Deployment :maxdepth: 1 @@ -125,21 +126,21 @@ deployment/k8s deployment/nginx deployment/frameworks/index deployment/integrations/index -``` +::: % Making the most out of vLLM -```{toctree} +:::{toctree} :caption: Performance :maxdepth: 1 performance/optimization performance/benchmarks -``` +::: % Explanation of vLLM internals -```{toctree} +:::{toctree} :caption: Design Documents :maxdepth: 2 @@ -150,11 +151,11 @@ design/kernel/paged_attention design/mm_processing design/automatic_prefix_caching design/multiprocessing -``` +::: % How to contribute to the vLLM project -```{toctree} +:::{toctree} :caption: Developer Guide :maxdepth: 2 @@ -163,11 +164,11 @@ contributing/profiling/profiling_index contributing/dockerfile/dockerfile contributing/model/index contributing/vulnerability_management -``` +::: % Technical API specifications -```{toctree} +:::{toctree} :caption: API Reference :maxdepth: 2 @@ -176,17 +177,18 @@ api/engine/index api/inference_params api/multimodal/index api/model/index -``` +::: % Latest news and acknowledgements -```{toctree} +:::{toctree} :caption: Community :maxdepth: 1 +community/blog community/meetups community/sponsors -``` +::: ## Indices and tables diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md index cff09d12eba47..69faf472e5300 100644 --- a/docs/source/models/extensions/index.md +++ b/docs/source/models/extensions/index.md @@ -1,8 +1,8 @@ # Built-in Extensions -```{toctree} +:::{toctree} :maxdepth: 1 runai_model_streamer tensorizer -``` +::: diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/source/models/extensions/runai_model_streamer.md index 75f7a9fcad416..99c37876a01b3 100644 --- a/docs/source/models/extensions/runai_model_streamer.md +++ b/docs/source/models/extensions/runai_model_streamer.md @@ -48,6 +48,6 @@ You can read further about CPU buffer memory limiting [here](https://github.com/ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}' ``` -```{note} +:::{note} For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md). -``` +::: diff --git a/docs/source/models/extensions/tensorizer.md b/docs/source/models/extensions/tensorizer.md index ae17e3437bca6..830c579d91bae 100644 --- a/docs/source/models/extensions/tensorizer.md +++ b/docs/source/models/extensions/tensorizer.md @@ -11,6 +11,6 @@ For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see the [vLLM example script](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference/tensorize_vllm_model.html). -```{note} +:::{note} Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. -``` +::: diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index e4b4cd03a90d2..4abe6b776eea3 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -70,10 +70,10 @@ The {class}`~vllm.LLM.chat` method implements chat functionality on top of {clas In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat) and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt. -```{important} +:::{important} In general, only instruction-tuned models have a chat template. Base models may perform poorly as they are not trained to respond to the chat conversation. -``` +::: ```python llm = LLM(model="meta-llama/Meta-Llama-3-8B-Instruct") diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 91db694be29a4..9704ccee745c4 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -8,54 +8,54 @@ In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmMo These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input before returning them. -```{note} +:::{note} We currently support pooling models primarily as a matter of convenience. As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. -``` +::: For pooling models, we support the following `--task` options. The selected option sets the default pooler used to extract the final hidden states: -```{list-table} +:::{list-table} :widths: 50 25 25 25 :header-rows: 1 -* - Task - - Pooling Type - - Normalization - - Softmax -* - Embedding (`embed`) - - `LAST` - - ✅︎ - - ✗ -* - Classification (`classify`) - - `LAST` - - ✗ - - ✅︎ -* - Sentence Pair Scoring (`score`) - - \* - - \* - - \* -* - Reward Modeling (`reward`) - - `ALL` - - ✗ - - ✗ -``` +- * Task + * Pooling Type + * Normalization + * Softmax +- * Embedding (`embed`) + * `LAST` + * ✅︎ + * ✗ +- * Classification (`classify`) + * `LAST` + * ✗ + * ✅︎ +- * Sentence Pair Scoring (`score`) + * \* + * \* + * \* +- * Reward Modeling (`reward`) + * `ALL` + * ✗ + * ✗ +::: \*The default pooler is always defined by the model. -```{note} +:::{note} If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. -``` +::: When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). -```{tip} +:::{tip} You can customize the model's pooling method via the `--override-pooler-config` option, which takes priority over both the model's and Sentence Transformers's defaults. -``` +::: ## Offline Inference @@ -111,10 +111,10 @@ The {class}`~vllm.LLM.score` method outputs similarity scores between sentence p It is primarily designed for [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html). These types of models serve as rerankers between candidate query-document pairs in RAG systems. -```{note} +:::{note} vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG. To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain). -``` +::: ```python llm = LLM(model="BAAI/bge-reranker-v2-m3", task="score") diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index d07cde3db5c6e..afaad8818bdcb 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -17,7 +17,7 @@ By default, vLLM loads models from [HuggingFace (HF) Hub](https://huggingface.co To determine whether a given model is supported, you can check the `config.json` file inside the HF repository. If the `"architectures"` field contains a model architecture listed below, then it should be supported in theory. -````{tip} +:::{tip} The easiest way to check if your model is really supported at runtime is to run the program below: ```python @@ -35,7 +35,7 @@ print(output) ``` If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. -```` +::: Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. @@ -72,364 +72,364 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `AquilaForCausalLM` - - Aquila, Aquila2 - - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. - - ✅︎ - - ✅︎ -* - `ArcticForCausalLM` - - Arctic - - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. - - - - ✅︎ -* - `BaiChuanForCausalLM` - - Baichuan2, Baichuan - - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. - - ✅︎ - - ✅︎ -* - `BloomForCausalLM` - - BLOOM, BLOOMZ, BLOOMChat - - `bigscience/bloom`, `bigscience/bloomz`, etc. - - - - ✅︎ -* - `BartForConditionalGeneration` - - BART - - `facebook/bart-base`, `facebook/bart-large-cnn`, etc. - - - - -* - `ChatGLMModel` - - ChatGLM - - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. - - ✅︎ - - ✅︎ -* - `CohereForCausalLM`, `Cohere2ForCausalLM` - - Command-R - - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. - - ✅︎ - - ✅︎ -* - `DbrxForCausalLM` - - DBRX - - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. - - - - ✅︎ -* - `DeciLMForCausalLM` - - DeciLM - - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. - - - - ✅︎ -* - `DeepseekForCausalLM` - - DeepSeek - - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. - - - - ✅︎ -* - `DeepseekV2ForCausalLM` - - DeepSeek-V2 - - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. - - - - ✅︎ -* - `DeepseekV3ForCausalLM` - - DeepSeek-V3 - - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. - - - - ✅︎ -* - `ExaoneForCausalLM` - - EXAONE-3 - - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. - - ✅︎ - - ✅︎ -* - `FalconForCausalLM` - - Falcon - - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. - - - - ✅︎ -* - `FalconMambaForCausalLM` - - FalconMamba - - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. - - ✅︎ - - ✅︎ -* - `GemmaForCausalLM` - - Gemma - - `google/gemma-2b`, `google/gemma-7b`, etc. - - ✅︎ - - ✅︎ -* - `Gemma2ForCausalLM` - - Gemma2 - - `google/gemma-2-9b`, `google/gemma-2-27b`, etc. - - ✅︎ - - ✅︎ -* - `GlmForCausalLM` - - GLM-4 - - `THUDM/glm-4-9b-chat-hf`, etc. - - ✅︎ - - ✅︎ -* - `GPT2LMHeadModel` - - GPT-2 - - `gpt2`, `gpt2-xl`, etc. - - - - ✅︎ -* - `GPTBigCodeForCausalLM` - - StarCoder, SantaCoder, WizardCoder - - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. - - ✅︎ - - ✅︎ -* - `GPTJForCausalLM` - - GPT-J - - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. - - - - ✅︎ -* - `GPTNeoXForCausalLM` - - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. - - - - ✅︎ -* - `GraniteForCausalLM` - - Granite 3.0, Granite 3.1, PowerLM - - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. - - ✅︎ - - ✅︎ -* - `GraniteMoeForCausalLM` - - Granite 3.0 MoE, PowerMoE - - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. - - ✅︎ - - ✅︎ -* - `GritLM` - - GritLM - - `parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ -* - `InternLMForCausalLM` - - InternLM - - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. - - ✅︎ - - ✅︎ -* - `InternLM2ForCausalLM` - - InternLM2 - - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. - - ✅︎ - - ✅︎ -* - `InternLM3ForCausalLM` - - InternLM3 - - `internlm/internlm3-8b-instruct`, etc. - - ✅︎ - - ✅︎ -* - `JAISLMHeadModel` - - Jais - - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. - - - - ✅︎ -* - `JambaForCausalLM` - - Jamba - - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. - - ✅︎ - - ✅︎ -* - `LlamaForCausalLM` - - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. - - ✅︎ - - ✅︎ -* - `MambaForCausalLM` - - Mamba - - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. - - - - ✅︎ -* - `MiniCPMForCausalLM` - - MiniCPM - - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. - - ✅︎ - - ✅︎ -* - `MiniCPM3ForCausalLM` - - MiniCPM3 - - `openbmb/MiniCPM3-4B`, etc. - - ✅︎ - - ✅︎ -* - `MistralForCausalLM` - - Mistral, Mistral-Instruct - - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. - - ✅︎ - - ✅︎ -* - `MixtralForCausalLM` - - Mixtral-8x7B, Mixtral-8x7B-Instruct - - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. - - ✅︎ - - ✅︎ -* - `MPTForCausalLM` - - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. - - - - ✅︎ -* - `NemotronForCausalLM` - - Nemotron-3, Nemotron-4, Minitron - - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. - - ✅︎ - - ✅︎ -* - `OLMoForCausalLM` - - OLMo - - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. - - - - ✅︎ -* - `OLMo2ForCausalLM` - - OLMo2 - - `allenai/OLMo2-7B-1124`, etc. - - - - ✅︎ -* - `OLMoEForCausalLM` - - OLMoE - - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. - - ✅︎ - - ✅︎ -* - `OPTForCausalLM` - - OPT, OPT-IML - - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. - - - - ✅︎ -* - `OrionForCausalLM` - - Orion - - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. - - - - ✅︎ -* - `PhiForCausalLM` - - Phi - - `microsoft/phi-1_5`, `microsoft/phi-2`, etc. - - ✅︎ - - ✅︎ -* - `Phi3ForCausalLM` - - Phi-3 - - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. - - ✅︎ - - ✅︎ -* - `Phi3SmallForCausalLM` - - Phi-3-Small - - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. - - - - ✅︎ -* - `PhiMoEForCausalLM` - - Phi-3.5-MoE - - `microsoft/Phi-3.5-MoE-instruct`, etc. - - ✅︎ - - ✅︎ -* - `PersimmonForCausalLM` - - Persimmon - - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. - - - - ✅︎ -* - `QWenLMHeadModel` - - Qwen - - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2ForCausalLM` - - QwQ, Qwen2 - - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2MoeForCausalLM` - - Qwen2MoE - - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. - - - - ✅︎ -* - `StableLmForCausalLM` - - StableLM - - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. - - - - ✅︎ -* - `Starcoder2ForCausalLM` - - Starcoder2 - - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. - - - - ✅︎ -* - `SolarForCausalLM` - - Solar Pro - - `upstage/solar-pro-preview-instruct`, etc. - - ✅︎ - - ✅︎ -* - `TeleChat2ForCausalLM` - - TeleChat2 - - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc. - - ✅︎ - - ✅︎ -* - `XverseForCausalLM` - - XVERSE - - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. - - ✅︎ - - ✅︎ -``` - -```{note} +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `AquilaForCausalLM` + * Aquila, Aquila2 + * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. + * ✅︎ + * ✅︎ +- * `ArcticForCausalLM` + * Arctic + * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. + * + * ✅︎ +- * `BaiChuanForCausalLM` + * Baichuan2, Baichuan + * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. + * ✅︎ + * ✅︎ +- * `BloomForCausalLM` + * BLOOM, BLOOMZ, BLOOMChat + * `bigscience/bloom`, `bigscience/bloomz`, etc. + * + * ✅︎ +- * `BartForConditionalGeneration` + * BART + * `facebook/bart-base`, `facebook/bart-large-cnn`, etc. + * + * +- * `ChatGLMModel` + * ChatGLM + * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. + * ✅︎ + * ✅︎ +- * `CohereForCausalLM`, `Cohere2ForCausalLM` + * Command-R + * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. + * ✅︎ + * ✅︎ +- * `DbrxForCausalLM` + * DBRX + * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. + * + * ✅︎ +- * `DeciLMForCausalLM` + * DeciLM + * `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. + * + * ✅︎ +- * `DeepseekForCausalLM` + * DeepSeek + * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. + * + * ✅︎ +- * `DeepseekV2ForCausalLM` + * DeepSeek-V2 + * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. + * + * ✅︎ +- * `DeepseekV3ForCausalLM` + * DeepSeek-V3 + * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. + * + * ✅︎ +- * `ExaoneForCausalLM` + * EXAONE-3 + * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. + * ✅︎ + * ✅︎ +- * `FalconForCausalLM` + * Falcon + * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. + * + * ✅︎ +- * `FalconMambaForCausalLM` + * FalconMamba + * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. + * ✅︎ + * ✅︎ +- * `GemmaForCausalLM` + * Gemma + * `google/gemma-2b`, `google/gemma-7b`, etc. + * ✅︎ + * ✅︎ +- * `Gemma2ForCausalLM` + * Gemma2 + * `google/gemma-2-9b`, `google/gemma-2-27b`, etc. + * ✅︎ + * ✅︎ +- * `GlmForCausalLM` + * GLM-4 + * `THUDM/glm-4-9b-chat-hf`, etc. + * ✅︎ + * ✅︎ +- * `GPT2LMHeadModel` + * GPT-2 + * `gpt2`, `gpt2-xl`, etc. + * + * ✅︎ +- * `GPTBigCodeForCausalLM` + * StarCoder, SantaCoder, WizardCoder + * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. + * ✅︎ + * ✅︎ +- * `GPTJForCausalLM` + * GPT-J + * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. + * + * ✅︎ +- * `GPTNeoXForCausalLM` + * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM + * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. + * + * ✅︎ +- * `GraniteForCausalLM` + * Granite 3.0, Granite 3.1, PowerLM + * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. + * ✅︎ + * ✅︎ +- * `GraniteMoeForCausalLM` + * Granite 3.0 MoE, PowerMoE + * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. + * ✅︎ + * ✅︎ +- * `GritLM` + * GritLM + * `parasail-ai/GritLM-7B-vllm`. + * ✅︎ + * ✅︎ +- * `InternLMForCausalLM` + * InternLM + * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. + * ✅︎ + * ✅︎ +- * `InternLM2ForCausalLM` + * InternLM2 + * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. + * ✅︎ + * ✅︎ +- * `InternLM3ForCausalLM` + * InternLM3 + * `internlm/internlm3-8b-instruct`, etc. + * ✅︎ + * ✅︎ +- * `JAISLMHeadModel` + * Jais + * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. + * + * ✅︎ +- * `JambaForCausalLM` + * Jamba + * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. + * ✅︎ + * ✅︎ +- * `LlamaForCausalLM` + * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi + * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. + * ✅︎ + * ✅︎ +- * `MambaForCausalLM` + * Mamba + * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. + * + * ✅︎ +- * `MiniCPMForCausalLM` + * MiniCPM + * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. + * ✅︎ + * ✅︎ +- * `MiniCPM3ForCausalLM` + * MiniCPM3 + * `openbmb/MiniCPM3-4B`, etc. + * ✅︎ + * ✅︎ +- * `MistralForCausalLM` + * Mistral, Mistral-Instruct + * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. + * ✅︎ + * ✅︎ +- * `MixtralForCausalLM` + * Mixtral-8x7B, Mixtral-8x7B-Instruct + * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. + * ✅︎ + * ✅︎ +- * `MPTForCausalLM` + * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter + * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. + * + * ✅︎ +- * `NemotronForCausalLM` + * Nemotron-3, Nemotron-4, Minitron + * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + * ✅︎ + * ✅︎ +- * `OLMoForCausalLM` + * OLMo + * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. + * + * ✅︎ +- * `OLMo2ForCausalLM` + * OLMo2 + * `allenai/OLMo2-7B-1124`, etc. + * + * ✅︎ +- * `OLMoEForCausalLM` + * OLMoE + * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. + * ✅︎ + * ✅︎ +- * `OPTForCausalLM` + * OPT, OPT-IML + * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. + * + * ✅︎ +- * `OrionForCausalLM` + * Orion + * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. + * + * ✅︎ +- * `PhiForCausalLM` + * Phi + * `microsoft/phi-1_5`, `microsoft/phi-2`, etc. + * ✅︎ + * ✅︎ +- * `Phi3ForCausalLM` + * Phi-4, Phi-3 + * `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. + * ✅︎ + * ✅︎ +- * `Phi3SmallForCausalLM` + * Phi-3-Small + * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. + * + * ✅︎ +- * `PhiMoEForCausalLM` + * Phi-3.5-MoE + * `microsoft/Phi-3.5-MoE-instruct`, etc. + * ✅︎ + * ✅︎ +- * `PersimmonForCausalLM` + * Persimmon + * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. + * + * ✅︎ +- * `QWenLMHeadModel` + * Qwen + * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForCausalLM` + * QwQ, Qwen2 + * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2MoeForCausalLM` + * Qwen2MoE + * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. + * + * ✅︎ +- * `StableLmForCausalLM` + * StableLM + * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. + * + * ✅︎ +- * `Starcoder2ForCausalLM` + * Starcoder2 + * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. + * + * ✅︎ +- * `SolarForCausalLM` + * Solar Pro + * `upstage/solar-pro-preview-instruct`, etc. + * ✅︎ + * ✅︎ +- * `TeleChat2ForCausalLM` + * TeleChat2 + * `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc. + * ✅︎ + * ✅︎ +- * `XverseForCausalLM` + * XVERSE + * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. + * ✅︎ + * ✅︎ +::: + +:::{note} Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096. -``` +::: ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. -```{important} +:::{important} Since some model architectures support both generative and pooling tasks, you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -``` +::: #### Text Embedding (`--task embed`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `BertModel` - - BERT-based - - `BAAI/bge-base-en-v1.5`, etc. - - - - -* - `Gemma2Model` - - Gemma2-based - - `BAAI/bge-multilingual-gemma2`, etc. - - - - ✅︎ -* - `GritLM` - - GritLM - - `parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ -* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. - - Llama-based - - `intfloat/e5-mistral-7b-instruct`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2Model`, `Qwen2ForCausalLM` - - Qwen2-based - - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - - ✅︎ - - ✅︎ -* - `RobertaModel`, `RobertaForMaskedLM` - - RoBERTa-based - - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc. - - - - -* - `XLMRobertaModel` - - XLM-RoBERTa-based - - `intfloat/multilingual-e5-large`, etc. - - - - -``` - -```{note} +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `BertModel` + * BERT-based + * `BAAI/bge-base-en-v1.5`, etc. + * + * +- * `Gemma2Model` + * Gemma2-based + * `BAAI/bge-multilingual-gemma2`, etc. + * + * ✅︎ +- * `GritLM` + * GritLM + * `parasail-ai/GritLM-7B-vllm`. + * ✅︎ + * ✅︎ +- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. + * Llama-based + * `intfloat/e5-mistral-7b-instruct`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2Model`, `Qwen2ForCausalLM` + * Qwen2-based + * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. + * ✅︎ + * ✅︎ +- * `RobertaModel`, `RobertaForMaskedLM` + * RoBERTa-based + * `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc. + * + * +- * `XLMRobertaModel` + * XLM-RoBERTa-based + * `intfloat/multilingual-e5-large`, etc. + * + * +::: + +:::{note} `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config. You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`. -``` +::: -```{note} +:::{note} Unlike base Qwen2, `Alibaba-NLP/gte-Qwen2-7B-instruct` uses bi-directional attention. You can set `--hf-overrides '{"is_causal": false}'` to change the attention mask accordingly. @@ -438,7 +438,7 @@ despite being described otherwise on its model card. Regardless of the variant, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded. See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882). -``` +::: If your model is not in the above list, we will try to automatically convert the model using {func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings @@ -446,93 +446,98 @@ of the whole prompt are extracted from the normalized hidden state corresponding #### Reward Modeling (`--task reward`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `InternLM2ForRewardModel` - - InternLM2-based - - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. - - ✅︎ - - ✅︎ -* - `LlamaForCausalLM` - - Llama-based - - `peiyi9979/math-shepherd-mistral-7b-prm`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2ForRewardModel` - - Qwen2-based - - `Qwen/Qwen2.5-Math-RM-72B`, etc. - - ✅︎ - - ✅︎ -``` +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `InternLM2ForRewardModel` + * InternLM2-based + * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. + * ✅︎ + * ✅︎ +- * `LlamaForCausalLM` + * Llama-based + * `peiyi9979/math-shepherd-mistral-7b-prm`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForRewardModel` + * Qwen2-based + * `Qwen/Qwen2.5-Math-RM-72B`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForProcessRewardModel` + * Qwen2-based + * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc. + * ✅︎ + * ✅︎ +::: If your model is not in the above list, we will try to automatically convert the model using {func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. -```{important} +:::{important} For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`. -``` +::: #### Classification (`--task classify`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `JambaForSequenceClassification` - - Jamba - - `ai21labs/Jamba-tiny-reward-dev`, etc. - - ✅︎ - - ✅︎ -* - `Qwen2ForSequenceClassification` - - Qwen2-based - - `jason9693/Qwen2.5-1.5B-apeach`, etc. - - ✅︎ - - ✅︎ -``` +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `JambaForSequenceClassification` + * Jamba + * `ai21labs/Jamba-tiny-reward-dev`, etc. + * ✅︎ + * ✅︎ +- * `Qwen2ForSequenceClassification` + * Qwen2-based + * `jason9693/Qwen2.5-1.5B-apeach`, etc. + * ✅︎ + * ✅︎ +::: If your model is not in the above list, we will try to automatically convert the model using {func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) -```{list-table} +:::{list-table} :widths: 25 25 50 5 5 :header-rows: 1 -* - Architecture - - Models - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `BertForSequenceClassification` - - BERT-based - - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - - - - -* - `RobertaForSequenceClassification` - - RoBERTa-based - - `cross-encoder/quora-roberta-base`, etc. - - - - -* - `XLMRobertaForSequenceClassification` - - XLM-RoBERTa-based - - `BAAI/bge-reranker-v2-m3`, etc. - - - - -``` +- * Architecture + * Models + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `BertForSequenceClassification` + * BERT-based + * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. + * + * +- * `RobertaForSequenceClassification` + * RoBERTa-based + * `cross-encoder/quora-roberta-base`, etc. + * + * +- * `XLMRobertaForSequenceClassification` + * XLM-RoBERTa-based + * `BAAI/bge-reranker-v2-m3`, etc. + * + * +::: (supported-mm-models)= @@ -555,11 +560,12 @@ On the other hand, modalities separated by `/` are mutually exclusive. See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model. -````{important} +:::{important} To enable multiple multi-modal items per text prompt, you have to set `limit_mm_per_prompt` (offline inference) or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt: Offline inference: + ```python llm = LLM( model="Qwen/Qwen2-VL-7B-Instruct", @@ -568,14 +574,16 @@ llm = LLM( ``` Online serving: + ```bash vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt image=4 ``` -```` -```{note} +::: + +:::{note} vLLM currently only supports adding LoRA to the language backbone of multimodal models. -``` +::: ### Generative Models @@ -583,262 +591,263 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{list-table} +:::{list-table} :widths: 25 25 15 20 5 5 5 :header-rows: 1 -* - Architecture - - Models - - Inputs - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) - - [V1](gh-issue:8779) -* - `AriaForConditionalGeneration` - - Aria - - T + I+ - - `rhymes-ai/Aria` - - - - ✅︎ - - ✅︎ -* - `Blip2ForConditionalGeneration` - - BLIP-2 - - T + IE - - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. - - - - ✅︎ - - ✅︎ -* - `ChameleonForConditionalGeneration` - - Chameleon - - T + I - - `facebook/chameleon-7b` etc. - - - - ✅︎ - - ✅︎ -* - `DeepseekVLV2ForCausalLM` - - DeepSeek-VL2 - - T + I+ - - `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) - - - - ✅︎ - - ✅︎ -* - `FuyuForCausalLM` - - Fuyu - - T + I - - `adept/fuyu-8b` etc. - - - - ✅︎ - - ✅︎ -* - `ChatGLMModel` - - GLM-4V - - T + I - - `THUDM/glm-4v-9b` etc. - - ✅︎ - - ✅︎ - - -* - `H2OVLChatModel` - - H2OVL - - T + IE+ - - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. - - - - ✅︎ - - -* - `Idefics3ForConditionalGeneration` - - Idefics3 - - T + I - - `HuggingFaceM4/Idefics3-8B-Llama3` etc. - - ✅︎ - - - - -* - `InternVLChatModel` - - InternVL 2.5, Mono-InternVL, InternVL 2.0 - - T + IE+ - - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. - - - - ✅︎ - - ✅︎ -* - `LlavaForConditionalGeneration` - - LLaVA-1.5 - - T + IE+ - - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - - - - ✅︎ - - ✅︎ -* - `LlavaNextForConditionalGeneration` - - LLaVA-NeXT - - T + IE+ - - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - - - ✅︎ - - ✅︎ -* - `LlavaNextVideoForConditionalGeneration` - - LLaVA-NeXT-Video - - T + V - - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - - - ✅︎ - - ✅︎ -* - `LlavaOnevisionForConditionalGeneration` - - LLaVA-Onevision - - T + I+ + V+ - - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - - - ✅︎ - - ✅︎ -* - `MiniCPMV` - - MiniCPM-V - - T + IE+ - - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. - - ✅︎ - - ✅︎ - - -* - `MllamaForConditionalGeneration` - - Llama 3.2 - - T + I+ - - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. - - - - - - -* - `MolmoForCausalLM` - - Molmo - - T + I - - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. - - ✅︎ - - ✅︎ - - ✅︎ -* - `NVLM_D_Model` - - NVLM-D 1.0 - - T + IE+ - - `nvidia/NVLM-D-72B`, etc. - - - - ✅︎ - - ✅︎ -* - `PaliGemmaForConditionalGeneration` - - PaliGemma, PaliGemma 2 - - T + IE - - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. - - - - ✅︎ - - -* - `Phi3VForCausalLM` - - Phi-3-Vision, Phi-3.5-Vision - - T + IE+ - - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. - - - - ✅︎ - - ✅︎ -* - `PixtralForConditionalGeneration` - - Pixtral - - T + I+ - - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. - - - - ✅︎ - - ✅︎ -* - `QWenLMHeadModel` - - Qwen-VL - - T + IE+ - - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. - - ✅︎ - - ✅︎ - - -* - `Qwen2AudioForConditionalGeneration` - - Qwen2-Audio - - T + A+ - - `Qwen/Qwen2-Audio-7B-Instruct` - - - - ✅︎ - - ✅︎ -* - `Qwen2VLForConditionalGeneration` - - QVQ, Qwen2-VL - - T + IE+ + VE+ - - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. - - ✅︎ - - ✅︎ - - -* - `UltravoxModel` - - Ultravox - - T + AE+ - - `fixie-ai/ultravox-v0_3` - - - - ✅︎ - - ✅︎ -``` +- * Architecture + * Models + * Inputs + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) + * [V1](gh-issue:8779) +- * `AriaForConditionalGeneration` + * Aria + * T + I+ + * `rhymes-ai/Aria` + * + * ✅︎ + * ✅︎ +- * `Blip2ForConditionalGeneration` + * BLIP-2 + * T + IE + * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. + * + * ✅︎ + * ✅︎ +- * `ChameleonForConditionalGeneration` + * Chameleon + * T + I + * `facebook/chameleon-7b` etc. + * + * ✅︎ + * ✅︎ +- * `DeepseekVLV2ForCausalLM` + * DeepSeek-VL2 + * T + I+ + * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc. (see note) + * + * ✅︎ + * ✅︎ +- * `FuyuForCausalLM` + * Fuyu + * T + I + * `adept/fuyu-8b` etc. + * + * ✅︎ + * ✅︎ +- * `ChatGLMModel` + * GLM-4V + * T + I + * `THUDM/glm-4v-9b` etc. + * ✅︎ + * ✅︎ + * +- * `H2OVLChatModel` + * H2OVL + * T + IE+ + * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. + * + * ✅︎ + * +- * `Idefics3ForConditionalGeneration` + * Idefics3 + * T + I + * `HuggingFaceM4/Idefics3-8B-Llama3` etc. + * ✅︎ + * + * +- * `InternVLChatModel` + * InternVL 2.5, Mono-InternVL, InternVL 2.0 + * T + IE+ + * `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. + * + * ✅︎ + * ✅︎ +- * `LlavaForConditionalGeneration` + * LLaVA-1.5 + * T + IE+ + * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. + * + * ✅︎ + * ✅︎ +- * `LlavaNextForConditionalGeneration` + * LLaVA-NeXT + * T + IE+ + * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. + * + * ✅︎ + * ✅︎ +- * `LlavaNextVideoForConditionalGeneration` + * LLaVA-NeXT-Video + * T + V + * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. + * + * ✅︎ + * ✅︎ +- * `LlavaOnevisionForConditionalGeneration` + * LLaVA-Onevision + * T + I+ + V+ + * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. + * + * ✅︎ + * ✅︎ +- * `MiniCPMO` + * MiniCPM-O + * T + IE+ + VE+ + AE+ + * `openbmb/MiniCPM-o-2_6`, etc. + * ✅︎ + * ✅︎ + * +- * `MiniCPMV` + * MiniCPM-V + * T + IE+ + VE+ + * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. + * ✅︎ + * ✅︎ + * +- * `MllamaForConditionalGeneration` + * Llama 3.2 + * T + I+ + * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. + * + * + * +- * `MolmoForCausalLM` + * Molmo + * T + I + * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. + * ✅︎ + * ✅︎ + * ✅︎ +- * `NVLM_D_Model` + * NVLM-D 1.0 + * T + IE+ + * `nvidia/NVLM-D-72B`, etc. + * + * ✅︎ + * ✅︎ +- * `PaliGemmaForConditionalGeneration` + * PaliGemma, PaliGemma 2 + * T + IE + * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. + * + * ✅︎ + * +- * `Phi3VForCausalLM` + * Phi-3-Vision, Phi-3.5-Vision + * T + IE+ + * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc. + * + * ✅︎ + * ✅︎ +- * `PixtralForConditionalGeneration` + * Pixtral + * T + I+ + * `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` (see note), etc. + * + * ✅︎ + * ✅︎ +- * `QWenLMHeadModel` + * Qwen-VL + * T + IE+ + * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. + * ✅︎ + * ✅︎ + * ✅︎ +- * `Qwen2AudioForConditionalGeneration` + * Qwen2-Audio + * T + A+ + * `Qwen/Qwen2-Audio-7B-Instruct` + * + * ✅︎ + * ✅︎ +- * `Qwen2VLForConditionalGeneration` + * QVQ, Qwen2-VL + * T + IE+ + VE+ + * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. + * ✅︎ + * ✅︎ + * ✅︎ +- * `UltravoxModel` + * Ultravox + * T + AE+ + * `fixie-ai/ultravox-v0_3` + * + * ✅︎ + * ✅︎ +::: E Pre-computed embeddings can be inputted for this modality. + Multiple items can be inputted per text prompt for this modality. -````{note} -To use `DeepSeek-VL2` series models, you need to install a fork version `deepseek_vl2` package: - -```shell -pip install git+https://github.com/Isotr0py/DeepSeek-VL2.git -``` - -Besides, to run `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. -```` +:::{note} +To use `DeepSeek-VL2` series models, you have to pass `--hf_overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'` when running vLLM. +::: -```{note} +:::{note} To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. -``` +::: -```{note} +:::{note} The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. For more details, please see: -``` +::: -```{note} +:::{note} The chat template for Pixtral-HF is incorrect (see [discussion](https://huggingface.co/mistral-community/pixtral-12b/discussions/22)). A corrected version is available at . -``` +::: ### Pooling Models See [this page](pooling-models) for more information on how to use pooling models. -```{important} +:::{important} Since some model architectures support both generative and pooling tasks, you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode. -``` +::: #### Text Embedding (`--task embed`) Any text generation model can be converted into an embedding model by passing `--task embed`. -```{note} +:::{note} To get the best results, you should use pooling models that are specifically trained as such. -``` +::: The following table lists those that are tested in vLLM. -```{list-table} +:::{list-table} :widths: 25 25 15 25 5 5 :header-rows: 1 -* - Architecture - - Models - - Inputs - - Example HF Models - - [LoRA](#lora-adapter) - - [PP](#distributed-serving) -* - `LlavaNextForConditionalGeneration` - - LLaVA-NeXT-based - - T / I - - `royokong/e5-v` - - - - ✅︎ -* - `Phi3VForCausalLM` - - Phi-3-Vision-based - - T + I - - `TIGER-Lab/VLM2Vec-Full` - - 🚧 - - ✅︎ -* - `Qwen2VLForConditionalGeneration` - - Qwen2-VL-based - - T + I - - `MrLight/dse-qwen2-2b-mrl-v1` - - - - ✅︎ -``` +- * Architecture + * Models + * Inputs + * Example HF Models + * [LoRA](#lora-adapter) + * [PP](#distributed-serving) +- * `LlavaNextForConditionalGeneration` + * LLaVA-NeXT-based + * T / I + * `royokong/e5-v` + * + * ✅︎ +- * `Phi3VForCausalLM` + * Phi-3-Vision-based + * T + I + * `TIGER-Lab/VLM2Vec-Full` + * 🚧 + * ✅︎ +- * `Qwen2VLForConditionalGeneration` + * Qwen2-VL-based + * T + I + * `MrLight/dse-qwen2-2b-mrl-v1` + * + * ✅︎ +::: _________________ @@ -850,9 +859,9 @@ At vLLM, we are committed to facilitating the integration and support of third-p 2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results. - ```{tip} + :::{tip} When comparing the output of `model.generate` from HuggingFace Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs. - ``` + ::: 3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback. diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index daf6e2f250416..3f9ca27eb438e 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -14,9 +14,9 @@ In short, you should increase the number of GPUs and the number of nodes until y After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough. -```{note} +:::{note} There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs. -``` +::: ## Running vLLM on a single node @@ -94,12 +94,12 @@ vllm serve /path/to/the/model/in/the/container \ To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient. -```{warning} +:::{warning} After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. -``` +::: -```{warning} +:::{warning} Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes. When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model. -``` +::: diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md index cd3c6a430b7fa..827c25b50522f 100644 --- a/docs/source/serving/engine_args.md +++ b/docs/source/serving/engine_args.md @@ -4,6 +4,7 @@ Below, you can find an explanation of every engine argument for vLLM: + ```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils @@ -16,6 +17,7 @@ Below, you can find an explanation of every engine argument for vLLM: Below are the additional arguments related to the asynchronous engine: + ```{eval-rst} .. argparse:: :module: vllm.engine.arg_utils diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md index f9b08077a03b4..9845241930a40 100644 --- a/docs/source/serving/env_vars.md +++ b/docs/source/serving/env_vars.md @@ -2,14 +2,14 @@ vLLM uses the following environment variables to configure the system: -```{warning} +:::{warning} Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work. All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables). -``` +::: -```{literalinclude} ../../../vllm/envs.py +:::{literalinclude} ../../../vllm/envs.py :end-before: end-env-vars-definition :language: python :start-after: begin-env-vars-definition -``` +::: diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md index 371c284981ce9..e2b4c0814605b 100644 --- a/docs/source/serving/integrations/index.md +++ b/docs/source/serving/integrations/index.md @@ -1,8 +1,8 @@ # External Integrations -```{toctree} +:::{toctree} :maxdepth: 1 langchain llamaindex -``` +::: diff --git a/docs/source/serving/metrics.md b/docs/source/serving/metrics.md index 6c84f6d1350a6..6c0dc8880a90d 100644 --- a/docs/source/serving/metrics.md +++ b/docs/source/serving/metrics.md @@ -31,8 +31,8 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I The following metrics are exposed: -```{literalinclude} ../../../vllm/engine/metrics.py +:::{literalinclude} ../../../vllm/engine/metrics.py :end-before: end-metrics-definitions :language: python :start-after: begin-metrics-definitions -``` +::: diff --git a/docs/source/serving/multimodal_inputs.md b/docs/source/serving/multimodal_inputs.md index 0213b0a3388ea..217b531e83788 100644 --- a/docs/source/serving/multimodal_inputs.md +++ b/docs/source/serving/multimodal_inputs.md @@ -4,10 +4,10 @@ This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM. -```{note} +:::{note} We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes, and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests. -``` +::: ## Offline Inference @@ -203,13 +203,13 @@ for o in outputs: Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). -```{important} +:::{important} A chat template is **required** to use Chat Completions API. Although most models come with a chat template, for others you have to define one yourself. The chat template can be inferred based on the documentation on the model's HuggingFace repo. For example, LLaVA-1.5 (`llava-hf/llava-1.5-7b-hf`) requires a chat template that can be found here: -``` +::: ### Image @@ -273,24 +273,25 @@ print("Chat completion output:", chat_response.choices[0].message.content) Full example: -```{tip} +:::{tip} Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine, and pass the file path as `url` in the API request. -``` +::: -```{tip} +:::{tip} There is no need to place image placeholders in the text content of the API request - they are already represented by the image content. In fact, you can place image placeholders in the middle of the text by interleaving text and image content. -``` +::: -````{note} +:::{note} By default, the timeout for fetching images through HTTP URL is `5` seconds. You can override this by setting the environment variable: ```console -$ export VLLM_IMAGE_FETCH_TIMEOUT= +export VLLM_IMAGE_FETCH_TIMEOUT= ``` -```` + +::: ### Video @@ -345,14 +346,15 @@ print("Chat completion output from image url:", result) Full example: -````{note} +:::{note} By default, the timeout for fetching videos through HTTP URL is `30` seconds. You can override this by setting the environment variable: ```console -$ export VLLM_VIDEO_FETCH_TIMEOUT= +export VLLM_VIDEO_FETCH_TIMEOUT= ``` -```` + +::: ### Audio @@ -448,24 +450,25 @@ print("Chat completion output from audio url:", result) Full example: -````{note} +:::{note} By default, the timeout for fetching audios through HTTP URL is `10` seconds. You can override this by setting the environment variable: ```console -$ export VLLM_AUDIO_FETCH_TIMEOUT= +export VLLM_AUDIO_FETCH_TIMEOUT= ``` -```` + +::: ### Embedding vLLM's Embeddings API is a superset of OpenAI's [Embeddings API](https://platform.openai.com/docs/api-reference/embeddings), where a list of chat `messages` can be passed instead of batched `inputs`. This enables multi-modal inputs to be passed to embedding models. -```{tip} +:::{tip} The schema of `messages` is exactly the same as in Chat Completions API. You can refer to the above tutorials for more details on how to pass each type of multi-modal data. -``` +::: Usually, embedding models do not expect chat-based input, so we need to use a custom chat template to format the text and images. Refer to the examples below for illustration. @@ -477,13 +480,13 @@ vllm serve TIGER-Lab/VLM2Vec-Full --task embed \ --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja ``` -```{important} +:::{important} Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed` to run this model in embedding mode instead of text generation mode. The custom chat template is completely different from the original one for this model, and can be found here: -``` +::: Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: @@ -518,16 +521,16 @@ vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \ --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja ``` -```{important} +:::{important} Like with VLM2Vec, we have to explicitly pass `--task embed`. Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled by a custom chat template: -``` +::: -```{important} +:::{important} Also important, `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code example below for details. -``` +::: Full example: diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md index 1f5a54f755f13..ded57500c5d0d 100644 --- a/docs/source/serving/offline_inference.md +++ b/docs/source/serving/offline_inference.md @@ -22,15 +22,17 @@ The available APIs depend on the type of model that is being run: Please refer to the above pages for more details about each API. -```{seealso} +:::{seealso} [API Reference](/api/offline_inference/index) -``` +::: ## Configuration Options This section lists the most common options for running the vLLM engine. For a full list, refer to the [Engine Arguments](#engine-args) page. +(model-resolution)= + ### Model resolution vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository @@ -41,37 +43,6 @@ Nevertheless, our model resolution may fail for the following reasons: - Unofficial repositories refer to a model using alternative names which are not recorded in vLLM. - The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded. -In those cases, vLLM may throw an error like: - -```text -Traceback (most recent call last): -... - File "vllm/model_executor/models/registry.py", line xxx, in inspect_model_cls - for arch in architectures: -TypeError: 'NoneType' object is not iterable -``` - -or: - -```text - File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported - raise ValueError( -ValueError: Model architectures [''] are not supported for now. Supported architectures: [...] -``` - -:::{note} -The above error is distinct from the following similar but different error: - -```text - File "vllm/model_executor/models/registry.py", line xxx, in _raise_for_unsupported - raise ValueError( -ValueError: Model architectures [''] failed to be inspected. Please check the logs for more details. -``` - -This error means that vLLM failed to import the model file. Usually, it is related to missing dependencies or outdated -binaries in the vLLM build. Please read the logs carefully to determine the real cause of the error. -::: - To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option. For example: @@ -99,12 +70,12 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ``` -```{important} +:::{important} To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`) before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. -``` +::: #### Quantization diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index e49bbb06695f8..82ef54c16dafb 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -50,6 +50,11 @@ In addition, we have the following custom APIs: - Applicable to all [pooling models](../models/pooling_models.md). - [Score API](#score-api) (`/score`) - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). +- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) + - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/) + - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank) + - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response. + - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`). (chat-template)= @@ -156,11 +161,11 @@ print(completion._request_id) The `vllm serve` command is used to launch the OpenAI-compatible server. -```{argparse} +:::{argparse} :module: vllm.entrypoints.openai.cli_args :func: create_parser_for_docs :prog: vllm serve -``` +::: #### Configuration file @@ -183,10 +188,10 @@ To use the above config file: vllm serve SOME_MODEL --config config.yaml ``` -```{note} +:::{note} In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence. The order of priorities is `command line > config file values > defaults`. -``` +::: ## API Reference @@ -203,19 +208,19 @@ Code example: The following [sampling parameters](#sampling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-completion-sampling-params :end-before: end-completion-sampling-params -``` +::: The following extra parameters are supported: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-completion-extra-params :end-before: end-completion-extra-params -``` +::: (chat-api)= @@ -235,19 +240,19 @@ Code example: The following [sampling parameters](#sampling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-chat-completion-sampling-params :end-before: end-chat-completion-sampling-params -``` +::: The following extra parameters are supported: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-chat-completion-extra-params :end-before: end-chat-completion-extra-params -``` +::: (embeddings-api)= @@ -259,9 +264,9 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api)) which will be treated as a single prompt to the model. -```{tip} +:::{tip} This enables multi-modal inputs to be passed to embedding models, see [this page](#multimodal-inputs) for details. -``` +::: Code example: @@ -269,27 +274,27 @@ Code example: The following [pooling parameters](#pooling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-embedding-pooling-params :end-before: end-embedding-pooling-params -``` +::: The following extra parameters are supported by default: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-embedding-extra-params :end-before: end-embedding-extra-params -``` +::: For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-chat-embedding-extra-params :end-before: end-chat-embedding-extra-params -``` +::: (tokenizer-api)= @@ -460,16 +465,103 @@ Response: The following [pooling parameters](#pooling-params) are supported. -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-score-pooling-params :end-before: end-score-pooling-params -``` +::: The following extra parameters are supported: -```{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py :language: python :start-after: begin-score-extra-params :end-before: end-score-extra-params +::: + +(rerank-api)= + +### Re-rank API + +Our Re-rank API applies a cross-encoder model to predict relevant scores between a single query, and +each of a list of documents. Usually, the score for a sentence pair refers to the similarity between two sentences, on +a scale of 0 to 1. + +You can find the documentation for these kind of models at [sbert.net](https://www.sbert.net/docs/package_reference/cross_encoder/cross_encoder.html). + +The rerank endpoints support popular re-rank models such as `BAAI/bge-reranker-base` and other models supporting the +`score` task. Additionally, `/rerank`, `/v1/rerank`, and `/v2/rerank` +endpoints are compatible with both [Jina AI's re-rank API interface](https://jina.ai/reranker/) and +[Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with +popular open-source tools. + +Code example: + +#### Example Request + +Note that the `top_n` request parameter is optional and will default to the length of the `documents` field. +Result documents will be sorted by relevance, and the `index` property can be used to determine original order. + +Request: + +```bash +curl -X 'POST' \ + 'http://127.0.0.1:8000/v1/rerank' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d '{ + "model": "BAAI/bge-reranker-base", + "query": "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", + "Horses and cows are both animals" + ] +}' +``` + +Response: + +```bash +{ + "id": "rerank-fae51b2b664d4ed38f5969b612edff77", + "model": "BAAI/bge-reranker-base", + "usage": { + "total_tokens": 56 + }, + "results": [ + { + "index": 1, + "document": { + "text": "The capital of France is Paris." + }, + "relevance_score": 0.99853515625 + }, + { + "index": 0, + "document": { + "text": "The capital of Brazil is Brasilia." + }, + "relevance_score": 0.0005860328674316406 + } + ] +} ``` + +#### Extra parameters + +The following [pooling parameters](#pooling-params) are supported. + +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-rerank-pooling-params +:end-before: end-rerank-pooling-params +::: + +The following extra parameters are supported: + +:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py +:language: python +:start-after: begin-rerank-extra-params +:end-before: end-rerank-extra-params +::: diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index 6fd74782a9aae..5952ec13ec3cb 100644 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -67,7 +67,37 @@ def run_qwen2_audio(question: str, audio_count: int): return llm, prompt, stop_token_ids -model_example_map = {"ultravox": run_ultravox, "qwen2_audio": run_qwen2_audio} +def run_minicpmo(question: str, audio_count: int): + model_name = "openbmb/MiniCPM-o-2_6" + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + llm = LLM(model=model_name, + trust_remote_code=True, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}) + + stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + audio_placeholder = "()" * audio_count + audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}" # noqa: E501 + messages = [{ + 'role': 'user', + 'content': f'{audio_placeholder}\n{question}' + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True, + chat_template=audio_chat_template) + return llm, prompt, stop_token_ids + + +model_example_map = { + "ultravox": run_ultravox, + "qwen2_audio": run_qwen2_audio, + "minicpmo": run_minicpmo +} def main(args): diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai/openai_batch.md index a4774e57cd9a5..953e6ef130f18 100644 --- a/examples/offline_inference/openai/openai_batch.md +++ b/examples/offline_inference/openai/openai_batch.md @@ -13,7 +13,7 @@ The OpenAI batch file format consists of a series of json objects on new lines. Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. ```{note} -We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon). +We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon). ``` ## Pre-requisites @@ -203,3 +203,34 @@ $ cat results.jsonl {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} ... ``` + +## Example 5: Using score endpoint + +### Additional prerequisites + +* Ensure you are using `vllm >= 0.7.0`. + +### Step 1: Create your batch file + +Add score requests to your batch file. The following is an example: + +``` +{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +``` + +You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model). + +### Step 2: Run the batch + +You can run the batch using the same command as in earlier examples. + +### Step 3: Check your results + +You can check your results by running `cat results.jsonl` + +``` +$ cat results.jsonl +{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} +{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} +``` diff --git a/examples/offline_inference/profiling_tpu/README.md b/examples/offline_inference/profiling_tpu/README.md new file mode 100644 index 0000000000000..08efa63dc1021 --- /dev/null +++ b/examples/offline_inference/profiling_tpu/README.md @@ -0,0 +1,67 @@ +# vLLM TPU Profiling + +This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes. + +Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes. + +We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/ai_accelerator/index.html). + +> In all examples below, we run several warmups before (so `--enforce-eager` is okay) + +## Profile Examples + +### Generate Prefill Trace + +This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations. + +```bash +export XLA_HLO_DEBUG=1 +export MODEL=Qwen/Qwen2.5-7B-Instruct +export VLLM_TPU_PROFILE_DURATION_MS=3000 +export VLLM_TPU_PROFILE_DELAY_MS=0 + +python3 profiling.py \ + --model $MODEL \ + --input-len 1024 --output-len 1 \ + --batch-size 1 --enforce-eager \ + --max-model-len 2048 \ + --tensor-parallel-size 1 \ + --profile-result-dir profiles +``` + + +### Generate Decode Trace + +This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill). + +```bash +export XLA_HLO_DEBUG=1 +export MODEL=meta-llama/Llama-3.1-70B-Instruct +export VLLM_TPU_PROFILE_DURATION_MS=2000 +export VLLM_TPU_PROFILE_DELAY_MS=1000 + +rm -rf ~/.cache/vllm/xla_cache +python3 profiling.py \ + --model $MODEL \ + --input-len 1 \ + --output-len 128 \ + --batch-size 32 \ + --enforce-eager \ + --profile-result-dir profiles \ + --max-model-len 2048 --tensor-parallel-size 8 +``` + + +## Visualizing the profiles + +Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm). + +Here are most likely the dependencies you need to install: +```bash +pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources +``` + +Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser: +```bash +tensorboard --logdir profiles/ --port 6006 +``` \ No newline at end of file diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py new file mode 100644 index 0000000000000..d7423e6c6da93 --- /dev/null +++ b/examples/offline_inference/profiling_tpu/profiling.py @@ -0,0 +1,101 @@ +import argparse +import dataclasses +import os +import time +from typing import List + +import numpy as np +import torch_xla.debug.profiler as xp +from tqdm import tqdm + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.inputs import PromptType +from vllm.utils import FlexibleArgumentParser + +DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000)) +DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0)) + + +def main(args: argparse.Namespace): + print(args) + + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + _ = xp.start_server(9012) + + sampling_params = SamplingParams( + temperature=0.0, + ignore_eos=True, + max_tokens=args.output_len, + ) + print(sampling_params) + dummy_prompt_token_ids = np.random.randint(10000, + size=(args.batch_size, + args.input_len)) + dummy_prompts: List[PromptType] = [{ + "prompt_token_ids": batch + } for batch in dummy_prompt_token_ids.tolist()] + + def run_to_completion(): + start_time = time.perf_counter() + llm.generate(dummy_prompts, + sampling_params=sampling_params, + use_tqdm=False) + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + # Warmup + print("Warming up...") + warmup_latencies = [] + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + warmup_latencies.append(run_to_completion()) + print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s") + + # Profile + profile_dir = args.profile_result_dir + print(f"Profiling (results will be saved to '{profile_dir}')...") + # Enable tracing on server + xp.trace_detached("localhost:9012", + profile_dir, + delay_ms=DELAY_MS, + duration_ms=DURATION_MS) + if DELAY_MS == 0: + time.sleep(1.0) + profile_latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profile iterations"): + profile_latencies.append(run_to_completion()) + print(f"Average profile latency: {np.mean(profile_latencies):.4f}s") + + return + + +if __name__ == '__main__': + parser = FlexibleArgumentParser( + description='Benchmark the latency of processing a single batch of ' + 'requests till completion.') + parser.add_argument('--input-len', type=int, default=32) + parser.add_argument('--output-len', type=int, default=128) + parser.add_argument('--batch-size', type=int, default=8) + parser.add_argument('--num-iters-warmup', + type=int, + default=5, + help='Number of iterations to run for warmup.') + parser.add_argument('--num-iters', + type=int, + default=1, + help='Number of iterations to run for profiling.') + parser.add_argument( + '--profile-result-dir', + type=str, + default="profiles", + help= + ('path to save the pytorch profiler output. Can be visualized ' + 'with ui.perfetto.dev or Tensorboard ' + '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).' + )) + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py index 3bc303dad277f..5c4918008dcb3 100644 --- a/examples/offline_inference/rlhf.py +++ b/examples/offline_inference/rlhf.py @@ -19,7 +19,7 @@ from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from transformers import AutoModelForCausalLM -from vllm import LLM, SamplingParams, configure_as_vllm_process +from vllm import LLM, SamplingParams from vllm.utils import get_ip, get_open_port from vllm.worker.worker import Worker @@ -98,12 +98,7 @@ def __init__(self, *args, **kwargs): """ Start the training process, here we use huggingface transformers as an example to hold a model on GPU 0. - -It is important for all the processes outside of vLLM to call -`configure_as_vllm_process` to set some common environment variables -the same as vLLM workers. """ -configure_as_vllm_process() train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") train_model.to("cuda:0") diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 69228bbf22949..38c2b13d3f2c7 100644 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -26,14 +26,12 @@ def run_aria(question: str, modality: str): # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM(model=model_name, - tokenizer_mode="slow", - dtype="bfloat16", max_model_len=4096, max_num_seqs=2, - trust_remote_code=True, + dtype="bfloat16", disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) - prompt = (f"<|im_start|>user\n<|img|>\n{question}" + prompt = (f"<|im_start|>user\n<|img|>{question}" "<|im_end|>\n<|im_start|>assistant\n") stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519] @@ -267,8 +265,9 @@ def run_mantis(question: str, modality: str): # MiniCPM-V -def run_minicpmv(question: str, modality: str): - assert modality == "image" +def run_minicpmv_base(question: str, modality: str, model_name): + assert modality in ["image", "video"] + # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa # 2.0 # The official repo doesn't work yet, so we need to use a fork for now @@ -279,7 +278,15 @@ def run_minicpmv(question: str, modality: str): # model_name = "openbmb/MiniCPM-Llama3-V-2_5" # 2.6 - model_name = "openbmb/MiniCPM-V-2_6" + # model_name = "openbmb/MiniCPM-V-2_6" + # o2.6 + + # modality supports + # 2.0: image + # 2.5: image + # 2.6: image, video + # o2.6: image, video, audio + # model_name = "openbmb/MiniCPM-o-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) llm = LLM( @@ -296,13 +303,18 @@ def run_minicpmv(question: str, modality: str): # 2.5 # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] - # 2.6 + # 2.6 / o2.6 stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + modality_placeholder = { + "image": "(./)", + "video": "()", + } + messages = [{ 'role': 'user', - 'content': f'(./)\n{question}' + 'content': f'{modality_placeholder[modality]}\n{question}' }] prompt = tokenizer.apply_chat_template(messages, tokenize=False, @@ -310,6 +322,14 @@ def run_minicpmv(question: str, modality: str): return llm, prompt, stop_token_ids +def run_minicpmo(question: str, modality: str): + return run_minicpmv_base(question, modality, "openbmb/MiniCPM-o-2_6") + + +def run_minicpmv(question: str, modality: str): + return run_minicpmv_base(question, modality, "openbmb/MiniCPM-V-2_6") + + # LLama 3.2 def run_mllama(question: str, modality: str): assert modality == "image" @@ -525,6 +545,7 @@ def run_qwen2_vl(question: str, modality: str): "llava-next-video": run_llava_next_video, "llava-onevision": run_llava_onevision, "mantis": run_mantis, + "minicpmo": run_minicpmo, "minicpmv": run_minicpmv, "mllama": run_mllama, "molmo": run_molmo, diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index cf3c5dd4e0a2c..43c44fa867e0a 100644 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -393,7 +393,7 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: model_example_map = { "aria": load_aria, - "deepseek_vl2": load_deepseek_vl2, + "deepseek_vl_v2": load_deepseek_vl2, "h2ovl_chat": load_h2onvl, "idefics3": load_idefics3, "internvl_chat": load_internvl, diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py new file mode 100644 index 0000000000000..a07affe3351ce --- /dev/null +++ b/examples/online_serving/cohere_rerank_client.py @@ -0,0 +1,32 @@ +""" +Example of using the OpenAI entrypoint's rerank API which is compatible with +the Cohere SDK: https://github.com/cohere-ai/cohere-python + +run: vllm serve BAAI/bge-reranker-base +""" +import cohere + +# cohere v1 client +co = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key") +rerank_v1_result = co.rerank( + model="BAAI/bge-reranker-base", + query="What is the capital of France?", + documents=[ + "The capital of France is Paris", "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving" + ]) + +print(rerank_v1_result) + +# or the v2 +co2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000") + +v2_rerank_result = co2.rerank( + model="BAAI/bge-reranker-base", + query="What is the capital of France?", + documents=[ + "The capital of France is Paris", "Reranking is fun!", + "vLLM is an open-source framework for fast AI serving" + ]) + +print(v2_rerank_result) diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py new file mode 100644 index 0000000000000..bf4de76ddf362 --- /dev/null +++ b/examples/online_serving/jinaai_rerank_client.py @@ -0,0 +1,33 @@ +""" +Example of using the OpenAI entrypoint's rerank API which is compatible with +Jina and Cohere https://jina.ai/reranker + +run: vllm serve BAAI/bge-reranker-base +""" +import json + +import requests + +url = "http://127.0.0.1:8000/rerank" + +headers = {"accept": "application/json", "Content-Type": "application/json"} + +data = { + "model": + "BAAI/bge-reranker-base", + "query": + "What is the capital of France?", + "documents": [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", "Horses and cows are both animals" + ] +} +response = requests.post(url, headers=headers, json=data) + +# Check the response +if response.status_code == 200: + print("Request successful!") + print(json.dumps(response.json(), indent=2)) +else: + print(f"Request failed with status code: {response.status_code}") + print(response.text) diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py new file mode 100644 index 0000000000000..83e51a48bcc6b --- /dev/null +++ b/examples/online_serving/openai_chat_completion_with_reasoning.py @@ -0,0 +1,53 @@ +""" +An example shows how to generate chat completions from reasoning models +like DeepSeekR1. + +To run this example, you need to start the vLLM server with the reasoning +parser: + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --enable-reasoning --reasoning-parser deepseek_r1 +``` + +This example demonstrates how to generate chat completions from reasoning models +using the OpenAI Python client library. +""" + +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +# Round 1 +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] +response = client.chat.completions.create(model=model, messages=messages) + +reasoning_content = response.choices[0].message.reasoning_content +content = response.choices[0].message.content + +print("reasoning_content:", reasoning_content) +print("content:", content) + +# Round 2 +messages.append({"role": "assistant", "content": content}) +messages.append({ + "role": "user", + "content": "How many Rs are there in the word 'strawberry'?", +}) +response = client.chat.completions.create(model=model, messages=messages) + +reasoning_content = response.choices[0].message.reasoning_content +content = response.choices[0].message.content + +print("reasoning_content:", reasoning_content) +print("content:", content) diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py new file mode 100644 index 0000000000000..8c14aac6b4ecb --- /dev/null +++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py @@ -0,0 +1,90 @@ +""" +An example shows how to generate chat completions from reasoning models +like DeepSeekR1. + +To run this example, you need to start the vLLM server with the reasoning +parser: + +```bash +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \ + --enable-reasoning --reasoning-parser deepseek_r1 +``` + +Unlike openai_chat_completion_with_reasoning.py, this example demonstrates the +streaming chat completions feature. + +The streaming chat completions feature allows you to receive chat completions +in real-time as they are generated by the model. This is useful for scenarios +where you want to display chat completions to the user as they are generated +by the model. + +Here we do not use the OpenAI Python client library, because it does not support +`reasoning_content` fields in the response. +""" + +import json + +import requests + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +models = requests.get( + f"{openai_api_base}/models", + headers={ + "Authorization": f"Bearer {openai_api_key}" + }, +).json() +model = models["data"][0]["id"] + +# Streaming chat completions +messages = [{"role": "user", "content": "9.11 and 9.8, which is greater?"}] + +response = requests.post( + f"{openai_api_base}/chat/completions", + headers={"Authorization": f"Bearer {openai_api_key}"}, + json={ + "model": model, + "messages": messages, + "stream": True + }, +) + +print("client: Start streaming chat completions...") +printed_reasoning_content = False +printed_content = False +# Make the streaming request +if response.status_code == 200: + # Process the streaming response + for line in response.iter_lines(): + if line: # Filter out keep-alive new lines + # Decode the line and parse the JSON + decoded_line = line.decode("utf-8") + if decoded_line.startswith("data:"): + data = decoded_line[5:].strip() # Remove "data:" prefix + if data == "[DONE]": # End of stream + print("\nclient: Stream completed.") + break + try: + # Parse the JSON data + chunk = json.loads(data) + reasoning_content = chunk["choices"][0]["delta"].get( + "reasoning_content", "") + content = chunk["choices"][0]["delta"].get("content", "") + + if reasoning_content: + if not printed_reasoning_content: + printed_reasoning_content = True + print("reasoning_content:", end="", flush=True) + print(reasoning_content, end="", flush=True) + elif content: + if not printed_content: + printed_content = True + print("\ncontent:", end="", flush=True) + # Extract and print the content + print(content, end="", flush=True) + except json.JSONDecodeError: + print("Error decoding JSON:", decoded_line) +else: + print(f"Error: {response.status_code} - {response.text}") diff --git a/examples/online_serving/prometheus_grafana/README.md b/examples/online_serving/prometheus_grafana/README.md index c49e5306a1cb4..4a85f953b0b4c 100644 --- a/examples/online_serving/prometheus_grafana/README.md +++ b/examples/online_serving/prometheus_grafana/README.md @@ -24,7 +24,7 @@ Submit some sample requests to the server: ```bash wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python3 ../../benchmarks/benchmark_serving.py \ +python3 ../../../benchmarks/benchmark_serving.py \ --model mistralai/Mistral-7B-v0.1 \ --tokenizer mistralai/Mistral-7B-v0.1 \ --endpoint /v1/completions \ diff --git a/examples/other/fp8/README.md b/examples/other/fp8/README.md deleted file mode 100644 index ee09f09dfdcd2..0000000000000 --- a/examples/other/fp8/README.md +++ /dev/null @@ -1,99 +0,0 @@ -> [!NOTE] ->The examples in this folder are **NOT** Intel Gaudi specific and come from the original vllm-project repository from where this fork was created. For FP8 examples on Intel Gaudi please refer to Intel® Gaudi® README. - -# FP8 KV Cache - -This utility extracts the KV cache scaling factors from a quantized HF (Hugging Face) model. The extracted scaling factors are saved to a JSON file, which can later be used by vLLM (variable-length language model) during runtime. This tool is particularly useful when the KV cache data type is FP8 and is intended for use on ROCm (AMD GPU) platforms. - -## Prerequisites - -- Python 3.x -- PyTorch -- NumPy -- Hugging Face Transformers -- Hugging Face Hub -- AMMO - -Before incorporating the FP8 datatype for inference workloads, you must adhere to the following steps: -1. Install all necessary prerequisites and dependencies. -2. Convert HF model into a quantized HF model. -3. Extract KV Cache Scaling Factors from quantized HF model. -4. Load KV Cache Scaling Factors into VLLM. - -### 2. Convert HF model into a quantized HF model. -Note: The following steps are adapted from the [TensorRT-LLM repository](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/README.md). - -`quantize.py` (examples/other/fp8/quantizer/quantize.py) uses the quantization toolkit (AMMO) to calibrate the PyTorch models and export TensorRT-LLM checkpoints. Each TensorRT-LLM checkpoint contains a config file (in .json format) and one or several rank weight files (in .safetensors format). - -The detailed quantization toolkit (AMMO) conversion guide for FP8 can be found at `examples/other/fp8/quantizer/README.md`. - -### 3. Extract KV Cache Scaling Factors from quantized HF model. -`extract_scales.py` (examples/other/fp8/extract_scales.py) can be utilized to extract the KV cache scaling factors from your quantized HF model, however at the moment, this tool exclusively supports Llama 2 models. It is also important to note the following: -1. **File Structure**: The utility operates under the assumption that all parameters, including KV cache scaling factors, corresponding to a particular Tensor Parallelism (TP) rank are stored in a single file. These files must adhere to a specific naming convention where the TP rank is immediately identified after a specific keyword (e.g., "rank") in the filename. - -2. **TP Decomposition**: The utility assumes consistency between the TP decomposition employed by the quantizer tool and that used by vLLM. - -3. **AMMO Compatibility**: Currently, the generated KV cache scaling factors for AMMO remain uniform across all TP ranks. - -```python -# prerequisites: -# - Quantized HF LLaMa 2 model -python3 examples/other/fp8/extract_scales.py --help -Usage: extract_scales.py [-h] --quantized_model QUANTIZED_MODEL [--load_format {auto,safetensors,npz,pt}] [--output_dir OUTPUT_DIR] [--output_name OUTPUT_NAME] [--tp_size TP_SIZE] - -KV Scale Extraction Example - -optional arguments: ---quantized_model: Specify either the local path to, or name of, a quantized HF model. It is expected that the quantization format is FP8_E4M3, for use on ROCm (AMD GPU). -Optional arguments: ---cache_dir: Specify a cache directory to use in the event of a HF model download. (Default: None) ---load_format: Specify the format of the model's tensor files containing the KV cache scaling factors. (Choices: auto, safetensors, npz, pt; Default: auto) ---revision: Specify the model's revision number. (Default: None) ---output_dir: Specify the output directory. By default the KV cache scaling factors will be saved in the model directory. (Default: None) ---output_name: Specify the output filename. (Default: kv_cache_scales.json) ---tp_size: Specify the tensor-parallel (TP) size that the quantized model should correspond to. If specified, during KV cache scaling factor extraction the observed TP size will be checked against this and an error will be raised if there is a mismatch. (Default: None) -``` -```python -Example: -python3 examples/other/fp8/extract_scales.py --quantized_model --tp_size --output_dir -``` -### 4. Load KV Cache Scaling Factors into VLLM. -This script evaluates the inference throughput of language models using various backends such as vLLM. It measures the time taken to process a given number of prompts and generate sequences for each prompt. The recently generated KV cache scaling factors are now integrated into the benchmarking process and allow for KV cache scaling factors to be utilized for FP8. -``` -# prerequisites: -# - LLaMa 2 kv_cache_scales.json file - -python3 benchmarks/benchmark_throughput.py --help -usage: benchmark_throughput.py [-h] [--backend {vllm,hf,mii}] [--dataset DATASET] [--input-len INPUT_LEN] [--output-len OUTPUT_LEN] [--model MODEL] - [--tokenizer TOKENIZER] [--quantization {awq,gptq,None}] [--tensor-parallel-size TENSOR_PARALLEL_SIZE] [--n N] - [--use-beam-search] [--num-prompts NUM_PROMPTS] [--seed SEED] [--hf-max-batch-size HF_MAX_BATCH_SIZE] [--trust-remote-code] - [--max-model-len MAX_MODEL_LEN] [--dtype {auto,half,float16,bfloat16,float,float32}] [--enforce-eager] [--kv-cache-dtype {auto,fp8}] - [--quantization-param-path KV_CACHE_quantization_param_path] - -Benchmark Throughput Example -optional arguments: - -h, --help show this help message and exit - --backend {vllm,hf,mii} - --dataset DATASET Path to the dataset. - --input-len INPUT_LEN Input prompt length for each request - --output-len OUTPUT_LEN Output length for each request. Overrides the output length from the dataset. - --model MODEL - --tokenizer TOKENIZER - --quantization {awq,gptq,None}, -q {awq,gptq,None} - --tensor-parallel-size TENSOR_PARALLEL_SIZE, -tp TENSOR_PARALLEL_SIZE - --n N Number of generated sequences per prompt. - --use-beam-search - --num-prompts NUM_PROMPTS Number of prompts to process. - --seed SEED - --hf-max-batch-size HF_MAX_BATCH_SIZE Maximum batch size for HF backend. - --trust-remote-code trust remote code from huggingface - --max-model-len MAX_MODEL_LEN Maximum length of a sequence (including prompt and output). If None, will be derived from the model. - --dtype {auto,half,float16,bfloat16,float,float32} data type for model weights and activations. The "auto" option will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. - --enforce-eager enforce eager execution - --kv-cache-dtype {auto,fp8} Data type for kv cache storage. If "auto", will use model data type. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported ```for common inference criteria. - --quantization-param-path QUANT_PARAM_JSON Path to the JSON file containing the KV cache scaling factors. This should generally be supplied, when KV cache dtype is FP8. Otherwise, KV cache scaling factors default to 1.0, which may cause accuracy issues. FP8_E5M2 (without scaling) is only supported on cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for common inference criteria. -``` -Example: -```console -python3 benchmarks/benchmark_throughput.py --input-len --output-len -tp --kv-cache-dtype fp8 --quantization-param-path --model -``` diff --git a/examples/other/fp8/extract_scales.py b/examples/other/fp8/extract_scales.py deleted file mode 100644 index 1dce9d7e993a0..0000000000000 --- a/examples/other/fp8/extract_scales.py +++ /dev/null @@ -1,367 +0,0 @@ -import argparse -import glob -import json -import os -from typing import Any, Callable, Dict, List, Optional, Tuple - -import numpy as np -import torch -from safetensors.torch import safe_open - -from vllm.model_executor.layers.quantization.schema import QuantParamSchema - - -# Adapted from vllm/model_executor/model_loader/weight_utils.py -# The main differences are that we add the NPZ format and simplify -# its functionality drastically for our purposes (e.g. we assume that -# the quantized model exists locally and there is no need to download it) -def _prepare_hf_weights( - quantized_model_dir: str, - load_format: str = "auto", - fall_back_to_pt: bool = True, -) -> Tuple[List[str], bool]: - if not os.path.isdir(quantized_model_dir): - raise FileNotFoundError( - f"The quantized model directory `{quantized_model_dir}` " - "does not exist.") - use_safetensors = False - # Some quantized models use .pt files for storing the weights. - if load_format == "auto": - allow_patterns = ["*.safetensors", "*.bin"] - elif load_format == "safetensors": - use_safetensors = True - allow_patterns = ["*.safetensors"] - elif load_format == "pt": - allow_patterns = ["*.pt"] - elif load_format == "npz": - allow_patterns = ["*.npz"] - else: - raise ValueError(f"Unknown load_format: {load_format}") - if fall_back_to_pt: - allow_patterns += ["*.pt"] - - hf_weights_files: List[str] = [] - for pattern in allow_patterns: - hf_weights_files += glob.glob( - os.path.join(quantized_model_dir, pattern)) - if len(hf_weights_files) > 0: - if pattern == "*.safetensors": - use_safetensors = True - break - - if not use_safetensors: - # Exclude files that are not needed for inference. - # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233 - blacklist = [ - "training_args.bin", - "optimizer.bin", - "optimizer.pt", - "scheduler.pt", - "scaler.pt", - ] - hf_weights_files = [ - f for f in hf_weights_files - if not any(f.endswith(x) for x in blacklist) - ] - - if len(hf_weights_files) == 0: - raise RuntimeError( - f"Cannot find any model weights with `{quantized_model_dir}`") - - return hf_weights_files, use_safetensors - - -# Adapted from vllm/model_executor/model_loader/weight_utils.py -def _hf_tensorfile_iterator(filename: str, load_format: str, - use_safetensors: bool): - if load_format == "npz": - assert not use_safetensors - with np.load(filename) as data: - for name in data.files: - param = torch.from_numpy(data[name]) - yield name, param - elif use_safetensors: - with safe_open(filename, framework="pt") as f: - for name in f.keys(): # NOQA: SIM118 - param = f.get_tensor(name) - yield name, param - else: - state = torch.load(filename, map_location="cpu") - for name, param in state.items(): - yield name, param - del state - torch.cuda.empty_cache() - - -def _kv_scales_extractor( - hf_tensor_files: List[str], - use_safetensors: bool, - rank_keyword: str = "rank", - expected_tp_size: Optional[int] = None) -> Dict[int, Dict[int, float]]: - """ - Given a list of files containing tensor data, attempt to extract KV cache - scales from these files. Intended as a helper function taking in the output - from _prepare_hf_weights. - Args: - rank_keyword Matches the number immediately after this keyword in the - tensor filename to determine the TP rank corresponding - to said tensor file - expected_tp_size If specified, the TP size of the tensor files is checked - against this and an error is raised if they don't match. - Returns a dictionary mapping TP ranks to their relevant KV cache scales. - The per-rank scales are themselves represented as a dictionary of layer - indices to the respective per-layer scale. - """ - for char in rank_keyword: - assert not char.isdecimal( - ), f"Rank keyword {rank_keyword} contains a numeric character!" - rank_scales_map: Dict[int, Dict[int, float]] = {} - for tensor_file in hf_tensor_files: - try: - rank_idx = tensor_file.find(rank_keyword) - if rank_idx != -1: - start_idx = rank_idx + len(rank_keyword) - stop_idx = start_idx - while stop_idx < len( - tensor_file) and tensor_file[stop_idx].isdecimal(): - stop_idx += 1 - if stop_idx == start_idx: - raise RuntimeError("Did not find rank # in filename.") - rank = int(tensor_file[start_idx:stop_idx]) - elif len(hf_tensor_files) == 1: - # Since there is only one tensor file, we can assume - # that it's intended for TP rank 0 - rank = 0 - else: - raise RuntimeError( - f"Filename does not contain '{rank_keyword}'.") - except RuntimeError: - print("Unable to determine TP rank " - f"corresponding to file '{tensor_file}'") - raise - - if rank not in rank_scales_map: - layer_scales_map: Dict[int, float] = {} - rank_scales_map[rank] = layer_scales_map - else: - raise RuntimeError( - f"Tensor file '{tensor_file}' shares TP rank {rank} " - "with another tensor file.") - - module_delimiter = ":" if args.load_format == "npz" else "." - for name, param in _hf_tensorfile_iterator(tensor_file, - args.load_format, - use_safetensors): - if "kv_cache_scaling_factor" in name: - nums = [ - int(s) for s in name.split(module_delimiter) - if s.isdecimal() - ] - assert len( - nums) == 1, f"Could not determine layer idx for {name}" - layer_idx = nums[0] - assert layer_idx not in layer_scales_map, f"Duplicate scaling"\ - f" factor corresponding to layer {layer_idx}" - try: - layer_scales_map[layer_idx] = param.item() - except RuntimeError: - print( - "This utility supports only per-tensor scalar scales " - f"for now. The tensor\n {name} = {param} \nis an " - "invalid scale factor.") - raise - - if all( - len(layer_scales_map) == 0 - for layer_scales_map in rank_scales_map.values()): - # Note: this is true even if the rank_scales_map is empty - print("WARNING: No KV cache scale factors found. No output saved.") - return None - empirical_tp_world_size = max(rank_scales_map.keys()) + 1 - if expected_tp_size is not None: - assert expected_tp_size == empirical_tp_world_size, \ - f"User expected TP world size = {expected_tp_size} " \ - "from model but tool is expecting TP world size = " \ - f"{empirical_tp_world_size} from model instead." - for i in range(empirical_tp_world_size): - assert i in rank_scales_map, "Expected TP world size = "\ - f"{empirical_tp_world_size} but did not find KV " \ - f"cache scaling factors for TP rank {i}" - print(f"Found TP world size = {empirical_tp_world_size} " - "when extracting KV cache scales!") - return rank_scales_map - - -def _metadata_extractor(quantized_model_dir: str, - metadata_extract_fns: \ - Dict[str, Callable[[Dict[str, Any]], Any]]) \ - -> Dict[str, Any]: - """ - Given a directory containing quantized model files, this function - aims to extract metadata from the JSON files within this directory. - Each JSON file is expected to represent a dictionary in JSON - format (referred to as a "JSON-dictionary"). Metadata extraction is - defined by a dictionary called metadata_extract_fns, where each - metadata field name is mapped to an extraction function. - - These extraction functions are designed to take a JSON-dictionary - as their only argument and return the corresponding metadata. - While extraction functions are permitted to raise exceptions, they - should only raise a KeyError or ValueError if the metadata field - cannot be extracted from the current JSON-dictionary, yet there's - a possibility of finding it in another JSON-dictionary. - - The function returns a dictionary that maps metadata fields to - their extracted data. The keys of this dictionary correspond exactly - to those in metadata_extract_fns. If any fields fail to be extracted, - their corresponding values are set to None, and a warning is printed. - """ - if not os.path.isdir(quantized_model_dir): - raise FileNotFoundError( - f"The quantized model directory `{quantized_model_dir}` " - "does not exist.") - metadata_files = glob.glob(os.path.join(quantized_model_dir, "*.json")) - - result: Dict[str, Any] = {} - for file in metadata_files: - with open(file) as f: - try: - metadata = json.load(f) - except json.JSONDecodeError: - print(f"Could not parse `{file}` as a valid metadata file," - " skipping it.") - continue - if not isinstance(metadata, dict): - print(f"The file `{file}` does not correspond to a " - "JSON-serialized dictionary, skipping it.") - continue - for metadata_name, extract_fn in metadata_extract_fns.items(): - try: - metadata_info = extract_fn(metadata) - if metadata_name not in result: - result[metadata_name] = metadata_info - elif metadata_info != result[metadata_name]: - raise RuntimeError( - "Metadata mismatch! Originally found " - f"{metadata_name} = {result[metadata_name]} but " - f"now found {metadata_name} = {metadata_info} in " - f"`{file}`") - except KeyError: - # It is possible that a given file does not contain some - # of our selected metadata as it could be located in some - # other metadata file. - # 'EFINAE': extract_fn failure is not an error. - pass - except ValueError: - # See above. - pass - - # Warn if we cannot find any of the requested metadata - for metadata_name in metadata_extract_fns: - if metadata_name not in result: - print("WARNING: Unable to find requested metadata field " - f"`{metadata_name}`, setting it to None.") - result[metadata_name] = None - - return result - - -def main(args): - metadata_extract_fns = { - "model_type": lambda json_dict: json_dict["layers"][0]["decoder_type"], - "tp_size": lambda json_dict: int(json_dict["tensor_parallel"]), - "model_dtype": lambda json_dict: json_dict["dtype"] - } - recovered_metadata = _metadata_extractor(args.quantized_model, - metadata_extract_fns) - if args.tp_size is not None: - metadata_tp_size = recovered_metadata["tp_size"] - if metadata_tp_size is not None: - assert args.tp_size == metadata_tp_size, \ - f"User expected TP world size = {args.tp_size} " \ - f"but found TP world size = {metadata_tp_size} from metadata!" - expected_tp_size = args.tp_size or recovered_metadata["tp_size"] - rank_keyword = "rank" - hf_tensor_files, use_safetensors = _prepare_hf_weights( - args.quantized_model, args.load_format) - rank_scales_map = _kv_scales_extractor(hf_tensor_files, use_safetensors, - rank_keyword, expected_tp_size) - # Postprocess: formatting to the current schema. Consider pulling it - # out into a dedicated function should it ever become more complicated. - rank_scales_map = { - rank: {k: scale[k] - for k in sorted(scale.keys())} - for rank, scale in rank_scales_map.items() - } - # TODO: Expand this with activation and weights scaling factors when - # they are used in the future - schema = QuantParamSchema( - model_type=recovered_metadata["model_type"], - kv_cache={ - "dtype": ("float8_e4m3fn" if len(rank_scales_map) > 0 else - recovered_metadata["model_dtype"]), - "scaling_factor": - rank_scales_map - }, - ) - - if args.output_dir is None: - output_file = os.path.join(args.quantized_model, args.output_name) - else: - if not os.path.isdir(args.output_dir): - os.makedirs(args.output_dir, exist_ok=True) - output_file = os.path.join(args.output_dir, args.output_name) - - with open(output_file, 'w') as f: - f.write(schema.model_dump_json(indent=4)) - print(f"Completed! KV cache scaling factors saved to {output_file}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="This simple utility extracts the " - "KV cache scaling factors from a quantized HF model " - "and saves them to a JSON file compatible with later " - "use by vLLM (pass this file to the appropriate " - "runtime typically using the argument " - "--quantization-param-path ). This is only used " - "if the KV cache dtype is FP8 and on ROCm (AMD GPU).") - parser.add_argument( - "--quantized-model", - help="Specify the directory containing a single quantized HF model. " - "It is expected that the quantization format is FP8_E4M3, for use " - "on ROCm (AMD GPU).", - required=True) - parser.add_argument( - "--load_format", - help="Optionally specify the format of the model's tensor files " - "containing the KV cache scaling factors.", - choices=["auto", "safetensors", "npz", "pt"], - default="auto") - parser.add_argument( - "--output-dir", - help="Optionally specify the output directory. By default the " - "KV cache scaling factors will be saved in the model directory, " - "however you can override this behavior here.", - default=None) - parser.add_argument( - "--output-name", - help="Optionally specify the output filename.", - # TODO: Change this once additional scaling factors are enabled - default="kv_cache_scales.json") - parser.add_argument( - "--tp-size", - help="Optionally specify the tensor-parallel (TP) size that the " - "quantized model should correspond to. If specified, during KV " - "cache scaling factor extraction the observed TP size will be " - "checked against this and an error will be raised if there is " - "a mismatch. If not specified, the quantized model's expected " - "TP size is instead inferred from the largest TP rank observed. " - "The expected TP size is cross-checked against the TP ranks " - "observed in the quantized model and an error is raised if any " - "discrepancies are found.", - default=None, - type=int) - args = parser.parse_args() - - main(args) diff --git a/examples/other/fp8/quantizer/README.md b/examples/other/fp8/quantizer/README.md deleted file mode 100644 index d0895e97dc341..0000000000000 --- a/examples/other/fp8/quantizer/README.md +++ /dev/null @@ -1,32 +0,0 @@ -### Quantizer Utilities -`quantize.py`: NVIDIA Quantization utilities using TensorRT-Model-Optimizer, ported -from TensorRT-LLM: [`examples/quantization/quantize.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/quantization/quantize.py) - -### Prerequisite - -#### AMMO (AlgorithMic Model Optimization) Installation: nvidia-ammo 0.7.1 or later -`pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com nvidia-ammo` - -#### AMMO Download (code and docs) -`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.5.0.tar.gz` -`https://developer.nvidia.com/downloads/assets/cuda/files/nvidia-ammo/nvidia_ammo-0.7.1.tar.gz` - -### Usage - -#### Run on H100 system for speed if FP8; number of GPUs depends on the model size - -#### Example: quantize Llama2-7b model from HF to FP8 with FP8 KV Cache: -`python quantize.py --model-dir ./ll2-7b --dtype float16 --qformat fp8 --kv-cache-dtype fp8 --output-dir ./ll2_7b_fp8 --calib-size 512 --tp-size 1` - -Outputs: model structure, quantized model & parameters (with scaling factors) are in JSON and Safetensors (npz is generated only for the reference) -``` -# ll ./ll2_7b_fp8/ -total 19998244 -drwxr-xr-x 2 root root 4096 Feb 7 01:08 ./ -drwxrwxr-x 8 1060 1061 4096 Feb 7 01:08 ../ --rw-r--r-- 1 root root 176411 Feb 7 01:08 llama_tp1.json --rw-r--r-- 1 root root 13477087480 Feb 7 01:09 llama_tp1_rank0.npz --rw-r--r-- 1 root root 7000893272 Feb 7 01:08 rank0.safetensors -# -``` - diff --git a/examples/other/fp8/quantizer/quantize.py b/examples/other/fp8/quantizer/quantize.py deleted file mode 100644 index d75cc8b3d1cf7..0000000000000 --- a/examples/other/fp8/quantizer/quantize.py +++ /dev/null @@ -1,367 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # noqa: E501 -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Adapted from examples/quantization/hf_ptq.py -""" - -import argparse -import copy -import json -import random -import time - -import ammo.torch.quantization as atq -import numpy as np -import torch -from ammo.torch.export import export_model_config -from datasets import load_dataset -from torch.utils.data import DataLoader -from transformers import AutoModelForCausalLM, AutoTokenizer - -RAND_SEED = 1234 -MAX_SEQ_LEN = 2048 - -EMPTY_CFG = { - "quant_cfg": { - "*weight_quantizer": { - "enable": False, - }, - "*input_quantizer": { - "enable": False - }, - "*lm_head*": { - "enable": False - }, - "*output_layer*": { - "enable": False - }, - "default": { - "enable": False - }, - }, - "algorithm": "max", -} - -KV_CACHE_CFG = { - "*.query_key_value.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.Wqkv.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.W_pack.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.c_attn.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.k_proj.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, - "*.v_proj.output_quantizer": { - "num_bits": 8, - "axis": None, - "enable": True - }, -} - -QUANT_CFG_CHOICES = { - "int8_sq": atq.INT8_SMOOTHQUANT_CFG, - "fp8": atq.FP8_DEFAULT_CFG, - "int4_awq": atq.INT4_AWQ_CFG, - "w4a8_awq": atq.W4A8_AWQ_BETA_CFG, - "int8_wo": EMPTY_CFG, - "int4_wo": EMPTY_CFG, - "full_prec": EMPTY_CFG, -} - -MODEL_NAME_PATTERN_MAP = { - "GPT2": "gpt2", - "Xverse": "llama", - "Llama": "llama", - "Mistral": "llama", - "GPTJ": "gptj", - "FalconForCausalLM": "falcon", - "RWForCausalLM": "falcon", - "baichuan": "baichuan", - "MPT": "mpt", - "Bloom": "bloom", - "ChatGLM": "chatglm", - "QWen": "qwen", -} - - -def get_tokenizer(ckpt_path, max_seq_len=MAX_SEQ_LEN, model_type=None): - print(f"Initializing tokenizer from {ckpt_path}") - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, - model_max_length=max_seq_len, - padding_side="left", - trust_remote_code=True, - ) - if model_type and model_type == "qwen": - # qwen use token id 151643 as pad and eos tokens - tokenizer.pad_token = tokenizer.convert_ids_to_tokens(151643) - tokenizer.eos_token = tokenizer.convert_ids_to_tokens(151643) - - # can't set attribute 'pad_token' for "" - if tokenizer.pad_token != "": - tokenizer.pad_token = tokenizer.eos_token - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - assert (tokenizer.pad_token - is not None), f"Pad token for {model_type} cannot be set!" - - return tokenizer - - -def get_model(ckpt_path, dtype="fp16", device="cuda"): - print(f"Initializing model from {ckpt_path}") - if dtype == "bf16" or dtype == "bfloat16": - dtype = torch.bfloat16 - elif dtype == "fp16" or dtype == "float16": - dtype = torch.float16 - elif dtype == "fp32" or dtype == "float32": - dtype = torch.float32 - else: - raise NotImplementedError(f"Unknown dtype {dtype}") - - # model_kwargs = {"torch_dtype": dtype} - model_kwargs = {"torch_dtype": "auto"} - - model = AutoModelForCausalLM.from_pretrained(ckpt_path, - device_map="auto", - **model_kwargs, - trust_remote_code=True) - model.eval() - - model_dtype = next(model.parameters()).dtype - if dtype != model_dtype: - print("[TensorRT-LLM][WARNING] The manually set model data type is " - f"{dtype}, but the data type of the HuggingFace model is " - f"{model_dtype}.") - - return model - - -def get_model_type(model): - for k, v in MODEL_NAME_PATTERN_MAP.items(): - if k.lower() in type(model).__name__.lower(): - return v - return None - - -def get_calib_dataloader(data="cnn_dailymail", - tokenizer=None, - batch_size=1, - calib_size=512, - block_size=512, - device=None): - print("Loading calibration dataset") - if data == "pileval": - dataset = load_dataset( - "json", - data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", - split="train") - dataset = dataset["text"][:calib_size] - elif data == "cnn_dailymail": - dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train") - dataset = dataset["article"][:calib_size] - else: - raise NotImplementedError - - batch_encoded = tokenizer.batch_encode_plus(dataset, - return_tensors="pt", - padding="max_length", - truncation=True, - max_length=block_size) - if device: - batch_encoded = batch_encoded.to(device) - batch_encoded = batch_encoded["input_ids"] - - calib_dataloader = DataLoader(batch_encoded, - batch_size=batch_size, - shuffle=False) - - return calib_dataloader - - -def quantize_model(model, quant_cfg, calib_dataloader=None): - - def calibrate_loop(): - if calib_dataloader is None: - return - """Adjusts weights and scaling factors based on selected algorithms.""" - for idx, data in enumerate(calib_dataloader): - print(f"Calibrating batch {idx}") - model(data) - - print("Starting quantization...") - start_time = time.time() - atq.quantize(model, quant_cfg, forward_loop=calibrate_loop) - end_time = time.time() - print("Quantization done. Total time used: {:.2f} s.".format(end_time - - start_time)) - - return model - - -def main(args): - if not torch.cuda.is_available(): - raise OSError("GPU is required for inference.") - - random.seed(RAND_SEED) - np.random.seed(RAND_SEED) - - model = get_model(args.model_dir, args.dtype, args.device) - model_type = get_model_type(model) - tokenizer = get_tokenizer(args.model_dir, model_type=model_type) - - if args.qformat in ["full_prec", "int8_wo", "int4_wo" - ] and args.kv_cache_dtype is None: - print(f"No quantization applied, export {args.dtype} model") - else: - if "awq" in args.qformat: - if args.calib_size > 32: - print("AWQ calibration could take longer with calib_size = " - f"{args.calib_size}, Using calib_size=32 instead") - args.calib_size = 32 - print("\nAWQ calibration could take longer than other calibration " - "methods. Please increase the batch size to speed up the " - "calibration process. Batch size can be set by adding the " - "argument --batch_size to the command line.\n") - - calib_dataloader = get_calib_dataloader( - tokenizer=tokenizer, - batch_size=args.batch_size, - calib_size=args.calib_size, - device=args.device, - ) - - if args.qformat in QUANT_CFG_CHOICES: - quant_cfg = QUANT_CFG_CHOICES[args.qformat] - else: - raise ValueError( - f"Unsupported quantization format: {args.qformat}") - - if "awq" in args.qformat: - quant_cfg = copy.deepcopy(QUANT_CFG_CHOICES[args.qformat]) - weight_quantizer = quant_cfg["quant_cfg"][ - "*weight_quantizer"] # type: ignore - if isinstance(weight_quantizer, list): - weight_quantizer = weight_quantizer[0] - weight_quantizer["block_sizes"][-1] = args.awq_block_size - - if args.kv_cache_dtype is not None: - if args.kv_cache_dtype == "fp8": - for value in KV_CACHE_CFG.values(): - value.update({"num_bits": (4, 3)}) # type: ignore - quant_cfg["quant_cfg"].update(KV_CACHE_CFG) # type: ignore - - print(quant_cfg) - - model = quantize_model(model, quant_cfg, calib_dataloader) - - with torch.inference_mode(): - if model_type is None: - print(f"Unknown model type {type(model).__name__}. Continue " - "exporting...") - model_type = f"unknown:{type(model).__name__}" - - export_path = args.output_dir - start_time = time.time() - - if args.qformat == "int4_awq" and model_type == "qwen": - torch.save(model.state_dict(), export_path) - else: - export_npz = (model_type not in [ - 'gptj', 'falcon', 'chatglm', 'mpt', 'llama', 'baichuan' - ]) - - # export safetensors - export_model_config( - model, - model_type, - getattr(torch, args.dtype), - export_dir=export_path, - inference_tensor_parallel=args.tp_size, - inference_pipeline_parallel=args.pp_size, - # export_tensorrt_llm_config=(not export_npz), - export_tensorrt_llm_config=False, - export_npz=export_npz) - - # Workaround for wo quantization - if args.qformat in ["int8_wo", "int4_wo", "full_prec"]: - with open(f"{export_path}/config.json") as f: - tensorrt_llm_config = json.load(f) - if args.qformat == "int8_wo": - tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16' - elif args.qformat == "int4_wo": - tensorrt_llm_config["quantization"]["quant_algo"] = 'W4A16' - else: - tensorrt_llm_config["quantization"]["quant_algo"] = None - with open(f"{export_path}/config.json", "w") as f: - json.dump(tensorrt_llm_config, f, indent=4) - - end_time = time.time() - print("Quantized model exported to {} \nTotal time used {:.2f} s.". - format(export_path, end_time - start_time)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--model-dir", - help="Specify where the HuggingFace model is", - required=True) - parser.add_argument("--device", default="cuda") - parser.add_argument("--dtype", help="Model data type.", default="float16") - parser.add_argument( - "--qformat", - help="Quantization format.", - default="full_prec", - choices=[ - "fp8", "int8_sq", "int4_awq", "w4a8_awq", "int8_wo", "int4_wo", - "full_prec" - ], - ) - parser.add_argument("--batch-size", - help="Batch size for calibration.", - type=int, - default=1) - parser.add_argument("--calib-size", - help="Number of samples for calibration.", - type=int, - default=512) - parser.add_argument("--output-dir", default="exported_model") - parser.add_argument("--tp-size", type=int, default=1) - parser.add_argument("--pp-size", type=int, default=1) - parser.add_argument("--awq-block-size", type=int, default=128) - parser.add_argument("--kv-cache-dtype", - help="KV Cache dtype.", - default=None, - choices=["int8", "fp8", None]) - args = parser.parse_args() - - main(args) diff --git a/format.sh b/format.sh index 2277eef93c745..4bcd0be0c96e5 100755 --- a/format.sh +++ b/format.sh @@ -1,321 +1,5 @@ -#!/usr/bin/env bash -# YAPF formatter, adapted from ray and skypilot. -# -# Usage: -# # Do work and commit your work. +#!/bin/bash -# # Format files that differ from origin/main. -# bash format.sh - -# # Commit changed files with message 'Run yapf and ruff' -# -# -# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. -# You are encouraged to run this locally before pushing changes for review. - -# Cause the script to exit if a single command fails -set -eo pipefail - -# this stops git rev-parse from failing if we run this from the .git directory -builtin cd "$(dirname "${BASH_SOURCE:-$0}")" -ROOT="$(git rev-parse --show-toplevel)" -builtin cd "$ROOT" || exit 1 - -check_command() { - if ! command -v "$1" &> /dev/null; then - echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" - exit 1 - fi -} - -check_command yapf -check_command ruff -check_command mypy -check_command codespell -check_command isort -check_command clang-format - -YAPF_VERSION=$(yapf --version | awk '{print $2}') -RUFF_VERSION=$(ruff --version | awk '{print $2}') -MYPY_VERSION=$(mypy --version | awk '{print $2}') -CODESPELL_VERSION=$(codespell --version) -ISORT_VERSION=$(isort --vn) -CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') -PYMARKDOWNLNT_VERSION=$(pymarkdownlnt version | awk '{print $1}') - -# # params: tool name, tool version, required version -tool_version_check() { - expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) - if [[ "$2" != "$expected" ]]; then - echo "❓❓Wrong $1 version installed: $expected is required, not $2." - exit 1 - fi -} - -tool_version_check "yapf" "$YAPF_VERSION" -tool_version_check "ruff" "$RUFF_VERSION" -tool_version_check "mypy" "$MYPY_VERSION" -tool_version_check "isort" "$ISORT_VERSION" -tool_version_check "codespell" "$CODESPELL_VERSION" -tool_version_check "clang-format" "$CLANGFORMAT_VERSION" -tool_version_check "pymarkdownlnt" "$PYMARKDOWNLNT_VERSION" - -YAPF_FLAGS=( - '--recursive' - '--parallel' -) - -YAPF_EXCLUDES=( - '--exclude' 'build/**' -) - -# Format specified files -format() { - yapf --in-place "${YAPF_FLAGS[@]}" "$@" -} - -# Format files that differ from main branch. Ignores dirs that are not slated -# for autoformat yet. -format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause yapf to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" - fi - -} - -# Format all files -format_all() { - yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . -} - -## This flag formats individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - format "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is formatted. -elif [[ "$1" == '--all' ]]; then - format_all -else - # Format only the files that changed in last commit. - format_changed -fi -echo 'vLLM yapf: Done' - -# Run mypy -echo 'vLLM mypy:' -tools/mypy.sh -echo 'vLLM mypy: Done' - - -# If git diff returns a file that is in the skip list, the file may be checked anyway: -# https://github.com/codespell-project/codespell/issues/1915 -# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem -CODESPELL_EXCLUDES=( - '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' -) - -# check spelling of specified files -spell_check() { - codespell "$@" -} - -spell_check_all(){ - codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" -} - -# Spelling check of files that differ from main branch. -spell_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - codespell "${CODESPELL_EXCLUDES[@]}" - fi -} - -# Run Codespell -## This flag runs spell check of individual files. --files *must* be the first command line -## arg to use this option. -if [[ "$1" == '--files' ]]; then - spell_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - spell_check_all -else - # Check spelling only of the files that changed in last commit. - spell_check_changed -fi -echo 'vLLM codespell: Done' - - -# Lint specified files -lint() { - ruff check "$@" -} - -# Lint files that differ from main branch. Ignores dirs that are not slated -# for autolint yet. -lint_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - ruff check - fi - -} - -# Run Ruff -### This flag lints individual files. --files *must* be the first command line -### arg to use this option. -if [[ "$1" == '--files' ]]; then - lint "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - lint vllm tests -else - # Format only the files that changed in last commit. - lint_changed -fi -echo 'vLLM ruff: Done' - -# check spelling of specified files -isort_check() { - isort "$@" -} - -isort_check_all(){ - isort . -} - -# Spelling check of files that differ from main branch. -isort_check_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause ruff to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ - isort - fi -} - -# Run Isort -# This flag runs spell check of individual files. --files *must* be the first command line -# arg to use this option. -if [[ "$1" == '--files' ]]; then - isort_check "${@:2}" - # If `--all` is passed, then any further arguments are ignored and the - # entire python directory is linted. -elif [[ "$1" == '--all' ]]; then - isort_check_all -else - # Check spelling only of the files that changed in last commit. - isort_check_changed -fi -echo 'vLLM isort: Done' - -# Clang-format section -# Exclude some files for formatting because they are vendored -# NOTE: Keep up to date with .github/workflows/clang-format.yml -CLANG_FORMAT_EXCLUDES=( - 'csrc/moe/topk_softmax_kernels.cu' - 'csrc/quantization/gguf/ggml-common.h' - 'csrc/quantization/gguf/dequantize.cuh' - 'csrc/quantization/gguf/vecdotq.cuh' - 'csrc/quantization/gguf/mmq.cuh' - 'csrc/quantization/gguf/mmvq.cuh' -) - -# Format specified files with clang-format -clang_format() { - clang-format -i "$@" -} - -# Format files that differ from main branch with clang-format. -clang_format_changed() { - # The `if` guard ensures that the list of filenames is not empty, which - # could cause clang-format to receive 0 positional arguments, making it hang - # waiting for STDIN. - # - # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that - # exist on both branches. - MERGEBASE="$(git merge-base origin/main HEAD)" - - # Get the list of changed files, excluding the specified ones - changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e)) - if [ -n "$changed_files" ]; then - echo "$changed_files" | xargs -P 5 clang-format -i - fi -} - -# Format all files with clang-format -clang_format_all() { - find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ - | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ - | xargs clang-format -i -} - -# Run clang-format -if [[ "$1" == '--files' ]]; then - clang_format "${@:2}" -elif [[ "$1" == '--all' ]]; then - clang_format_all -else - clang_format_changed -fi -echo 'vLLM clang-format: Done' - -echo 'vLLM actionlint:' -tools/actionlint.sh -color -echo 'vLLM actionlint: Done' - -echo 'vLLM shellcheck:' -tools/shellcheck.sh -echo 'vLLM shellcheck: Done' - -echo 'excalidraw png check:' -tools/png-lint.sh -echo 'excalidraw png check: Done' - -if ! git diff --quiet &>/dev/null; then - echo - echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:" - git --no-pager diff --name-only - echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." - - exit 1 -else - echo "✨🎉 Format check passed! Congratulations! 🎉✨" -fi - -echo 'vLLM doc-lint:' -tools/doc-lint.sh -echo 'vLLM doc-lint: Done' +echo "vLLM linting system has been moved from format.sh to pre-commit hook." +echo "Please run 'pip install -r requirements-lint.txt' and 'pre-commit install' to install the pre-commit hook." +echo "Then linters will run automatically before each commit." diff --git a/pyproject.toml b/pyproject.toml index 82275ccafb572..9892967b82d79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,11 @@ build-backend = "setuptools.build_meta" [tool.setuptools_scm] # version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` +[tool.yapfignore] +ignore_patterns = [ + "build/**", +] + [tool.ruff] # Allow lines to be as long as 80. line-length = 80 @@ -52,6 +57,9 @@ ignore = [ "B007", # f-string format "UP032", + # Python 3.8 typing + "UP006", "UP035", + ] [tool.mypy] @@ -103,6 +111,7 @@ markers = [ ] [tool.pymarkdown] +plugins.md004.style = "sublist" # ul-style plugins.md013.enabled = false # line-length plugins.md041.enabled = false # first-line-h1 plugins.md033.enabled = false # inline-html diff --git a/requirements-common.txt b/requirements-common.txt index 6c390bcfd18e6..7051ca8cb50cd 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -19,7 +19,7 @@ pillow # Required for image processing prometheus-fastapi-instrumentator >= 7.0.0 tiktoken >= 0.6.0 # Required for DBRX tokenizer lm-format-enforcer >= 0.10.9, < 0.11 -outlines == 0.1.11 # Requires pytorch +outlines == 0.1.11 lark == 1.2.2 xgrammar >= 0.1.6; platform_machine == "x86_64" typing_extensions >= 4.10 @@ -34,6 +34,6 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch +compressed-tensors == 0.9.0 # required for compressed-tensors depyf==0.18.0 # required for profiling and debugging with compilation config cloudpickle # allows pickling lambda functions in model_executor/models/registry.py diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 056fbf5a7adec..ed0d2c9fae0b6 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -4,5 +4,6 @@ # Dependencies for CPUs torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin" torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin" +torchaudio; platform_machine != "ppc64le" # required for the image processor of minicpm-o-2_6, this must be updated alongside torch torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch datasets # for benchmark scripts diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 8002fbd8ee5b9..78fa360f2dc96 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -5,6 +5,7 @@ ray[default] >= 2.9 nvidia-ml-py >= 12.560.30 # for pynvml package torch == 2.5.1 +torchaudio==2.5.1 # These must be updated alongside torch torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 38113981cd663..95ee976ef955b 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -8,4 +8,4 @@ pandas tabulate setuptools>=61 setuptools-scm>=8 -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@db80a48 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@bac2a62 diff --git a/requirements-lint.txt b/requirements-lint.txt index ffc73f90a0d48..62446f94048df 100644 --- a/requirements-lint.txt +++ b/requirements-lint.txt @@ -1,15 +1,2 @@ # formatting -yapf==0.32.0 -toml==0.10.2 -tomli==2.0.2 -ruff==0.6.5 -codespell==2.3.0 -isort==5.13.2 -clang-format==18.1.5 -pymarkdownlnt==0.9.26 - -# type checking -mypy==1.11.1 -types-PyYAML -types-requests -types-setuptools +pre-commit==4.0.1 diff --git a/requirements-test.in b/requirements-test.in index bc76a91ad5356..13ad17b256734 100644 --- a/requirements-test.in +++ b/requirements-test.in @@ -12,6 +12,8 @@ decord # required for video tests einops # required for MPT, qwen-vl and Mamba httpx librosa # required for audio tests +vector_quantize_pytorch # required for minicpmo_26 test +vocos # required for minicpmo_26 test peft pqdm ray[adag]==2.40.0 @@ -19,6 +21,7 @@ sentence-transformers # required for embedding tests soundfile # required for audio tests timm # required for internvl test torch==2.5.1 +torchaudio==2.5.1 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[opencv] >= 1.5.0 # required for pixtral test diff --git a/requirements-test.txt b/requirements-test.txt index 09e009c2e21f4..df7e904bb0d34 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -106,9 +106,17 @@ dnspython==2.7.0 docutils==0.16 # via awscli einops==0.8.0 - # via -r requirements-test.in + # via + # -r requirements-test.in + # encodec + # vector-quantize-pytorch + # vocos +einx==0.3.0 + # via vector-quantize-pytorch email-validator==2.2.0 # via pydantic +encodec==0.1.1 + # via vocos evaluate==0.4.3 # via lm-eval fastparquet==2024.11.0 @@ -125,6 +133,8 @@ filelock==3.16.1 # triton fonttools==4.54.1 # via matplotlib +frozendict==2.4.6 + # via einx frozenlist==1.5.0 # via # aiohttp @@ -159,6 +169,7 @@ huggingface-hub==0.26.2 # timm # tokenizers # transformers + # vocos idna==3.10 # via # anyio @@ -261,6 +272,8 @@ numpy==1.26.4 # cupy-cuda12x # datasets # decord + # einx + # encodec # evaluate # fastparquet # genai-perf @@ -283,6 +296,7 @@ numpy==1.26.4 # torchvision # transformers # tritonclient + # vocos nvidia-cublas-cu12==12.4.5.8 # via # nvidia-cudnn-cu12 @@ -455,6 +469,7 @@ pyyaml==6.0.2 # responses # timm # transformers + # vocos ray[adag]==2.40.0 # via -r requirements-test.in redis==5.2.0 @@ -517,6 +532,7 @@ scipy==1.13.1 # scikit-learn # sentence-transformers # statsmodels + # vocos sentence-transformers==3.2.1 # via -r requirements-test.in sentencepiece==0.2.0 @@ -540,7 +556,9 @@ sqlitedict==2.1.0 statsmodels==0.14.4 # via genai-perf sympy==1.13.1 - # via torch + # via + # einx + # torch tabledata==1.3.3 # via pytablewriter tabulate==0.9.0 @@ -568,12 +586,21 @@ torch==2.5.1 # -r requirements-test.in # accelerate # bitsandbytes + # encodec # lm-eval # peft # sentence-transformers # tensorizer # timm + # torchaudio # torchvision + # vector-quantize-pytorch + # vocos +torchaudio==2.5.1 + # via + # -r requirements-test.in + # encodec + # vocos torchvision==0.20.1 # via timm tqdm==4.66.6 @@ -584,6 +611,7 @@ tqdm==4.66.6 # lm-eval # nltk # peft + # pqdm # sentence-transformers # tqdm-multiprocess # transformers @@ -615,6 +643,7 @@ typing-extensions==4.12.2 # huggingface-hub # librosa # mistral-common + # pqdm # pydantic # pydantic-core # torch @@ -626,6 +655,10 @@ urllib3==2.2.3 # requests # responses # tritonclient +vector-quantize-pytorch==1.21.2 + # via -r requirements-test.in +vocos==0.1.0 + # via -r requirements-test.in word2number==1.1 # via lm-eval xxhash==3.5.0 diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 8ab18b3770ae8..1abde714af7c9 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -13,13 +13,11 @@ ray[default] # Install torch_xla --pre --extra-index-url https://download.pytorch.org/whl/nightly/cpu +--find-links https://storage.googleapis.com/libtpu-wheels/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html -torch==2.6.0.dev20241126+cpu -torchvision==0.20.0.dev20241126+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" -jaxlib==0.4.36.dev20241122 -jax==0.4.36.dev20241122 +torch==2.6.0.dev20241216+cpu +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index 978625a069778..59ece870b5585 --- a/setup.py +++ b/setup.py @@ -228,8 +228,11 @@ def target_name(s: str) -> str: # CMake appends the extension prefix to the install path, # and outdir already contains that prefix, so we need to remove it. + # We assume only the final component of extension prefix is added by + # CMake, this is currently true for current extensions but may not + # always be the case. prefix = outdir - for i in range(ext.name.count('.')): + if '.' in ext.name: prefix = prefix.parent # prefix here should actually be the same for all components @@ -298,9 +301,11 @@ def run(self) -> None: files_to_copy = [ "vllm/_C.abi3.so", "vllm/_moe_C.abi3.so", - "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/flash_attn_interface.py", "vllm/vllm_flash_attn/__init__.py", + "vllm/cumem_allocator.abi3.so", # "vllm/_version.py", # not available in nightly wheels yet ] file_members = filter(lambda x: x.filename in files_to_copy, @@ -412,7 +417,7 @@ def get_rocm_version(): if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), ctypes.byref(patch)) == 0): - return "%d.%d.%d" % (major.value, minor.value, patch.value) + return f"{major.value}.{minor.value}.{patch.value}" return None except Exception: return None @@ -549,7 +554,7 @@ def _read_requirements(filename: str) -> List[str]: return resolved_requirements if _no_device(): - requirements = _read_requirements("requirements-cuda.txt") + requirements = _read_requirements("requirements-cpu.txt") elif _is_cuda(): requirements = _read_requirements("requirements-cuda.txt") cuda_major, cuda_minor = torch.version.cuda.split(".") @@ -592,8 +597,12 @@ def _read_requirements(filename: str) -> List[str]: ext_modules.append(CMakeExtension(name="vllm._rocm_C")) if _is_cuda(): - ext_modules.append( - CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c")) + ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) + if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"): + # FA3 requires CUDA 12.0 or later + ext_modules.append( + CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C")) + ext_modules.append(CMakeExtension(name="vllm.cumem_allocator")) if _build_custom_ops(): ext_modules.append(CMakeExtension(name="vllm._C")) diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py index 83c71b5cf6eb7..91ac35dd67bbf 100644 --- a/tests/async_engine/test_api_server.py +++ b/tests/async_engine/test_api_server.py @@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict: @pytest.fixture -def api_server(tokenizer_pool_size: int, worker_use_ray: bool): +def api_server(tokenizer_pool_size: int, distributed_executor_backend: str): script_path = Path(__file__).parent.joinpath( "api_server_async_engine.py").absolute() commands = [ - sys.executable, "-u", - str(script_path), "--model", "facebook/opt-125m", "--host", - "127.0.0.1", "--tokenizer-pool-size", - str(tokenizer_pool_size) + sys.executable, + "-u", + str(script_path), + "--model", + "facebook/opt-125m", + "--host", + "127.0.0.1", + "--tokenizer-pool-size", + str(tokenizer_pool_size), + "--distributed-executor-backend", + distributed_executor_backend, ] - if worker_use_ray: - commands.append("--worker-use-ray") uvicorn_process = subprocess.Popen(commands) yield uvicorn_process.terminate() @pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) -@pytest.mark.parametrize("worker_use_ray", [False, True]) +@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"]) def test_api_server(api_server, tokenizer_pool_size: int, - worker_use_ray: bool): + distributed_executor_backend: str): """ Run the API server and test it. diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 31a101e48e026..23285040642a8 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -61,9 +61,10 @@ def test_models( if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") - if backend == "XFORMERS" and model == "google/gemma-2-2b-it": + if backend in ("XFORMERS", + "FLASHINFER") and model == "google/gemma-2-2b-it": pytest.skip( - "XFORMERS does not support gemma2 with full context length.") + f"{backend} does not support gemma2 with full context length.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py new file mode 100644 index 0000000000000..53f4ef08f36a2 --- /dev/null +++ b/tests/basic_correctness/test_cumem.py @@ -0,0 +1,112 @@ +import torch + +from vllm import LLM, SamplingParams +from vllm.device_allocator.cumem import CuMemAllocator +from vllm.utils import GiB_bytes + +from ..utils import fork_new_process_for_each_test + + +@fork_new_process_for_each_test +def test_basic_cumem(): + # some tensors from default memory pool + shape = (1024, 1024) + x = torch.empty(shape, device='cuda') + x.zero_() + + # some tensors from custom memory pool + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + # custom memory pool + y = torch.empty(shape, device='cuda') + y.zero_() + y += 1 + z = torch.empty(shape, device='cuda') + z.zero_() + z += 2 + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # they can be used together + output = x + y + z + assert torch.allclose(output, torch.ones_like(output) * 3) + + +@fork_new_process_for_each_test +def test_cumem_with_cudagraph(): + allocator = CuMemAllocator.get_instance() + with allocator.use_memory_pool(): + weight = torch.eye(1024, device='cuda') + with allocator.use_memory_pool(tag="discard"): + cache = torch.empty(1024, 1024, device='cuda') + + def model(x): + out = x @ weight + cache[:out.size(0)].copy_(out) + return out + 1 + + x = torch.empty(128, 1024, device='cuda') + + # warmup + model(x) + + # capture cudagraph + model_graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(model_graph): + y = model(x) + + free_bytes = torch.cuda.mem_get_info()[0] + allocator.sleep() + free_bytes_after_sleep = torch.cuda.mem_get_info()[0] + assert free_bytes_after_sleep > free_bytes + allocator.wake_up() + + # after waking up, the content in the weight tensor + # should be restored, but the content in the cache tensor + # should be discarded + + # this operation is also compatible with cudagraph + + x.random_() + model_graph.replay() + + # cache content is as expected + assert torch.allclose(x, cache[:x.size(0)]) + + # output content is as expected + assert torch.allclose(y, x + 1) + + +@fork_new_process_for_each_test +def test_end_to_end(): + free, total = torch.cuda.mem_get_info() + used_bytes_baseline = total - free # in case other process is running + llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True) + prompt = "How are you?" + sampling_params = SamplingParams(temperature=0, max_tokens=10) + output = llm.generate(prompt, sampling_params) + + # the benefit of `llm.sleep(level=2)` is mainly CPU memory usage, + # which is difficult to measure in the test. therefore, we only + # test sleep level 1 here. + llm.sleep(level=1) + + free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info() + used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline + # now the memory usage is mostly cudagraph memory pool, + # and it should be less than the model weights (1B model, 2GiB weights) + assert used_bytes < 2 * GiB_bytes + + llm.wake_up() + output2 = llm.generate(prompt, sampling_params) + + # cmp output + assert output[0].outputs[0].text == output2[0].outputs[0].text diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 4e502cfb5f4f8..4b27dcbc8609f 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -29,10 +29,10 @@ def check_settings(): @pytest.fixture -def worker_use_ray() -> bool: - # When SPMD worker is used, use ray_use_worker=True +def distributed_executor_backend() -> str: + # When SPMD worker is used, use distributed_executor_backend="ray" # to test delta input optimization works with preemption. - return envs.VLLM_USE_RAY_SPMD_WORKER + return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp" @pytest.mark.parametrize("model", MODELS) @@ -47,7 +47,7 @@ def test_chunked_prefill_recompute( dtype: str, max_tokens: int, chunked_prefill_token_size: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Ensure that chunked prefill works with preemption.""" max_num_seqs = min(chunked_prefill_token_size, 256) @@ -66,7 +66,7 @@ def test_chunked_prefill_recompute( max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, max_num_seqs=max_num_seqs, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, disable_log_stats=False, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) @@ -93,7 +93,7 @@ def test_preemption( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """By default, recompute preemption is enabled""" @@ -104,7 +104,7 @@ def test_preemption( model, dtype=dtype, disable_log_stats=False, - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt @@ -144,7 +144,7 @@ def test_preemption_infeasible( model: str, dtype: str, max_tokens: int, - worker_use_ray: bool, + distributed_executor_backend: str, ) -> None: """Verify infeasible preemption request will be ignored.""" BLOCK_SIZE = 16 @@ -159,7 +159,7 @@ def test_preemption_infeasible( # ignored instead of hanging forever. num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), - worker_use_ray=worker_use_ray, + distributed_executor_backend=distributed_executor_backend, ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 87d5aefea6cb4..1945479fc3031 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -58,7 +58,7 @@ class TestSetting: model_args=["--task", "embed"], pp_size=1, tp_size=1, - attn_backend="FLASHINFER", + attn_backend="FLASH_ATTN", method="encode", fullgraph=True, ), diff --git a/tests/conftest.py b/tests/conftest.py index 55cae78e4d721..cd1f8f1a3b191 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -245,6 +245,7 @@ def video_assets() -> _VideoAssets: _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) +_R = TypeVar("_R") class HfRunner: @@ -1005,6 +1006,10 @@ def score( req_outputs = self.model.score(text_1, text_2) return [req_output.outputs.score for req_output in req_outputs] + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + executor = self.model.llm_engine.model_executor + return executor.apply_model(func) + def __enter__(self): return self diff --git a/tests/core/block/test_prefix_caching_block.py b/tests/core/block/test_prefix_caching_block.py index 29ac3a3c86cb4..6642174c17d8b 100644 --- a/tests/core/block/test_prefix_caching_block.py +++ b/tests/core/block/test_prefix_caching_block.py @@ -796,6 +796,44 @@ def test_find_cached_blocks_prefix(): block_hashes=block_hashes_seq1) assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks + # Test reset prefix cache + @staticmethod + @pytest.mark.parametrize("num_blocks", [10]) + @pytest.mark.parametrize("block_size", [16]) + def test_reset_prefix_cache(num_blocks: int, block_size: int): + """This test case simulates the case of resetting the prefix cache.""" + + allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks, + block_size=block_size) + token_ids = list(range(3 * block_size)) + + first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain( + block_size=block_size, + token_ids=token_ids, + allocator=allocator, + ) + + # Free each block in the first chain. + for block in first_chain: + allocator.free(block) + + # Failed to reset prefix cache because some blocks are not freed yet. + assert not allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() > 0.0 + + # Free each block in the second chain. + for block in second_chain: + allocator.free(block) + + # Reset prefix cache. + assert allocator.reset_prefix_cache() + assert allocator.get_prefix_cache_hit_rate() == 0.0 + @staticmethod def create_immutable_chain( block_size: int, diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index fdfcd4f4c9d50..0e33f3662da82 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -51,7 +51,9 @@ def test_custom_executor(model, tmp_path): assert not os.path.exists(".marker") engine_args = EngineArgs( - model=model, distributed_executor_backend=CustomUniExecutor) + model=model, + distributed_executor_backend=CustomUniExecutor, + ) engine = LLMEngine.from_engine_args(engine_args) sampling_params = SamplingParams(max_tokens=1) diff --git a/tests/entrypoints/openai/reasoning_parsers/__init__.py b/tests/entrypoints/openai/reasoning_parsers/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py new file mode 100644 index 0000000000000..4607e4dfe4d0b --- /dev/null +++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py @@ -0,0 +1,120 @@ +from typing import List + +import pytest +from transformers import AutoTokenizer + +from tests.entrypoints.openai.reasoning_parsers.utils import ( + run_reasoning_extraction) +from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser, + ReasoningParserManager) + +parser_name = "deepseek_r1" +start_token = "" +end_token = "" + +SIMPLE_REASONING = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning_content": "This is a reasoning section", + "content": "This is the rest", +} +COMPLETE_REASONING = { + "output": "This is a reasoning section", + "reasoning_content": "This is a reasoning section", + "content": None, +} +NO_REASONING = { + "output": "This is a reasoning section", + "reasoning_content": None, + "content": "This is a reasoning section", +} +MULTIPLE_LINES = { + "output": "This\nThatThis is the rest\nThat", + "reasoning_content": "This\nThat", + "content": "This is the rest\nThat", +} +SHORTEST_REASONING_NO_STREAMING = { + "output": "This is the rest", + "reasoning_content": "", + "content": "This is the rest", +} +SHORTEST_REASONING = { + "output": "This is the rest", + "reasoning_content": None, + "content": "This is the rest", +} + +TEST_CASES = [ + pytest.param( + False, + SIMPLE_REASONING, + id="simple_streaming", + ), + pytest.param( + True, + SIMPLE_REASONING, + id="simple_streaming", + ), + pytest.param( + False, + COMPLETE_REASONING, + id="complete_streaming", + ), + pytest.param( + True, + COMPLETE_REASONING, + id="complete_streaming", + ), + pytest.param( + False, + NO_REASONING, + id="no_streaming", + ), + pytest.param( + True, + NO_REASONING, + id="no_streaming", + ), + pytest.param( + False, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + True, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + True, + SHORTEST_REASONING, + id="shortest_streaming", + ), + pytest.param( + False, + SHORTEST_REASONING_NO_STREAMING, + id="shortest_streaming", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, +): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + tokenizer.add_tokens([start_token, end_token]) + output = tokenizer.tokenize(param_dict["output"]) + # decode everything to tokens + output_tokens: List[str] = [ + tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser( + parser_name)(tokenizer) + + reasoning, content = run_reasoning_extraction(parser, + output_tokens, + streaming=streaming) + + assert reasoning == param_dict["reasoning_content"] + assert content == param_dict["content"] diff --git a/tests/entrypoints/openai/reasoning_parsers/utils.py b/tests/entrypoints/openai/reasoning_parsers/utils.py new file mode 100644 index 0000000000000..ac73ad50a7395 --- /dev/null +++ b/tests/entrypoints/openai/reasoning_parsers/utils.py @@ -0,0 +1,93 @@ +from typing import List, Optional, Tuple, Union + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser + + +class StreamingReasoningReconstructor: + + def __init__(self): + self.reasoning_content = None + self.other_content = None + + def append_delta(self, delta: DeltaMessage): + # content and the reasoning content should not be present + # at the same time + assert delta.content is None or delta.reasoning_content is None, ( + "Both content and reasoning content are present in the " + "delta message") + if delta.content is not None: + if self.other_content is None: + self.other_content = delta.content + else: + self.other_content += delta.content + else: + if self.reasoning_content is None: + self.reasoning_content = delta.reasoning_content + else: + self.reasoning_content += delta.reasoning_content + + +def run_reasoning_extraction( + reasoning_parser: ReasoningParser, + model_output: List[str], + request: Union[ChatCompletionRequest, None] = None, + streaming: bool = False, +) -> Tuple[Optional[str], Optional[str]]: + if streaming: + reconstructor = run_reasoning_extraction_streaming( + reasoning_parser, + model_output, + request, + ) + return ( + reconstructor.reasoning_content, + reconstructor.other_content or None, + ) + else: + reasoning, content = run_reasoning_extraction_nonstreaming( + reasoning_parser, model_output, request) + return reasoning, content + + +def run_reasoning_extraction_nonstreaming( + reasoning_parser: ReasoningParser, + model_output: List[str], + request: Union[ChatCompletionRequest, None] = None, +) -> Tuple[Optional[str], Optional[str]]: + request = request or ChatCompletionRequest(messages=[], model="test-model") + return reasoning_parser.extract_reasoning_content( + model_output=''.join(model_output), request=request) + + +def run_reasoning_extraction_streaming( + reasoning_parser: ReasoningParser, + model_deltas: List[str], + request: Union[ChatCompletionRequest, None] = None, +) -> StreamingReasoningReconstructor: + request = request or ChatCompletionRequest(messages=[], model="test-model") + reconstructor = StreamingReasoningReconstructor() + previous_text = "" + previous_tokens: List[int] = [] + for delta in model_deltas: + token_delta = [ + reasoning_parser.vocab.get(token) + for token in reasoning_parser.model_tokenizer.tokenize(delta) + if token in reasoning_parser.vocab + ] + current_text = previous_text + delta + current_tokens = previous_tokens + token_delta + delta_message = reasoning_parser.extract_reasoning_content_streaming( + previous_text, + current_text, + delta, + previous_tokens, + current_tokens, + token_delta, + ) + if delta_message is not None: + reconstructor.append_delta(delta_message) + previous_text = current_text + previous_tokens = current_tokens + return reconstructor diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index e49562ad6a21f..01bcd78aa91a8 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -116,6 +116,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser): validate_parsed_serve_args(args) +def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser): + """Ensure validation fails if reasoning is enabled with auto tool choice""" + args = serve_parser.parse_args(args=[ + "--enable-auto-tool-choice", + "--enable-reasoning", + ]) + with pytest.raises(TypeError): + validate_parsed_serve_args(args) + + +def test_enable_reasoning_passes_with_reasoning_parser(serve_parser): + """Ensure validation passes if reasoning is enabled + with a reasoning parser""" + args = serve_parser.parse_args(args=[ + "--enable-reasoning", + "--reasoning-parser", + "deepseek_r1", + ]) + validate_parsed_serve_args(args) + + +def test_enable_reasoning_fails_without_reasoning_parser(serve_parser): + """Ensure validation fails if reasoning is enabled + without a reasoning parser""" + args = serve_parser.parse_args(args=["--enable-reasoning"]) + with pytest.raises(TypeError): + validate_parsed_serve_args(args) + + def test_chat_template_validation_for_happy_paths(serve_parser): """Ensure validation passes if the chat template exists""" args = serve_parser.parse_args( diff --git a/tests/entrypoints/openai/test_metrics.py b/tests/entrypoints/openai/test_metrics.py index 6523c8b6297c6..901ba8e8e5ef3 100644 --- a/tests/entrypoints/openai/test_metrics.py +++ b/tests/entrypoints/openai/test_metrics.py @@ -16,6 +16,24 @@ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +@pytest.fixture(scope="module", params=[True, False]) +def use_v1(request): + # Module-scoped variant of run_with_both_engines + # + # Use this fixture to run a test with both v0 and v1, and + # also to conditionalize the test logic e.g. + # + # def test_metrics_exist(use_v1, server, client): + # ... + # expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS + # for metric in expected: + # assert metric in response.text + # + # @skip_v1 wouldn't work here because this is a module-level + # fixture - per-function decorators would have no effect + yield request.param + + @pytest.fixture(scope="module") def default_server_args(): return [ @@ -36,10 +54,12 @@ def default_server_args(): "--enable-chunked-prefill", "--disable-frontend-multiprocessing", ]) -def server(default_server_args, request): +def server(use_v1, default_server_args, request): if request.param: default_server_args.append(request.param) - with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: + env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0') + with RemoteOpenAIServer(MODEL_NAME, default_server_args, + env_dict=env_dict) as remote_server: yield remote_server @@ -84,7 +104,7 @@ async def client(server): @pytest.mark.asyncio async def test_metrics_counts(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): for _ in range(_NUM_REQUESTS): # sending a request triggers the metrics to be logged. await client.completions.create( @@ -98,6 +118,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer, # Loop over all expected metric_families for metric_family, suffix_values_list in EXPECTED_VALUES.items(): + if use_v1 and metric_family not in EXPECTED_METRICS_V1: + continue + found_metric = False # Check to see if the metric_family is found in the prom endpoint. @@ -174,10 +197,29 @@ async def test_metrics_counts(server: RemoteOpenAIServer, "swap_space_bytes", ] +EXPECTED_METRICS_V1 = [ + "vllm:num_requests_running", + "vllm:num_requests_waiting", + "vllm:prompt_tokens_total", + "vllm:generation_tokens_total", + "vllm:request_prompt_tokens_sum", + "vllm:request_prompt_tokens_bucket", + "vllm:request_prompt_tokens_count", + "vllm:request_generation_tokens_sum", + "vllm:request_generation_tokens_bucket", + "vllm:request_generation_tokens_count", + "vllm:time_to_first_token_seconds_sum", + "vllm:time_to_first_token_seconds_bucket", + "vllm:time_to_first_token_seconds_count", + "vllm:time_per_output_token_seconds_sum", + "vllm:time_per_output_token_seconds_bucket", + "vllm:time_per_output_token_seconds_count", +] + @pytest.mark.asyncio async def test_metrics_exist(server: RemoteOpenAIServer, - client: openai.AsyncClient): + client: openai.AsyncClient, use_v1: bool): # sending a request triggers the metrics to be logged. await client.completions.create(model=MODEL_NAME, prompt="Hello, my name is", @@ -187,11 +229,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer, response = requests.get(server.url_for("metrics")) assert response.status_code == HTTPStatus.OK - for metric in EXPECTED_METRICS: + for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): assert metric in response.text -def test_metrics_exist_run_batch(): +def test_metrics_exist_run_batch(use_v1: bool): + if use_v1: + pytest.skip("Skipping test on vllm V1") input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 base_url = "0.0.0.0" diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py new file mode 100644 index 0000000000000..cfd8f33133960 --- /dev/null +++ b/tests/entrypoints/openai/test_rerank.py @@ -0,0 +1,87 @@ +import pytest +import requests + +from vllm.entrypoints.openai.protocol import RerankResponse + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "BAAI/bge-reranker-base" + + +@pytest.fixture(scope="module") +def server(): + args = ["--enforce-eager", "--max-model-len", "100"] + + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank_texts(server: RemoteOpenAIServer, model_name: str): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents, + }) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_top_n(server: RemoteOpenAIServer, model_name: str): + query = "What is the capital of France?" + documents = [ + "The capital of Brazil is Brasilia.", + "The capital of France is Paris.", "Cross-encoder models are neat" + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents, + "top_n": 2 + }) + rerank_response.raise_for_status() + rerank = RerankResponse.model_validate(rerank_response.json()) + + assert rerank.id is not None + assert rerank.results is not None + assert len(rerank.results) == 2 + assert rerank.results[0].relevance_score >= 0.9 + assert rerank.results[1].relevance_score <= 0.01 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str): + + query = "What is the capital of France?" * 100 + documents = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + rerank_response = requests.post(server.url_for("rerank"), + json={ + "model": model_name, + "query": query, + "documents": documents + }) + assert rerank_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + rerank_response.text \ No newline at end of file diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index 097d6b1a32349..1f8a56bb43ac6 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -1,3 +1,4 @@ +import json import subprocess import sys import tempfile @@ -21,6 +22,9 @@ {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}} {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}""" +INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} +{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}""" + def test_empty_file(): with tempfile.NamedTemporaryFile( @@ -102,3 +106,36 @@ def test_embeddings(): # Ensure that the output format conforms to the openai api. # Validation should throw if the schema is wrong. BatchRequestOutput.model_validate_json(line) + + +def test_score(): + with tempfile.NamedTemporaryFile( + "w") as input_file, tempfile.NamedTemporaryFile( + "r") as output_file: + input_file.write(INPUT_SCORE_BATCH) + input_file.flush() + proc = subprocess.Popen([ + sys.executable, + "-m", + "vllm.entrypoints.openai.run_batch", + "-i", + input_file.name, + "-o", + output_file.name, + "--model", + "BAAI/bge-reranker-v2-m3", + ], ) + proc.communicate() + proc.wait() + assert proc.returncode == 0, f"{proc=}" + + contents = output_file.read() + for line in contents.strip().split("\n"): + # Ensure that the output format conforms to the openai api. + # Validation should throw if the schema is wrong. + BatchRequestOutput.model_validate_json(line) + + # Ensure that there is no error in the response. + line_dict = json.loads(line) + assert isinstance(line_dict, dict) + assert line_dict["error"] is None diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index a803ea4a8d6ad..0d19615bc0d99 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -10,9 +10,7 @@ @pytest.fixture(scope="module") def server(): - args = [ - "--enforce-eager", - ] + args = ["--enforce-eager", "--max-model-len", "100"] with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: yield remote_server @@ -20,8 +18,7 @@ def server(): @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = [ "The capital of Brazil is Brasilia.", "The capital of France is Paris." @@ -45,8 +42,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str): text_1 = [ "What is the capital of the United States?", "What is the capital of France?" @@ -73,8 +69,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, - model_name: str): +def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str): text_1 = "What is the capital of France?" text_2 = "The capital of France is Paris." @@ -91,3 +86,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, assert score.data is not None assert len(score.data) == 1 assert score.data[0].score >= 0.9 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str): + + text_1 = "What is the capital of France?" * 20 + text_2 = [ + "The capital of Brazil is Brasilia.", "The capital of France is Paris." + ] + + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + }) + assert score_response.status_code == 400 + # Assert just a small fragments of the response + assert "Please reduce the length of the input." in \ + score_response.text + + # Test truncation + score_response = requests.post(server.url_for("score"), + json={ + "model": model_name, + "text_1": text_1, + "text_2": text_2, + "truncate_prompt_tokens": 101 + }) + assert score_response.status_code == 400 + assert "Please, select a smaller truncation size." in \ + score_response.text diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 85f485364a411..e88d6c3c67829 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -103,6 +103,116 @@ def test_serving_chat_should_set_correct_max_tokens(): assert mock_engine.generate.call_args.args[1].max_tokens == 10 + # Setting server's max_tokens in the generation_config.json + # lower than context_window - prompt_tokens + mock_model_config = MockModelConfig() + mock_model_config.diff_sampling_param = { + "max_tokens": 10 # Setting server-side max_tokens limit + } + + # Reinitialize the engine with new settings + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + # Initialize the serving chat + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + serving_chat = OpenAIServingChat(mock_engine, + mock_model_config, + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None) + + # Test Case 1: No max_tokens specified in request + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?" + }], + guided_decoding_backend="outlines", + ) + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 10 + + # Test Case 2: Request's max_tokens set higher than server accepts + req.max_tokens = 15 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 10 + + # Test Case 3: Request's max_tokens set lower than server accepts + req.max_tokens = 5 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 5 + + # Setting server's max_tokens in the generation_config.json + # higher than context_window - prompt_tokens + mock_model_config = MockModelConfig() + mock_model_config.diff_sampling_param = { + "max_tokens": 200 # Setting server-side max_tokens limit + } + + # Reinitialize the engine with new settings + mock_engine = MagicMock(spec=MQLLMEngineClient) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + + # Initialize the serving chat + models = OpenAIServingModels(engine_client=mock_engine, + base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) + serving_chat = OpenAIServingChat(mock_engine, + mock_model_config, + models, + response_role="assistant", + chat_template=CHAT_TEMPLATE, + chat_template_content_format="auto", + request_logger=None) + + # Test case 1: No max_tokens specified, defaults to context_window + req = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{ + "role": "user", + "content": "what is 1+1?" + }], + guided_decoding_backend="outlines", + ) + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 93 + + # Test Case 2: Request's max_tokens set higher than server accepts + req.max_tokens = 100 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 93 + + # Test Case 3: Request's max_tokens set lower than server accepts + req.max_tokens = 5 + + with suppress(Exception): + asyncio.run(serving_chat.create_chat_completion(req)) + + assert mock_engine.generate.call_args.args[1].max_tokens == 5 + def test_serving_chat_could_load_correct_generation_config(): diff --git a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json deleted file mode 100644 index a548f0a9611f6..0000000000000 --- a/tests/fp8_kv/llama2-70b-fp8-kv/kv_cache_scales.json +++ /dev/null @@ -1,90 +0,0 @@ -{ - "model_type": "llama", - "kv_cache": { - "dtype": "float8_e4m3fn", - "scaling_factor": { - "0": { - "0": 0.0230364128947258, - "1": 0.01979283057153225, - "2": 0.0241350457072258, - "3": 0.0308314748108387, - "4": 0.0430733822286129, - "5": 0.0370396226644516, - "6": 0.0306222103536129, - "7": 0.0357491634786129, - "8": 0.0358189195394516, - "9": 0.0443289652466774, - "10": 0.0433175228536129, - "11": 0.0416782945394516, - "12": 0.0366908498108387, - "13": 0.0432477705180645, - "14": 0.0410505048930645, - "15": 0.0457589291036129, - "16": 0.0418526791036129, - "17": 0.0432477705180645, - "18": 0.0469447560608387, - "19": 0.0514787957072258, - "20": 0.0541294664144516, - "21": 0.0587681382894516, - "22": 0.0625, - "23": 0.0585588738322258, - "24": 0.0600237175822258, - "25": 0.0588030144572258, - "26": 0.0531180277466774, - "27": 0.06396484375, - "28": 0.0603027381002903, - "29": 0.0582101047039032, - "30": 0.0625348836183548, - "31": 0.0585588738322258, - "32": 0.0582798570394516, - "33": 0.0575125589966774, - "34": 0.0590820349752903, - "35": 0.0614188089966774, - "36": 0.0631975457072258, - "37": 0.0615931935608387, - "38": 0.0601283498108387, - "39": 0.0571986623108387, - "40": 0.0670340433716774, - "41": 0.0523507259786129, - "42": 0.0547223798930645, - "43": 0.0631975457072258, - "44": 0.0663713738322258, - "45": 0.0603376142680645, - "46": 0.0652204304933548, - "47": 0.0734514519572258, - "48": 0.0693708211183548, - "49": 0.0725446492433548, - "50": 0.0627790242433548, - "51": 0.0691266804933548, - "52": 0.0688825398683548, - "53": 0.068429134786129, - "54": 0.0605119988322258, - "55": 0.0799386203289032, - "56": 0.0853097140789032, - "57": 0.0661969929933548, - "58": 0.0689871683716774, - "59": 0.0724051371216774, - "60": 0.0541643425822258, - "61": 0.0626743882894516, - "62": 0.0628487765789032, - "63": 0.0607212632894516, - "64": 0.0589076466858387, - "65": 0.0451660193502903, - "66": 0.0453055277466774, - "67": 0.0414341539144516, - "68": 0.0385044664144516, - "69": 0.0414341539144516, - "70": 0.0466308631002903, - "71": 0.0399693101644516, - "72": 0.0437011756002903, - "73": 0.0434221550822258, - "74": 0.0428989976644516, - "75": 0.0401785746216774, - "76": 0.0431082621216774, - "77": 0.0484444759786129, - "78": 0.0417829267680645, - "79": 0.0418178029358387 - } - } - } -} \ No newline at end of file diff --git a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json b/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json deleted file mode 100644 index bb734039e982b..0000000000000 --- a/tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "model_type": "llama", - "kv_cache": { - "dtype": "float8_e4m3fn", - "scaling_factor": { - "0": { - "0": 0.0152239128947258, - "1": 0.0188860222697258, - "2": 0.0354178324341774, - "3": 0.0376674123108387, - "4": 0.0418526791036129, - "5": 0.0433175228536129, - "6": 0.0397600457072258, - "7": 0.0424455925822258, - "8": 0.0415387861430645, - "9": 0.0408412404358387, - "10": 0.0395856611430645, - "11": 0.0377371683716774, - "12": 0.0400739423930645, - "13": 0.040771484375, - "14": 0.0393415205180645, - "15": 0.0369001142680645, - "16": 0.03857421875, - "17": 0.0387486070394516, - "18": 0.0403180830180645, - "19": 0.0396205373108387, - "20": 0.0375627800822258, - "21": 0.0407366082072258, - "22": 0.0432477705180645, - "23": 0.0377022884786129, - "24": 0.0399693101644516, - "25": 0.0374581478536129, - "26": 0.0413295216858387, - "27": 0.0442243330180645, - "28": 0.0424804724752903, - "29": 0.0456891767680645, - "30": 0.0409109964966774, - "31": 0.0482352152466774 - } - } - } -} diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 124d5d297a574..574a0f223ef0d 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -182,7 +182,7 @@ def test_paged_attention( key_cache, value_cache = key_caches[0], value_caches[0] # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Call the paged attention kernel. output = torch.empty_like(query) diff --git a/tests/kernels/test_block_fp8.py b/tests/kernels/test_block_fp8.py index a16cc4582a180..f28fdf3feedbc 100644 --- a/tests/kernels/test_block_fp8.py +++ b/tests/kernels/test_block_fp8.py @@ -92,8 +92,10 @@ def native_w8a8_block_fp8_matmul(A, A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles) ] B_tiles = [[ - B[j * block_n:min((j + 1) * block_n, N), - i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles) + B[ + j * block_n:min((j + 1) * block_n, N), + i * block_k:min((i + 1) * block_k, K), + ] for i in range(k_tiles) ] for j in range(n_tiles)] C_tiles = [ C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles) @@ -157,9 +159,9 @@ def setup_cuda(): torch.set_default_device("cuda") -@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed", - itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, - SEEDS)) +@pytest.mark.parametrize( + "num_tokens,d,dtype,group_size,seed", + itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS)) @torch.inference_mode() def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): torch.manual_seed(seed) @@ -174,9 +176,9 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): assert torch.allclose(scale, ref_scale) -@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed", - itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, - SEEDS)) +@pytest.mark.parametrize( + "M,N,K,block_size,out_dtype,seed", + itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): torch.manual_seed(seed) @@ -207,9 +209,10 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 -@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed", - itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, - BLOCK_SIZE, DTYPES, SEEDS)) +@pytest.mark.parametrize( + "M,N,K,E,topk,block_size,dtype,seed", + itertools.product(M_moe, N_moe, K_moe, E, TOP_KS, BLOCK_SIZE, DTYPES, + SEEDS)) @torch.inference_mode() def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): torch.manual_seed(seed) diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index fad342d1b5923..08f31219e3574 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -210,7 +210,7 @@ def test_paged_attention( key_cache, value_cache = key_caches[0], value_caches[0] # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) tp_rank = 0 # Call the paged attention kernel. diff --git a/tests/kernels/test_cache.py b/tests/kernels/test_cache.py index 40550ed51e2c7..c848be4f9d807 100644 --- a/tests/kernels/test_cache.py +++ b/tests/kernels/test_cache.py @@ -160,7 +160,7 @@ def test_reshape_and_cache( cloned_value_cache = value_cache.clone() # Using default kv_scale - k_scale = v_scale = 1.0 + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Call the reshape_and_cache kernel. opcheck(torch.ops._C_cache_ops.reshape_and_cache, @@ -258,8 +258,8 @@ def test_reshape_and_cache_flash( del key_caches del value_caches - k_scale = key.amax().item() / 256 - v_scale = value.amax().item() / 256 + k_scale = (key.amax() / 256.0).to(torch.float32) + v_scale = (value.amax() / 256.0).to(torch.float32) # Clone the KV caches. if kv_cache_dtype == "fp8": @@ -284,12 +284,12 @@ def test_reshape_and_cache_flash( result_key_cache = torch.empty_like(key_cache, dtype=torch.float16) ops.convert_fp8(result_key_cache, key_cache, - k_scale, + k_scale.item(), kv_dtype=kv_cache_dtype) result_value_cache = torch.empty_like(value_cache, dtype=torch.float16) ops.convert_fp8(result_value_cache, value_cache, - v_scale, + v_scale.item(), kv_dtype=kv_cache_dtype) # Run the reference implementation. diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py old mode 100644 new mode 100755 index 45ec6df4e711e..8edfde42ede74 --- a/tests/kernels/test_cascade_flash_attn.py +++ b/tests/kernels/test_cascade_flash_attn.py @@ -6,7 +6,9 @@ from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import (cascade_attention, merge_attn_states) -from vllm.vllm_flash_attn import flash_attn_varlen_func +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported) NUM_HEADS = [(4, 4), (8, 2), (16, 2)] HEAD_SIZES = [128, 192, 256] @@ -78,6 +80,7 @@ def test_merge_kernel( @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("soft_cap", [None, 50]) @pytest.mark.parametrize("num_blocks", [2048]) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_cascade( seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], @@ -87,8 +90,13 @@ def test_cascade( block_size: int, soft_cap: Optional[float], num_blocks: int, + fa_version: int, ) -> None: torch.set_default_device("cuda") + if not is_fa_version_supported(fa_version): + pytest.skip(f"Flash attention version {fa_version} not supported due " + f"to: \"{fa_version_unsupported_reason(fa_version)}\"") + current_platform.seed_everything(0) window_size = (-1, -1) @@ -118,9 +126,7 @@ def test_cascade( cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) - cu_kv_lens = torch.tensor([0] + kv_lens, - dtype=torch.int32).cumsum(dim=0, - dtype=torch.int32) + kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32) max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size block_tables = torch.randint(0, num_blocks, @@ -140,7 +146,7 @@ def test_cascade( k=key_cache, v=value_cache, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_kv_lens, + seqused_k=kv_lens_tensor, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len, softmax_scale=scale, @@ -154,10 +160,8 @@ def test_cascade( assert all(common_prefix_len < kv_len for kv_len in kv_lens) cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], dtype=torch.int32) - cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32) - cu_suffix_kv_lens = ( - cu_kv_lens - - torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len) + prefix_kv_lens = torch.tensor([common_prefix_len], dtype=torch.int32) + suffix_kv_lens = kv_lens_tensor - common_prefix_len output = torch.empty_like(query) cascade_attention( output=output, @@ -167,8 +171,8 @@ def test_cascade( cu_query_lens=cu_query_lens, max_query_len=max_query_len, cu_prefix_query_lens=cu_prefix_query_lens, - cu_prefix_kv_lens=cu_prefix_kv_lens, - cu_suffix_kv_lens=cu_suffix_kv_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, max_kv_len=max_kv_len, softmax_scale=scale, alibi_slopes=None, @@ -176,6 +180,7 @@ def test_cascade( logits_soft_cap=soft_cap if soft_cap is not None else 0, block_table=block_tables, common_prefix_len=common_prefix_len, + fa_version=fa_version, ) # Compare the results. diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py index afe53797322f9..c3eddacec2727 100644 --- a/tests/kernels/test_cutlass.py +++ b/tests/kernels/test_cutlass.py @@ -2,7 +2,7 @@ Run `pytest tests/kernels/test_cutlass.py`. """ -from typing import Optional, Type +from typing import Type import pytest import torch @@ -11,6 +11,8 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +from .utils import baseline_scaled_mm, to_fp8, to_int8 + MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), @@ -41,34 +43,10 @@ capability = capability[0] * 10 + capability[1] -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) - - -def to_int8(tensor: torch.Tensor): - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - def rand_int8(shape: tuple, device: str = "cuda"): return to_int8(torch.rand(shape, device=device) * 255 - 128) -def baseline_scaled_mm(a: torch.Tensor, - b: torch.Tensor, - scale_a: torch.Tensor, - scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = (scale_a * (scale_b * (torch.mm( - a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) - if bias is not None: - output = output + bias - - return output - - def cutlass_fp8_gemm_helper(m: int, n: int, k: int, diff --git a/tests/kernels/test_cutlass_2of4_sparse.py b/tests/kernels/test_cutlass_2of4_sparse.py new file mode 100644 index 0000000000000..56495df34aa6c --- /dev/null +++ b/tests/kernels/test_cutlass_2of4_sparse.py @@ -0,0 +1,214 @@ +"""Tests for sparse cutlass kernels + +Run `pytest tests/kernels/test_semi_structured.py`. +""" +from typing import Tuple, Type + +import pytest +import torch +import torch.nn.functional as F + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( + sparse_cutlass_supported) +from vllm.platforms import current_platform + +from .utils import baseline_scaled_mm, to_fp8, to_int8 + +CUDA_DEVICES = [ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] + +capability = current_platform.get_device_capability() +capability = capability[0] * 10 + capability[1] + + +def to_bf16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.bfloat16) + + +def to_fp16(tensor: torch.Tensor) -> torch.Tensor: + return tensor.to(dtype=torch.float16) + + +def prune_to_2_4(tensor): + # Reshape tensor to [N, 4] where N is number of groups of 4 + original_shape = tensor.shape + reshaped = tensor.reshape(-1, 4) + + # Get indices of top 2 absolute values in each group of 4 + _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) + + # Create binary mask + mask = torch.zeros_like(reshaped) + mask.scatter_(dim=1, + index=indices, + src=torch.ones_like(indices, dtype=mask.dtype)) + + # Apply mask and reshape back + pruned = reshaped * mask + + # Turn all -0.0 to 0.0 + pruned[pruned == -0.0] = 0.0 + + return pruned.reshape(original_shape) + + +def make_rand_sparse_tensors( + dtype: torch.dtype, m: int, n: int, k: int +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + a = torch.randn((m, k), device='cuda') * 5 + b = torch.randn((n, k), device='cuda').t() * 5 + + b = prune_to_2_4(b.t()).t() + + if dtype == torch.int8: + a, b = to_int8(a), to_int8(b) + elif dtype == torch.float8_e4m3fn: + a, b = to_fp8(a), to_fp8(b) + elif dtype == torch.float16: + a, b = to_fp16(a), to_fp16(b) + elif dtype == torch.bfloat16: + a, b = to_bf16(a), to_bf16(b) + else: + raise ValueError("unsupported dtype") + + b_compressed, e = ops.cutlass_sparse_compress(b.t()) + + # Compressed B, Metadata, Original A, B + return b_compressed, e, a, b + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +# Test working with a subset of A and B for sparse matmul +def test_cutlass_sparse_subset(): + + big_m = 1024 + m, n, k = 512, 512, 512 + + # Create tensors + b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, + big_m, n, k) + a = whole_a[0:m, 0:k] + scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 + scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) + + +MNK_FACTORS = [ + (1, 256, 128), + (1, 16384, 1024), + (1, 24576, 512), + (16, 256, 512), + (16, 16384, 128), + (16, 24576, 4096), + (32, 8192, 4096), + (32, 16384, 4096), + (33, 1024, 1024), + (33, 8192, 128), + (64, 2048, 512), + (64, 16384, 1024), + (100, 8192, 512), + (128, 32768, 4096), + (256, 4096, 4096), + (512, 256, 1024), + (512, 8192, 4096), + (512, 16384, 128), + (512, 24576, 128), +] + + +# Test working with a subset of A and B for sparse matmul +@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m, k, n", MNK_FACTORS) +@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) +def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k) + scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32) + scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=dtype) + baseline = F.linear(a, b.T) + + torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2) + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m, k, n", MNK_FACTORS) +@pytest.mark.skipif(not current_platform.has_device_capability(89), + reason="FP8 is not supported on this GPU type.") +def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k) + scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0) + + +@pytest.mark.skipif(not sparse_cutlass_supported(), + reason="Sparse CUTLASS is not supported on this GPU type.") +@pytest.mark.parametrize("m,k,n", MNK_FACTORS) +@pytest.mark.parametrize("per_act_token", [True, False]) +@pytest.mark.parametrize("per_out_ch", [True, False]) +@pytest.mark.parametrize("use_bias", [True, False]) +def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool, + per_out_ch: bool, use_bias: bool): + + # Create tensors + b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k) + scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32)) + + out = ops.cutlass_scaled_sparse_mm(a, + b_comp, + e, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + baseline = baseline_scaled_mm(a, + b, + scale_a, + scale_b, + out_dtype=torch.bfloat16) + + torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0) diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py index 1ae78d7b46c5b..0ee0bf6c6a374 100644 --- a/tests/kernels/test_flash_attn.py +++ b/tests/kernels/test_flash_attn.py @@ -4,8 +4,10 @@ import torch from vllm.platforms import current_platform -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + flash_attn_with_kvcache, + is_fa_version_supported) NUM_HEADS = [(4, 4), (8, 2), (16, 2)] HEAD_SIZES = [128, 256] @@ -80,6 +82,7 @@ def ref_paged_attn( @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("sliding_window", [None, 256]) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_flash_attn_with_paged_kv( use_out: bool, @@ -91,8 +94,13 @@ def test_flash_attn_with_paged_kv( soft_cap: Optional[float], num_blocks: int, sliding_window: Optional[int], + fa_version: int, ) -> None: torch.set_default_device("cuda") + if not is_fa_version_supported(fa_version): + pytest.skip(f"Flash attention version {fa_version} not supported due " + f"to: \"{fa_version_unsupported_reason(fa_version)}\"") + current_platform.seed_everything(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] @@ -131,6 +139,7 @@ def test_flash_attn_with_paged_kv( cache_seqlens=kv_lens_tensor, softcap=soft_cap if soft_cap is not None else 0, window_size=window_size, + fa_version=fa_version, ) output = output if not use_out else out output = output.squeeze(1) @@ -159,6 +168,7 @@ def test_flash_attn_with_paged_kv( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0]) @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("fa_version", [2, 3]) @torch.inference_mode() def test_varlen_with_paged_kv( use_out: bool, @@ -170,8 +180,12 @@ def test_varlen_with_paged_kv( block_size: int, soft_cap: Optional[float], num_blocks: int, + fa_version: int, ) -> None: torch.set_default_device("cuda") + if not is_fa_version_supported(fa_version): + pytest.skip(f"Flash attention version {fa_version} not supported due " + f"to: \"{fa_version_unsupported_reason(fa_version)}\"") current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] @@ -198,9 +212,7 @@ def test_varlen_with_paged_kv( cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum(dim=0, dtype=torch.int32) - cu_kv_lens = torch.tensor([0] + kv_lens, - dtype=torch.int32).cumsum(dim=0, - dtype=torch.int32) + kv_lens = torch.tensor(kv_lens, dtype=torch.int32) max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size block_tables = torch.randint(0, @@ -215,7 +227,7 @@ def test_varlen_with_paged_kv( v=value_cache, out=out, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_kv_lens, + seqused_k=kv_lens, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len, softmax_scale=scale, @@ -223,6 +235,7 @@ def test_varlen_with_paged_kv( window_size=window_size, block_table=block_tables, softcap=soft_cap if soft_cap is not None else 0, + fa_version=fa_version, ) output = output if not use_out else out diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index a2c8f71665737..1645ef911d697 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -133,17 +133,19 @@ def test_flashinfer_decode_with_paged_kv( use_tensor_cores=( (num_query_heads//num_kv_heads) > 4) ) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype) - - output = wrapper.forward(query, key_value_cache, logits_soft_cap=soft_cap) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap) + + output = wrapper.run(query, key_value_cache) ref_output = ref_paged_attn(query=query, key_cache=key_cache, @@ -228,7 +230,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -237,12 +239,14 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward( + output = wrapper.run( query, key_value_cache, - logits_soft_cap=soft_cap, ) ref_output = ref_paged_attn(query=query, @@ -253,7 +257,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: List[Tuple[int, int]], block_tables=block_tables, scale=scale, soft_cap=soft_cap) - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -332,7 +336,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8) wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper( workspace_buffer, "NHD") - wrapper.begin_forward( + wrapper.plan( qo_indptr, kv_indptr, kv_indices, @@ -341,13 +345,12 @@ def test_flashinfer_prefill_with_paged_fp8_kv( num_kv_heads, head_size, block_size, + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap, ) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) ref_output = ref_paged_attn(query=query, key_cache=key_cache.squeeze(1), @@ -360,7 +363,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( del query del block_tables # verify prefill fp8 - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ + torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -439,21 +442,18 @@ def test_flashinfer_decode_with_paged_fp8_kv( wrapper = flashinfer.\ BatchDecodeWithPagedKVCacheWrapper(workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores) - wrapper.begin_forward(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - data_type=dtype, - q_data_type=dtype) - output = wrapper.forward(query, - kv_cache_fp8, - logits_soft_cap=soft_cap, - k_scale=k_scale, - v_scale=v_scale) + wrapper.plan(kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + q_data_type=dtype, + kv_data_type=kv_cache_dtype, + logits_soft_cap=soft_cap) + output = wrapper.run(query, kv_cache_fp8, k_scale=k_scale, v_scale=v_scale) key_cache = key_value_cache[:, 0, :, :, :].squeeze(1) value_cache = key_value_cache[:, 1, :, :, :].squeeze(1) diff --git a/tests/kernels/test_mha_attn.py b/tests/kernels/test_mha_attn.py new file mode 100644 index 0000000000000..eab874e9e02bb --- /dev/null +++ b/tests/kernels/test_mha_attn.py @@ -0,0 +1,126 @@ +""" +Test: + +* Tests for MultiHeadAttention layer +""" +from unittest.mock import patch + +import pytest +import torch + +from vllm.attention.layer import MultiHeadAttention +from vllm.attention.selector import _Backend, _cached_get_attn_backend +from vllm.platforms import current_platform +from vllm.platforms.cpu import CpuPlatform +from vllm.platforms.cuda import CudaPlatform +from vllm.platforms.rocm import RocmPlatform + + +@pytest.fixture(autouse=True) +def clear_cache(): + """Clear lru cache to ensure each test case runs without caching. + """ + _cached_get_attn_backend.cache_clear() + + +@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) +def test_mha_attn_platform(device: str): + """ + Test the attention selector between different platform and device. + """ + torch.set_default_dtype(torch.float16) + + if device == "cpu": + with patch("vllm.attention.selector.current_platform", CpuPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.TORCH_SDPA + elif device == "hip": + with patch("vllm.attention.selector.current_platform", RocmPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.TORCH_SDPA + else: + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + attn = MultiHeadAttention(16, 64, scale=1) + assert attn.attn_backend == _Backend.XFORMERS + + with patch("vllm.attention.selector.current_platform", CudaPlatform()): + attn = MultiHeadAttention(16, 72, scale=1) + assert attn.attn_backend == _Backend.XFORMERS + + +def ref_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, +) -> torch.Tensor: + """ + Native implementation of scaled dot product attention without mask: + - query, key, value: [batch_size, seq_len, num_heads, head_size] + - attn_mask: [batch_size, seq_len, seq_len] + """ + query, key, value = (x.transpose(1, 2) for x in (query, key, value)) + attn_weights = scale * torch.matmul(query, key.transpose(2, 3)) + attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) + out = torch.matmul(attn_weights, value).transpose(1, 2) + return out + + +BATCH_SIZES = [1, 16] +SEQ_LENS = [1] +NUM_HEADS = [1, 16] +NUM_KV_HEADS = [1] +HEAD_SIZES = [64, 80] +# flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} +DTYPES = [ + torch.half, torch.bfloat16, torch.float +] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] +CUDA_DEVICES = ["cuda"] + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("seq_len", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("num_kv_heads", NUM_KV_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_mha_attn_forward( + batch_size: int, + seq_len: int, + num_heads: int, + num_kv_heads: int, + head_size: int, + dtype: torch.dtype, + device: str, +): + current_platform.seed_everything(0) + torch.set_default_device(device) + torch.set_default_dtype(dtype) + + q = torch.randn(batch_size, seq_len, num_heads * head_size) + k = torch.randn(batch_size, seq_len, num_kv_heads * head_size) + v = torch.randn(batch_size, seq_len, num_kv_heads * head_size) + scale = 1.0 / head_size**0.5 + attn = MultiHeadAttention(num_heads, + head_size, + scale=scale, + num_kv_heads=num_kv_heads) + output = attn(q, k, v) + + assert num_heads % num_kv_heads == 0 + num_queries_per_kv = num_heads // num_kv_heads + q = q.reshape(batch_size, seq_len, num_heads, head_size) + k = k.reshape(batch_size, seq_len, num_kv_heads, head_size) + v = v.reshape(batch_size, seq_len, num_kv_heads, head_size) + if num_queries_per_kv > 1: + k = torch.repeat_interleave(k, num_queries_per_kv, dim=2) + v = torch.repeat_interleave(v, num_queries_per_kv, dim=2) + + ref_output = ref_attention( + q, + k, + v, + scale=scale, + ).reshape(batch_size, seq_len, num_heads * head_size) + torch.testing.assert_close(output, ref_output) diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py index 3fdb7996ba4e0..10e73ab950b0e 100644 --- a/tests/kernels/test_prefix_prefill.py +++ b/tests/kernels/test_prefix_prefill.py @@ -138,6 +138,7 @@ def test_contexted_kv_attention( # to V_cache[num_blocks, num_kv_heads, head_size, block_size] v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Warm up the Triton kernel by calling it once before actually measuring # generation time @@ -153,6 +154,8 @@ def test_contexted_kv_attention( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, sliding_window=sliding_window) torch.cuda.synchronize() start_time = time.time() @@ -168,6 +171,8 @@ def test_contexted_kv_attention( b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, sliding_window=sliding_window) torch.cuda.synchronize() end_time = time.time() @@ -366,6 +371,7 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: # to V_cache[num_blocks, num_kv_heads, head_size, block_size] v_cache = v_cache.view(-1, block_size, num_kv_heads, head_size).permute(0, 2, 3, 1).contiguous() + k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) # Warm up the Triton kernel by calling it once before actually measuring # generation time @@ -381,6 +387,8 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, alibi_slopes=alibi_slopes) torch.cuda.synchronize() start_time = time.time() @@ -396,6 +404,8 @@ def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor: b_seq_len, b_ctx_len, max_input_len, + k_scale, + v_scale, alibi_slopes=alibi_slopes) torch.cuda.synchronize() end_time = time.time() diff --git a/tests/kernels/test_semi_structured.py b/tests/kernels/test_semi_structured.py deleted file mode 100644 index 4316d6ab30e33..0000000000000 --- a/tests/kernels/test_semi_structured.py +++ /dev/null @@ -1,134 +0,0 @@ -"""Tests for sparse cutlass kernels - -Run `pytest tests/kernels/test_semi_structured.py`. -""" -from typing import Optional, Tuple, Type - -import pytest -import torch - -from vllm import _custom_ops as ops -from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - sparse_cutlass_supported) -from vllm.platforms import current_platform - -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - -capability = current_platform.get_device_capability() -capability = capability[0] * 10 + capability[1] - - -def to_fp8(tensor: torch.Tensor): - finfo = torch.finfo(torch.float8_e4m3fn) - return torch.round(tensor.clamp( - min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) - - -def to_int8(tensor: torch.Tensor): - return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) - - -def rand_int8(shape: tuple, device: str = "cuda"): - return to_int8(torch.rand(shape, device=device) * 255 - 128) - - -def to_bf16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.bfloat16) - - -def to_fp16(tensor: torch.Tensor) -> torch.Tensor: - return tensor.to(dtype=torch.float16) - - -def prune_to_2_4(tensor): - # Reshape tensor to [N, 4] where N is number of groups of 4 - original_shape = tensor.shape - reshaped = tensor.reshape(-1, 4) - - # Get indices of top 2 absolute values in each group of 4 - _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1) - - # Create binary mask - mask = torch.zeros_like(reshaped) - mask.scatter_(dim=1, - index=indices, - src=torch.ones_like(indices, dtype=mask.dtype)) - - # Apply mask and reshape back - pruned = reshaped * mask - - # Turn all -0.0 to 0.0 - pruned[pruned == -0.0] = 0.0 - - return pruned.reshape(original_shape) - - -def make_rand_sparse_tensors( - dtype: torch.dtype, m: int, n: int, k: int -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - a = torch.randn((m, k), device='cuda') * 5 - b = torch.randn((n, k), device='cuda').t() * 5 - - b = prune_to_2_4(b.t()).t() - - if dtype == torch.int8: - a, b = to_int8(a), to_int8(b) - elif dtype == torch.float8_e4m3fn: - a, b = to_fp8(a), to_fp8(b) - elif dtype == torch.float16: - a, b = to_fp16(a), to_fp16(b) - elif dtype == torch.bfloat16: - a, b = to_bf16(a), to_bf16(b) - else: - raise ValueError("unsupported dtype") - - b_compressed, e = ops.cutlass_sparse_compress(b.t()) - - # Compressed B, Metadata, Original A, B - return b_compressed, e, a, b - - -def baseline_scaled_mm(a: torch.Tensor, - b: torch.Tensor, - scale_a: torch.Tensor, - scale_b: torch.Tensor, - out_dtype: Type[torch.dtype], - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - output = (scale_a * (scale_b * (torch.mm( - a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) - if bias is not None: - output = output + bias - - return output - - -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") -# Test working with a subset of A and B for sparse matmul -def test_cutlass_sparse_subset(): - - big_m = 1024 - m, n, k = 512, 512, 512 - - # Create tensors - b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, - big_m, n, k) - a = whole_a[0:m, 0:k] - scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 - scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10 - - out = ops.cutlass_scaled_sparse_mm(a, - b_comp, - e, - scale_a, - scale_b, - out_dtype=torch.bfloat16) - baseline = baseline_scaled_mm(a, - b, - scale_a, - scale_b, - out_dtype=torch.bfloat16) - - torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0) diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py index 8e96a2f70d751..a5aab3c2ea4b0 100644 --- a/tests/kernels/test_triton_scaled_mm.py +++ b/tests/kernels/test_triton_scaled_mm.py @@ -39,6 +39,23 @@ def get_8bit_types(): return types +# This test is to check regressions for int8 support on ROCm. +@pytest.mark.parametrize("model_path", [ + "neuralmagic/Llama-3.2-1B-quantized.w8a8", +]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [10]) +@pytest.mark.skipif(not current_platform.is_rocm(), + reason="Should only run on ROCm") +def test_rocm_compressed_tensors_w8a8(vllm_runner, example_prompts, model_path, + max_tokens, num_logprobs): + dtype = "bfloat16" + + with vllm_runner(model_path, dtype=dtype) as vllm_model: + vllm_model.generate_greedy_logprobs(example_prompts, max_tokens, + num_logprobs) + + @pytest.mark.parametrize("M", [1, 33, 64, 512]) @pytest.mark.parametrize("N", [256, 971, 20486]) @pytest.mark.parametrize("K", [128, 496, 1024]) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 848eea7f54cab..fb2c9f5d30583 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -5,7 +5,7 @@ import unittest from numbers import Number from typing import (Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, - Union) + Type, Union) import pytest import torch @@ -909,6 +909,7 @@ def make_test_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, @@ -958,6 +959,7 @@ def make_test_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, num_prefill_tokens=num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, @@ -1098,3 +1100,28 @@ def opcheck(op: Union[torch._ops.OpOverload, torch._ops.OpOverloadPacket, kwargs, test_utils=test_utils, raise_exception=raise_exception) if cond else {} + + +# For testing quantized linear kernels +def to_fp8(tensor: torch.Tensor): + finfo = torch.finfo(torch.float8_e4m3fn) + return torch.round(tensor.clamp( + min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn) + + +def to_int8(tensor: torch.Tensor): + return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8) + + +def baseline_scaled_mm(a: torch.Tensor, + b: torch.Tensor, + scale_a: torch.Tensor, + scale_b: torch.Tensor, + out_dtype: Type[torch.dtype], + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + output = (scale_a * (scale_b * (torch.mm( + a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype) + if bias is not None: + output = output + bias + + return output diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py index 718730bb8cbbe..4d6890305af73 100644 --- a/tests/kv_transfer/test_lookup_buffer.py +++ b/tests/kv_transfer/test_lookup_buffer.py @@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, device: %s" % (my_rank, device)) + print(f"My rank: {my_rank}, device: {device}") # insert tokens = torch.tensor([1, 2, 3]).to(device) @@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): assert buffer.buffer_size == 0 assert len(buffer.buffer) == 0 - print("My rank: %d, Test run passed!" % (my_rank)) + print(f"My rank: {my_rank}, Test run passed!") def stress_test(my_rank, buf, device): @@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device): assert torch.allclose(k, k_) assert torch.allclose(v, v_) assert torch.allclose(h, h_) - print('Rank %d done' % my_rank) + print(f"Rank {my_rank} done") torch.distributed.barrier() if my_rank == 0: @@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): else: torch.distributed.send(torch.tensor([n]), 0) - print("My rank: %d, Passed stress test!" % (my_rank)) + print(f"My rank: {my_rank}, Passed stress test!") if __name__ == "__main__": @@ -122,7 +122,7 @@ def stress_test(my_rank, buf, device): rank=my_rank, ) - print("initialized! My rank is %d" % my_rank) + print(f"initialized! My rank is {my_rank}") config = KVTransferConfig( kv_connector='PyNcclConnector', diff --git a/tests/lora/test_lora_manager_hpu.py b/tests/lora/test_lora_manager_hpu.py index ede3b11e431f5..1771f8762ae64 100644 --- a/tests/lora/test_lora_manager_hpu.py +++ b/tests/lora/test_lora_manager_hpu.py @@ -14,10 +14,12 @@ from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.models import (LoRAMapping, LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager) +from vllm.lora.peft_helper import PEFTHelper from vllm.lora.request import LoRARequest from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, WorkerLoRAManager) from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.platforms import current_platform EMBEDDING_MODULES = { "embed_tokens": "input_embeddings", @@ -26,18 +28,26 @@ EMBEDDING_PADDING_MODULES = ["lm_head"] +DEVICES = ([ + f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) +] if current_platform.is_cuda_alike() else + ["hpu:0"] if current_platform.is_hpu() else ["cpu"]) -def test_from_lora_tensors(sql_lora_files): + +@pytest.mark.parametrize("device", DEVICES) +def test_from_lora_tensors(sql_lora_files, device): tensors = load_file( os.path.join(sql_lora_files, "adapter_model.safetensors")) new_embeddings = load_file( os.path.join(sql_lora_files, "new_embeddings.safetensors")) + + peft_helper = PEFTHelper.from_local_dir(sql_lora_files, + max_position_embeddings=4096) lora_model = LoRAModel.from_lora_tensors( 1, - 8, - 16, tensors, - torch.device("hpu"), + peft_helper=peft_helper, + device=device, embeddings=new_embeddings, embedding_modules=EMBEDDING_MODULES, embedding_padding_modules=EMBEDDING_PADDING_MODULES) @@ -47,6 +57,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.lora_alpha == 16 assert lora.lora_a is not None assert lora.lora_b is not None + assert lora.lora_a.device == torch.device(device) + assert lora.lora_b.device == torch.device(device) assert (lora.lora_a.shape[1] == lora.lora_b.shape[0] ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}" assert lora.lora_a.shape[1] == 8 @@ -61,8 +73,8 @@ def test_from_lora_tensors(sql_lora_files): assert lora.embeddings_tensor is None -def create_lora(lora_id: int, model: nn.Module, - sub_modules: List[str]) -> LoRAModel: +def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str], + device: torch.device) -> LoRAModel: loras: Dict[str, LoRALayerWeights] = {} for name in sub_modules: w = model.get_submodule(name).weight @@ -70,8 +82,8 @@ def create_lora(lora_id: int, model: nn.Module, name, 8, 16, - torch.rand([w.shape[1], 8], device="hpu"), - torch.rand([8, w.shape[0]], device="hpu"), + torch.rand([w.shape[1], 8], device=device), + torch.rand([8, w.shape[0]], device=device), ) return LoRAModel(lora_id, 8, loras) @@ -81,6 +93,7 @@ def create_packed_lora( model: nn.Module, module_name, replaced_module_names, + device: torch.device, empty_replaced_module_name=None, ) -> LoRAModel: w = model.get_submodule(module_name).weight @@ -92,9 +105,9 @@ def create_packed_lora( replaced_module_name, 8, 16, - torch.rand([w.shape[1], 8], device="hpu"), + torch.rand([w.shape[1], 8], device=device), torch.rand([8, w.shape[0] // len(replaced_module_names)], - device="hpu"), + device=device), ) return LoRAModel(lora_id, 8, loras) @@ -106,7 +119,7 @@ def test_replace_submodules(dist_init, dummy_model): manager = LoRAModelManager( model, 1, 1, 1, LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), - torch.device("hpu")) + torch.device(DEVICES[0])) model = manager.model assert isinstance(model.get_submodule("dense1"), @@ -118,17 +131,28 @@ def test_replace_submodules(dist_init, dummy_model): RowParallelLinearWithLoRA) -def test_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", DEVICES) +def test_lora_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2), - torch.device("hpu")) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -164,18 +188,32 @@ def test_lora_model_manager(dist_init, dummy_model): assert manager.lora_index_to_id[0] == 3 assert manager.lora_index_to_id[1] == 2 + assert manager.device == device + assert manager.punica_wrapper.device == device -def test_lora_lru_cache_model_manager(dist_init, dummy_model): + +@pytest.mark.parametrize("device", DEVICES) +def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2), - torch.device("hpu")) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=3, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) assert manager.add_adapter(model_lora1) assert manager.activate_adapter(1) @@ -242,21 +280,37 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model): with pytest.raises(ValueError): assert manager.pin_adapter(3) + assert manager.punica_wrapper.device == device + assert manager.device == device + -def test_lru_lora_model_manager(dist_init, dummy_model): +@pytest.mark.parametrize("device", DEVICES) +def test_lru_lora_model_manager(dist_init, dummy_model, device): # This tests just the LRU cache functionality, everything else is # tested in test_lora_model_manager model = dummy_model model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.packed_modules_mapping = {} - model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"]) - model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"]) - model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"]) - model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"]) - manager = LRUCacheLoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2), - torch.device("hpu")) + model_lora1 = create_lora(1, + model, ["layer1.dense1", "dense2", "lm_head"], + device=device) + model_lora2 = create_lora(2, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora3 = create_lora(3, + model, ["dense1", "dense2", "lm_head"], + device=device) + model_lora4 = create_lora(4, + model, ["dense1", "dense2", "lm_head"], + device=device) + manager = LRUCacheLoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) assert all(x is None for x in manager.lora_index_to_id) @@ -356,14 +410,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model): assert manager.remove_oldest_adapter() assert set(manager.list_adapters()) == {1} + assert manager.punica_wrapper.device == device + assert manager.device == device +@pytest.mark.parametrize("device", DEVICES) def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = LRUCacheWorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings.model) @@ -431,14 +488,19 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + +@pytest.mark.parametrize("device", DEVICES) def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, - sql_lora_files): + sql_lora_files, device): # Should remove every LoRA not specified in the request. lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) worker_adapter_manager = WorkerLoRAManager( 4, 2, llama_2_7b_model_extra_embeddings.model.unpadded_vocab_size - - lora_config.lora_extra_vocab_size, lora_config, torch.device("hpu"), + lora_config.lora_extra_vocab_size, lora_config, device, EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES) worker_adapter_manager.create_lora_manager( llama_2_7b_model_extra_embeddings.model) @@ -502,8 +564,13 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, LoRARequest("14", 14, sql_lora_files) ], mapping) + assert worker_adapter_manager.device == device + assert (worker_adapter_manager._adapter_manager.punica_wrapper.device == + device) + -def test_packed_loras(dist_init, dummy_model_gate_up): +@pytest.mark.parametrize("device", DEVICES) +def test_packed_loras(dist_init, dummy_model_gate_up, device): model = dummy_model_gate_up model.supported_lora_modules = ["gate_up_proj"] model.packed_modules_mapping = { @@ -516,19 +583,25 @@ def test_packed_loras(dist_init, dummy_model_gate_up): 1, model, module_name="gate_up_proj", - replaced_module_names=["gate_proj", "up_proj"]) + replaced_module_names=["gate_proj", "up_proj"], + device=device) model_lora1 = create_packed_lora( 2, model, module_name="gate_up_proj", replaced_module_names=["gate_proj", "up_proj"], + device=device, empty_replaced_module_name="gate_proj", ) - manager = LoRAModelManager( - model, 2, 2, 2, - LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2), - torch.device("hpu")) + manager = LoRAModelManager(model, + 2, + 2, + 2, + LoRAConfig(max_lora_rank=8, + max_cpu_loras=2, + max_loras=2), + device=device) model = manager.model assert isinstance(model.get_submodule("gate_up_proj"), @@ -539,21 +612,21 @@ def test_packed_loras(dist_init, dummy_model_gate_up): packed_lora = model_lora.get_lora("gate_up_proj") assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights) - assert torch.allclose(packed_lora.lora_a[0], - model_lora.get_lora("gate_proj").lora_a) - assert torch.allclose(packed_lora.lora_b[0], - model_lora.get_lora("gate_proj").lora_b) - assert torch.allclose(packed_lora.lora_a[1], - model_lora.get_lora("up_proj").lora_a) - assert torch.allclose(packed_lora.lora_b[1], - model_lora.get_lora("up_proj").lora_b) + torch.testing.assert_close(packed_lora.lora_a[0], + model_lora.get_lora("gate_proj").lora_a) + torch.testing.assert_close(packed_lora.lora_b[0], + model_lora.get_lora("gate_proj").lora_b) + torch.testing.assert_close(packed_lora.lora_a[1], + model_lora.get_lora("up_proj").lora_a) + torch.testing.assert_close(packed_lora.lora_b[1], + model_lora.get_lora("up_proj").lora_b) packed_lora1 = model_lora1.get_lora("gate_up_proj") assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights) assert packed_lora1.lora_a[0] is None assert packed_lora1.lora_b[0] is None - assert torch.allclose(packed_lora1.lora_a[1], - model_lora1.get_lora("up_proj").lora_a) - assert torch.allclose(packed_lora1.lora_b[1], - model_lora1.get_lora("up_proj").lora_b) + torch.testing.assert_close(packed_lora1.lora_a[1], + model_lora1.get_lora("up_proj").lora_a) + torch.testing.assert_close(packed_lora1.lora_b[1], + model_lora1.get_lora("up_proj").lora_b) diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index ebdd129db5f6a..570aa3861d0be 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -55,9 +55,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(current_platform.is_rocm(), - reason="Qwen2-VL dependency xformers incompatible with ROCm" - ) +@pytest.mark.xfail( + current_platform.is_rocm(), + reason="Qwen2-VL dependency xformers incompatible with ROCm") def test_qwen2vl_lora(qwen2vl_lora_files): llm = vllm.LLM( MODEL_PATH, diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 0609fd96825e3..9c1f784c1c93b 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -25,13 +25,12 @@ def test_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME, revision=REVISION, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -46,11 +45,13 @@ def test_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer.model_max_length == 512 - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, BertEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + + vllm_model.apply_model(check_model) + # assert output assert output @@ -64,13 +65,12 @@ def test_roberta_model_loading_with_params(vllm_runner): with vllm_runner(model_name=MODEL_NAME_ROBERTA, revision=REVISION_ROBERTA, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_config = model.model.llm_engine.model_config - - model_tokenizer = model.model.llm_engine.tokenizer + model_config = vllm_model.model.llm_engine.model_config + model_tokenizer = vllm_model.model.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -84,11 +84,12 @@ def test_roberta_model_loading_with_params(vllm_runner): assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" assert not model_tokenizer.tokenizer_config["do_lower_case"] - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.MEAN - assert model._pooler.normalize + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + + vllm_model.apply_model(check_model) # assert output assert output @@ -103,17 +104,18 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): model_name = "FacebookAI/roberta-base" with vllm_runner(model_name=model_name, dtype="float16", - max_model_len=MAX_MODEL_LEN) as model: - output = model.encode("Write a short story about a robot that" - " dreams for the first time.\n") + max_model_len=MAX_MODEL_LEN) as vllm_model: + output = vllm_model.encode("Write a short story about a robot that" + " dreams for the first time.\n") - model_tokenizer = model.model.llm_engine.tokenizer + model_tokenizer = vllm_model.model.llm_engine.tokenizer assert model_tokenizer.tokenizer_id == model_name - model = model.model.llm_engine.model_executor\ - .driver_worker.model_runner.model - assert not hasattr(model, "lm_head") - assert isinstance(model, RobertaEmbeddingModel) - assert isinstance(model._pooler, CLSPool) + def check_model(model): + assert isinstance(model, RobertaEmbeddingModel) + assert not hasattr(model, "lm_head") + assert isinstance(model._pooler, CLSPool) + + vllm_model.apply_model(check_model) assert output diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py index 53f23e24511b3..5f06f1e3a2fe9 100644 --- a/tests/models/decoder_only/language/test_fp8.py +++ b/tests/models/decoder_only/language/test_fp8.py @@ -19,18 +19,17 @@ @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize( - "kv_cache_dtype,base_model,test_model,scale_path", + "kv_cache_dtype,base_model,test_model", [ # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct", - "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None), + "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"), # Test FP16 checkpoint w. fp8_e5m2 kv-cache. ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct", - "meta-llama/Llama-3.2-1B-Instruct", None), + "meta-llama/Llama-3.2-1B-Instruct"), # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", - "meta-llama/Llama-2-7b-chat-hf", - "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") + "meta-llama/Llama-2-7b-chat-hf") ]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) @@ -48,7 +47,6 @@ def test_models( kv_cache_dtype: str, base_model: str, test_model: str, - scale_path: Optional[str], max_tokens: int, enforce_eager: bool, backend: str, @@ -76,10 +74,6 @@ def test_models( baseline_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) - extra_kwargs = {} - if scale_path is not None: - extra_kwargs["quantization_param_path"] = scale_path - with vllm_runner( test_model, max_model_len=MAX_MODEL_LEN, @@ -87,7 +81,6 @@ def test_models( enforce_eager=enforce_eager, kv_cache_dtype=kv_cache_dtype, disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, ) as vllm_model: test_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 81b93ebdf0fc0..ad8f8a0c320e9 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -66,12 +66,16 @@ def gguf_model(self): gguf_filename="starcoder2-3b.Q6_K.gguf", ) +DOLPHIN_CONFIG = GGUFTestConfig( + # Test VocabParallelEmbedding sharding issue. + original_model="cognitivecomputations/TinyDolphin-2.8-1.1b", + gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF", + gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf", +) + MODELS = [ - LLAMA_CONFIG, - QWEN2_CONFIG, - PHI3_CONFIG, - GPT2_CONFIG, - STABLELM_CONFIG, + LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, + DOLPHIN_CONFIG # STARCODER_CONFIG, # broken ] @@ -106,15 +110,18 @@ def test_models( messages, tokenize=False, add_generation_prompt=True) # Run unquantized model. - with vllm_runner(model_name=model.original_model, - dtype=dtype, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=tp_size) as original_model: + with vllm_runner( + model_name=model.original_model, + enforce_eager=True, # faster tests + dtype=dtype, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tp_size) as original_model: original_outputs = original_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) # Run gguf model. with vllm_runner(model_name=model.gguf_model, + enforce_eager=True, tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py index 057b04349e8b7..2e06b10fbb827 100644 --- a/tests/models/decoder_only/language/test_jamba.py +++ b/tests/models/decoder_only/language/test_jamba.py @@ -33,10 +33,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py index 06739e8f02253..1ad4f5aae8f5b 100644 --- a/tests/models/decoder_only/language/test_mamba.py +++ b/tests/models/decoder_only/language/test_mamba.py @@ -51,10 +51,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) for i in range(len(example_prompts)): hf_output_ids, hf_output_str = hf_outputs[i] diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 4e110366a09f3..c7efa4edbbc0a 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -73,10 +73,13 @@ def test_models( with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_logprobs_close( outputs_0_lst=hf_outputs, diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 5710303548c34..62c644f73d62d 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -10,7 +10,6 @@ import pytest from transformers import AutoModelForVision2Seq from transformers import __version__ as TRANSFORMERS_VERSION -from transformers.utils import is_flash_attn_2_available from vllm.platforms import current_platform from vllm.utils import identity @@ -140,9 +139,7 @@ #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], - tokenizer_mode="slow", test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), - dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", max_model_len=4096, @@ -158,8 +155,8 @@ max_tokens=64, marks=[ pytest.mark.skipif( - not is_flash_attn_2_available(), - reason="Model needs flash-attn for numeric convergence.", + TRANSFORMERS_VERSION < "4.48.0", + reason="HF model requires transformers>=4.48.0", ), large_gpu_mark(min_gb=64), ], @@ -190,7 +187,7 @@ dtype="bfloat16", ), "deepseek_vl_v2": VLMTestInfo( - models=["deepseek-ai/deepseek-vl2-tiny"], + models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 max_model_len=4096, @@ -353,6 +350,20 @@ postprocess_inputs=model_utils.wrap_inputs_post_processor, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, ), + "minicpmo_26": VLMTestInfo( + models=["openbmb/MiniCPM-o-2_6"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "(./)\n", + max_model_len=4096, + max_num_seqs=2, + get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 + postprocess_inputs=model_utils.ignore_inputs_post_processor( + "image_sizes" + ), + hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, + patch_hf_runner=model_utils.minicpmo_patch_hf_runner + ), "minicpmv_26": VLMTestInfo( models=["openbmb/MiniCPM-V-2_6"], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), @@ -524,12 +535,13 @@ def _mark_splits( # - image embeddings # - video # - custom inputs -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=False, + )) def test_single_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -546,12 +558,13 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=False, + )) def test_multi_image_models(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -568,12 +581,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=False, + )) def test_image_embedding_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], @@ -589,12 +603,13 @@ def test_image_embedding_models(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=False, + )) def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], video_assets: _VideoAssets): @@ -608,12 +623,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=False, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=False, + )) def test_custom_inputs_models( model_type: str, test_case: ExpandableVLMTestArgs, @@ -630,12 +646,13 @@ def test_custom_inputs_models( #### Tests filtering for things running each test as a new process -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -653,12 +670,13 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.MULTI_IMAGE, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.MULTI_IMAGE, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, test_case: ExpandableVLMTestArgs, @@ -676,12 +694,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.EMBEDDING, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.EMBEDDING, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_image_embedding_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, @@ -698,12 +717,13 @@ def test_image_embedding_models_heavy(model_type: str, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.VIDEO, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.VIDEO, + fork_new_process_for_each_test=True, + )) def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], @@ -718,12 +738,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ) -@pytest.mark.parametrize("model_type,test_case", - get_parametrized_options( - VLM_TEST_SETTINGS, - test_type=VLMTestType.CUSTOM_INPUTS, - fork_new_process_for_each_test=True, - )) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.CUSTOM_INPUTS, + fork_new_process_for_each_test=True, + )) @fork_new_process_for_each_test def test_custom_inputs_models_heavy( model_type: str, diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py index 90c0fab99054c..8103e5305b91b 100644 --- a/tests/models/decoder_only/vision_language/test_pixtral.py +++ b/tests/models/decoder_only/vision_language/test_pixtral.py @@ -135,10 +135,10 @@ def _dump_outputs_w_logprobs( outputs: OutputsLogprobs, filename: "StrPath", ) -> None: - json_data = [(tokens, text, - [{k: asdict(v) - for k, v in token_logprobs.items()} - for token_logprobs in (logprobs or [])]) + json_data = [(tokens, text, [{ + k: asdict(v) + for k, v in token_logprobs.items() + } for token_logprobs in (logprobs or [])]) for tokens, text, logprobs in outputs] with open(filename, "w") as f: @@ -149,11 +149,10 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs: with open(filename, "rb") as f: json_data = json.load(f) - return [(tokens, text, - [{int(k): Logprob(**v) - for k, v in token_logprobs.items()} - for token_logprobs in logprobs]) - for tokens, text, logprobs in json_data] + return [(tokens, text, [{ + int(k): Logprob(**v) + for k, v in token_logprobs.items() + } for token_logprobs in logprobs]) for tokens, text, logprobs in json_data] @large_gpu_test(min_gb=80) diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 16e256e040a74..5a485f3d81747 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -5,7 +5,6 @@ import torch from PIL import Image -from vllm.entrypoints.llm import LLM from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import rescale_video_size, sample_frames_from_video @@ -69,7 +68,7 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict): def batch_make_image_embeddings( image_batches: List[Union[Image.Image, List[Image.Image]]], processor, - llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptImageEmbeddingInput]: """batched image embeddings for Qwen2-VL This will infer all images' embeddings in a single batch, @@ -105,17 +104,19 @@ def batch_make_image_embeddings( pixel_values = preprocess_result["pixel_values"] image_grid_thw = preprocess_result["image_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker. \ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - image_grid_thw_on_device = image_grid_thw.to(visual.device, - dtype=torch.int64) - image_embeds = visual(pixel_values_on_device, - grid_thw=image_grid_thw_on_device) + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + image_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptImageEmbeddingInput] = [] @@ -124,11 +125,10 @@ def batch_make_image_embeddings( for image_batch in image_batches_: cur_batch_image_count = len(image_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in image_grid_thw[image_counter:image_counter + - cur_batch_image_count] - ]) + cur_batch_image_count]) result.append({ "image_embeds": @@ -151,7 +151,7 @@ def batch_make_image_embeddings( def batch_make_video_embeddings( video_batches: PromptVideoInput, processor, - llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + llm: VllmRunner) -> List[Qwen2VLPromptVideoEmbeddingInput]: """batched video embeddings for Qwen2-VL A NDArray represents a single video's all frames. @@ -187,17 +187,19 @@ def batch_make_video_embeddings( pixel_values = preprocess_result["pixel_values_videos"] video_grid_thw = preprocess_result["video_grid_thw"] - # pixel values to embeddinds & grid_thws - with torch.no_grad(): - visual = llm.llm_engine.model_executor.driver_worker.\ - model_runner.model.visual + # pixel values to embeddings & grid_thws + def get_image_embeds(model): + with torch.no_grad(): + visual = model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + return visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) - pixel_values_on_device = pixel_values.to(visual.device, - dtype=visual.dtype) - video_grid_thw_on_device = video_grid_thw.to(visual.device, - dtype=torch.int64) - video_embeds = visual(pixel_values_on_device, - grid_thw=video_grid_thw_on_device) + video_embeds = torch.concat(llm.apply_model(get_image_embeds)) # split into original batches result: List[Qwen2VLPromptVideoEmbeddingInput] = [] @@ -206,11 +208,10 @@ def batch_make_video_embeddings( for video_batch in video_batches_: cur_batch_video_count = len(video_batch) merge_size = image_processor.merge_size - cur_batch_embed_len = sum([ - grid_thw.prod() // merge_size // merge_size + cur_batch_embed_len = sum( + grid_thw.prod(-1) // merge_size // merge_size for grid_thw in video_grid_thw[video_counter:video_counter + - cur_batch_video_count] - ]) + cur_batch_video_count]) result.append({ "video_embeds": @@ -280,9 +281,9 @@ def run_embedding_input_test( max_tokens, num_logprobs=num_logprobs, images=batch_make_image_embeddings( - images, processor, vllm_model.model) if images else None, + images, processor, vllm_model) if images else None, videos=batch_make_video_embeddings( - videos, processor, vllm_model.model) if videos else None) + videos, processor, vllm_model) if videos else None) for prompts, images, videos in inputs ] diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 1ca85c7bb2056..07bdb2cee44d2 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -497,6 +497,17 @@ def _generate(self, *args, **kwargs): return hf_model +def minicpmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + orig_generate = hf_model.model.generate + + def _generate(self, *args, **kwargs): + return orig_generate(*args, decode_text=False, **kwargs) + + hf_model.model.generate = types.MethodType(_generate, hf_model.model) + + return hf_model + + def _generate_greedy_logprobs_limit( self, prompts: List[str], diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6673a9fc22f69..0cbe4afe96c0a 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -24,10 +24,13 @@ def test_classification_models( ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) with hf_runner(model, dtype=dtype, diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 04ab4dd7371a3..e17198e385475 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -17,14 +17,15 @@ marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-large"), - # [Encoder-decoder] - pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + # [Decoder-only] pytest.param("BAAI/bge-multilingual-gemma2", marks=[pytest.mark.core_model]), - pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # [Encoder-decoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], ) @@ -61,10 +62,13 @@ def test_models( max_model_len=None, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) + # This test is for verifying whether the model's extra_repr # can be printed correctly. - print(vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model) + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) check_embeddings_close( embeddings_0_lst=hf_outputs, diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index be6e3842821e2..3db27d942ac8c 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -5,12 +5,18 @@ import math import pytest +import torch +import torch.nn.functional as F MODELS = [ "cross-encoder/ms-marco-MiniLM-L-6-v2", # Bert "BAAI/bge-reranker-v2-m3", # Roberta ] +EMBEDDING_MODELS = [ + "sentence-transformers/all-MiniLM-L12-v2", +] + TEXTS_1 = [ "What is the capital of France?", "What is the capital of Germany?", @@ -87,3 +93,97 @@ def test_llm_N_to_N(vllm_runner, hf_runner, model_name, dtype: str): assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.fixture(scope="module", params=EMBEDDING_MODELS) +def emb_model_name(request): + yield request.param + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_1_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pair = [TEXTS_1[0], TEXTS_2[0]] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = hf_model.encode(text_pair) + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, hf_embeddings), dim=0) + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(text_pair[0], text_pair[1]) + + assert len(vllm_outputs) == 1 + assert len(hf_outputs) == 1 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_1_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[0], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1[0], TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) + + +@pytest.mark.parametrize("dtype", ["half"]) +def test_llm_N_to_N_embedding(vllm_runner, hf_runner, emb_model_name, + dtype: str): + + text_pairs = [ + [TEXTS_1[0], TEXTS_2[0]], + [TEXTS_1[1], TEXTS_2[1]], + ] + + with hf_runner(emb_model_name, dtype=dtype, + is_sentence_transformer=True) as hf_model: + hf_embeddings = [ + hf_model.encode(text_pair) for text_pair in text_pairs + ] + hf_outputs = [ + F.cosine_similarity(*map(torch.tensor, pair), dim=0) + for pair in hf_embeddings + ] + + with vllm_runner(emb_model_name, + task="embed", + dtype=dtype, + max_model_len=None) as vllm_model: + vllm_outputs = vllm_model.score(TEXTS_1, TEXTS_2) + + assert len(vllm_outputs) == 2 + assert len(hf_outputs) == 2 + + assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) + assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 84ed415c136cc..61fd5d8b57274 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -1,11 +1,15 @@ from typing import List, Optional, Tuple, Type, overload import pytest +import torch from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) +from vllm.attention.backends.flash_attn import FlashAttentionMetadata from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) +from vllm.model_executor.models.mllama import (MLLAMA_IMAGE_TOKEN_ID, + MllamaForConditionalGeneration) from vllm.multimodal.image import rescale_image_size from vllm.sequence import SampleLogprobs @@ -33,6 +37,29 @@ "meta-llama/Llama-3.2-11B-Vision-Instruct", ] +# Indices for inputs +TEXT_ONLY = '0' +IMAGE_AT_BEG = '1' +IMAGE_AT_MIDDLE = '2' +TWO_IMAGES = '3' + +# Input tokenized +prompt_data = { + # Tell me a story + TEXT_ONLY: [41551, 757, 264, 3446], + # <|image|> What's the content of this image + IMAGE_AT_BEG: + [MLLAMA_IMAGE_TOKEN_ID, 3639, 596, 279, 2262, 315, 420, 2217, 220], + # Hello <|image|>What' the content of this image + IMAGE_AT_MIDDLE: + [9906, 220, MLLAMA_IMAGE_TOKEN_ID, 3923, 6, 279, 2262, 315, 420, 2217], + #<|image|>Is there a duck in this image?<|image|>What's the animal in this image? # noqa: E501 + TWO_IMAGES: [ + MLLAMA_IMAGE_TOKEN_ID, 3957, 1070, 264, 37085, 304, 420, 2217, 30, + MLLAMA_IMAGE_TOKEN_ID, 3923, 596, 279, 10065, 304, 420, 2217, 30 + ] +} + def vllm_to_hf_output(vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]], @@ -387,3 +414,184 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, num_logprobs=num_logprobs, tensor_parallel_size=1, ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) +def test_regression(vllm_runner, image_assets, model, dtype, max_tokens, + num_logprobs, attn_backend: _Backend) -> None: + + stop_sign = image_assets[0].pil_image + + with global_force_attn_backend_context_manager(attn_backend), vllm_runner( + model, + dtype=dtype, + max_model_len=4096, + max_num_seqs=2, + tensor_parallel_size=1, + enforce_eager=True, + limit_mm_per_prompt={"image": + _LIMIT_IMAGE_PER_PROMPT}) as vllm_model: + + # Regression tests for https://github.com/vllm-project/vllm/issues/10648 + + # Number of image tags is greater than the number of images provided + prompt = "<|begin_of_text|><|image|><|image|> Compare the two images" # noqa: E501 + image = stop_sign + with pytest.raises(ValueError): + vllm_model.generate_greedy_logprobs([prompt], + max_tokens, + num_logprobs, + images=[image]) + + # Batch of a text-only and image request that requires cross-attention + prompts = [ + "What is the capital of spain?", + "Text before the image...<|image|>What is in the image?", # noqa: E501 + ] + images = [ + None, + [stop_sign], + ] + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs, + images=images) + + # Test the reverse order too for good measure + prompts = [ + "<|begin_of_text|>Text before the image...<|image|>What is in the image?", # noqa: E501 + "<|begin_of_text|>Hello!", + ] + images = [ + [stop_sign], + None, + ] + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs, + images=images) + + +@pytest.mark.core_model +@pytest.mark.parametrize( + "input_indices_and_output", + # inputs, (cross_attention_mask, kv_range_for_decode) + [([TEXT_ONLY], (None, None)), ([IMAGE_AT_BEG], (None, None)), + ([TEXT_ONLY, IMAGE_AT_BEG], (None, None)), + ([IMAGE_AT_MIDDLE], ((10, 12), [[0, 6]])), + ([TEXT_ONLY, IMAGE_AT_MIDDLE], ((14, 12), [[0, 6]])), + ([TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], + ((23, 24), [[0, 6], [6, 12]])), + ([IMAGE_AT_MIDDLE, TEXT_ONLY], ((14, 12), [[0, 6]])), + ([TWO_IMAGES], ((18, 12), [[6, 12]])), + ([TEXT_ONLY, TWO_IMAGES], ((22, 12), [[6, 12]]))]) +def test_get_cross_attention_mask(input_indices_and_output) -> None: + + input_indices, expected_output = input_indices_and_output + + sequences = [torch.tensor(prompt_data[i]) for i in input_indices] + num_tiles = [[2, 2] if i != TEXT_ONLY else [] for i in input_indices + if i != TEXT_ONLY] + input = torch.cat(sequences) + + seq_lens = [len(s) for s in sequences] + + attn_data = FlashAttentionMetadata( + seq_lens=seq_lens, + # Dummy values + enable_kv_scales_calculation=False, + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=0, + slot_mapping=0, + multi_modal_placeholder_index_maps=None, + seq_lens_tensor=0, + max_prefill_seq_len=0, + max_decode_seq_len=0, + context_lens_tensor=None, + block_tables=None, + use_cuda_graph=False, + ) + + dummy: dict[str, str] = {} + + cross_attention_mask, kv_range_for_decode = MllamaForConditionalGeneration\ + .get_cross_attention_mask(dummy, + input, + attn_data, + num_tiles=num_tiles, + num_tokens_per_tile=3, + dtype=torch.bfloat16) + + expected_cross_attention_mask, expected_kv_range_for_decode = \ + expected_output + + assert kv_range_for_decode == expected_kv_range_for_decode + if expected_cross_attention_mask is not None: + assert cross_attention_mask is not None + assert cross_attention_mask.shape == expected_cross_attention_mask + else: + assert cross_attention_mask is None + + +@pytest.mark.core_model +@pytest.mark.parametrize( + "input_indices", + [[TEXT_ONLY], [IMAGE_AT_BEG], [TEXT_ONLY, IMAGE_AT_BEG], [IMAGE_AT_MIDDLE], + [TEXT_ONLY, IMAGE_AT_MIDDLE], [TEXT_ONLY, IMAGE_AT_BEG, IMAGE_AT_MIDDLE], + [IMAGE_AT_MIDDLE, TEXT_ONLY], [TWO_IMAGES], [TEXT_ONLY, TWO_IMAGES]]) +def test_get_full_text_row_masked_out_mask(input_indices) -> None: + + sequences = [torch.tensor(prompt_data[i]) for i in input_indices] + + seq_lens = [len(s) for s in sequences] + + num_prefill_tokens = sum(seq_lens) + + # TEXT_ONLY is zero, so it will be masked out, + # other instances should not be. + encoder_seq_lens = [int(i) for i in input_indices] + + attn_data = FlashAttentionMetadata( + seq_lens=seq_lens, + encoder_seq_lens=encoder_seq_lens, + num_prefill_tokens=num_prefill_tokens, + # Dummy values + enable_kv_scales_calculation=False, + num_prefills=0, + num_decode_tokens=0, + slot_mapping=0, + multi_modal_placeholder_index_maps=None, + seq_lens_tensor=0, + max_prefill_seq_len=0, + max_decode_seq_len=0, + context_lens_tensor=None, + block_tables=None, + use_cuda_graph=False, + ) + + dummy: dict[str, str] = {} + + full_text_row_masked_out_mask = MllamaForConditionalGeneration\ + .get_full_text_row_masked_out_mask(dummy, + attn_data, + torch.get_default_device()) + + full_text_row_masked_out_mask = full_text_row_masked_out_mask.squeeze() + full_text_row_masked_out_mask = full_text_row_masked_out_mask.tolist() + + idx = 0 + assert len(full_text_row_masked_out_mask) == num_prefill_tokens + for i, seq_len in enumerate(seq_lens): + must_be_masked = input_indices[i] != TEXT_ONLY + for _ in range(seq_len): + assert full_text_row_masked_out_mask[idx] == must_be_masked, \ + f"full_text_row_masked_out_mask[{idx}] must be " \ + f"'{must_be_masked}' " + idx += 1 diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0a38779e0e4f0..ca28da268fa05 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -11,47 +11,52 @@ from vllm.multimodal.utils import cached_get_tokenizer from ....multimodal.utils import random_audio, random_image, random_video +from ...registry import HF_EXAMPLE_MODELS def _test_processing_correctness( model_id: str, - modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, ): - if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": - hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} - else: - hf_overrides = {} - - limit_mm_per_prompt = { - modality: 3 if supports_multi else 1 - for modality, supports_multi in modalities.items() - } + model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id) + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") model_config = ModelConfig( model_id, task="auto", tokenizer=model_id, tokenizer_mode="auto", - trust_remote_code=True, + trust_remote_code=model_info.trust_remote_code, seed=0, dtype="float16", revision=None, - hf_overrides=hf_overrides, - limit_mm_per_prompt=limit_mm_per_prompt, + hf_overrides=model_info.hf_overrides, ) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = MULTIMODAL_REGISTRY._processor_factories[model_cls] ctx = InputProcessingContext( model_config, - tokenizer=cached_get_tokenizer(model_config.tokenizer), + tokenizer=cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_info.trust_remote_code, + ), ) # Ensure that it can fit all of the data cache = ProcessingCache(capacity=1 << 30) + processing_info = factories.info(ctx) + supported_mm_limits = processing_info.get_supported_mm_limits() + limit_mm_per_prompt = { + modality: 3 if limit is None else limit + for modality, limit in supported_mm_limits.items() + } + + model_config.get_multimodal_config().limit_per_prompt = limit_mm_per_prompt + baseline_processor = factories.build_processor(ctx, cache=None) cached_processor = factories.build_processor(ctx, cache=cache) dummy_inputs = baseline_processor.dummy_inputs @@ -82,8 +87,8 @@ def _test_processing_correctness( mm_data = { k: [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(rng.randint(limit_mm_per_prompt[k]))] - for k in modalities + for _ in range(rng.randint(limit))] + for k, limit in limit_mm_per_prompt.items() } mm_counts = {k: len(vs) for k, vs in mm_data.items()} @@ -135,20 +140,24 @@ def _test_processing_correctness( # yapf: disable # True if the model supports multiple data items of the modality per request -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("rhymes-ai/Aria", {"image": True}), - ("Salesforce/blip2-opt-2.7b", {"image": False}), - ("facebook/chameleon-7b", {"image": False}), - ("adept/fuyu-8b", {"image": False}), - ("llava-hf/llava-1.5-7b-hf", {"image": True}), - ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), - ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), - ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), - ("mistral-community/pixtral-12b", {"image": True}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), - ("fixie-ai/ultravox-v0_3", {"audio": True}), +@pytest.mark.parametrize("model_id", [ + "rhymes-ai/Aria", + "Salesforce/blip2-opt-2.7b", + "facebook/chameleon-7b", + "deepseek-ai/deepseek-vl2-tiny", + "adept/fuyu-8b", + "llava-hf/llava-1.5-7b-hf", + "llava-hf/llava-v1.6-mistral-7b-hf", + "llava-hf/LLaVA-NeXT-Video-7B-hf", + "llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + "TIGER-Lab/Mantis-8B-siglip-llama3", + "mistral-community/pixtral-12b", + "openbmb/MiniCPM-o-2_6", + "openbmb/MiniCPM-V-2_6", + "Qwen/Qwen-VL-Chat", + "Qwen/Qwen2-VL-2B-Instruct", + "Qwen/Qwen2-Audio-7B-Instruct", + "fixie-ai/ultravox-v0_3", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -156,14 +165,12 @@ def _test_processing_correctness( # yapf: enable def test_processing_correctness( model_id: str, - modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, ): _test_processing_correctness( model_id, - modalities, hit_rate=hit_rate, num_batches=num_batches, simplify_rate=simplify_rate, @@ -171,16 +178,13 @@ def test_processing_correctness( # yapf: disable -@pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), -]) +@pytest.mark.parametrize("model_id", ["microsoft/Phi-3-vision-128k-instruct"]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @pytest.mark.parametrize("simplify_rate", [1.0]) # yapf: enable def test_processing_correctness_phi3v( model_id: str, - modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -194,7 +198,6 @@ def test_processing_correctness_phi3v( _test_processing_correctness( model_id, - modalities, hit_rate=hit_rate, num_batches=num_batches, simplify_rate=simplify_rate, diff --git a/tests/models/multimodal/processing/test_qwen.py b/tests/models/multimodal/processing/test_qwen.py deleted file mode 100644 index af0ace711ba3e..0000000000000 --- a/tests/models/multimodal/processing/test_qwen.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Tests for Qwen's multimodal preprocessing kwargs.""" -from typing import Dict, List, Union - -import pytest -import torch -from PIL.Image import Image - -from vllm.inputs import InputContext, token_inputs -from vllm.multimodal import MultiModalKwargs -from vllm.multimodal.utils import cached_get_tokenizer - -from ....conftest import IMAGE_ASSETS -from ...utils import build_model_context - -### Multimodal preprocessing tests -SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image -# These values are specific to Qwen-VL/Chat; we can get these from the model -# config also, but they are hardcoded here to keep the parameterize/fixtures -# easy to read. -IMG_START_ID = 151857 -IMG_END_ID = 151858 -IMG_PAD_ID = 151859 -TOKS_PER_IMG = 256 -VIS_ENC_DIM = 4096 -IMG_SIZE = 448 - - -@pytest.fixture() -def input_mapper_for_qwen(): - # Lazy import to avoid initializing CUDA during test collection - from vllm.model_executor.models.qwen import input_mapper_for_qwen - return input_mapper_for_qwen - - -@pytest.fixture() -def input_processor_for_qwen(): - # Lazy import to avoid initializing CUDA during test collection - from vllm.model_executor.models.qwen import input_processor_for_qwen - return input_processor_for_qwen - - -@pytest.fixture() -def qwen_vl_context() -> InputContext: - """Get an InputContext for Qwen-VL.""" - return build_model_context(model_name="Qwen/Qwen-VL", - trust_remote_code=True) - - -# Happy path tests for single/multi-image scenarios for the multimodal -# input processor and mapper, respectively -@pytest.mark.parametrize("num_images", [1, 2]) -def test_input_processor_valid_mm_data(input_processor_for_qwen, - qwen_vl_context: InputContext, - num_images: int): - """Happy cases for image inputs to Qwen's multimodal input processor.""" - prompt = "".join( - [f"Picture {num}: \n" for num in range(1, num_images + 1)]) - inputs = token_inputs( - prompt=prompt, - # When processing multimodal data for a multimodal model, the qwen - # input processor will overwrite the provided prompt_token_ids with - # the image prompts - prompt_token_ids=[], - multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)}, - ) - proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs) - assert isinstance(proc_inputs, dict) - - # Each image should have one start / stop and a fixed context of 256 - proc_tokens = proc_inputs["prompt_token_ids"] - assert proc_tokens.count(IMG_START_ID) == num_images - assert proc_tokens.count(IMG_END_ID) == num_images - assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG - - -@pytest.mark.parametrize( - "img_data,expected_shape", - [ - # single / multi-image - (SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)), - (2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)), - # single / multi-image embeddings - (torch.rand( - (TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), - (torch.rand( - (1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)), - (torch.rand( - (2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)), - ]) -def test_input_mapper_valid_mm_data(input_mapper_for_qwen, - qwen_vl_context: InputContext, - img_data: Union[torch.Tensor, List[Image], - Image], - expected_shape: List[int]): - """Happy cases for image inputs to Qwen's multimodal input mapper.""" - mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data) - # Ensure that we get the appropriately shaped pixel_values - # for images and image embeddings, respectively. - assert isinstance(mapped_img_data, MultiModalKwargs) - assert "pixel_values" in mapped_img_data - assert mapped_img_data["pixel_values"].shape == expected_shape - - -# Sad path tests for the multimodal input processor and mapper, respectively -@pytest.mark.parametrize("mm_data", [ - { - "image": torch.rand(5) - }, - { - "image": torch.rand((5, 5, 5, 5, 5)) - }, -]) -def test_input_processor_invalid_mm_data(input_processor_for_qwen, - qwen_vl_context: InputContext, - mm_data: Dict[str, torch.Tensor]): - """Test sad cases validated in Qwen's multimodal input processor.""" - tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer, - trust_remote_code=True) - prompt = "Picture 1: \n" - prompt_token_ids = tokenizer.encode(prompt) - inputs = token_inputs(prompt=prompt, - prompt_token_ids=prompt_token_ids, - multi_modal_data=mm_data) - # Should fail since we have too many or too few dimensions for embeddings - with pytest.raises(ValueError): - input_processor_for_qwen(qwen_vl_context, inputs) - - -@pytest.mark.parametrize( - "img_data", - [ - # Wrong context length - torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)), - # Wrong visual encoder output size - torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)), - ]) -def test_input_mapper_invalid_mm_data( - input_mapper_for_qwen, - qwen_vl_context: InputContext, - img_data: Union[torch.Tensor, List[Image], Image], -): - """Sad cases validated in Qwen VL's multimodal input mapper.""" - with pytest.raises(ValueError): - input_mapper_for_qwen(qwen_vl_context, img_data) diff --git a/tests/models/registry.py b/tests/models/registry.py index 938c838617e8b..7952e65aa76a5 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -1,5 +1,9 @@ from dataclasses import dataclass, field -from typing import AbstractSet, Mapping, Optional +from typing import AbstractSet, Any, Literal, Mapping, Optional + +import pytest +from packaging.version import Version +from transformers import __version__ as TRANSFORMERS_VERSION @dataclass(frozen=True) @@ -38,6 +42,50 @@ class _HfExamplesInfo: trust_remote_code: bool = False """The ``trust_remote_code`` level required to load the model.""" + hf_overrides: dict[str, Any] = field(default_factory=dict) + """The ``hf_overrides`` required to load the model.""" + + def check_transformers_version( + self, + *, + on_fail: Literal["error", "skip"], + ) -> None: + """ + If the installed transformers version does not meet the requirements, + perform the given action. + """ + if self.min_transformers_version is None: + return + + current_version = TRANSFORMERS_VERSION + required_version = self.min_transformers_version + if Version(current_version) < Version(required_version): + msg = ( + f"You have `transformers=={current_version}` installed, but " + f"`transformers>={required_version}` is required to run this " + "model") + + if on_fail == "error": + raise RuntimeError(msg) + else: + pytest.skip(msg) + + def check_available_online( + self, + *, + on_fail: Literal["error", "skip"], + ) -> None: + """ + If the model is not available online, perform the given action. + """ + if not self.is_available_online: + msg = "Model is not available online" + + if on_fail == "error": + raise RuntimeError(msg) + else: + pytest.skip(msg) + # yapf: disable _TEXT_GENERATION_EXAMPLE_MODELS = { @@ -48,8 +96,6 @@ class _HfExamplesInfo: trust_remote_code=True), "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct", trust_remote_code=True), - "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", - trust_remote_code=True), "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B", trust_remote_code=True), "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat", @@ -69,6 +115,7 @@ class _HfExamplesInfo: "DeepseekV3ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V3", # noqa: E501 trust_remote_code=True), "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"), "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"), @@ -154,6 +201,7 @@ class _HfExamplesInfo: "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), + "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 @@ -174,6 +222,8 @@ class _HfExamplesInfo: _MULTIMODAL_EXAMPLE_MODELS = { # [Decoder-only] + "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria", + min_transformers_version="4.48"), "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"), # noqa: E501 "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"), # noqa: E501 "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b", @@ -181,7 +231,8 @@ class _HfExamplesInfo: trust_remote_code=True), "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b", is_available_online=False), - "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny"), # noqa: E501 + "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501 + hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501 "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"), "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"), "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B", @@ -192,8 +243,11 @@ class _HfExamplesInfo: "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501 "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 - "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3"), # noqa: E501 - "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5", + "MantisForConditionalGeneration": _HfExamplesInfo("TIGER-Lab/Mantis-8B-siglip-llama3", # noqa: E501 + hf_overrides={"architectures": ["MantisForConditionalGeneration"]}), # noqa: E501 + "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6", + trust_remote_code=True), + "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-V-2_6", trust_remote_code=True), "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924", trust_remote_code=True), @@ -209,7 +263,8 @@ class _HfExamplesInfo: trust_remote_code=True), "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"), # noqa: E501 "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 - "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), + "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3", + trust_remote_code=True), # [Encoder-decoder] "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 @@ -245,5 +300,17 @@ def get_supported_archs(self) -> AbstractSet[str]: def get_hf_info(self, model_arch: str) -> _HfExamplesInfo: return self.hf_models[model_arch] + def find_hf_info(self, model_id: str) -> _HfExamplesInfo: + for info in self.hf_models.values(): + if info.default == model_id: + return info + + # Fallback to extras + for info in self.hf_models.values(): + if any(extra == model_id for extra in info.extras.values()): + return info + + raise ValueError(f"No example model defined for {model_id}") + HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS) diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index daece7c93c0ef..d3a3aaf670c23 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,9 +1,7 @@ from unittest.mock import patch import pytest -from packaging.version import Version from transformers import PretrainedConfig -from transformers import __version__ as TRANSFORMERS_VERSION from vllm import LLM @@ -13,16 +11,8 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) - if not model_info.is_available_online: - pytest.skip("Model is not available online") - if model_info.min_transformers_version is not None: - current_version = TRANSFORMERS_VERSION - required_version = model_info.min_transformers_version - if Version(current_version) < Version(required_version): - pytest.skip( - f"You have `transformers=={current_version}` installed, but " - f"`transformers>={required_version}` is required to run this " - "model") + model_info.check_available_online(on_fail="skip") + model_info.check_transformers_version(on_fail="skip") # Avoid OOM def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig: diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 73b70d65e8e0b..ac0366847e334 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -21,6 +21,9 @@ @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs()) def test_registry_imports(model_arch): + model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) + model_info.check_transformers_version(on_fail="skip") + # Ensure all model classes can be imported successfully model_cls, _ = ModelRegistry.resolve_model_cls(model_arch) diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 8456a463adeeb..b8524ed83026b 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -16,7 +16,8 @@ NUM_PROMPTS = [10] DEFAULT_SERVER_ARGS: List[str] = [ - "--worker-use-ray", + "--distributed-executor-backend", + "ray", "--gpu-memory-utilization", "0.85", "--swap-space", diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 9e58ed4cfde93..13f820d013e2a 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -7,12 +7,16 @@ from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.processing import (PlaceholderInfo, PromptReplacement, +# yapf conflicts with isort for this block +# yapf: disable +from vllm.multimodal.processing import (PlaceholderFeaturesInfo, + PromptReplacement, find_mm_placeholders, find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) +# yapf: enable from vllm.multimodal.profiling import MultiModalProfiler from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer @@ -433,19 +437,19 @@ def test_find_replace_tokens( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=6, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_4": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_4", item_idx=0, start_idx=3, - replacement=[32000], + tokens=[32000], ), ], } @@ -455,25 +459,25 @@ def test_find_replace_tokens( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=1, - replacement=[32000, 32000], + tokens=[32000, 32000], ), - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=1, start_idx=5, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_3": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_3", item_idx=0, start_idx=7, - replacement=[1550, 918, 1550], + tokens=[1550, 918, 1550], ), ], # No match for pattern_4 as it has lower priority than pattern_1 @@ -483,33 +487,33 @@ def test_find_replace_tokens( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], { "pattern_1": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=0, start_idx=1, - replacement=[32000, 32000], + tokens=[32000, 32000], ), - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_1", item_idx=1, start_idx=3, - replacement=[32000, 32000], + tokens=[32000, 32000], ), ], "pattern_4": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_4", item_idx=0, start_idx=5, - replacement=[32000], + tokens=[32000], ), ], "pattern_3": [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality="pattern_3", item_idx=0, start_idx=6, - replacement=[1550, 918, 1550], + tokens=[1550, 918, 1550], ), ], } diff --git a/tests/neuron/test_prefix_prefill.py b/tests/neuron/test_prefix_prefill.py new file mode 100644 index 0000000000000..77b707a737118 --- /dev/null +++ b/tests/neuron/test_prefix_prefill.py @@ -0,0 +1,456 @@ +import random +from typing import Optional + +import pytest +import torch +import torch.nn.functional as F + + +class BlockDiagonalCausalFromBottomRightMask: + + @staticmethod + def _from_seqlens(query_lens, seq_lens, block_size=None): + from torch import logical_and, logical_or + + contexted = block_size is None + context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) + n_queries = sum(query_lens) + num_seqs = len(query_lens) + if contexted: + key_lens_blockaligned = seq_lens + else: + n_blocks_per_seq = (context_lens + block_size - 1) // block_size + offset_per_seq = n_blocks_per_seq * block_size + key_lens_blockaligned = offset_per_seq[:num_seqs].tolist() + n_keys = sum(key_lens_blockaligned) + + a = (torch.arange(n_queries).reshape(n_queries, + 1).expand(n_queries, n_keys)) + b = torch.arange(n_keys).reshape(1, n_keys).expand(n_queries, n_keys) + q_cumsum = torch.tensor([0] + query_lens).cumsum(dim=0) + k_cumsum = torch.tensor([0] + key_lens_blockaligned).cumsum(dim=0) + + prior_mask = torch.zeros(n_queries, n_keys) + new_masks: list[torch.Tensor] = [] + for seq_id in range(num_seqs): + ri = q_cumsum[seq_id] + ci = k_cumsum[seq_id] + nr = query_lens[seq_id] + + if contexted: + nc = seq_lens[seq_id] + a_offset = ci + nc - ri - nr + new_mask = (a + a_offset) >= b + else: + nc = context_lens[seq_id] + a_offset = ci + nc - 1 + new_mask = a_offset >= b + + left_mask = b >= ci + top_mask = a >= ri + bottom_mask = a < (ri + nr) + + new_mask = logical_and( + logical_and(logical_and(new_mask, left_mask), top_mask), + bottom_mask, + ) + prior_mask = logical_or(prior_mask, new_mask) + new_masks = new_masks + [new_mask] + return prior_mask + + @staticmethod + def from_seqlens(query_lens, seq_lens, block_size=None): + contexted = block_size is None + if contexted: + prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( + query_lens, seq_lens) + active_mask = None + else: + prior_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( + query_lens, seq_lens, block_size) + active_mask = BlockDiagonalCausalFromBottomRightMask._from_seqlens( + query_lens, query_lens) + return prior_mask, active_mask + + +def ref_softmax(x: torch.Tensor, + dim: int, + mixed_precision=False, + return_max_reduce=False): + max_value = torch.amax(x, dim=dim, keepdims=True) + exp = torch.exp(x - max_value) + if mixed_precision: + sum_value = torch.sum(exp.astype(torch.float32), + dim=dim, + keepdims=True).astype(x.dtype) + else: + sum_value = torch.sum(exp, dim=dim, keepdims=True) + if return_max_reduce: + return exp / sum_value, max_value, torch.reciprocal(sum_value) + return exp / sum_value + + +def ref_masked_attention( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + scale: float, + attn_mask: Optional[torch.Tensor] = None, + return_max_reduce: Optional[bool] = False, +) -> torch.Tensor: + scaled_qk = scale * torch.einsum("qhd,khd->hqk", query, key).float() + if attn_mask is not None: + masked_score = scaled_qk + attn_mask.float() + if return_max_reduce: + norm_score, cached_max, cached_sum_reciprocal = ref_softmax( + masked_score, dim=-1, return_max_reduce=True) + else: + norm_score = ref_softmax(masked_score, dim=-1) + out = torch.einsum("hqk,khd->qhd", norm_score, value) + if return_max_reduce: + return ( + out, + cached_max, + cached_sum_reciprocal, + norm_score, + masked_score, + scaled_qk, + ) + else: + return out + + +def ref_context_attention( + query, + key, + value, + query_lens, + seq_lens, + head_size, + num_kv_heads, + num_heads, + num_queries_per_kv, + return_max_reduce=False, +): + scale = float(1.0 / (head_size**0.5)) + if num_queries_per_kv > 1: + # Handle MQA and GQA + key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) + value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) + + attn_mask, _ = BlockDiagonalCausalFromBottomRightMask.from_seqlens( + query_lens, seq_lens) + + # convert binary mask to -inf values + attn_mask = torch.logical_not(attn_mask) + attn_mask = attn_mask.float() * -30000 + + output, cached_max, cached_sum_reciprocal, lse, masked_score, scaled_qk = ( + ref_masked_attention( + query, + key, + value, + scale, + attn_mask, + return_max_reduce=return_max_reduce, + )) + + output = output.unsqueeze(1) + if return_max_reduce: + return ( + output, + cached_max, + cached_sum_reciprocal, + lse, + masked_score, + scaled_qk, + ) + else: + return output + + +@pytest.mark.parametrize( + "num_heads,num_queries_per_kv,head_size,mixed_precision", + [ + (4, 2, 8, False), + (4, 2, 8, True), + (32, 8, 64, True), + ], +) +@torch.inference_mode() +def test_contexted_kv_attention( + num_heads: int, + num_queries_per_kv: int, + head_size: int, + mixed_precision: bool, +) -> None: + import os + + import torch_xla.core.xla_model as xm + + from vllm.attention.ops.nki_flash_attn import flash_attn_varlen_nkifunc + + device = xm.xla_device() + + os.environ["NEURON_CC_FLAGS"] = ( + " --model-type=transformer -O1 " + " --internal-hlo2tensorizer-options='--verify-hlo' ") + + random.seed(0) + torch.manual_seed(0) + torch.set_printoptions(sci_mode=False) + + min_ctx_len = 2 + max_ctx_len = 64 + min_query_len = 2 + max_query_len = 64 + prefill_batch_size = 2 + decode_batch_size = 6 + batch_size = prefill_batch_size + decode_batch_size + block_size = 32 + max_model_len = (max_query_len + max_ctx_len) * 4 + + max_block_per_request = max_model_len // block_size + dtype = torch.float32 + cache_size = (batch_size * max_block_per_request) + 2 + ctx_lens = [ + random.randint(min_ctx_len, max_ctx_len) + for _ in range(prefill_batch_size) + ] + [ + random.randint(min_ctx_len, max_ctx_len) + for _ in range(decode_batch_size) + ] + query_lens = [ + random.randint(min_query_len, max_query_len) + for _ in range(prefill_batch_size) + ] + [1 for _ in range(decode_batch_size)] + seq_lens = [a + b for a, b in zip(query_lens, ctx_lens)] + num_kv_heads = num_heads // num_queries_per_kv + + num_tokens = sum(query_lens) + query = torch.empty(num_tokens, num_heads, head_size, dtype=dtype) + query.uniform_(-1, 1) + torch.empty(num_tokens, num_heads, head_size, dtype=dtype) + + kv = torch.empty(sum(seq_lens), 2, num_kv_heads, head_size, dtype=dtype) + kv.uniform_(-1, 1) + key, value = kv.unbind(dim=1) + + k_cache = torch.zeros(cache_size, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + v_cache = torch.zeros(cache_size, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + k = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) + v = torch.zeros(sum(query_lens), num_kv_heads, head_size, dtype=dtype) + values = torch.arange(0, cache_size, dtype=torch.long) + values = values[torch.randperm(cache_size)] + block_table = values[:batch_size * max_block_per_request].view( + batch_size, max_block_per_request) + torch.tensor(seq_lens, dtype=torch.long) + b_ctx_len = torch.tensor(ctx_lens, dtype=torch.long) + b_start_loc = torch.cumsum(torch.tensor([0] + query_lens[:-1], + dtype=torch.long), + dim=0) + # copy kv to cache + b_seq_start_loc = torch.cumsum(torch.tensor([0] + seq_lens[:-1], + dtype=torch.long), + dim=0) + for i in range(batch_size): + for j in range(query_lens[i]): + k[b_start_loc[i] + j].copy_(key[b_seq_start_loc[i] + b_ctx_len[i] + + j]) + v[b_start_loc[i] + j].copy_(value[b_seq_start_loc[i] + + b_ctx_len[i] + j]) + cur_ctx = 0 + block_id = 0 + while cur_ctx < b_ctx_len[i]: + start_loc = b_seq_start_loc[i] + cur_ctx + if cur_ctx + block_size > b_ctx_len[i]: + end_loc = b_seq_start_loc[i] + b_ctx_len[i] + else: + end_loc = start_loc + block_size + start_slot = block_table[i, block_id] * block_size + end_slot = start_slot + end_loc - start_loc + k_cache.view(-1, num_kv_heads, + head_size)[start_slot:end_slot].copy_( + key[start_loc:end_loc]) + v_cache.view(-1, num_kv_heads, + head_size)[start_slot:end_slot].copy_( + value[start_loc:end_loc]) + cur_ctx += block_size + block_id += 1 + + ( + output_ref, + cached_max, + cached_sum_reciprocal, + lse, + masked_score, + scaled_qk, + ) = ref_context_attention( + query, + key, + value, + query_lens, + seq_lens, + head_size, + num_kv_heads, + num_heads, + num_queries_per_kv, + return_max_reduce=True, + ) + + # build neuron program + return_debug_tensors = False + B_P_SIZE = 128 + LARGE_TILE_SZ = 2048 + max_num_queries = ( + (sum(query_lens) + block_size - 1) // block_size) * block_size + + def get_active_block_tables(block_tables, query_lens, seq_lens, block_size, + num_blocks): + context_lens = seq_lens - query_lens + blocks_per_seq = (context_lens + block_size - 1) // block_size + num_seqs = len(seq_lens) + active_blocks: list[int] = [] + for seq_id in range(num_seqs): + active_blocks = ( + active_blocks + + block_tables[seq_id, :blocks_per_seq[seq_id]].tolist()) + return F.pad( + torch.tensor(active_blocks), + (0, num_blocks - len(active_blocks)), + "constant", + 0, + ) + + def shift_bit_length(x): + return 1 << (x - 1).bit_length() + + # calculate input shapes + max_num_queries_shifted = shift_bit_length(max_num_queries) + max_num_queries_factor = B_P_SIZE // max_num_queries_shifted + max_num_queries_padded = max_num_queries_shifted * max_num_queries_factor + assert (max_num_queries_padded == B_P_SIZE + ), "invalid {max_num_queries_padded=}" + head_size_padded = B_P_SIZE + context_lens = torch.tensor(seq_lens) - torch.tensor(query_lens) + num_active_blocks_shifted = shift_bit_length( + ((context_lens + block_size - 1) // block_size).sum().item()) + num_active_blocks_factor = (LARGE_TILE_SZ // block_size // + num_active_blocks_shifted) + num_active_blocks = num_active_blocks_shifted * num_active_blocks_factor + assert (num_active_blocks * + block_size) == LARGE_TILE_SZ, "invalid {num_active_blocks=}" + context_kv_len = num_active_blocks * block_size + assert context_kv_len == LARGE_TILE_SZ, f"invalid {context_kv_len=}" + + # pad QKV tensors + pad_dims = ( + 0, + head_size_padded - query.shape[2], + 0, + 0, + 0, + max_num_queries_padded - query.shape[0], + ) + query = F.pad(query, pad_dims, "constant", 0) + k = F.pad(k, pad_dims, "constant", 0) + v = F.pad(v, pad_dims, "constant", 0) + k_cache = F.pad(k_cache, (0, head_size_padded - head_size), "constant", 0) + v_cache = F.pad(v_cache, (0, head_size_padded - head_size), "constant", 0) + + # permute QKV tensors + # query: (1, n_heads, d, seq_q) + # key: (1, n_kv_heads, d, seq_k) + # value: (1, n_kv_heads, seq_v, d) + query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous() + v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous() + + # transform block table + active_block_table = get_active_block_tables( + block_table, + torch.tensor(query_lens), + torch.tensor(seq_lens), + block_size, + num_active_blocks, + ) + + # Build attention masks + prior_mask, active_mask = ( + BlockDiagonalCausalFromBottomRightMask.from_seqlens( + query_lens, seq_lens, block_size=block_size)) + attn_mask = torch.concat( + [ + F.pad( + prior_mask, + ( + 0, + context_kv_len - prior_mask.shape[1], + 0, + B_P_SIZE - prior_mask.shape[0], + ), + "constant", + 0, + ).bool(), + F.pad( + active_mask, + ( + 0, + B_P_SIZE - active_mask.shape[1], + 0, + B_P_SIZE - active_mask.shape[0], + ), + "constant", + 0, + ).bool(), + ], + dim=1, + ) + + input_args = ( + query.to(device=device), + k.to(device=device), + v.to(device=device), + k_cache.to(device=device), + v_cache.to(device=device), + active_block_table.to(torch.int32).to(device=device), + attn_mask.to(device=device), + ) + input_kwargs = dict( + n_kv_head=num_kv_heads, + head_size=head_size, + mixed_precision=mixed_precision, + ) + + if return_debug_tensors: + output_nki, *debug_tensors = flash_attn_varlen_nkifunc( + *input_args, **input_kwargs) + else: + output_nki = flash_attn_varlen_nkifunc(*input_args, **input_kwargs) + debug_tensors = [] + + output_nki = torch.tensor(output_nki).cpu() + debug_tensors = [torch.tensor(dt).cpu() for dt in debug_tensors] + + num_actual_tokens = sum(query_lens) + print(f"{num_actual_tokens=}") + # - o: shape (bs, n_heads, seq_q, d) -> (bs, seq_q, n_heads, d) + output_nki = output_nki.permute( + 0, 2, 1, 3)[:, :, :, :head_size].cpu()[0, :num_actual_tokens, :, :] + output_ref_padded = F.pad( + output_ref, + (0, 0, 0, 0, 0, 0, 0, max_num_queries_padded - output_ref.shape[0]), + "constant", + 0, + ) + output_ref = output_ref_padded.transpose(0, 1)[0, :num_actual_tokens, :, :] + + torch.testing.assert_close(output_nki, output_ref, atol=1e-2, rtol=0) diff --git a/tests/quantization/test_awq.py b/tests/quantization/test_awq.py new file mode 100644 index 0000000000000..dfa0a924bf90a --- /dev/null +++ b/tests/quantization/test_awq.py @@ -0,0 +1,28 @@ +"""Test model set-up and inference for quantized HF models supported + on the HPU backend using AutoAWQ. + + Validating the configuration and printing results for manual checking. + + Run `pytest tests/quantization/test_awq.py`. +""" + +import pytest + +from vllm.platforms import current_platform + +MODELS = [ + "TheBloke/Llama-2-7B-Chat-AWQ", +] +DTYPE = ["bfloat16"] + + +@pytest.mark.skipif(not current_platform.is_hpu(), + reason="only supports Intel HPU backend.") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", DTYPE) +def test_awq(vllm_runner, model, dtype): + with vllm_runner(model, dtype=dtype, quantization='awq_hpu') as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + assert output + print(output) diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 92436889ecffe..1072697ecf5cc 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -30,50 +30,55 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - o_proj = layer.self_attn.o_proj - gate_up_proj = layer.mlp.gate_up_proj - down_proj = layer.mlp.down_proj - - # assert zp for symmetric and asymmetric cases - def zp_valid(zp: Optional[torch.Tensor]): - if is_symmetric: - return zp is None - - return zp is not None and zp.dtype is torch.int32 - - assert zp_valid(qkv_proj.input_zero_point) - assert zp_valid(o_proj.input_zero_point) - assert zp_valid(gate_up_proj.input_zero_point) - assert zp_valid(down_proj.input_zero_point) - - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(gate_up_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(down_proj.quant_method, - CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.is_static_input_scheme - expected_type = torch.int8 - - assert qkv_proj.weight.dtype is expected_type - assert o_proj.weight.dtype is expected_type - assert gate_up_proj.weight.dtype is expected_type - - if qkv_proj.scheme.strategy == "tensor": - # Make sure it is a channelwise buffer - # After running process_weights_after_loading - assert len(qkv_proj.weight_scale.shape) == 2 - assert qkv_proj.weight_scale.shape[0] == shape_0 - assert qkv_proj.weight_scale.shape[1] == 1 - assert qkv_proj.weight_scale.dtype is torch.float32 - assert qkv_proj.input_scale.dtype is torch.float32 + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + # assert zp for symmetric and asymmetric cases + def zp_valid(zp: Optional[torch.Tensor]): + if is_symmetric: + return zp is None + + return zp is not None and zp.dtype is torch.int32 + + assert zp_valid(qkv_proj.input_zero_point) + assert zp_valid(o_proj.input_zero_point) + assert zp_valid(gate_up_proj.input_zero_point) + assert zp_valid(down_proj.input_zero_point) + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(o_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(gate_up_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(down_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.is_static_input_scheme + expected_type = torch.int8 + + assert qkv_proj.weight.dtype is expected_type + assert o_proj.weight.dtype is expected_type + assert gate_up_proj.weight.dtype is expected_type + + if qkv_proj.scheme.strategy == "tensor": + # Make sure it is a channelwise buffer + # After running process_weights_after_loading + assert len(qkv_proj.weight_scale.shape) == 2 + assert qkv_proj.weight_scale.shape[0] == shape_0 + assert qkv_proj.weight_scale.shape[1] == 1 + assert qkv_proj.weight_scale.dtype is torch.float32 + assert qkv_proj.input_scale.dtype is torch.float32 + + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -129,16 +134,20 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) - assert not qkv_proj.scheme.is_static_input_scheme - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.weight.dtype is torch.int8 + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8Int8) + assert not qkv_proj.scheme.is_static_input_scheme + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.weight.dtype is torch.int8 + + llm.apply_model(check_model) output = llm.generate_greedy(["Hello my name is"], max_tokens=20) assert output @@ -152,19 +161,24 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): model, strategy, group, pack_factor = wNa16_args with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16) - assert qkv_proj.scheme.strategy == strategy - assert qkv_proj.scheme.group_size == (-1 if group is None else group) + assert qkv_proj.scheme.strategy == strategy + assert qkv_proj.scheme.group_size == (-1 + if group is None else group) - assert qkv_proj.weight_packed.dtype is torch.int32 - assert qkv_proj.weight_scale.dtype is torch.float16 - assert qkv_proj.scheme.pack_factor == pack_factor + assert qkv_proj.weight_packed.dtype is torch.int32 + assert qkv_proj.weight_scale.dtype is torch.float16 + assert qkv_proj.scheme.pack_factor == pack_factor + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -173,14 +187,18 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): def test_compressed_tensors_w4a16_marlin24(vllm_runner): model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) + assert qkv_proj.weight_packed.dtype is torch.int32 - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Sparse24) - assert qkv_proj.weight_packed.dtype is torch.int32 + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -189,23 +207,27 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner): def test_compressed_tensors_fp8(vllm_runner): model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance( - qkv_proj.scheme, - (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) + qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.input_scale.dtype is torch.float32 + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance( + qkv_proj.scheme, + (CompressedTensorsW8A8Fp8, CompressedTensorsW8A16Fp8)) - if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - assert qkv_proj.weight_scale.dtype is torch.float32 - assert len(qkv_proj.weight_scale.shape) == 0 + assert qkv_proj.input_scale.dtype is torch.float32 + + if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + assert qkv_proj.weight_scale.dtype is torch.float32 + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output @@ -248,12 +270,15 @@ def _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy): def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.float8_e4m3fn + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) @@ -273,40 +298,49 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4): def test_compressed_tensors_2of4_quant_int8(vllm_runner, args_2of4): model, weight_strategy, input_strategy = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj - assert qkv_proj.scheme.weights_dtype == torch.int8 - _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert qkv_proj.scheme.weights_dtype == torch.int8 + _test_2of4_quant_models(qkv_proj, weight_strategy, input_strategy) + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) assert output -@pytest.mark.skipif(not sparse_cutlass_supported(), - reason="Sparse FP8 is not yet supported on this GPU type.") +@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.") +@pytest.mark.skipif( + not sparse_cutlass_supported(), + reason="2of4 Sparse is not yet supported on this GPU type.") @pytest.mark.parametrize( "args_2of4", [("nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor")]) def test_compressed_tensors_2of4_sparse(vllm_runner, args_2of4): model = args_2of4 with vllm_runner(model) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - - qkv_proj = layer.self_attn.qkv_proj - assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) - assert isinstance(qkv_proj.scheme, CompressedTensors24) - - assert qkv_proj.scheme.weight_quant is None - assert qkv_proj.scheme.input_quant is None - assert not qkv_proj.scheme.quantized - assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map - sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 - assert sparsity_map.get("Linear").format == "dense" - assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + assert isinstance(qkv_proj.quant_method, + CompressedTensorsLinearMethod) + assert isinstance(qkv_proj.scheme, CompressedTensors24) + + assert qkv_proj.scheme.weight_quant is None + assert qkv_proj.scheme.input_quant is None + assert not qkv_proj.scheme.quantized + assert qkv_proj.quant_method.quantization_config.sparsity_scheme_map + sparsity_map = qkv_proj.quant_method.quantization_config.sparsity_scheme_map # noqa: E501 + assert sparsity_map.get("Linear").format == "dense" + assert sparsity_map.get("Linear").sparsity_structure == "2:4" + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) print(output) diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index a0c1d7e24c503..4bff734746297 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -49,13 +49,17 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): with vllm_runner(model_id, kv_cache_dtype="fp8") as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - attn = model.model.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - # NOTE: it is valid for scales to be 1.0 (default value), but we know - # these checkpoints have scales < 1.0 - assert 0.0 < attn._k_scale < 1.0 - assert 0.0 < attn._v_scale < 1.0 + def check_model(model): + attn = model.model.layers[0].self_attn.attn + + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + + # NOTE: it is valid for scales to be 1.0 (default value), but + # we know these checkpoints have scales < 1.0 + assert 0.0 < attn._k_scale < 1.0 + assert 0.0 < attn._v_scale < 1.0 + + llm.apply_model(check_model) # note: this does not test accuracy, just that we can run through # see lm-eval tests for accuracy @@ -77,22 +81,24 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, quantization="fp8", kv_cache_dtype=kv_cache_dtype) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, Fp8LinearMethod) - if kv_cache_dtype == "fp8": - attn = model.model.decoder.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - assert attn._k_scale == 1.0 - assert attn._v_scale == 1.0 - - if current_platform.has_device_capability(89) and not force_marlin: - # For GPUs with hardware support, we keep weights in fp8 - assert fc1.weight.dtype == torch.float8_e4m3fn - else: - # For GPUs without hardware support, we pack the fp8 weights - # for weight-only quantization using Marlin kernels - assert fc1.weight.dtype == torch.int32 + def check_model(model): + fc1 = model.model.decoder.layers[0].fc1 + assert isinstance(fc1.quant_method, Fp8LinearMethod) + if kv_cache_dtype == "fp8": + attn = model.model.decoder.layers[0].self_attn.attn + assert isinstance(attn.quant_method, Fp8KVCacheMethod) + assert attn._k_scale == 1.0 + assert attn._v_scale == 1.0 + + if current_platform.has_device_capability(89) and not force_marlin: + # For GPUs with hardware support, we keep weights in fp8 + assert fc1.weight.dtype == torch.float8_e4m3fn + else: + # For GPUs without hardware support, we pack the fp8 weights + # for weight-only quantization using Marlin kernels + assert fc1.weight.dtype == torch.int32 + + llm.apply_model(check_model) @pytest.mark.skipif(not is_quant_method_supported("fp8"), diff --git a/tests/quantization/test_gptq.py b/tests/quantization/test_gptq.py new file mode 100644 index 0000000000000..6a92bfe46a5b8 --- /dev/null +++ b/tests/quantization/test_gptq.py @@ -0,0 +1,28 @@ +"""Test model set-up and inference for quantized HF models supported + on the HPU backend using AutoGPTQ. + + Validating the configuration and printing results for manual checking. + + Run `pytest tests/quantization/test_gptq.py`. +""" + +import pytest + +from vllm.platforms import current_platform + +MODELS = [ + "TheBloke/Llama-2-7B-Chat-GPTQ", +] +DTYPE = ["bfloat16"] + + +@pytest.mark.skipif(not current_platform.is_hpu(), + reason="only supports Intel HPU backend.") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", DTYPE) +def test_gptq(vllm_runner, model, dtype): + with vllm_runner(model, dtype=dtype, quantization='gptq_hpu') as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + assert output + print(output) \ No newline at end of file diff --git a/tests/quantization/test_lm_head.py b/tests/quantization/test_lm_head.py index ad526a4065101..fa2d9645ea47f 100644 --- a/tests/quantization/test_lm_head.py +++ b/tests/quantization/test_lm_head.py @@ -28,20 +28,23 @@ def test_lm_head( model_lm_head_quant: Tuple[str, bool], ) -> None: model, lm_head_quantized = model_lm_head_quant - vllm_model = vllm_runner(model, dtype=torch.float16, max_model_len=2048) - - lm_head_layer = (vllm_model.model.llm_engine.model_executor.driver_worker. - model_runner.model.lm_head) - - if lm_head_quantized: - assert isinstance( - lm_head_layer.linear_method, - (GPTQLinearMethod, GPTQMarlinLinearMethod, MarlinLinearMethod)) - else: - assert isinstance(lm_head_layer.linear_method, - UnquantizedEmbeddingMethod) - - print( - vllm_model.generate_greedy(prompts=["Hello my name is"], - max_tokens=10)[0][1]) - del vllm_model + + with vllm_runner(model, dtype=torch.float16, + max_model_len=2048) as vllm_model: + + def check_model(model): + lm_head_layer = model.lm_head + + if lm_head_quantized: + assert isinstance(lm_head_layer.linear_method, + (GPTQLinearMethod, GPTQMarlinLinearMethod, + MarlinLinearMethod)) + else: + assert isinstance(lm_head_layer.linear_method, + UnquantizedEmbeddingMethod) + + vllm_model.apply_model(check_model) + + print( + vllm_model.generate_greedy(prompts=["Hello my name is"], + max_tokens=10)[0][1]) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 27493a682b746..11382ad708faa 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -12,19 +12,22 @@ def test_quark_fp8(vllm_runner): model_path = "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test" with vllm_runner(model_path) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 - layer = model.model.layers[0] - qkv_proj = layer.self_attn.qkv_proj + def check_model(model): + layer = model.model.layers[0] - assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) - assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + qkv_proj = layer.self_attn.qkv_proj - if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): - assert len(qkv_proj.input_scale.shape) == 0 - assert qkv_proj.weight.dtype is torch.float8_e4m3fn - #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz - assert len(qkv_proj.weight_scale.shape) == 0 + assert isinstance(qkv_proj.quant_method, QuarkLinearMethod) + assert isinstance(qkv_proj.scheme, QuarkW8A8Fp8) + + if isinstance(qkv_proj.scheme, QuarkW8A8Fp8): + assert len(qkv_proj.input_scale.shape) == 0 + assert qkv_proj.weight.dtype is torch.float8_e4m3fn + #assert qkv_proj.weight.dtype is torch.float8_e4m3fnuz + assert len(qkv_proj.weight_scale.shape) == 0 + + llm.apply_model(check_model) output = llm.generate_greedy("Hello my name is", max_tokens=20) assert output diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py new file mode 100644 index 0000000000000..8e7f44a399ddf --- /dev/null +++ b/tests/quantization/test_register_quantization_config.py @@ -0,0 +1,117 @@ +"""Tests register custom quantization config. + +See https://github.com/vllm-project/vllm/issues/11926 for more details. + +Run `pytest tests/quantization/test_register_quantization_config.py`. +""" +from typing import Any, Dict, List, Optional + +import pytest +import torch +import torch.nn.functional as F + +from vllm.model_executor.layers.linear import LinearBase # noqa: E501 +from vllm.model_executor.layers.linear import UnquantizedLinearMethod +from vllm.model_executor.layers.quantization import ( + get_quantization_config, register_quantization_config) +from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 + QuantizationConfig) + + +class FakeQuantLinearMethod(UnquantizedLinearMethod): + """Fake quantization linear method for per-token dynamic quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization method.""" + super().__init__() + self.num_bits = num_bits + + def apply(self, + layer: "torch.nn.Module", + x: "torch.Tensor", + bias: Optional["torch.Tensor"] = None) -> "torch.Tensor": + """Perform fake quantization before the linear layer.""" + + # Calculate the scales dynamically + max_val = torch.amax(x, dim=(0, -1), keepdims=True) + min_val = torch.amin(x, dim=(0, -1), keepdims=True) + scales = (max_val - min_val) / (2**self.num_bits - 1) + + # Fake quantize the input + quant_x = torch.clamp(torch.round(x / scales), -2**(self.num_bits - 1), + 2**(self.num_bits - 1) - 1) + dequant_x = quant_x * scales + + return F.linear(dequant_x, layer.weight, bias) + + +@register_quantization_config("custom_quant") +class CustomQuantConfig(QuantizationConfig): + """Custom quantization config for per-token dynamic fake quantization.""" + + def __init__(self, num_bits: int = 8) -> None: + """Initialize the quantization config.""" + self.num_bits = num_bits + + def get_name(self) -> str: + """Name of the quantization method.""" + return "custom_quant" + + def get_supported_act_dtypes(self) -> List["torch.dtype"]: + """List of supported activation dtypes.""" + return [torch.float16, torch.bfloat16] + + @classmethod + def get_min_capability(cls) -> int: + """Minimum GPU capability to support the quantization method.""" + return -1 + + @staticmethod + def get_config_filenames() -> List[str]: + """List of filenames to search for in the model directory.""" + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "CustomQuantConfig": + """Create a config class from the model's quantization config.""" + return CustomQuantConfig(num_bits=config.get("num_bits", 8)) + + def get_quant_method(self, layer: "torch.nn.Module", + prefix: str) -> Optional["FakeQuantLinearMethod"]: + """Get the quantize method to use for the quantized layer.""" + if isinstance(layer, LinearBase): + return FakeQuantLinearMethod(num_bits=self.num_bits) + return None + + +def test_register_quantization_config(): + """Test register custom quantization config.""" + + # The quantization method `custom_quant` should be registered. + assert get_quantization_config("custom_quant") == CustomQuantConfig + + # The quantization method `custom_quant` is already exists, + # should raise an error. + with pytest.raises(ValueError): + register_quantization_config("custom_quant")(CustomQuantConfig) + + +@pytest.mark.parametrize(argnames="model", + argvalues=[ + "meta-llama/Meta-Llama-3-8B-Instruct", + ]) +def test_custom_quant(vllm_runner, model): + """Test infer with the custom quantization method.""" + with vllm_runner(model_name=model, + quantization="custom_quant", + enforce_eager=True) as llm: + + model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + layer = model.model.layers[0] + qkv_proj = layer.self_attn.qkv_proj + + # Check the quantization method is FakeQuantLinearMethod + assert isinstance(qkv_proj.quant_method, FakeQuantLinearMethod) + + output = llm.generate_greedy("Hello my name is", max_tokens=20) + assert output diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py index 34cfb9c5bbd71..3a91698ecd2c6 100644 --- a/tests/samplers/test_rejection_sampler.py +++ b/tests/samplers/test_rejection_sampler.py @@ -23,16 +23,17 @@ def mock_causal_accepted_tensor( """ batch_size = last_accepted_indices.shape[0] - accepted = (torch.arange(k).expand(batch_size, k) <= - last_accepted_indices.unsqueeze(-1).broadcast_to( + accepted = (torch.arange(k).expand(batch_size, k) + <= last_accepted_indices.unsqueeze(-1).broadcast_to( batch_size, k)) # Sprinkle accepted values after the contiguous initial accepted values. # This replicates the behavior of rejection sampling, which may "accept" # a token that cannot be accepted because of causality. - sprinkle_candidates = ( - torch.arange(k).expand(batch_size, k) > - last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + 1) + sprinkle_candidates = (torch.arange(k).expand( + batch_size, + k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + + 1) sprinkle = torch.rand(batch_size, k) > 0.5 accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] return accepted @@ -445,8 +446,8 @@ def test_rejection_sampling_approximates_target_distribution( distance_wrt_reference) expected_improvement_multiplier = 20 - assert (relative_change_in_distance_wrt_target > - relative_change_in_distance_wrt_reference * + assert (relative_change_in_distance_wrt_target + > relative_change_in_distance_wrt_reference * expected_improvement_multiplier) diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index 88067f19c8f07..bf1ee6c397838 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -31,7 +31,7 @@ def test_random_sample_with_seed( sampling_params = SamplingParams( # Parameters to ensure sufficient randomness - temperature=2.0, + temperature=3.0, top_p=min(random.random() + 0.3, 1), top_k=random.randint(5, 20), n=random.randint(1, 10), @@ -75,3 +75,8 @@ def test_random_sample_with_seed( # verify requests with the same seed match assert outputs[1] == outputs[4] assert outputs[2] == outputs[5] + + # verify generations within the same parallel sampling group differ + for output in outputs: + for sub_output_a, sub_output_b in combinations(output, 2): + assert sub_output_a != sub_output_b diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index b9cb3858c0068..5cb982a0811c7 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -2,6 +2,7 @@ from typing import List, Optional, Sequence, Tuple, Union import pytest +import torch from vllm import LLM, SamplingParams from vllm.distributed import cleanup_dist_env_and_memory @@ -154,6 +155,8 @@ def _check_logprobs_when_output_disabled( spec_pos_logprob) = next(iter(spec_pos_logprobs.items())) assert spec_pos_logprob.rank == -1 assert spec_pos_logprob.logprob == 0.0 + if isinstance(spec_pos_logprob_token_id, torch.Tensor): + spec_pos_logprob_token_id = spec_pos_logprob_token_id.item() assert spec_pos_logprob_token_id in baseline_pos_logprobs @@ -244,7 +247,8 @@ def run_equality_correctness_test_tp(model, batch_size: int, max_output_len: int, seed: int = 0, - temperature: float = 0.0): + temperature: float = 0.0, + logprobs: Optional[int] = None): """Helper method that compares the outputs of both the baseline LLM and the test LLM. It asserts greedy equality, e.g. that the outputs are exactly the same when temperature is zero. @@ -257,7 +261,6 @@ def run_equality_correctness_test_tp(model, results = [] prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))] - for args, env in ((arg1, env1), (arg2, env2)): with RemoteOpenAIServer(model, args, @@ -269,12 +272,14 @@ def run_equality_correctness_test_tp(model, prompt=prompts, max_tokens=max_output_len, seed=seed, - temperature=temperature) + temperature=temperature, + logprobs=logprobs) results.append({ "test": "seeded_sampling", "text": [choice.text for choice in completion.choices], + "logprobs": [choice.logprobs for choice in completion.choices], "finish_reason": [choice.finish_reason for choice in completion.choices], "usage": @@ -284,7 +289,15 @@ def run_equality_correctness_test_tp(model, n = len(results) // 2 arg1_results = results[:n] arg2_results = results[n:] + # Separate logprobs to avoid asserting exact equality. + arg1_logprobs = [r.pop("logprobs") for r in arg1_results] + arg2_logprobs = [r.pop("logprobs") for r in arg2_results] + for arg1_result, arg2_result in zip(arg1_results, arg2_results): assert arg1_result == arg2_result, ( f"Results for {model=} are not the same with {arg1=} and {arg2=}. " f"{arg1_result=} != {arg2_result=}") + if logprobs: + for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs): + for l1, l2 in zip(logs1, logs2): + assert l1.tokens == l2.tokens diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index 02cba92795142..7001ee4c007fe 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -2,6 +2,8 @@ tensor parallelism. """ +from typing import Optional + import pytest import torch @@ -154,15 +156,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, "--speculative-draft-tensor-parallel-size", "1", ])]) +@pytest.mark.parametrize("logprobs", [None, 2]) @pytest.mark.parametrize("batch_size", [2]) @pytest.mark.parametrize("seed", [1]) def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, + logprobs: Optional[int], batch_size: int, seed: int): """Verify spec decode works well with same and different TP size for the draft model with chunked prefill. """ + if logprobs: + test_llm_kwargs.extend( + ["--disable_logprobs_during_spec_decoding", "False"]) run_equality_correctness_test_tp(model, common_llm_kwargs, per_test_common_llm_kwargs, @@ -171,4 +178,5 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, batch_size, max_output_len=32, seed=seed, - temperature=0.0) + temperature=0.0, + logprobs=logprobs) diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py index 4cfca8b78e79b..1a543606cb3f3 100644 --- a/tests/spec_decode/e2e/test_logprobs.py +++ b/tests/spec_decode/e2e/test_logprobs.py @@ -4,26 +4,27 @@ from vllm import SamplingParams +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test @pytest.mark.parametrize( "common_llm_kwargs", [{ - "model_name": "JackFram/llama-68m", + "model_name": "JackFram/llama-160m", # Skip cuda graph recording for fast test. - "enforce_eager": True, + "enforce_eager": True }]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 3, "disable_logprobs_during_spec_decoding": False, }, { - "speculative_model": "JackFram/llama-160m", + "speculative_model": "JackFram/llama-68m", "num_speculative_tokens": 3, "disable_logprobs_during_spec_decoding": True, }]) @@ -36,12 +37,15 @@ ]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12]) def test_logprobs_equality(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): - """Verify output logprobs are equal with and without speculative decoding. + seed: int, logprobs: int, prefill_chunk_size: int): + """Verify output logprobs are equal with and without speculative decoding, + as well as with and without chunked prefill. """ + maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py index b8965606b3d0e..dbcbc0db10881 100644 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ b/tests/spec_decode/e2e/test_medusa_correctness.py @@ -21,6 +21,7 @@ import pytest +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test # main model @@ -67,12 +68,14 @@ ]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -119,12 +122,15 @@ def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): + seed: int, logprobs: int, + prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -167,12 +173,14 @@ def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, ]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness_cuda_graph( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with cuda graph enabled and different batch sizes.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -217,13 +225,15 @@ def test_medusa_e2e_greedy_correctness_cuda_graph( ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_e2e_greedy_correctness_with_preemption( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality, even when some sequences are preempted mid- generation. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -267,13 +277,15 @@ def test_medusa_e2e_greedy_correctness_with_preemption( 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_different_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify that medusa speculative decoding produces exact equality to without spec decode with different values of num_speculative_tokens. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -313,14 +325,17 @@ def test_medusa_different_k(vllm_runner, common_llm_kwargs, 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, seed: int, + prefill_chunk_size: int): """Verify that medusa speculative decoding produces exact equality to without spec decode when speculation is disabled for large batch sizes. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -361,12 +376,14 @@ def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, 32, ]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, seed: int, prefill_chunk_size: int): """Verify that speculative decoding generates the same output with batch expansion scorer and mqa scorer. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py index 183ff2f5db274..1fa1104f5d3a8 100644 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ b/tests/spec_decode/e2e/test_mlp_correctness.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test # main model @@ -66,14 +67,16 @@ @pytest.mark.parametrize("output_len", [ 128, ]) -@pytest.mark.parametrize("batch_size", [1, 32]) +@pytest.mark.parametrize("batch_size", [4, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + seed: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -116,12 +119,19 @@ def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("batch_size", [8]) @pytest.mark.parametrize("seed", [1]) @pytest.mark.parametrize("logprobs", [1, 6]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, seed: int, - logprobs: int): + logprobs: int, prefill_chunk_size: int): """Verify greedy equality with different batch size.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + # NOTE Test is sensitive enough st if we don't enable chunked prefill + # scheduling on baseline too, we get slightly different logprobs, ending + # up sampling different tokens at the tail (ie top tokens don't change). + # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected? + maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -162,12 +172,15 @@ def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("output_len", [2048]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): + batch_size: int, output_len: int, + prefill_chunk_size: int, seed: int): """Verify acceptance rate with different batch size and large output length.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -204,13 +217,17 @@ def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, @pytest.mark.parametrize("output_len", [64]) @pytest.mark.parametrize("batch_size", [1, 32]) @pytest.mark.parametrize("temperature", [1.0]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - temperature: float, seed: int): + temperature: float, + prefill_chunk_size: int, seed: int): """Verify seeded runs produce the same output.""" + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) + maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -266,14 +283,16 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, 128, ]) @pytest.mark.parametrize("batch_size", [4]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_e2e_greedy_correctness_with_preemption( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + prefill_chunk_size: int, seed: int): """Verify greedy equality, even when some sequences are preempted mid- generation. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -317,12 +336,14 @@ def test_mlp_e2e_greedy_correctness_with_preemption( ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) def test_mlp_e2e_greedy_correctness_with_padding( vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): + prefill_chunk_size: int, seed: int): """Verify greedy equality when the vocab dimension is padded """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) # Default pad_to is 64, test model has vocab_size of 32000 def patched_pad_vocab_size(vocab_size, pad_to=None): @@ -373,14 +394,16 @@ def patched_pad_vocab_size(vocab_size, pad_to=None): # Use smaller output len for fast test. 32, ]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_different_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, seed: int, - output_len: int): + test_llm_kwargs, batch_size: int, + prefill_chunk_size: int, seed: int, output_len: int): """Verify that mlp speculative decoding produces exact equality to without spec decode with different values of num_speculative_tokens. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -418,15 +441,21 @@ def test_mlp_different_k(vllm_runner, common_llm_kwargs, # Use smaller output len for fast test. 32, ]) +# Speculative decoding is disabled when sequences reach decoding and the batch +# consists of single-token requests. Hence we set `max_num_seqs` +# >= `speculative_disable_by_batch_size` to test feature interaction. +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, seed: int, + test_llm_kwargs, batch_size: int, + prefill_chunk_size: int, seed: int, output_len: int): """Verify that mlp speculative decoding produces exact equality to without spec decode when speculation is disabled for large batch sizes. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, @@ -460,13 +489,15 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, # Use smaller output len for fast test. 32, ]) +@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) @pytest.mark.parametrize("seed", [1]) def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): + output_len: int, prefill_chunk_size: int, seed: int): """Verify that speculative decoding generates the same output with batch expansion scorer and mqa scorer. """ + maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py index a13cca41f99e5..05ad468dd8bc5 100644 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ b/tests/spec_decode/e2e/test_multistep_correctness.py @@ -147,20 +147,20 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator, }, ]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "enable_chunked_prefill": False, - }, - { - "speculative_model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - }, -]) +@pytest.mark.parametrize("test_llm_kwargs", + [{ + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 5, + "enable_chunked_prefill": False, + "disable_logprobs_during_spec_decoding": False + }, { + "speculative_model": "JackFram/llama-68m", + "num_speculative_tokens": 3, + "enable_chunked_prefill": True, + "max_num_batched_tokens": 4, + "max_num_seqs": 4, + "disable_logprobs_during_spec_decoding": False + }]) @pytest.mark.parametrize( "output_len", [ @@ -192,6 +192,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( batch_size, max_output_len=output_len, seed=seed, + prompt_logprobs=2, + logprobs=2, + disable_logprobs=False, temperature=0.0, ensure_all_accepted=ensure_all_accepted) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py index e53d169a8fcc3..77f8b8998c8d3 100644 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ b/tests/spec_decode/e2e/test_ngram_correctness.py @@ -26,6 +26,7 @@ import pytest +from ..utils import maybe_enable_chunked_prefill from .conftest import run_equality_correctness_test @@ -49,11 +50,13 @@ "speculative_model": "[ngram]", "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, + "speculative_disable_mqa_scorer": False, }, { "speculative_model": "[ngram]", "num_speculative_tokens": 5, "ngram_prompt_lookup_max": 3, + "speculative_disable_mqa_scorer": True, }, ]) @pytest.mark.parametrize("output_len", [ @@ -68,15 +71,7 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, batch_size: int, output_len: int, prefill_chunk_size: int, seed: int): """Verify greedy equality on a tiny model with different batch size.""" - if prefill_chunk_size > 0: - common_llm_kwargs.update( - **{ - "enable_chunked_prefill": True, - "max_num_batched_tokens": prefill_chunk_size, - "max_num_seqs": prefill_chunk_size - }) - else: - common_llm_kwargs["enable_chunked_prefill"] = False + maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) run_equality_correctness_test(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py index 0b1509d8b7785..5a093dea16d40 100644 --- a/tests/spec_decode/test_scorer.py +++ b/tests/spec_decode/test_scorer.py @@ -60,6 +60,7 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int, num_gpu_blocks = 2048 // block_size scorer_worker = create_worker(Worker, model_name, block_size, num_gpu_blocks, seed) + scorer_worker.model_runner.disable_logprobs = True # accessed by mqa_scorer scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor = True scorer_worker.model_runner.model.sampler.\ should_modify_greedy_probs_inplace = True diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py index caf7a7e625b46..d8c3af4c1cd1e 100644 --- a/tests/spec_decode/test_spec_decode_worker.py +++ b/tests/spec_decode/test_spec_decode_worker.py @@ -754,6 +754,7 @@ def test_populate_seq_ids_with_bonus_tokens(): seq_group_metadata_list=seq_group_metadata_list, accepted_token_ids=accepted_token_ids, target_logprobs=target_token_logprobs, + prompt_logprobs=None, k=k, stage_times=(0, 0, 0)) # Verify that _seq_with_bonus_token_in_last_step contains the following: diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py index a4bfa6b2f384b..1e1024dd75727 100644 --- a/tests/spec_decode/utils.py +++ b/tests/spec_decode/utils.py @@ -9,6 +9,7 @@ from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.utils import set_random_seed +from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, SequenceData, SequenceGroupMetadata, SequenceOutput) @@ -274,3 +275,18 @@ def create_batch(batch_size, prompts, num_gpu_blocks, block_size, final_prompt_lens, prev_output_tokens, seq_ids) return seq_group_metadata_list, prompts, prev_output_tokens + + +def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs): + if prefill_chunk_size > 0 and current_platform.is_hpu(): + import pytest + pytest.skip('Chunked prefill is not supported on HPU') + elif prefill_chunk_size > 0: + llm_kwargs.update( + **{ + "enable_chunked_prefill": True, + "max_num_batched_tokens": prefill_chunk_size, + "max_num_seqs": prefill_chunk_size + }) + else: + llm_kwargs["enable_chunked_prefill"] = False diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index bf409d2d97aa1..6e7eec1c6ab34 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -3,6 +3,7 @@ import os import pathlib import subprocess +from functools import partial from unittest.mock import MagicMock, patch import openai @@ -24,7 +25,6 @@ # yapf: enable from vllm.utils import PlaceholderModule, import_from_path -from ..conftest import VllmRunner from ..utils import VLLM_PATH, RemoteOpenAIServer from .conftest import retry_until_skip @@ -58,16 +58,6 @@ def is_curl_installed(): return False -def get_torch_model(vllm_runner: VllmRunner): - return vllm_runner \ - .model \ - .llm_engine \ - .model_executor \ - .driver_worker \ - .model_runner \ - .model - - def write_keyfile(keyfile_path: str): encryption_params = EncryptionParams.random() pathlib.Path(keyfile_path).parent.mkdir(parents=True, exist_ok=True) @@ -121,8 +111,10 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs( config_for_serializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) - serialize_vllm_model(get_torch_model(vllm_model), - config_for_serializing) + + vllm_model.apply_model( + partial(serialize_vllm_model, + tensorizer_config=config_for_serializing)) config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path, encryption_keyfile=key_path) @@ -175,8 +167,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) with vllm_runner( model_ref, @@ -215,8 +209,10 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path): with vllm_runner(model_ref, ) as vllm_model: model_path = tmp_path / (model_ref + ".tensors") - serialize_vllm_model(get_torch_model(vllm_model), - TensorizerConfig(tensorizer_uri=model_path)) + vllm_model.apply_model( + partial( + serialize_vllm_model, + tensorizer_config=TensorizerConfig(tensorizer_uri=model_path))) model_loader_extra_config = { "tensorizer_uri": str(model_path), @@ -337,7 +333,9 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path): with vllm_runner(model_ref) as vllm_model: outputs = vllm_model.generate(prompts, sampling_params) - serialize_vllm_model(get_torch_model(vllm_model), config) + + vllm_model.apply_model( + partial(serialize_vllm_model, tensorizer_config=config)) assert is_vllm_tensorized(config) diff --git a/tests/test_config.py b/tests/test_config.py index 4518adfc31bfc..ec366b93d6a37 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -281,3 +281,73 @@ def test_uses_mrope(model_id, uses_mrope): ) assert config.uses_mrope == uses_mrope + + +def test_generation_config_loading(): + model_id = "Qwen/Qwen2.5-1.5B-Instruct" + + # When set generation_config to None, the default generation config + # will not be loaded. + model_config = ModelConfig(model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config=None) + assert model_config.get_diff_sampling_param() == {} + + # When set generation_config to "auto", the default generation config + # should be loaded. + model_config = ModelConfig(model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config="auto") + + correct_generation_config = { + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_p": 0.8, + "top_k": 20, + } + + assert model_config.get_diff_sampling_param() == correct_generation_config + + # The generation config could be overridden by the user. + override_generation_config = {"temperature": 0.5, "top_k": 5} + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config="auto", + override_generation_config=override_generation_config) + + override_result = correct_generation_config.copy() + override_result.update(override_generation_config) + + assert model_config.get_diff_sampling_param() == override_result + + # When generation_config is set to None and override_generation_config + # is set, the override_generation_config should be used directly. + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="float16", + generation_config=None, + override_generation_config=override_generation_config) + + assert model_config.get_diff_sampling_param() == override_generation_config diff --git a/tests/test_utils.py b/tests/test_utils.py index c68d730af7f8a..d5dc4464e634d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -9,10 +9,10 @@ from vllm_test_utils import monitor from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config -from vllm.utils import (FlexibleArgumentParser, PlaceholderModule, - StoreBoolean, bind_kv_cache, deprecate_kwargs, - get_open_port, memory_profiling, merge_async_iterators, - supports_kw) +from vllm.utils import (FlexibleArgumentParser, MemorySnapshot, + PlaceholderModule, StoreBoolean, bind_kv_cache, + deprecate_kwargs, get_open_port, memory_profiling, + merge_async_iterators, supports_kw) from .utils import error_on_warning, fork_new_process_for_each_test @@ -284,14 +284,13 @@ def test_memory_profiling(): # 512 MiB allocation outside of this instance handle1 = lib.cudaMalloc(512 * 1024 * 1024) - baseline_memory_in_bytes = \ - torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0] + baseline_snapshot = MemorySnapshot() # load weights weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32) - weights_memory_in_bytes = 128 * 1024 * 1024 * 4 # 512 MiB + weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB def measure_current_non_torch(): free, total = torch.cuda.mem_get_info() @@ -300,8 +299,8 @@ def measure_current_non_torch(): current_non_torch = current_used - current_torch return current_non_torch - with memory_profiling(baseline_memory_in_bytes=baseline_memory_in_bytes, - weights_memory_in_bytes=weights_memory_in_bytes) as result, \ + with memory_profiling(baseline_snapshot=baseline_snapshot, + weights_memory=weights_memory) as result, \ monitor(measure_current_non_torch) as monitored_values: # make a memory spike, 1 GiB spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32) @@ -316,13 +315,12 @@ def measure_current_non_torch(): assert measured_diff == 256 * 1024 * 1024 # Check that the memory usage is within 5% of the expected values - # 5% tolerance is caused by PyTorch caching allocator, - # we cannot control PyTorch's behavior of its internal buffers, + # 5% tolerance is caused by cuda runtime. + # we cannot control cuda runtime in the granularity of bytes, # which causes a small error (<10 MiB in practice) - non_torch_ratio = result.non_torch_increase_in_bytes / (256 * 1024 * 1024) # noqa - torch_peak_ratio = result.torch_peak_increase_in_bytes / (1024 * 1024 * 1024) # noqa + non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa assert abs(non_torch_ratio - 1) <= 0.05 - assert abs(torch_peak_ratio - 1) <= 0.05 + assert result.torch_peak_increase == 1024 * 1024 * 1024 del weights lib.cudaFree(handle1) lib.cudaFree(handle2) diff --git a/tests/tracing/test_tracing.py b/tests/tracing/test_tracing.py index fe5fc979c66a3..49a16d16eb840 100644 --- a/tests/tracing/test_tracing.py +++ b/tests/tracing/test_tracing.py @@ -100,32 +100,32 @@ def test_traces(trace_service): attributes = decode_attributes( request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get( - SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature assert attributes.get( - SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( - SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) assert attributes.get( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens metrics = outputs[0].metrics assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time assert metrics.scheduler_time > 0 - assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time # Model forward and model execute should be none, since detailed traces is # not enabled. assert metrics.model_forward_time is None @@ -166,37 +166,37 @@ def test_traces_with_detailed_steps(trace_service): attributes = decode_attributes( request.resource_spans[0].scope_spans[0].spans[0].attributes) - assert attributes.get(SpanAttributes.LLM_RESPONSE_MODEL) == model + assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model assert attributes.get( - SpanAttributes.LLM_REQUEST_ID) == outputs[0].request_id + SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE + ) == sampling_params.temperature assert attributes.get( - SpanAttributes.LLM_REQUEST_TEMPERATURE) == sampling_params.temperature + SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p assert attributes.get( - SpanAttributes.LLM_REQUEST_TOP_P) == sampling_params.top_p - assert attributes.get( - SpanAttributes.LLM_REQUEST_MAX_TOKENS) == sampling_params.max_tokens - assert attributes.get(SpanAttributes.LLM_REQUEST_N) == sampling_params.n - assert attributes.get(SpanAttributes.LLM_USAGE_PROMPT_TOKENS) == len( + SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS) == sampling_params.max_tokens + assert attributes.get(SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n + assert attributes.get(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len( outputs[0].prompt_token_ids) completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs) assert attributes.get( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS) == completion_tokens + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens metrics = outputs[0].metrics assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue + SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) == metrics.time_in_queue ttft = metrics.first_token_time - metrics.arrival_time assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN) == ttft + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) == ttft e2e_time = metrics.finished_time - metrics.arrival_time - assert attributes.get(SpanAttributes.LLM_LATENCY_E2E) == e2e_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) == e2e_time assert metrics.scheduler_time > 0 - assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER) == metrics.scheduler_time + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER + ) == metrics.scheduler_time assert metrics.model_forward_time > 0 assert attributes.get( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD) == pytest.approx( metrics.model_forward_time / 1000) assert metrics.model_execute_time > 0 - assert attributes.get(SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE + assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE ) == metrics.model_execute_time assert metrics.model_forward_time < 1000 * metrics.model_execute_time diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index fafd9d0ce4455..f434fa8c61a80 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -587,3 +587,72 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks(): assert {block.ref_cnt for block in block_part1[:3]} == {1} # Block 3-5 are free. assert {block.ref_cnt for block in block_part1[3:]} == {0} + + +def test_reset_prefix_cache(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + + full_block_token_ids = [i for i in range(3) for _ in range(16)] + unique_token_ids = [3] * 7 + all_token_ids = full_block_token_ids + unique_token_ids + req0 = make_request("0", all_token_ids) + blocks = manager.allocate_slots(req0, 55, []) + assert [b.block_id for b in blocks] == [0, 1, 2, 3] + + unique_token_ids = [4] * 7 + all_token_ids = full_block_token_ids + unique_token_ids + req1 = make_request("1", all_token_ids) + computed_blocks, _ = manager.get_computed_blocks(req1) + assert len(req1.kv_block_hashes) == 3 + assert len(computed_blocks) == 3 + blocks = manager.allocate_slots(req1, 7, computed_blocks) + assert [b.block_id for b in blocks] == [4] + + # Failed to reset prefix cache because some blocks are not freed yet. + assert not manager.reset_prefix_cache() + assert manager.cached_block_hash_to_block + + # Free the blocks. + manager.free(req0) + manager.free(req1) + + assert manager.reset_prefix_cache() + assert not manager.cached_block_hash_to_block + assert all([blk.block_hash is None for blk in manager.block_pool]) + + +def test_uncache_blocks(): + manager = KVCacheManager( + block_size=16, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + + req0 = make_request("0", list(range(30))) + blocks = manager.allocate_slots(req0, 30, []) + assert [b.block_id for b in blocks] == [0, 1] + assert len(manager.cached_block_hash_to_block) == 1 + + req0.num_computed_tokens = 30 + + # Simulate speculative tokens. + for _ in range(5): + req0.append_output_token_ids(8) + manager.append_slots(req0, 5) + assert len(manager.cached_block_hash_to_block) == 2 + + # After sampling, assuming only 1 token is accepted. + req0.num_computed_tokens = 31 + num_uncached_blocks = manager.uncache_blocks(req0) + assert num_uncached_blocks == 1 + assert len(manager.cached_block_hash_to_block) == 1 diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index 2c805e18eebae..10f783b21a9ec 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -1,4 +1,5 @@ import asyncio +from contextlib import ExitStack from typing import List, Tuple import pytest @@ -6,6 +7,7 @@ from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.platforms import current_platform +from vllm.sampling_params import RequestOutputKind from vllm.v1.engine.async_llm import AsyncLLM if not current_platform.is_cuda(): @@ -18,28 +20,39 @@ async def generate(engine: AsyncLLM, request_id: str, + output_kind: RequestOutputKind, max_tokens: int) -> Tuple[int, str]: count = 0 - async for _ in engine.generate(request_id=request_id, - prompt="Hello my name is Robert and", - sampling_params=SamplingParams( - max_tokens=max_tokens, temperature=0)): + sampling_params = SamplingParams(max_tokens=max_tokens, + output_kind=output_kind, + temperature=0) + async for out in engine.generate(request_id=request_id, + prompt="Hello my name is Robert and", + sampling_params=sampling_params): + + num_tokens = len(out.outputs[0].token_ids) + if output_kind == RequestOutputKind.DELTA: + count += num_tokens + else: + count = num_tokens - count += 1 await asyncio.sleep(0.) return count, request_id +@pytest.mark.parametrize( + "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) @pytest.mark.asyncio -async def test_load(monkeypatch): +async def test_load(monkeypatch, output_kind: RequestOutputKind): # TODO(rickyx): Remove monkeypatch once we have a better way to test V1 # so that in the future when we switch, we don't have to change all the # tests. - with monkeypatch.context() as m: + with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + after.callback(engine.shutdown) NUM_REQUESTS = 10000 NUM_EXPECTED_TOKENS = 10 @@ -51,26 +64,33 @@ async def test_load(monkeypatch): for request_id in request_ids: tasks.append( asyncio.create_task( - generate(engine, request_id, NUM_EXPECTED_TOKENS))) + generate(engine, request_id, output_kind, + NUM_EXPECTED_TOKENS))) # Confirm that we got all the EXPECTED tokens from the requests. - for task in tasks: + done, pending = await asyncio.wait(tasks, + return_when=asyncio.FIRST_EXCEPTION) + for task in pending: + task.cancel() + for task in done: num_generated_tokens, request_id = await task assert num_generated_tokens == NUM_EXPECTED_TOKENS, ( f"{request_id} generated {num_generated_tokens} but " f"expected {NUM_EXPECTED_TOKENS}") assert not engine.output_processor.has_unfinished_requests() - engine.shutdown() +@pytest.mark.parametrize( + "output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]) @pytest.mark.asyncio -async def test_abort(monkeypatch): +async def test_abort(monkeypatch, output_kind: RequestOutputKind): - with monkeypatch.context() as m: + with monkeypatch.context() as m, ExitStack() as after: m.setenv("VLLM_USE_V1", "1") engine = AsyncLLM.from_engine_args(ENGINE_ARGS) + after.callback(engine.shutdown) NUM_REQUESTS = 100 NUM_EXPECTED_TOKENS = 100 @@ -83,7 +103,8 @@ async def test_abort(monkeypatch): for request_id in request_ids: tasks.append( asyncio.create_task( - generate(engine, request_id, NUM_EXPECTED_TOKENS))) + generate(engine, request_id, output_kind, + NUM_EXPECTED_TOKENS))) # API server cancels requests when they disconnect. for idx in REQUEST_IDS_TO_ABORT: @@ -108,9 +129,7 @@ async def test_abort(monkeypatch): # Confirm we can do another generation. request_id = f"request-{REQUEST_IDS_TO_ABORT[0]}" task = asyncio.create_task( - generate(engine, request_id, NUM_EXPECTED_TOKENS)) + generate(engine, request_id, output_kind, NUM_EXPECTED_TOKENS)) num_generated_tokens, request_id = await task assert num_generated_tokens == NUM_EXPECTED_TOKENS assert not engine.output_processor.has_unfinished_requests() - - engine.shutdown() diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index cccfd305ac604..033bbcfce564e 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -144,7 +144,7 @@ def test_engine_core(monkeypatch): def test_engine_core_advanced_sampling(monkeypatch): """ A basic end-to-end test to verify that the engine functions correctly - when additional sampling parameters, such as min_tokens and + when additional sampling parameters, such as top_p, min_tokens, and presence_penalty, are set. """ with monkeypatch.context() as m: @@ -167,11 +167,23 @@ def test_engine_core_advanced_sampling(monkeypatch): stop_token_ids=[1001, 1002], ) engine_core.add_request(request) - assert len(engine_core.scheduler.waiting) == 1 - assert len(engine_core.scheduler.running) == 0 - # Loop through until they are all done. - while len(engine_core.step().outputs) > 0: - pass - assert len(engine_core.scheduler.waiting) == 0 - assert len(engine_core.scheduler.running) == 0 + def _check_engine_state(): + assert len(engine_core.scheduler.waiting) == 1 + assert len(engine_core.scheduler.running) == 0 + # Loop through until they are all done. + while len(engine_core.step().outputs) > 0: + pass + assert len(engine_core.scheduler.waiting) == 0 + assert len(engine_core.scheduler.running) == 0 + + _check_engine_state() + + # Second request. + request2 = make_request() + request2.sampling_params = SamplingParams( + top_p=0.99, + top_k=50, + ) + engine_core.add_request(request2) + _check_engine_state() diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py new file mode 100644 index 0000000000000..580392ac5f446 --- /dev/null +++ b/tests/v1/test_stats.py @@ -0,0 +1,300 @@ +import pytest + +from vllm.sampling_params import SamplingParams +from vllm.v1.stats.common import RequestStats, RequestStatsUpdate + + +def make_update( + request_id: str, + update_type: RequestStatsUpdate.Type, + monotonic_ts_s: float, + **kwargs, +): + if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED: + kwargs.setdefault("sampling_params", SamplingParams(n=1)) + kwargs.setdefault("num_prompt_tokens", 10) + elif update_type == RequestStatsUpdate.Type.PREFILLING: + kwargs.setdefault("num_computed_tokens", 10) + kwargs.setdefault("num_cached_tokens", 10) + elif update_type == RequestStatsUpdate.Type.DETOKENIZED: + kwargs.setdefault("num_new_tokens", 10) + elif update_type == RequestStatsUpdate.Type.FINISHED: + kwargs.setdefault("finish_reason", "test_reason") + + return RequestStatsUpdate( + request_id=request_id, + type=update_type, + monotonic_ts_s=monotonic_ts_s, + **kwargs, + ) + + +def test_invalid_request_update(): + request_id = "test_request" + update_specific_required_fields = { + RequestStatsUpdate.Type.INPUT_PROCESSED: [ + "sampling_params", + "num_prompt_tokens", + ], + RequestStatsUpdate.Type.PREFILLING: [ + "num_computed_tokens", + "num_cached_tokens", + ], + RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"], + RequestStatsUpdate.Type.FINISHED: ["finish_reason"], + } + + # Missing a required field should raise an assertion error. + for update_type in RequestStatsUpdate.Type: + required_fields = update_specific_required_fields.get(update_type, []) + + # Try to miss one of the required fields. + kwargs = {field: object() for field in required_fields} + for field in required_fields: + copy_kwargs = kwargs.copy() + copy_kwargs.pop(field) + with pytest.raises(ValueError): + RequestStatsUpdate( + request_id=request_id, + type=update_type, + **copy_kwargs, + ) + + +def test_invalid_request_update_transition(): + # Test invalid transition type. + for src in RequestStatsUpdate.Type: + for dst in RequestStatsUpdate.Type: + if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]: + with pytest.raises(AssertionError): + RequestStatsUpdate.check_valid_update( + make_update( + update_type=dst, + request_id="test_request", + monotonic_ts_s=1, + ), + last_update_type=src, + last_updated_ts_s=0, + ) + else: + RequestStatsUpdate.check_valid_update( + make_update( + request_id="test_request", + update_type=dst, + monotonic_ts_s=1, + ), + last_update_type=src, + last_updated_ts_s=0, + ) + + # Test invalid timestamp. + with pytest.raises(AssertionError): + RequestStatsUpdate.check_valid_update( + make_update( + request_id="test_request", + update_type=RequestStatsUpdate.Type.ARRIVED, + monotonic_ts_s=1, + ), + last_update_type=None, + last_updated_ts_s=2, + ) + + +def test_lifecycle_updates(): + request_id = "test_request" + stats = RequestStats(request_id=request_id) + + # Test the below scenario: + arrived_ts = 0 + input_processed_ts = 1 + queued_ts = 2 + prefilling_ts = 3 + decoded_ts = 5 + detokenized_ts = 6 + decoded_2_ts = 7 + detokenized_2_ts = 8 + preempted_ts = 9 + resumed_ts = 10 + decoded_3_ts = 11 + detokenized_3_ts = 12 + finished_ts = 13 + + # Test ARRIVED + arrived_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.ARRIVED, + monotonic_ts_s=arrived_ts, + ) + stats.update_from(arrived_update) + assert stats.arrival_ts_s == arrived_ts + assert stats.last_updated_ts_s == arrived_ts + + # Test INPUT_PROCESSED + sampling_params = SamplingParams(n=1) + input_processed_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.INPUT_PROCESSED, + monotonic_ts_s=input_processed_ts, + sampling_params=sampling_params, + num_prompt_tokens=6, + ) + stats.update_from(input_processed_update) + assert stats.input_processor_end_ts_s == input_processed_ts + assert stats.last_updated_ts_s == input_processed_ts + assert stats.num_prompt_tokens == 6 + assert stats.sampling_params == sampling_params + + assert stats.first_token_ts_s is None + assert stats.prefill_ts_s is None + + # Test QUEUED + queued_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.QUEUED, + monotonic_ts_s=queued_ts, + ) + stats.update_from(queued_update) + assert stats.queued_ts_s == queued_ts + assert stats.last_updated_ts_s == queued_ts + + # Test PREFILLING + prefilling_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREFILLING, + monotonic_ts_s=prefilling_ts, + num_computed_tokens=3, + num_cached_tokens=1, + ) + stats.update_from(prefilling_update) + assert stats.prefill_ts_s == prefilling_ts + assert stats.num_computed_tokens == 3 + assert stats.num_cached_tokens == 1 + assert stats.queue_duration_s == prefilling_ts - queued_ts + + # Test DECODING + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_ts, + ) + stats.update_from(decoded_update) + assert stats.last_updated_ts_s == decoded_ts + + # Test DETOKENIZED + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_ts, + num_new_tokens=1, + ) + stats.update_from(detokenized_update) + assert stats.last_updated_ts_s == detokenized_ts + assert stats.num_output_tokens == 1 + # Since arrival + assert stats.first_token_latency_s == detokenized_ts - arrived_ts + # Since first scheduled + assert stats.prefill_latency_s == detokenized_ts - prefilling_ts + + # Test another DECODING and DETOKENIZED should + # yield correct inter token latency + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_2_ts, + ) + stats.update_from(decoded_update) + + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_2_ts, + num_new_tokens=1, + ) + stats.update_from(detokenized_update) + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + ] + assert stats.num_output_tokens == 2 + + # Test PREEMPTED + preempted_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREEMPTED, + monotonic_ts_s=preempted_ts, + ) + stats.update_from(preempted_update) + assert stats.last_updated_ts_s == preempted_ts + assert stats.preempted_ts_s_lst == [preempted_ts] + # States should be reset + assert stats.num_computed_tokens == 0 + assert stats.num_cached_tokens == 0 + # These states should not be reset + assert stats.num_output_tokens == 2 + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + ] + assert stats.prefill_latency_s == prefilling_ts - arrived_ts + assert stats.num_prompt_tokens == 6 + assert stats.prefill_start_ts_s_lst == [prefilling_ts] + + # Test resumed + resumed_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.PREFILLING, + monotonic_ts_s=resumed_ts, + num_computed_tokens=6, + num_cached_tokens=2, + ) + stats.update_from(resumed_update) + # prefill timestamp should not be updated since it's a resumed prefill + assert stats.prefill_ts_s == prefilling_ts + assert stats.num_computed_tokens == 6 + assert stats.num_cached_tokens == 2 + assert stats.prefill_start_ts_s_lst == [ + prefilling_ts, + resumed_ts, + ] + assert stats.last_updated_ts_s == resumed_ts + + # Test another DECODED/DETOKENIZED should yield correct first token latency. + decoded_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DECODING, + monotonic_ts_s=decoded_3_ts, + ) + detokenized_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.DETOKENIZED, + monotonic_ts_s=detokenized_3_ts, + num_new_tokens=1, + ) + stats.update_from(decoded_update) + stats.update_from(detokenized_update) + assert stats.first_token_ts_s == detokenized_ts - arrived_ts + assert stats.num_output_tokens == 3 + assert stats.output_token_latency_s_lst == [ + detokenized_2_ts - detokenized_ts, + detokenized_3_ts - detokenized_2_ts, + ] + + # Test FINISHED + finished_update = RequestStatsUpdate( + request_id=request_id, + type=RequestStatsUpdate.Type.FINISHED, + monotonic_ts_s=finished_ts, + finish_reason="test_reason", + ) + stats.update_from(finished_update) + assert stats.last_updated_ts_s == finished_ts + assert stats.e2e_latency_s == finished_ts - arrived_ts + assert stats.inference_latency_s == finished_ts - prefilling_ts + assert stats.prefill_latency_s == detokenized_ts - prefilling_ts + assert stats.decode_latency_s == finished_ts - detokenized_ts + assert stats.first_token_latency_s == detokenized_ts - arrived_ts + assert stats.queue_duration_s == prefilling_ts - queued_ts + assert stats.is_finished + assert stats.finish_reason == "test_reason" + + # TODO(rickyx): Add model forward/execute time. + assert stats.model_forward_duration_s == 0.0 + assert stats.model_execute_duration_s == 0.0 diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index a06956ce18a93..272206d4502e9 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -30,4 +30,5 @@ marlin, nm-testing/zephyr-beta-7b-marlin-g128, main marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main qqq, HandH1998/QQQ-Llama-3-8b-g128, main qqq, HandH1998/QQQ-Llama-3-8b, main -hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main \ No newline at end of file +hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main +None, mgleize/fairseq2-dummy-Llama-3.2-1B, main \ No newline at end of file diff --git a/tests/weight_loading/test_weight_loading.py b/tests/weight_loading/test_weight_loading.py index 199731bdc21fe..7a3786456d0d6 100644 --- a/tests/weight_loading/test_weight_loading.py +++ b/tests/weight_loading/test_weight_loading.py @@ -20,12 +20,13 @@ def test_weight_loading(vllm_runner): """ Test parameter weight loading with tp>1. """ - with vllm_runner(model_name=MODEL_NAME, - revision=REVISION, - dtype=torch.half if QUANTIZATION == "gptq" else "auto", - quantization=QUANTIZATION, - max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=2) as model: + with vllm_runner( + model_name=MODEL_NAME, + revision=REVISION, + dtype=torch.half if QUANTIZATION == "gptq" else "auto", + quantization=None if QUANTIZATION == "None" else QUANTIZATION, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=2) as model: output = model.generate_greedy("Hello world!", max_tokens=20) print(output) diff --git a/tests/worker/test_model_input.py b/tests/worker/test_model_input.py index 309854e6babf3..57f1fd47a600f 100644 --- a/tests/worker/test_model_input.py +++ b/tests/worker/test_model_input.py @@ -74,6 +74,7 @@ def test_model_runner_input(): num_decode_tokens=3, slot_mapping=torch.zeros(1), multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, ) model_input = ModelInputForGPUWithSamplingMetadata( input_tokens=torch.ones(10), @@ -126,6 +127,7 @@ def test_embedding_model_runner_input(): num_decode_tokens=3, slot_mapping=torch.zeros(1), multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, ) model_input = ModelInputForGPUWithPoolingMetadata( input_tokens=torch.ones(10), @@ -177,6 +179,7 @@ def test_multi_step_model_runner_input(): num_decode_tokens=3, slot_mapping=torch.zeros(1), multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, ) frozen_model_input = ModelInputForGPUWithSamplingMetadata( input_tokens=torch.ones(10), diff --git a/tools/actionlint.sh b/tools/actionlint.sh deleted file mode 100755 index f6a8b5e83a2de..0000000000000 --- a/tools/actionlint.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -if command -v actionlint &> /dev/null; then - actionlint "$@" - exit 0 -elif [ -x ./actionlint ]; then - ./actionlint "$@" - exit 0 -fi - -# download a binary to the current directory - v1.7.3 -bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) -./actionlint "$@" diff --git a/tools/doc-lint.sh b/tools/doc-lint.sh deleted file mode 100755 index 19a55ddfa91c4..0000000000000 --- a/tools/doc-lint.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -pymarkdownlnt scan docs -r diff --git a/tools/mypy.sh b/tools/mypy.sh index bf95e4c526fd1..77d342da1ec82 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -1,12 +1,16 @@ #!/bin/bash CI=${1:-0} -PYTHON_VERSION=${2:-3.9} +PYTHON_VERSION=${2:-local} if [ "$CI" -eq 1 ]; then set -e fi +if [ $PYTHON_VERSION == "local" ]; then + PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') +fi + run_mypy() { echo "Running mypy on $1" if [ "$CI" -eq 1 ] && [ -z "$1" ]; then diff --git a/tools/report_build_time_ninja.py b/tools/report_build_time_ninja.py index 51ad2adc74fe1..9dc19f5fd4cdd 100644 --- a/tools/report_build_time_ninja.py +++ b/tools/report_build_time_ninja.py @@ -274,8 +274,9 @@ def SummarizeEntries(entries, extra_step_types): print(' {:.1f} s weighted time ({:.1f} s elapsed time sum, {:1.1f}x ' 'parallelism)'.format(length, total_cpu_time, total_cpu_time * 1.0 / length)) - print(' %d build steps completed, average of %1.2f/s' % - (len(entries), len(entries) / (length))) + print(' {} build steps completed, average of {:1.2f}/s'.format( + len(entries), + len(entries) / (length))) def main(): diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh index d99fa77b96351..7efb3cabc64fe 100755 --- a/tools/shellcheck.sh +++ b/tools/shellcheck.sh @@ -19,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then fi # TODO - fix warnings in .buildkite/run-amd-test.sh -find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' +find . -name "*.sh" ".git" -prune -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck -s bash "{}"' diff --git a/vllm/__init__.py b/vllm/__init__.py index 0d38a96ed8337..1131e10601984 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -3,6 +3,9 @@ if is_fake_hpu(): migrate_to_cpu() +import os + +import torch from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine @@ -21,43 +24,18 @@ from .version import __version__, __version_tuple__ +# set some common config/environment variables that should be set +# for all processes created by vllm and all processes +# that interact with vllm workers. +# they are executed whenever `import vllm` is called. -def configure_as_vllm_process(): - """ - set some common config/environment variables that should be set - for all processes created by vllm and all processes - that interact with vllm workers. - """ - import os - - import torch - - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' - - # see https://github.com/vllm-project/vllm/issues/10480 - os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' - # see https://github.com/vllm-project/vllm/issues/10619 - torch._inductor.config.compile_threads = 1 - - from vllm.platforms import current_platform - - if current_platform.is_xpu(): - # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 - torch._dynamo.config.disable = True - elif current_platform.is_hpu(): - # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) - # does not support torch.compile - # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for - # torch.compile support - is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' - if is_lazy: - torch._dynamo.config.disable = True - # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) - # requires enabling lazy collectives - # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 - os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' +# see https://github.com/NVIDIA/nccl/issues/1234 +os.environ['NCCL_CUMEM_ENABLE'] = '0' +# see https://github.com/vllm-project/vllm/issues/10480 +os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' +# see https://github.com/vllm-project/vllm/issues/10619 +torch._inductor.config.compile_threads = 1 __all__ = [ "__version__", @@ -84,5 +62,4 @@ def configure_as_vllm_process(): "AsyncEngineArgs", "initialize_ray_cluster", "PoolingParams", - "configure_as_vllm_process", ] diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index d04cbbc0a9eed..f138eb446f551 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -48,8 +48,8 @@ def paged_attention_v1( max_seq_len: int, alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, tp_rank: int = 0, blocksparse_local_blocks: int = 0, blocksparse_vert_stride: int = 0, @@ -80,8 +80,8 @@ def paged_attention_v2( max_seq_len: int, alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, tp_rank: int = 0, blocksparse_local_blocks: int = 0, blocksparse_vert_stride: int = 0, @@ -112,8 +112,8 @@ def paged_attention_rocm( max_seq_len: int, alibi_slopes: Optional[torch.Tensor], kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, ) -> None: torch.ops._rocm_C.paged_attention(out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache, num_kv_heads, @@ -230,7 +230,6 @@ def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor, return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters) -# gptq def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor, b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor, b_g_idx: torch.Tensor, use_exllama: bool, @@ -820,8 +819,8 @@ def scaled_int8_quant( if scale is not None: # static-per-tensor quantization. assert symmetric == ( - azp is - None), "azp must only be provided for asymmetric quantization." + azp + is None), "azp must only be provided for asymmetric quantization." torch.ops._C.static_scaled_int8_quant(output, input, scale, azp) return output, scale, azp @@ -956,8 +955,8 @@ def reshape_and_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, ) -> None: torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping, @@ -971,8 +970,8 @@ def reshape_and_cache_flash( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, ) -> None: torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache, slot_mapping, diff --git a/vllm/assets/image.py b/vllm/assets/image.py index cb831cb0b5bb4..0a55506f88255 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -26,4 +26,4 @@ def image_embeds(self) -> torch.Tensor: """ image_path = get_vllm_public_assets(filename=f"{self.name}.pt", s3_prefix=VLM_IMAGES_DIR) - return torch.load(image_path, map_location="cpu") + return torch.load(image_path, map_location="cpu", weights_only=True) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 737559bfe70ca..8027a52b82ffc 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -1,8 +1,8 @@ from abc import ABC, abstractmethod from contextlib import contextmanager from dataclasses import dataclass, fields -from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Set, - Tuple, Type, TypeVar) +from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, + Protocol, Set, Tuple, Type, TypeVar) import torch @@ -65,11 +65,6 @@ def make_metadata(cls, *args, **kwargs) -> "AttentionMetadata": def get_builder_cls() -> Type["AttentionMetadataBuilder"]: raise NotImplementedError - @classmethod - def make_metadata_builder(cls, *args, - **kwargs) -> "AttentionMetadataBuilder": - return cls.get_builder_cls()(*args, **kwargs) - @staticmethod @abstractmethod def get_kv_cache_shape( @@ -128,6 +123,10 @@ class AttentionMetadata: multi_modal_placeholder_index_maps: Optional[Dict[ str, MultiModalPlaceholderMap.IndexMap]] + # Enable/disable KV scales calculation. This is so that we can disable the + # calculation until after prefill and cuda graph capture. + enable_kv_scales_calculation: bool + @property @abstractmethod def prefill_metadata(self) -> Optional["AttentionMetadata"]: @@ -214,6 +213,12 @@ class AttentionMetadataBuilder(ABC, Generic[T]): @abstractmethod def __init__(self, input_builder: "ModelRunnerInputBuilderBase") -> None: + """Create the builder, remember some configuration and parameters.""" + raise NotImplementedError + + @abstractmethod + def prepare(self) -> None: + """Prepare for one batch.""" raise NotImplementedError @abstractmethod @@ -223,6 +228,24 @@ def build(self, seq_lens: List[int], query_lens: List[int], raise NotImplementedError +class AttentionLayer(Protocol): + + _k_scale: torch.Tensor + _v_scale: torch.Tensor + _k_scale_float: float + _v_scale_float: float + + def forward( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + ... + + class AttentionImpl(ABC, Generic[T]): @abstractmethod @@ -244,13 +267,12 @@ def __init__( @abstractmethod def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: T, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 77cfa8490172b..20e9a3f139de2 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -4,6 +4,7 @@ import torch from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import (CommonAttentionState, CommonMetadataBuilder) @@ -221,6 +222,7 @@ def prefill_metadata( slot_mapping=self.slot_mapping[:self.num_prefill_tokens], multi_modal_placeholder_index_maps=self. multi_modal_placeholder_index_maps, + enable_kv_scales_calculation=self.enable_kv_scales_calculation, seq_lens=self.seq_lens[:self.num_prefills], seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], max_query_len=self.max_query_len, @@ -250,6 +252,7 @@ def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]: num_decode_tokens=self.num_decode_tokens, slot_mapping=self.slot_mapping[self.num_prefill_tokens:], multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, seq_lens=None, seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], max_query_len=None, @@ -358,13 +361,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: BlocksparseFlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -401,8 +403,8 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) if prefill_meta := attn_metadata.prefill_metadata: @@ -439,8 +441,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, tp_rank=self.tp_rank, blocksparse_local_blocks=self.local_blocks, blocksparse_vert_stride=self.vert_stride, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py old mode 100644 new mode 100755 index 48b3e8d177ec9..4a9aa1e217365 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -8,6 +8,7 @@ from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, AttentionType) @@ -16,15 +17,21 @@ compute_slot_mapping_start_idx, get_num_prefill_decode_query_kv_tokens, get_seq_len_block_table_args, is_all_cross_attn_metadata_set, is_all_encoder_attn_metadata_set, is_block_tables_empty) +from vllm.envs import VLLM_FLASH_ATTN_VERSION +from vllm.logger import init_logger from vllm.multimodal import MultiModalPlaceholderMap +from vllm.platforms import current_platform from vllm.utils import async_tensor_h2d, make_tensor_with_pad +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + flash_attn_with_kvcache, + is_fa_version_supported) if TYPE_CHECKING: from vllm.worker.model_runner import (ModelInputForGPUBuilder, ModelInputForGPUWithSamplingMetadata) -from vllm.vllm_flash_attn import (flash_attn_varlen_func, - flash_attn_with_kvcache) +logger = init_logger(__name__) class FlashAttentionBackend(AttentionBackend): @@ -226,6 +233,7 @@ def prefill_metadata(self) -> Optional["FlashAttentionMetadata"]: slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=self. multi_modal_placeholder_index_maps, + enable_kv_scales_calculation=self.enable_kv_scales_calculation, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, max_query_len=self.max_query_len, @@ -270,6 +278,7 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: num_decode_tokens=self.num_decode_tokens, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=seq_lens_tensor, max_decode_query_len=self.max_decode_query_len, @@ -374,6 +383,12 @@ class FlashAttentionMetadataBuilder( AttentionMetadataBuilder[FlashAttentionMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.input_builder = input_builder + self.runner = input_builder.runner + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -387,11 +402,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool, prefix_cache_hit: bool): @@ -552,6 +562,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=True, seq_lens_tensor=seq_lens_tensor, max_query_len=max_query_len, max_decode_query_len=max_decode_query_len, @@ -632,15 +643,33 @@ def __init__( f"Supported head sizes are: {support_head_sizes}.") self.attn_type = attn_type + # if hopper default to FA3, otherwise stick to FA2 for now + # TODO(lucas): profile FA3 on ampere to see if it makes sense to + # use FA3 as default for both + if current_platform.get_device_capability()[0] >= 9: + self.fa_version = 3 if is_fa_version_supported(3) else 2 + else: + self.fa_version = 2 + + if VLLM_FLASH_ATTN_VERSION is not None: + assert VLLM_FLASH_ATTN_VERSION in [2, 3] + self.fa_version = VLLM_FLASH_ATTN_VERSION + + if not is_fa_version_supported(self.fa_version): + logger.error("Cannot use FA version %d is not supported due to %s", + self.fa_version, + fa_version_unsupported_reason(self.fa_version)) + + assert is_fa_version_supported(self.fa_version) + def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -657,7 +686,7 @@ def forward( NOTE: It in-place updates the output tensor. """ # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert k_scale == 1.0 and v_scale == 1.0, ( + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0, ( "key/v_scale is not supported in FlashAttention.") assert output is not None, "Output tensor must be provided." @@ -709,8 +738,8 @@ def forward( kv_cache[1], updated_slot_mapping.flatten(), # type: ignore[union-attr] kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) (num_prefill_query_tokens, num_prefill_kv_tokens, @@ -751,6 +780,7 @@ def forward( alibi_slopes=alibi_slopes, softcap=logits_soft_cap, out=prefill_output, + fa_version=self.fa_version, ) else: # prefix-enabled attention @@ -764,7 +794,7 @@ def forward( v=value_cache, cu_seqlens_q=prefill_meta.query_start_loc, max_seqlen_q=prefill_meta.max_query_len, - cu_seqlens_k=prefill_meta.seq_start_loc, + seqused_k=prefill_meta.seq_lens_tensor, max_seqlen_k=max_seq_len, softmax_scale=softmax_scale, causal=True, @@ -773,6 +803,7 @@ def forward( block_table=prefill_meta.block_tables, softcap=logits_soft_cap, out=prefill_output, + fa_version=self.fa_version, ) if decode_meta := attn_metadata.decode_metadata: @@ -792,7 +823,7 @@ def forward( v=value_cache, cu_seqlens_q=decode_meta.query_start_loc, max_seqlen_q=decode_meta.max_decode_query_len, - cu_seqlens_k=decode_meta.seq_start_loc, + seqused_k=decode_meta.seq_lens_tensor, max_seqlen_k=decode_meta.max_decode_seq_len, softmax_scale=softmax_scale, causal=True, @@ -801,6 +832,7 @@ def forward( softcap=logits_soft_cap, block_table=decode_meta.block_tables, out=decode_output, + fa_version=self.fa_version, ) else: # Use flash_attn_with_kvcache for normal decoding. @@ -821,6 +853,7 @@ def forward( alibi_slopes=alibi_slopes, softcap=logits_soft_cap, out=decode_output.unsqueeze(1), + fa_version=self.fa_version, ) return output diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 6ca75fabdfc38..7cccef9608218 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -1,3 +1,4 @@ +import dataclasses from collections import defaultdict from contextlib import contextmanager from dataclasses import dataclass @@ -13,9 +14,11 @@ from vllm.vllm_flash_attn import flash_attn_varlen_func FLASHINFER_WORKSPACE_BUFFER_SIZE = 256 * 1024 * 1024 except ImportError: - BatchDecodeWithPagedKVCacheWrapper = None - CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None - BatchPrefillWithPagedKVCacheWrapper = None + # Avoid turning these types into variables during type checking + if not TYPE_CHECKING: + BatchDecodeWithPagedKVCacheWrapper = None + CUDAGraphBatchDecodeWithPagedKVCacheWrapper = None + BatchPrefillWithPagedKVCacheWrapper = None FLASHINFER_WORKSPACE_BUFFER_SIZE = 0 import torch @@ -23,13 +26,16 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, AttentionState, AttentionType) from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping, compute_slot_mapping_start_idx, is_block_tables_empty) +from vllm.attention.layer import Attention from vllm.attention.ops.paged_attn import PagedAttention +from vllm.config import VllmConfig, get_current_vllm_config from vllm.utils import (async_tensor_h2d, get_kv_cache_torch_dtype, make_tensor_with_pad) @@ -98,6 +104,72 @@ def get_fp8_dtype_for_flashinfer(kv_cache_dtype: str) -> torch.dtype: raise ValueError(f"Unrecognized FP8 dtype: {kv_cache_dtype}") +@dataclass +class PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters. + """ + + window_left: int + logits_soft_cap: Optional[float] + sm_scale: float + + +def get_per_layer_parameters( + vllm_config: VllmConfig) -> Dict[str, PerLayerParameters]: + """ + Scan all attention layers and determine some hyperparameters + to use during `plan`. + """ + + layers = vllm_config.compilation_config.static_forward_context + per_layer_params: Dict[str, PerLayerParameters] = {} + + for key, layer in layers.items(): + assert isinstance(layer, Attention) + + impl = layer.impl + assert isinstance(impl, FlashInferImpl) + + # Infer hyperparameters from the attention layer + window_size = impl.sliding_window + window_left = window_size[0] if window_size is not None else -1 + logits_soft_cap = impl.logits_soft_cap + sm_scale = impl.scale + + per_layer_params[key] = PerLayerParameters(window_left, + logits_soft_cap, sm_scale) + + return per_layer_params + + +def infer_global_hyperparameters( + per_layer_params: Dict[str, PerLayerParameters]) -> PerLayerParameters: + """ + Currently, FlashInfer backend only support models in which all layers share + the same values for the following hyperparameters: + - `window_left` + - `logits_soft_cap` + - `sm_scale` + + So this function asserts that all layers share the same values for these + hyperparameters and returns the global values. + """ + + assert len(per_layer_params) > 0, "No attention layers found in the model." + + param_sets = list(per_layer_params.values()) + global_params = param_sets[0] + for params in param_sets: + assert params == global_params, ( + "FlashInfer backend currently only supports models in which all " + "layers share the same values for the following hyperparameters: " + "`window_left`, `logits_soft_cap`, `sm_scale`.") + + return global_params + + class FlashInferState(AttentionState): def __init__(self, runner): @@ -107,6 +179,11 @@ def __init__(self, runner): self._decode_wrapper = None self._prefill_wrapper = None + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + def _get_workspace_buffer(self): if self._workspace_buffer is None: self._workspace_buffer = torch.empty( @@ -214,10 +291,14 @@ def graph_capture_get_metadata_for_batch( batch_size + 1, dtype=torch.int32) + global_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + attn_metadata = self.runner.attn_backend.make_metadata( num_prefills=0, slot_mapping=self._graph_slot_mapping[:batch_size], multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, num_prefill_tokens=0, num_decode_tokens=batch_size, max_prefill_seq_len=0, @@ -236,7 +317,9 @@ def graph_capture_get_metadata_for_batch( q_data_type=self.runner.model_config.dtype, use_cuda_graph=True, decode_wrapper=self._graph_decode_wrapper, - prefill_wrapper=None) + prefill_wrapper=None, + **dataclasses.asdict(global_params), + ) attn_metadata.begin_forward() return attn_metadata @@ -323,9 +406,28 @@ class FlashInferMetadata(AttentionMetadata): data_type: torch.dtype = None # The data type of the query q_data_type: torch.dtype = None - device: torch.device = torch.device("cuda") + # FlashInfer 0.2 encourages passing host tensors + device: torch.device = torch.device("cpu") is_profile_run: bool = False + # The FlashInfer backend currently supports only models in which all layers + # share the same following hyperparameters: + + # The left (inclusive) window size for the attention window, when + # set to `-1`, the window size will be set to the full length of + # the sequence. Defaults to `-1`. + window_left: int = -1 + # The attention logits soft capping value (used in Gemini, Grok and + # Gemma-2, etc.), if not provided, will be set to `0`. If greater + # than 0, the logits will be capped according to formula: + # $$\texttt{logits\_soft\_cap} \times + # \mathrm{tanh}(x / \texttt{logits\_soft\_cap})$$, + # where $x$ is the input logits. + logits_soft_cap: Optional[float] = None + # The scale used in softmax, if not provided, will be set to + # `1.0 / sqrt(head_dim)`. + sm_scale: Optional[float] = None + def __post_init__(self): # Refer to # https://github.com/flashinfer-ai/flashinfer/blob/3d55c71a62052c590c130897d3a3db49b14fcc34/include/flashinfer/utils.cuh#L157 @@ -361,14 +463,21 @@ def begin_forward(self): self.block_table_bound = self.block_table_bound.to(self.device) self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) self.paged_kv_indices = self.paged_kv_indices.to(self.device) - self.prefill_wrapper.end_forward() - self.prefill_wrapper.begin_forward( + self.prefill_wrapper.plan( self.query_start_loc, self.paged_kv_indptr[:self.num_prefills + 1], self.paged_kv_indices, self.paged_kv_last_page_len[:self.num_prefills], - self.num_qo_heads, self.num_kv_heads, self.head_dim, - self.page_size) + self.num_qo_heads, + self.num_kv_heads, + self.head_dim, + self.page_size, + causal=True, + sm_scale=self.sm_scale, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + q_data_type=self.q_data_type, + kv_data_type=self.data_type) if self.num_decode_tokens > 0: assert self.paged_kv_indices is not None assert self.paged_kv_indptr is not None @@ -384,8 +493,7 @@ def begin_forward(self): self.seq_lens_tensor = self.seq_lens_tensor.to(self.device) assert self.decode_wrapper is not None - self.decode_wrapper.end_forward() - self.decode_wrapper.begin_forward( + self.decode_wrapper.plan( self.paged_kv_indptr[self.num_prefills:], self.paged_kv_indices, self.paged_kv_last_page_len[self.num_prefills:], @@ -395,8 +503,11 @@ def begin_forward(self): self.page_size, # Disable flashinfer's pos encoding and use vllm's rope. pos_encoding_mode="NONE", + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, # kv-cache data type. - data_type=self.data_type, + kv_data_type=self.data_type, # query data type. q_data_type=self.q_data_type) @@ -487,6 +598,19 @@ def advance_step(self, class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + + self.input_builder = input_builder + self.runner = input_builder.runner + + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + # Global hyperparameters shared by all attention layers + self.global_hyperparameters: Optional[PerLayerParameters] = None + + self.vllm_config = get_current_vllm_config() + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -499,12 +623,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - # Please follow https://docs.flashinfer.ai/tutorials/kv_layout.html#page-layout # for the precise definition of the following fields. # An example: @@ -524,6 +642,20 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.total_blocks = 0 self.is_profile_run: bool = False + if self.global_hyperparameters is None: + # Infer global hyperparameters, since currently we only support + # models in which all layers share the same values for the + # following hyperparameters: + # - `window_left` + # - `logits_soft_cap` + # - `sm_scale` + inferred_params = infer_global_hyperparameters( + get_per_layer_parameters(self.vllm_config)) + self.global_hyperparameters = inferred_params + self.window_left = inferred_params.window_left + self.logits_soft_cap = inferred_params.logits_soft_cap + self.sm_scale = inferred_params.sm_scale + def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): @@ -730,6 +862,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=False, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, max_prefill_seq_len=max_prefill_seq_len, @@ -751,7 +884,11 @@ def build(self, seq_lens: List[int], query_lens: List[int], data_type=kv_cache_dtype, q_data_type=self.runner.model_config.dtype, use_cuda_graph=use_captured_graph, - is_profile_run=self.is_profile_run) + is_profile_run=self.is_profile_run, + window_left=self.window_left, + logits_soft_cap=self.logits_soft_cap, + sm_scale=self.sm_scale, + ) class FlashInferImpl(AttentionImpl): @@ -792,13 +929,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashInferMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -826,8 +962,8 @@ def forward( kv_cache[:, 1], attn_metadata.slot_mapping.flatten(), kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # The FlashInfer api requires data to be in fp8_e4m3 or fp8_e5m2 # to process the cache when the kv_cache_dtype is fp8 @@ -881,25 +1017,34 @@ def forward( else: assert prefill_meta is not None assert prefill_meta.prefill_wrapper is not None - prefill_output = prefill_meta.prefill_wrapper.forward( + + assert prefill_meta.prefill_wrapper._causal + assert prefill_meta.prefill_wrapper._window_left == window_left + assert prefill_meta.prefill_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert prefill_meta.prefill_wrapper._sm_scale == softmax_scale + + prefill_output = prefill_meta.prefill_wrapper.run( query, kv_cache, - logits_soft_cap=logits_soft_cap, - causal=True, - k_scale=k_scale, - v_scale=v_scale, - window_left=window_left) + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + ) if decode_meta := attn_metadata.decode_metadata: assert decode_meta is not None assert decode_meta.decode_wrapper is not None - decode_output = decode_meta.decode_wrapper.forward( + + assert decode_meta.decode_wrapper._window_left == window_left + assert decode_meta.decode_wrapper._logits_soft_cap == ( + logits_soft_cap or 0.0) + assert decode_meta.decode_wrapper._sm_scale == softmax_scale + + decode_output = decode_meta.decode_wrapper.run( decode_query, kv_cache, - sm_scale=softmax_scale, - logits_soft_cap=logits_soft_cap, - k_scale=k_scale, - v_scale=v_scale, - window_left=window_left) + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, + ) if prefill_output is None and decode_output is not None: # Decode only batch. diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index 1893f98d8af77..e50bf56674e03 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -13,6 +13,7 @@ VLLMKVCache) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention, @@ -164,13 +165,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: HPUAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -184,6 +184,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 if self.attn_type == AttentionType.ENCODER_DECODER: return self.forward_encoder_decoder( query=query, @@ -191,8 +192,8 @@ def forward( value=value, kv_cache=kv_cache, attn_metadata=attn_metadata, - k_scale=k_scale, - v_scale=v_scale, + k_scale=layer._k_scale_float, + v_scale=layer._k_scale_float, ) batch_size, seq_len, hidden_size = query.shape diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index da1d307daa517..57916a3c6a34c 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -7,6 +7,7 @@ from vllm._ipex_ops import ipex_ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState from vllm.attention.ops.paged_attn import (PagedAttention, @@ -171,13 +172,12 @@ def split_kv_cache( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: IpexAttnMetadata, # type: ignore - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with IPEX varlen_attention and PagedAttention. @@ -193,7 +193,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert k_scale == 1.0 and v_scale == 1.0 + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) @@ -210,8 +210,8 @@ def forward( value_cache, attn_metadata.slot_mapping.flatten(), self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) if attn_metadata.is_prompt: @@ -296,8 +296,8 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) else: # Run PagedAttention V2. @@ -329,8 +329,8 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 2ac492dd8ae54..209a623ba441c 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -5,6 +5,7 @@ import torch_xla.experimental.custom_kernel # Required to register custom ops. from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import CommonAttentionState @@ -109,6 +110,7 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads + self.logits_soft_cap = logits_soft_cap if head_size % 128 != 0: raise NotImplementedError("Head size must be a multiple of 128.") if alibi_slopes is not None: @@ -119,9 +121,6 @@ def __init__( raise NotImplementedError("FP8 KV cache dtype is not supported.") if blocksparse_params is not None: raise NotImplementedError("Blocksparse is not supported.") - if logits_soft_cap is not None: - raise NotImplementedError( - "Attention logits soft-capping is not supported.") if torch_xla.tpu.version() < 4: raise NotImplementedError("TPU version must be 4 or higher.") @@ -150,13 +149,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with Pallas attention. @@ -173,7 +171,7 @@ def forward( Returns: shape = [batch_size, seq_len, num_heads * head_size] """ - assert k_scale == 1.0 and v_scale == 1.0 + assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 batch_size, seq_len, hidden_size = query.shape query = query.view(batch_size, seq_len, self.num_heads, self.head_size) key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) @@ -230,6 +228,7 @@ def forward( num_kv_pages_per_compute_block, num_queries_per_compute_block, use_kernel=True, + attn_logits_soft_cap=self.logits_soft_cap, ) else: # Decoding run. @@ -257,6 +256,7 @@ def forward( attn_metadata.block_tables, pages_per_compute_block, self.megacore_mode, + attn_logits_soft_cap=self.logits_soft_cap, ) else: chunk_size = max_num_seq @@ -280,6 +280,7 @@ def forward( attn_metadata.block_tables[chunk_start:chunk_end], pages_per_compute_block, self.megacore_mode, + attn_logits_soft_cap=self.logits_soft_cap, ) output[chunk_start:chunk_end] = chunk_output @@ -313,6 +314,8 @@ def paged_attention( block_tables: torch.Tensor, pages_per_compute_block: int, megacore_mode: Optional[str], + *, + attn_logits_soft_cap: Optional[float], ) -> torch.Tensor: batch_size = query.shape[0] if megacore_mode == "batch" and batch_size % 2 != 0: @@ -320,26 +323,13 @@ def paged_attention( else: megacore_mode = megacore_mode - # NOTE(woosuk): A temporary workaround to avoid the error: - # "xla::paged_attention() Expected a value of type 'str' for - # argument 'megacore_mode' but instead found type 'NoneType'." - if megacore_mode is not None: - output = torch.ops.xla.paged_attention( - query, - key_cache, - value_cache, - context_lens, - block_tables, - pages_per_compute_block, - megacore_mode=megacore_mode, - ) - else: - output = torch.ops.xla.paged_attention( - query, - key_cache, - value_cache, - context_lens, - block_tables, - pages_per_compute_block, - ) - return output + return torch.ops.xla.paged_attention( + query, + key_cache, + value_cache, + context_lens, + block_tables, + pages_per_compute_block, + megacore_mode=megacore_mode, + attn_logits_soft_cap=attn_logits_soft_cap, + ) diff --git a/vllm/attention/backends/placeholder_attn.py b/vllm/attention/backends/placeholder_attn.py index 534f79b3a60bf..826311896d1d2 100644 --- a/vllm/attention/backends/placeholder_attn.py +++ b/vllm/attention/backends/placeholder_attn.py @@ -140,6 +140,7 @@ def prefill_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=self. multi_modal_placeholder_index_maps, + enable_kv_scales_calculation=self.enable_kv_scales_calculation, seq_lens=self.seq_lens[:self.num_prefills], seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], max_decode_query_len=0, @@ -173,6 +174,7 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]: num_decode_tokens=self.num_decode_tokens, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], max_decode_query_len=self.max_decode_query_len, @@ -253,6 +255,11 @@ class PlaceholderAttentionMetadataBuilder( AttentionMetadataBuilder[PlaceholderAttentionMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + + self.input_builder = input_builder + self.runner = input_builder.runner + + def prepare(self): self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] self.curr_seq_lens: List[int] = [] @@ -263,9 +270,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): @@ -378,6 +382,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], num_prefills=self.num_prefills, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=True, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index a91a5af5c3d58..ca6fa9ca61b30 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -7,6 +7,7 @@ import vllm.envs as envs from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import (CommonAttentionState, CommonMetadataBuilder) @@ -152,6 +153,7 @@ def prefill_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: slot_mapping=self.slot_mapping[:self.num_prefill_tokens], multi_modal_placeholder_index_maps=self. multi_modal_placeholder_index_maps, + enable_kv_scales_calculation=self.enable_kv_scales_calculation, seq_lens=self.seq_lens[:self.num_prefills], seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], max_query_len=self.max_query_len, @@ -181,6 +183,7 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]: num_decode_tokens=self.num_decode_tokens, slot_mapping=self.slot_mapping[self.num_prefill_tokens:], multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], max_query_len=None, @@ -414,13 +417,12 @@ def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: ROCmFlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -458,8 +460,8 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) num_prefill_tokens = attn_metadata.num_prefill_tokens @@ -567,8 +569,8 @@ def forward( prefill_meta.max_query_len, self.alibi_slopes, self.sliding_window[0], - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) if decode_meta := attn_metadata.decode_metadata: @@ -613,8 +615,8 @@ def forward( max_seq_len, self.alibi_slopes, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) else: output[num_prefill_tokens:] = PagedAttention.forward_decode( @@ -628,8 +630,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index ca1c4618615de..c3b2398b4e632 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -7,6 +7,7 @@ from torch.nn.functional import scaled_dot_product_attention from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionMetadataBuilder, AttentionType) @@ -281,7 +282,10 @@ class TorchSDPAMetadataBuilder(AttentionMetadataBuilder[TorchSDPAMetadata]): def __init__(self, input_builder: ModelInputForCPUBuilder) -> None: self.chunked_prefill = input_builder.chunked_prefill - self.input_data = input_builder.input_data + self.input_builder = input_builder + + def prepare(self): + self.input_data = self.input_builder.input_data def build(self, seq_lens: List[int], query_lens: List[int], cuda_graph_pad_size: int, batch_size: int) -> TorchSDPAMetadata: @@ -375,6 +379,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], prefill_block_tables=prefill_block_tables, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=False, ) return attn_metadata @@ -429,13 +434,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: TorchSDPAMetadata, # type: ignore - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with torch SDPA and PagedAttention. @@ -451,7 +455,6 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - assert k_scale == 1.0 and v_scale == 1.0 attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): @@ -493,11 +496,9 @@ def forward( # Update self-attention KV cache (prefill/decode) updated_slot_mapping = attn_metadata.slot_mapping - PagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - updated_slot_mapping, - self.kv_cache_dtype, - k_scale, v_scale) + PagedAttention.write_to_paged_cache( + key, value, key_cache, value_cache, updated_slot_mapping, + self.kv_cache_dtype, layer._k_scale, layer._v_scale) if attn_type != AttentionType.ENCODER: # Decoder self-attention supports chunked prefill. @@ -571,8 +572,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py index 56cc43430301f..84fe89b7df360 100644 --- a/vllm/attention/backends/utils.py +++ b/vllm/attention/backends/utils.py @@ -122,6 +122,13 @@ class CommonMetadataBuilder(AttentionMetadataBuilder[TAttentionMetadata]): _metadata_cls: Type[TAttentionMetadata] def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.input_builder = input_builder + self.runner = input_builder.runner + + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -134,12 +141,6 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"): self.num_prefill_tokens = 0 self.num_decode_tokens = 0 - self.input_builder = input_builder - self.runner = input_builder.runner - - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size - def _add_seq_group( self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup", chunked_prefill_enabled: bool): @@ -264,6 +265,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], num_prefills=self.num_prefills, slot_mapping=slot_mapping_tensor, multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=True, num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, seq_lens=seq_lens, @@ -316,6 +318,7 @@ def graph_capture_get_metadata_for_batch( num_decode_tokens=batch_size, slot_mapping=self._graph_slot_mapping[:batch_size], multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, seq_lens=None, seq_lens_tensor=self._graph_seq_lens[:batch_size], max_query_len=1, diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 8c8ca8520a9db..49f47f9c8ded3 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -10,6 +10,7 @@ LowerTriangularMaskWithTensorBias) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, AttentionMetadata, AttentionType) from vllm.attention.backends.utils import ( CommonAttentionState, CommonMetadataBuilder, @@ -198,6 +199,8 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: # Compute some attn_metadata fields which default to None query_start_loc = (None if self.query_start_loc is None else self.query_start_loc[:self.num_prefills + 1]) + seq_start_loc = (None if self.seq_start_loc is None else + self.seq_start_loc[:self.num_prefills + 1]) slot_mapping = (None if self.slot_mapping is None else self.slot_mapping[:self.num_prefill_tokens]) seq_lens = (None if self.seq_lens is None else @@ -217,12 +220,14 @@ def prefill_metadata(self) -> Optional["XFormersMetadata"]: slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=self. multi_modal_placeholder_index_maps, + enable_kv_scales_calculation=self.enable_kv_scales_calculation, seq_lens=seq_lens, seq_lens_tensor=seq_lens_tensor, max_query_len=self.max_query_len, max_prefill_seq_len=self.max_prefill_seq_len, max_decode_seq_len=0, query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, context_lens_tensor=context_lens_tensor, block_tables=block_tables, use_cuda_graph=False, @@ -261,6 +266,7 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]: num_decode_tokens=self.num_decode_tokens, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=True, seq_lens_tensor=seq_lens_tensor, max_prefill_seq_len=0, max_decode_seq_len=self.max_decode_seq_len, @@ -412,13 +418,12 @@ def __init__( def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: Optional[torch.Tensor], value: Optional[torch.Tensor], kv_cache: torch.Tensor, attn_metadata: "XFormersMetadata", - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -524,11 +529,9 @@ def forward( # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory # profiling run. - PagedAttention.write_to_paged_cache(key, value, key_cache, - value_cache, - updated_slot_mapping, - self.kv_cache_dtype, - k_scale, v_scale) + PagedAttention.write_to_paged_cache( + key, value, key_cache, value_cache, updated_slot_mapping, + self.kv_cache_dtype, layer._k_scale, layer._v_scale) (num_prefill_query_tokens, num_prefill_kv_tokens, num_decode_query_tokens) = \ get_num_prefill_decode_query_kv_tokens(attn_metadata, attn_type) @@ -580,8 +583,8 @@ def forward( prefill_meta.max_query_len, self.alibi_slopes, self.sliding_window, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) assert output[:num_prefill_query_tokens].shape == out.shape output[:num_prefill_query_tokens] = out @@ -607,8 +610,8 @@ def forward( self.num_kv_heads, self.scale, self.alibi_slopes, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Reshape the output tensor. diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index e2403306950a3..a4f45fc338164 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -5,6 +5,7 @@ import torch.nn as nn import torch.nn.functional as F +import vllm.envs as envs from vllm.attention import AttentionMetadata, AttentionType from vllm.attention.selector import backend_name_to_enum, get_attn_backend from vllm.config import CacheConfig, get_current_vllm_config @@ -57,10 +58,12 @@ def __init__( kv_cache_dtype = cache_config.cache_dtype block_size = cache_config.block_size is_attention_free = cache_config.is_attention_free + calculate_kv_scales = cache_config.calculate_kv_scales else: kv_cache_dtype = "auto" block_size = 16 is_attention_free = False + calculate_kv_scales = False if num_kv_heads is None: num_kv_heads = num_heads @@ -70,8 +73,15 @@ def __init__( # expect the pre-quantized k/v_scale to be loaded along # with the model weights. self.kv_cache_dtype = kv_cache_dtype - self._k_scale = 1.0 - self._v_scale = 1.0 + self.calculate_kv_scales = calculate_kv_scales + self._k_scale = torch.tensor(1.0, dtype=torch.float32) + self._v_scale = torch.tensor(1.0, dtype=torch.float32) + + # We also keep the float32 versions of k/v_scale for attention + # backends that don't support tensors (Flashinfer) + self._k_scale_float = 1.0 + self._v_scale_float = 1.0 + quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None if quant_method is not None: @@ -127,6 +137,9 @@ def __init__( ).parallel_config.pipeline_parallel_size) ] + self.k_range = torch.tensor(envs.K_SCALE_CONSTANT, dtype=torch.float32) + self.v_range = torch.tensor(envs.V_SCALE_CONSTANT, dtype=torch.float32) + def forward( self, query: torch.Tensor, @@ -135,6 +148,13 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: + # NOTE: please avoid accessing `kv_cache` and `attn_metadata` arguments + # directly, use `self.kv_cache` and + # `get_forward_context().attn_metadata` instead. + if self.calculate_kv_scales: + ctx_attn_metadata = get_forward_context().attn_metadata + if ctx_attn_metadata.enable_kv_scales_calculation: + self.calc_kv_scales(key, value) if self.use_output: output = torch.empty_like(query) hidden_size = query.size(-1) @@ -148,19 +168,39 @@ def forward( if value is not None: value = value.view(-1, self.num_kv_heads, self.head_size) if self.use_direct_call: - unified_attention_with_output(query, key, value, output, - self.layer_name) + forward_context: ForwardContext = get_forward_context() + ctx_attn_metadata = forward_context.attn_metadata + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + self.impl.forward(self, + query, + key, + value, + self_kv_cache, + ctx_attn_metadata, + output=output) else: torch.ops.vllm.unified_attention_with_output( query, key, value, output, self.layer_name) return output.view(-1, hidden_size) else: if self.use_direct_call: - return unified_attention(query, key, value, self.layer_name) + forward_context = get_forward_context() + ctx_attn_metadata = forward_context.attn_metadata + self_kv_cache = self.kv_cache[forward_context.virtual_engine] + return self.impl.forward(self, query, key, value, + self_kv_cache, ctx_attn_metadata) else: return torch.ops.vllm.unified_attention( query, key, value, self.layer_name) + def calc_kv_scales(self, key, value): + self._k_scale.copy_(torch.abs(key).max() / self.k_range) + self._v_scale.copy_(torch.abs(value).max() / self.v_range) + self._k_scale_float = self._k_scale.item() + self._v_scale_float = self._v_scale.item() + # We only calculate the scales once + self.calculate_kv_scales = False + def extra_repr(self) -> str: s = f"head_size={self.impl.head_size}" # type: ignore s += f", num_heads={self.impl.num_heads}" # type: ignore @@ -186,6 +226,9 @@ def __init__( self.scale = scale self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + dtype = torch.get_default_dtype() attn_backend = get_attn_backend(head_size, dtype, @@ -197,7 +240,7 @@ def __init__( backend = _Backend.XFORMERS self.attn_backend = backend if backend in { - _Backend.TORCH_SDPA, _Backend.XFORMERS + _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.HPU_ATTN } else _Backend.TORCH_SDPA def forward( @@ -207,7 +250,7 @@ def forward( value: torch.Tensor, ) -> torch.Tensor: """Input shape: batch_size x seq_len x hidden_size""" - # TODO(Isotr0py): Use existing backend implementations and support FA2 + # TODO(Isotr0py): Use existing backend implementations and support FA3 bsz, q_len, _ = query.size() kv_len = key.size(1) @@ -215,6 +258,11 @@ def forward( key = key.view(bsz, kv_len, self.num_kv_heads, self.head_size) value = value.view(bsz, kv_len, self.num_kv_heads, self.head_size) + if (num_repeat := self.num_queries_per_kv) > 1: + # Handle MQA and GQA + key = torch.repeat_interleave(key, num_repeat, dim=2) + value = torch.repeat_interleave(value, num_repeat, dim=2) + if self.attn_backend == _Backend.XFORMERS: from xformers import ops as xops @@ -230,6 +278,36 @@ def forward( value, scale=self.scale) out = out.transpose(1, 2) + elif self.attn_backend == _Backend.HPU_ATTN: + query, key, value = (x.transpose(1, 2) + for x in (query, key, value)) + + from vllm_hpu_extension.flags import enabled_flags + + if "fsdpa" in enabled_flags(): + from habana_frameworks.torch.hpex.kernels import FusedSDPA + from vllm_hpu_extension.utils import ModuleFusedSDPA + + fsdpa_op = ModuleFusedSDPA(FusedSDPA) + + out = fsdpa_op(query, + key, + value, + None, + dropout_p=0.0, + is_causal=False, + scale=self.scale, + softmax_mode="fast", + recompute_mode=True, + valid_sequence_lengths=None) + else: + out = F.scaled_dot_product_attention(query, + key, + value, + scale=self.scale) + + out = out.transpose(1, 2) + return out.reshape(bsz, q_len, -1) @@ -243,8 +321,7 @@ def unified_attention( attn_metadata = forward_context.attn_metadata self = forward_context.attn_layers[layer_name] kv_cache = self.kv_cache[forward_context.virtual_engine] - return self.impl.forward(query, key, value, kv_cache, attn_metadata, - self._k_scale, self._v_scale) + return self.impl.forward(self, query, key, value, kv_cache, attn_metadata) def unified_attention_fake( @@ -276,13 +353,12 @@ def unified_attention_with_output( attn_metadata = forward_context.attn_metadata self = forward_context.attn_layers[layer_name] kv_cache = self.kv_cache[forward_context.virtual_engine] - self.impl.forward(query, + self.impl.forward(self, + query, key, value, kv_cache, attn_metadata, - self._k_scale, - self._v_scale, output=output) diff --git a/vllm/attention/ops/hpu_paged_attn.py b/vllm/attention/ops/hpu_paged_attn.py index e55a4de11fd6c..826e432976945 100644 --- a/vllm/attention/ops/hpu_paged_attn.py +++ b/vllm/attention/ops/hpu_paged_attn.py @@ -28,7 +28,7 @@ class HPUPagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 256] + return list(range(1, 257)) @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index cbc6c74acf09a..3a07184ed31f0 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -52,8 +52,8 @@ def write_to_paged_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: ops.reshape_and_cache( @@ -80,8 +80,8 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: tp_rank: int = 0 @@ -149,8 +149,8 @@ def write_to_paged_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: ipex_modules.PagedAttention.reshape_and_cache( @@ -170,8 +170,8 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, *args, ) -> None: block_size = value_cache.shape[2] diff --git a/vllm/attention/ops/nki_flash_attn.py b/vllm/attention/ops/nki_flash_attn.py new file mode 100644 index 0000000000000..9de4ef7f5a140 --- /dev/null +++ b/vllm/attention/ops/nki_flash_attn.py @@ -0,0 +1,682 @@ +from dataclasses import dataclass + +import neuronxcc.nki.isa as nisa +import neuronxcc.nki.language as nl +import numpy as np +from neuronxcc import nki +from neuronxcc.nki.language import par_dim + + +@dataclass(frozen=True) +class FlashConfig: + """ + Config class for flash attention with default values + """ + + seq_tile_size: int = 2048 + should_transpose_v: bool = False + + __annotations__ = { + "seq_tile_size": int, + "should_transpose_v": bool, + } + + +@nki.jit +def transpose_p_local(p_local_transposed, + p_local, + LARGE_TILE_SZ, + forward_mask, + B_F_SIZE=512): + for i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE): + if nisa.get_nc_version() == nisa.nc_version.gen3: + p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), + buffer=nl.sbuf, + dtype=p_local.dtype) + else: + p_local_t_tmp = nl.ndarray((par_dim(128), B_F_SIZE), + buffer=nl.psum, + dtype=np.float32) + + for j in nl.affine_range(B_F_SIZE // 128): + j_128_slice = nl.ds(j * 128, 128) + i_j_128_slice = nl.ds(i * B_F_SIZE + j * 128, 128) + + if nisa.get_nc_version() == nisa.nc_version.gen3: + p_local_t_tmp[:, j_128_slice] = nisa.dma_transpose( + p_local[:, i_j_128_slice], mask=forward_mask) + else: + p_local_t_tmp[:, j_128_slice] = nisa.nc_transpose( + p_local[:, i_j_128_slice], mask=forward_mask) + + p_local_transposed[:, nl.ds(i * B_F_SIZE, B_F_SIZE)] = nl.copy( + p_local_t_tmp, dtype=p_local_transposed.dtype, mask=forward_mask) + + +@nki.jit +def _flash_attention_core( + q_local_tile, + k, + v, + q_h_per_k_h, + seqlen_q, + nheads, + o_buffer, + l_buffer, + m_buffer, + batch_id, + head_id, + gqa_head_idx, + q_tile_idx, + local_k_large_tile_idx, + kernel_dtype, + acc_type, + flash_config: FlashConfig, + use_causal_mask=False, + continuous_batching_mask=None, + initialize=False, + B_P_SIZE=128, + B_F_SIZE=512, + B_D_SIZE=128, + dropout_p=0.0, + dropout_p_tensor=None, + seed_tensor=None, + logit_bias_tile=None, + qk_res_buffer=None, +): + """ + The flash attention core function to calculate self attention between a tile + of q and a block of K and V. + The q_local_tile has (B_P_SIZE, B_F_SIZE), which is loaded into the SBUF + already. The block size of K and V + is defined in the seq_tile_size of the flash_config. The results are stored + in the following three buffers + o_buffer: (B_P_SIZE, d) + l_buffer: (B_P_SIZE, 1) + m_buffer: (B_P_SIZE, 1) + """ + LARGE_TILE_SZ = flash_config.seq_tile_size + num_k_tile_per_large_tile = LARGE_TILE_SZ // B_F_SIZE + seqlen_k = k.shape[-1] + seqlen_q // B_P_SIZE + seqlen_k // B_F_SIZE + + # TODO : support logit_bias with continuous_batching_mask + assert not use_causal_mask, "causal mask is not supported." + assert (continuous_batching_mask + is not None), "continuous_batching_mask input is required." + if continuous_batching_mask is not None: + assert ( + logit_bias_tile + is None), "continuous_batching_mask does not support logit_bias!" + + # mask are used to only apply computation to the lower half of the matrix, + # which reduce the arithmetic intensity by half + forward_mask = (q_tile_idx * B_P_SIZE >= local_k_large_tile_idx * + LARGE_TILE_SZ if use_causal_mask else None) + + qk_res_buf = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + buffer=nl.sbuf, + dtype=acc_type) + max_local = nl.ndarray((par_dim(B_P_SIZE), num_k_tile_per_large_tile), + dtype=acc_type) + for k_i in nl.affine_range(num_k_tile_per_large_tile): + k_i_b_f_slice = nl.ds(k_i * B_F_SIZE, B_F_SIZE) + + qk_psum = nl.zeros((par_dim(B_P_SIZE), B_F_SIZE), + dtype=np.float32, + buffer=nl.psum) # (128, 512) + qk_psum[:, :] = nl.matmul(q_local_tile, + k[:, k_i_b_f_slice], + transpose_x=True, + mask=None) # (p(128), 512) + + qk_res_buf[:, k_i_b_f_slice] = nl.where( + continuous_batching_mask[:, k_i_b_f_slice], + qk_psum[:, nl.ds(0, B_F_SIZE)], + -9984.0, + dtype=acc_type, + ) + + # Calculate max of the current tile + max_local[:, k_i] = nisa.tensor_reduce( + np.max, + qk_res_buf[:, k_i_b_f_slice], + axis=(1, ), + dtype=acc_type, + negate=False, + mask=forward_mask, + ) + + if qk_res_buffer is not None: + qk_res_buffer[:, :] = nl.copy(qk_res_buf[:, :]) + + max_ = nisa.tensor_reduce( + np.max, + max_local[:, :], + axis=(1, ), + dtype=acc_type, + negate=False, + mask=forward_mask, + ) + + o_previous_scaled = nl.ndarray((par_dim(B_P_SIZE), B_D_SIZE), + dtype=o_buffer.dtype) + + if initialize: + m_buffer[:, 0] = nl.copy(max_) + m_current = max_ + else: + m_previous = nl.copy(m_buffer[:, 0]) + m_buffer[:, 0] = nl.maximum(m_previous, max_, + mask=forward_mask) # (128,1) + + m_current = m_buffer[:, 0] + # Compute scaling factor + alpha = nisa.activation( + np.exp, + m_previous, + bias=-1 * m_current, + scale=1.0, + mask=forward_mask, + ) + o_previous_scaled[...] = nl.multiply(o_buffer[:, :], + alpha, + mask=forward_mask) + + p_local = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + REDUCTION_TILE = min(2048, LARGE_TILE_SZ // 2) + + p_partial_sum = nl.ndarray( + (par_dim(B_P_SIZE), LARGE_TILE_SZ // REDUCTION_TILE), dtype=acc_type) + + for k_r_i in nl.affine_range(LARGE_TILE_SZ // REDUCTION_TILE): + k_r_i_reduce_slice = nl.ds(k_r_i * REDUCTION_TILE, REDUCTION_TILE) + + # compute exp(qk - max) + # Compute partial row - tile sum of exp(qk - max)) + # FIXME : Use activation accumulate to accumulate over k_r_i loop ? + p_local[:, k_r_i_reduce_slice] = nisa.activation_reduce( + np.exp, + qk_res_buf[:, k_r_i_reduce_slice], + bias=-1 * m_current, + scale=1.0, + reduce_op=nl.add, + reduce_res=p_partial_sum[:, k_r_i], + dtype=kernel_dtype, + mask=forward_mask, + ) + + ps = nl.sum(p_partial_sum, axis=1, dtype=acc_type, mask=forward_mask) + + p_local_transposed = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + transpose_p_local( + p_local_transposed=p_local_transposed, + p_local=p_local, + LARGE_TILE_SZ=LARGE_TILE_SZ, + forward_mask=forward_mask, + B_F_SIZE=B_F_SIZE, + ) + + pv_psum = nl.zeros((par_dim(B_P_SIZE), B_D_SIZE), + dtype=np.float32, + buffer=nl.psum) + for k_i in nl.affine_range(LARGE_TILE_SZ // B_P_SIZE): + pv_psum[:, :] += nl.matmul( + p_local_transposed[:, nl.ds(k_i * B_P_SIZE, B_P_SIZE)], + v[k_i, :, :], + transpose_x=True, + mask=forward_mask, + ) # (128, 128) (p(Br), d) + + if initialize: + o_buffer[:, :] = nl.copy(pv_psum[:, :]) + l_buffer[:, 0] = nl.add(nl.log(ps), max_) + else: + o_buffer[:, :] = nl.add(o_previous_scaled, pv_psum, mask=forward_mask) + + l_prev = l_buffer[:, 0] + l_exp = nl.add( + nl.exp( + nl.subtract(l_prev, m_current, mask=forward_mask), + mask=forward_mask, + ), + ps, + mask=forward_mask, + ) + l_buffer[:, 0] = nl.add(m_current, + nl.log(l_exp, mask=forward_mask), + mask=forward_mask) + + +@nki.jit +def load_v_tile(v_hbm_tile, cur_v_tile, j, v_i, config): + LARGE_TILE_SZ = config.seq_tile_size + B_P_SIZE = 128 + + if not config.should_transpose_v: + cur_v_tile[v_i, :, :] = nl.load( + v_hbm_tile[nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE), :], + dtype=cur_v_tile.dtype, + ) + return + + if nisa.get_nc_version() == nisa.nc_version.gen3: + cur_v_tile_transposed = nisa.dma_transpose( + v_hbm_tile[:, + nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)]) + cur_v_tile[v_i, :, :] = nisa.tensor_copy(cur_v_tile_transposed, + dtype=cur_v_tile.dtype) + return + + cur_v_tile[v_i, :, :] = nl.load_transpose2d( + v_hbm_tile[:, nl.ds(j * LARGE_TILE_SZ + B_P_SIZE * v_i, B_P_SIZE)], + dtype=cur_v_tile.dtype, + ) + + +@nki.jit +def flash_paged_attention( + query, + key, + value, + key_cache, + value_cache, + block_tables, + mask, + softmax_scale=None, + mixed_precision=True, + config=None, + return_debug_tensors=False, +): + """ + Flash PagedAttention Forward Kernel. + - PagedAttention Paper: https://arxiv.org/abs/2309.06180 + - Chunked Prefill Paper: https://arxiv.org/abs/2403.02310 + + IO tensor layouts: + - query: shape (1, n_heads, d, seq_q) + - key: shape (1, n_kv_heads, d, seq_k) + - value: shape (1, n_kv_heads, seq_v, d) + - key_cache: (num_blocks, block_size, n_kv_heads, d) + - value_cache: (num_blocks, block_size, n_kv_heads, d) + - block_tables: (num_active_blocks, ) + - mask: (seq_q, num_active_blocks * block_size) + - o: shape (1, n_heads, seq_q, d) + - l_m: shape (1, n_heads, seq_q, 2) + + - This kernel requires seq_k == seq_v + - We use continuous batching by default, so the batch dimension is + always 1, and different requests are concatenated along sequence + dimension. + - We use paged cache blocks (key_cache, value_cache) to store KV cache. + + IO tensor dtypes: + - This kernel assumes all IO tensors have the same dtype except for + block_tables (int32) and mask (int32) + - If mixed_percision is True, then all Tensor Engine operation will be + performed in bfloat16 and accumulation will be performed in float32. + Otherwise the intermediates will be in the same type as the inputs. + + Compile-time Constants: + - softmax_scale: scaling for softmax, is None, default is `1.0/(d**0.5)` + - mixed_precision: flag to set non-matmul ops in fp32 precision, default + is set to `true`, if false, we use same precision as input types + - config: Instance of dataclass :class:`nki.kernels.attention.FlashConfig` + with Performance config parameters for flash attention with default + values + seq_tile_size: `default=2048`, size of the kv tile size for attention + computation reduction + + GQA support Notes: + the spmd kernel for launching kernel should be on kv_heads instead of + nheads + + Example usage: + MHA: q: [b, h, d, s], k: [b, h, d, s], v: [b, h, s, d] + usage: `flash_fwd[b, h](q, k, v, ...)` + GQA: q: [b, h, d, s], k: [b, kv_h, d, s], v: [b, kv_h, s, d] + usage: `flash_fwd[b, kv_h](q, k, v, ...)` + """ + config = config or FlashConfig() + B_F_SIZE = 512 + B_P_SIZE = 128 + b, h, d, seqlen_q = query.shape + B_D_SIZE = d + LARGE_TILE_SZ = config.seq_tile_size + n_tile_q = seqlen_q // B_P_SIZE # since q will be loaded on tensor engine + num_blocks, block_size, k_h, _ = key_cache.shape + q_h_per_k_h = h // k_h + assert tuple(key_cache.shape) == ( + num_blocks, + block_size, + k_h, + d, + ), "Input shape mismatch!" + assert tuple(value_cache.shape) == ( + num_blocks, + block_size, + k_h, + d, + ), "Input shape mismatch!" + assert b == 1, f"invalid batch size {b=}" + assert d <= 128, f" we do not support head_dim > 128, got head dim {d}" + kernel_dtype = nl.bfloat16 if mixed_precision else query.dtype + acc_type = np.dtype(np.float32) if mixed_precision else kernel_dtype + + o = nl.ndarray((b, h, seqlen_q, d), + dtype=query.dtype, + buffer=nl.shared_hbm) + hbm_l_buffer, hbm_m_buffer, hbm_qk_res, qk_res_buffer = ( + None, + None, + None, + None, + ) + if return_debug_tensors: + hbm_l_buffer = nl.ndarray((b, h, seqlen_q), + dtype=acc_type, + buffer=nl.shared_hbm) + hbm_m_buffer = nl.ndarray((b, h, seqlen_q), + dtype=acc_type, + buffer=nl.shared_hbm) + hbm_qk_res = nl.ndarray((b, h, B_P_SIZE, seqlen_q), + dtype=acc_type, + buffer=nl.shared_hbm) + qk_res_buffer = nl.zeros( + (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), seqlen_q), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + + assert ( + nl.program_ndim() == 2 + ), f"Expect spmd grid with 2 dimensions, got {nl.program_ndim()} instead!" + batch_id = nl.program_id(axis=0) + head_id = nl.program_id(axis=1) + + softmax_scale = softmax_scale or (1.0 / (d**0.5)) + + (num_active_blocks, ) = block_tables.shape + context_kv_len = num_active_blocks * block_size + assert (config.seq_tile_size >= 512 + ), f" seq tile_size {config.seq_tile_size} cannot be less than 512" + assert (context_kv_len % LARGE_TILE_SZ == 0 + ), f"Need {context_kv_len=} to be divisible by {LARGE_TILE_SZ=}" + assert ( + LARGE_TILE_SZ % B_P_SIZE == 0 + ), f"Need LARGE_TILE_SZ ({LARGE_TILE_SZ}) to be divisible by {B_P_SIZE=}" + assert (B_P_SIZE % block_size == 0 + ), f"Need B_P_SIZE ({B_P_SIZE}) to be divisible by {block_size=}" + num_large_k_tile = context_kv_len // LARGE_TILE_SZ + num_blocks_per_large_tile = LARGE_TILE_SZ // block_size + assert (num_blocks_per_large_tile <= B_P_SIZE + ), f"The number of blocks in each large tile " \ + f"({num_blocks_per_large_tile}) shouldn't exceed partition size {B_P_SIZE}" + + block_tables_sbuf = nl.full((par_dim(B_P_SIZE), num_large_k_tile), + 0, + dtype=np.int32, + buffer=nl.sbuf) + for j in nl.affine_range(num_large_k_tile): + i_p = nl.arange(num_blocks_per_large_tile)[:, None] + block_tables_sbuf[i_p, j] = nl.load( + block_tables[j * num_blocks_per_large_tile + i_p], dtype=np.int32) + + # Global Flash Attention accumulators + o_buffer = nl.zeros( + (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), d), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + l_buffer = nl.zeros( + (par_dim(B_P_SIZE), n_tile_q, q_h_per_k_h), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + m_buffer = nl.zeros( + (n_tile_q, q_h_per_k_h, par_dim(B_P_SIZE), 1), + dtype=acc_type, + buffer=nl.sbuf, + lazy_initialization=True, + ) + + for j in nl.sequential_range(0, num_large_k_tile): + cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + cur_v_tile = nl.ndarray( + (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE), + dtype=kernel_dtype, + ) + + for k_i in nl.affine_range(num_blocks_per_large_tile): + loaded = nl.load(key_cache[block_tables_sbuf[k_i, j], :, + head_id, :]) + cur_k_tile[:, nl.ds(k_i * + block_size, block_size)] = nl.transpose(loaded) + + load_tile_size = B_P_SIZE + num_blocks_per_partition = load_tile_size // block_size + for partition_idx in nl.affine_range(LARGE_TILE_SZ // load_tile_size): + for block_in_partition in nl.affine_range( + num_blocks_per_partition): + v_i = (partition_idx * num_blocks_per_partition + + block_in_partition) + loaded_v = nl.load(value_cache[block_tables_sbuf[v_i, j], :, + head_id, :]) + cur_v_tile[ + partition_idx, + nl.ds(block_in_partition * block_size, block_size), + :, + ] = loaded_v + + cur_mask = nl.ndarray((par_dim(B_P_SIZE), LARGE_TILE_SZ), + dtype=mask.dtype) + for m_i in nl.affine_range(LARGE_TILE_SZ // B_F_SIZE): + cur_mask[:, nl.ds(m_i * B_F_SIZE, B_F_SIZE)] = nl.load( + mask[:, nl.ds(j * LARGE_TILE_SZ + m_i * B_F_SIZE, B_F_SIZE)]) + + for i_q_h in nl.affine_range(q_h_per_k_h): + for i in nl.affine_range(n_tile_q): + q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) + q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] + q_sbuf_tile = nl.load( + q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)], + dtype=kernel_dtype, + ) # load (d, 128) tile in SBUF + q_tile[:, :] = q_sbuf_tile * softmax_scale + + _flash_attention_core( + q_local_tile=q_tile, + k=cur_k_tile, + v=cur_v_tile, + q_h_per_k_h=q_h_per_k_h, + seqlen_q=seqlen_q, + nheads=h, + o_buffer=o_buffer[i, i_q_h], + l_buffer=l_buffer[:, i, i_q_h], + m_buffer=m_buffer[i, i_q_h], + batch_id=batch_id, + head_id=head_id, + gqa_head_idx=i_q_h, + q_tile_idx=i, + local_k_large_tile_idx=j, + kernel_dtype=kernel_dtype, + acc_type=acc_type, + flash_config=config, + use_causal_mask=False, + continuous_batching_mask=cur_mask, + initialize=j == 0, + B_P_SIZE=B_P_SIZE, + B_F_SIZE=B_F_SIZE, + B_D_SIZE=B_D_SIZE, + dropout_p=0.0, + dropout_p_tensor=None, + seed_tensor=None, + logit_bias_tile=None, + ) + + # compute attention between input query, key and value + if key is not None and value is not None: + B_F_SIZE = seqlen_q + LARGE_TILE_SZ = seqlen_q + active_config = FlashConfig( + seq_tile_size=LARGE_TILE_SZ, + should_transpose_v=config.should_transpose_v, + ) + + cur_k_tile = nl.ndarray((par_dim(B_D_SIZE), LARGE_TILE_SZ), + dtype=kernel_dtype) + cur_v_tile = nl.ndarray( + (LARGE_TILE_SZ // B_P_SIZE, par_dim(B_P_SIZE), B_D_SIZE), + dtype=kernel_dtype, + ) + + cur_k_tile[:, :] = nl.load(key[batch_id, head_id, :, :]) + + load_tile_size = B_P_SIZE + v_hbm_tile = value[batch_id, head_id] + for v_i in nl.affine_range(LARGE_TILE_SZ // load_tile_size): + load_v_tile( + v_hbm_tile=v_hbm_tile, + cur_v_tile=cur_v_tile, + j=0, + v_i=v_i, + config=active_config, + ) + + cur_mask = nl.ndarray((par_dim(B_P_SIZE), B_F_SIZE), dtype=mask.dtype) + cur_mask[:, :] = nl.load(mask[:, nl.ds(context_kv_len, B_F_SIZE)]) + + for i_q_h in nl.affine_range(q_h_per_k_h): + for i in nl.affine_range(n_tile_q): + q_tile = nl.ndarray((B_D_SIZE, B_P_SIZE), dtype=kernel_dtype) + q_hbm_tile = query[batch_id, head_id * q_h_per_k_h + i_q_h] + q_sbuf_tile = nl.load( + q_hbm_tile[:, nl.ds(i * B_P_SIZE, B_P_SIZE)], + dtype=kernel_dtype, + ) # load (d, 128) tile in SBUF + q_tile[:, :] = q_sbuf_tile * softmax_scale + _flash_attention_core( + q_local_tile=q_tile, + k=cur_k_tile, + v=cur_v_tile, + q_h_per_k_h=q_h_per_k_h, + seqlen_q=seqlen_q, + nheads=h, + o_buffer=o_buffer[i, i_q_h], + l_buffer=l_buffer[:, i, i_q_h], + m_buffer=m_buffer[i, i_q_h], + batch_id=batch_id, + head_id=head_id, + gqa_head_idx=i_q_h, + q_tile_idx=i, + local_k_large_tile_idx=0, + kernel_dtype=kernel_dtype, + acc_type=acc_type, + flash_config=active_config, + use_causal_mask=False, + continuous_batching_mask=cur_mask, + initialize=False, + B_P_SIZE=B_P_SIZE, + B_F_SIZE=B_F_SIZE, + B_D_SIZE=B_D_SIZE, + dropout_p=0.0, + dropout_p_tensor=None, + seed_tensor=None, + logit_bias_tile=None, + qk_res_buffer=qk_res_buffer[i, i_q_h] + if qk_res_buffer is not None else None, + ) + + # -- -- -- -- write output to buffer on HBM -- -- -- -- -- -- # + for i_q_h in nl.affine_range(q_h_per_k_h): + for i in nl.affine_range(n_tile_q): + out = nl.multiply( + o_buffer[i, i_q_h, :, :], + nl.exp(m_buffer[i, i_q_h, :, :] - l_buffer[:, i, i_q_h]), + dtype=kernel_dtype, + ) + + nl.store( + o[ + batch_id, + head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), + :, + ], + out, + ) + # maximum and summation statistics + if return_debug_tensors: + nl.store( + hbm_m_buffer[ + batch_id, + head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), + ], + m_buffer[i, i_q_h, :, :], + ) + nl.store( + hbm_l_buffer[ + batch_id, + head_id * q_h_per_k_h + i_q_h, + nl.ds(i * B_P_SIZE, B_P_SIZE), + ], + l_buffer[:, i, i_q_h], + ) + nl.store( + hbm_qk_res[batch_id, head_id * q_h_per_k_h + i_q_h, :, :], + qk_res_buffer[batch_id, i_q_h, :, :], + ) + + if return_debug_tensors: + return o, hbm_m_buffer, hbm_l_buffer, hbm_qk_res + return o + + +def flash_attn_varlen_nkifunc( + query, + key, + value, + key_cache, + value_cache, + block_table, + attn_mask, + n_kv_head=None, + head_size=None, + B_P_SIZE=128, + LARGE_TILE_SZ=2048, + return_debug_tensors=False, + mixed_precision=True, +): + config = FlashConfig( + seq_tile_size=LARGE_TILE_SZ, + should_transpose_v=False, + ) + kwargs = dict( + query=query, + key=key, + value=value, + key_cache=key_cache, + value_cache=value_cache, + block_tables=block_table, + mask=attn_mask, + softmax_scale=1.0 / (head_size**0.5), + config=config, + mixed_precision=mixed_precision, + return_debug_tensors=return_debug_tensors, + ) + _, n_kv_head, _, _ = key.shape + + if return_debug_tensors: + o, *debug_tensors = flash_paged_attention[1, n_kv_head](**kwargs) + return o, *debug_tensors + else: + o = flash_paged_attention[1, n_kv_head](**kwargs) + return o diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 076f151ffcb61..fd62329141f6f 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -69,8 +69,8 @@ def write_to_paged_cache( value_cache: torch.Tensor, slot_mapping: torch.Tensor, kv_cache_dtype: str, - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, ) -> None: ops.reshape_and_cache( key, @@ -95,8 +95,8 @@ def forward_decode( num_kv_heads: int, scale: float, alibi_slopes: Optional[torch.Tensor], - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, tp_rank: int = 0, blocksparse_local_blocks: int = 0, blocksparse_vert_stride: int = 0, @@ -204,8 +204,8 @@ def forward_prefix( max_query_len: int, alibi_slopes: Optional[torch.Tensor], sliding_window: Optional[int], - k_scale: float, - v_scale: float, + k_scale: torch.Tensor, + v_scale: torch.Tensor, ) -> torch.Tensor: output = torch.empty_like(query) context_attention_fwd( diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py index 9c11a8df55278..ec3c8459c43ef 100644 --- a/vllm/attention/ops/prefix_prefill.py +++ b/vllm/attention/ops/prefix_prefill.py @@ -133,7 +133,7 @@ def _fwd_kernel( other=0.0) # [D,N] if k_load.dtype.is_fp8(): - k = (k_load.to(tl.float32) * k_scale).to(q.dtype) + k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype) else: k = k_load @@ -181,7 +181,7 @@ def _fwd_kernel( ((start_n + offs_n[:, None]) < cur_batch_ctx_len), other=0.0) # [N,D] if v_load.dtype.is_fp8(): - v = (v_load.to(tl.float32) * v_scale).to(q.dtype) + v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype) else: v = v_load p = p.to(v.dtype) @@ -219,8 +219,8 @@ def _fwd_kernel( float("-inf")) if SLIDING_WINDOW > 0: qk = tl.where( - offs_m[:, None] - - (start_n + offs_n[None, :]) < SLIDING_WINDOW, qk, -10000) + offs_m[:, None] - (start_n + offs_n[None, :]) + < SLIDING_WINDOW, qk, -10000) # -- compute m_ij, p, l_ij m_ij = tl.max(qk, 1) @@ -324,10 +324,10 @@ def _fwd_kernel_flash_attn_v2( (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + cur_head * stride_qh + offs_d[None, :] * stride_qd) - q = tl.load( - Q + off_q, - mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len, - other=0.0) + q = tl.load(Q + off_q, + mask=offs_m[:, None] + < cur_batch_seq_len - cur_batch_ctx_len, + other=0.0) # # initialize pointer to m and l m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") @@ -402,8 +402,8 @@ def _fwd_kernel_flash_attn_v2( # -- compute qk ---- k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, - mask=(start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -430,8 +430,8 @@ def _fwd_kernel_flash_attn_v2( # update acc v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, - mask=(start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len, + mask=(start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len, other=0.0) p = p.to(v.dtype) @@ -564,7 +564,7 @@ def _fwd_kernel_alibi( other=0.0) # [D,N] if k_load.dtype.is_fp8(): - k = (k_load.to(tl.float32) * k_scale).to(q.dtype) + k = (k_load.to(tl.float32) * tl.load(k_scale)).to(q.dtype) else: k = k_load @@ -604,7 +604,7 @@ def _fwd_kernel_alibi( ((start_n + offs_n[:, None]) < cur_batch_ctx_len), other=0.0) if v_load.dtype.is_fp8(): - v = (v_load.to(tl.float32) * v_scale).to(q.dtype) + v = (v_load.to(tl.float32) * tl.load(v_scale)).to(q.dtype) else: v = v_load p = p.to(v.dtype) @@ -639,8 +639,8 @@ def _fwd_kernel_alibi( k = tl.load(k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, mask=dim_mask[:, None] & - ((start_n + offs_n[None, :]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[None, :]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) @@ -677,8 +677,8 @@ def _fwd_kernel_alibi( v = tl.load(v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, mask=dim_mask[None, :] & - ((start_n + offs_n[:, None]) < - cur_batch_seq_len - cur_batch_ctx_len), + ((start_n + offs_n[:, None]) + < cur_batch_seq_len - cur_batch_ctx_len), other=0.0) p = p.to(v.dtype) @@ -713,8 +713,8 @@ def context_attention_fwd(q, b_seq_len, b_ctx_len, max_input_len, - k_scale: float = 1.0, - v_scale: float = 1.0, + k_scale: torch.Tensor, + v_scale: torch.Tensor, alibi_slopes=None, sliding_window=None): diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py index f94211116a746..ef04603f22b6e 100644 --- a/vllm/attention/ops/triton_flash_attention.py +++ b/vllm/attention/ops/triton_flash_attention.py @@ -627,8 +627,8 @@ def attn_fwd( causal_start_idx, dtype=tl.int32) mask_m_offsets = start_m_idx + tl.arange(0, BLOCK_M) - out_ptrs_mask = (mask_m_offsets[:, None] >= - out_mask_boundary[None, :]) + out_ptrs_mask = (mask_m_offsets[:, None] + >= out_mask_boundary[None, :]) z = 0.0 acc = tl.where(out_ptrs_mask, acc, z.to(acc.type.element_ty)) # write back LSE diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 81ea6eefb5410..1376274d57777 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,6 +1,6 @@ import os from contextlib import contextmanager -from functools import lru_cache +from functools import cache from typing import Generator, Optional, Type import torch @@ -100,7 +100,7 @@ def get_attn_backend( ) -@lru_cache(maxsize=None) +@cache def _cached_get_attn_backend( head_size: int, dtype: torch.dtype, diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 87655530cead4..7f4f97466d503 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -25,23 +25,30 @@ logger = init_logger(__name__) +@dataclasses.dataclass +class InductorArtifact: + hash_str: str = "" + file_path: str = "" + + class InductorHashCache: """ Disk format: a Python list of tuples, each tuple is - (runtime_shape, graph_index, hash_str) + (runtime_shape, graph_index, hash_str, file_path) We use list of tuple for readability. In-memory format: a defaultdict of dict, where the key is runtime_shape, and the value is a dict of graph_index to hash_str. - The data is essentially `Dict[Optional[int], Dict[int, str]]`, + The data is essentially `Dict[Optional[int], Dict[int, InductorArtifact]]`, we don't use json here because json doesn't support int as key. TODO: better off-the-shelf solution to serialize the data? """ def __init__(self, cache_dir: str, disabled: bool = False): - self.cache: defaultdict = defaultdict(dict) + self.cache: Dict[Optional[int], + Dict[int, InductorArtifact]] = defaultdict(dict) self.disabled = disabled self.cache_dir = cache_dir self.cache_file_path = os.path.join(cache_dir, @@ -66,14 +73,25 @@ def deserialize(self, data: str): # because it is a safe way to parse Python literals. # do not use eval(), it is unsafe. list_data = ast.literal_eval(data) - for runtime_shape, graph_index, hash_str in list_data: - self.cache[runtime_shape][graph_index] = hash_str + for item in list_data: + runtime_shape = item[0] + graph_index = item[1] + hash_str = item[2] + # for compatibility of old version, + # where we don't have file_path. + # NOTE: after running the new code, the file_path + # will be updated. + file_path = "" if len(item) == 3 else item[3] + self.cache[runtime_shape][graph_index] = InductorArtifact( + hash_str=hash_str, file_path=file_path) def serialize(self) -> str: data = [] - for runtime_shape, graph_index_to_hash_str in self.cache.items(): - for graph_index, hash_str in graph_index_to_hash_str.items(): - data.append((runtime_shape, graph_index, hash_str)) + for runtime_shape, value in self.cache.items(): + for graph_index, inductor_artifact in value.items(): + data.append( + (runtime_shape, graph_index, inductor_artifact.hash_str, + inductor_artifact.file_path)) printer = pprint.PrettyPrinter(indent=4) return printer.pformat(data) @@ -90,13 +108,14 @@ def __contains__(self, key: Tuple[Optional[int], int]) -> bool: return runtime_shape in self.cache and graph_index in self.cache[ runtime_shape] - def __getitem__(self, key: Tuple[Optional[int], int]) -> str: + def __getitem__(self, key: Tuple[Optional[int], int]) -> InductorArtifact: if self.disabled: raise KeyError("cannot read from disabled cache") runtime_shape, graph_index = key return self.cache[runtime_shape][graph_index] - def __setitem__(self, key: Tuple[Optional[int], int], value: str): + def __setitem__(self, key: Tuple[Optional[int], int], + value: InductorArtifact): # setitem for disabled cache is fine, because we # don't actually write to the disk runtime_shape, graph_index = key @@ -181,7 +200,8 @@ def wrap_inductor(graph: fx.GraphModule, if (runtime_shape, graph_index) in cache_data: # we compiled this graph before # so we can directly lookup the compiled graph via hash - hash_str = cache_data[(runtime_shape, graph_index)] + inductor_artifact = cache_data[(runtime_shape, graph_index)] + hash_str = inductor_artifact.hash_str if graph_index == 0: # adds some info logging for the first graph logger.info( @@ -199,6 +219,7 @@ def wrap_inductor(graph: fx.GraphModule, "Inductor cache lookup failed. Please remove" f"the cache file {cache_data.cache_file_path} and try again." # noqa ) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa # Inductor calling convention (function signature): # f(list) -> tuple @@ -224,19 +245,20 @@ def compiled_graph(*args): # the assumption is that we don't have nested Inductor compilation. # compiled_fx_graph_hash will only be called once, and we can hook # it to get the hash of the compiled graph directly. - from torch._inductor.codecache import compiled_fx_graph_hash + + inductor_artifact = InductorArtifact() + from torch._inductor.codecache import (FxGraphCache, + compiled_fx_graph_hash) + original_load = FxGraphCache.load + + def hijack_load(*args, **kwargs): + inductor_compiled_graph = original_load(*args, **kwargs) + inductor_artifact.file_path = inductor_compiled_graph.current_callable.__code__.co_filename # noqa + return inductor_compiled_graph def hijack_compiled_fx_graph_hash(*args, **kwargs): out = compiled_fx_graph_hash(*args, **kwargs) - # store the hash in the cache - nonlocal cache_data - cache_data[(runtime_shape, graph_index)] = out[0] - if graph_index == 0: - # adds some info logging for the first graph - logger.info("Cache the graph of shape %s for later use", - str(runtime_shape)) - logger.debug("store the %s-th graph for shape %s via hash %s", - graph_index, str(runtime_shape), out[0]) + inductor_artifact.hash_str = out[0] return out def _check_can_cache(*args, **kwargs): @@ -251,19 +273,45 @@ def _check_can_cache(*args, **kwargs): def _get_shape_env() -> AlwaysHitShapeEnv: return AlwaysHitShapeEnv() - with patch(# for hijacking the hash of the compiled graph - "torch._inductor.codecache.compiled_fx_graph_hash", - hijack_compiled_fx_graph_hash), \ - patch(# for providing a dummy shape environment - "torch._inductor.codecache.FxGraphCache._get_shape_env", - _get_shape_env), \ - patch(# for forcing the graph to be cached - "torch._inductor.codecache.FxGraphCache._check_can_cache", - _check_can_cache): + with ExitStack() as stack: + if not cache_data.disabled: + # compilation cache is enabled, patch several functions + + # hijack to get the compiled graph itself + stack.enter_context( + patch("torch._inductor.codecache.FxGraphCache.load", + hijack_load)) + + # for hijacking the hash of the compiled graph + stack.enter_context( + patch("torch._inductor.codecache.compiled_fx_graph_hash", + hijack_compiled_fx_graph_hash)) + + # for providing a dummy shape environment + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._get_shape_env", + _get_shape_env)) + + # for forcing the graph to be cached + stack.enter_context( + patch( + "torch._inductor.codecache.FxGraphCache._check_can_cache", + _check_can_cache)) + compiled_graph = compile_fx(graph, example_inputs, config_patches=current_config) - + # store the inductor_artifact in the cache + cache_data[(runtime_shape, graph_index)] = inductor_artifact + if graph_index == 0: + # adds some info logging for the first graph + logger.info("Cache the graph of shape %s for later use", + str(runtime_shape)) + logger.debug( + "store the %s-th graph for shape %s via hash %s from file %s", + graph_index, str(runtime_shape), inductor_artifact.hash_str, + inductor_artifact.file_path) # after compiling the last graph, record the end time if graph_index == num_graphs - 1: now = time.time() @@ -476,6 +524,7 @@ def configure_post_pass(self): def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: + vllm_config = self.vllm_config if not self.compilation_config.cache_dir: # no provided cache dir, generate one based on the known factors # that affects the compilation. if none of the factors change, @@ -484,7 +533,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: # 1. factors come from the vllm_config (it mainly summarizes how the # model is created) - vllm_config = self.vllm_config config_hash = vllm_config.compute_hash() # 2. factors come from the code files that are traced by Dynamo ( @@ -508,20 +556,26 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: hash_key = hashlib.md5( f"{config_hash}_{code_hash}".encode()).hexdigest()[:10] cache_dir = os.path.join( - envs.VLLM_CACHE_ROOT, "torch_compile_cache", hash_key, - f"rank_{vllm_config.parallel_config.rank}") - else: - cache_dir = self.compilation_config.cache_dir + envs.VLLM_CACHE_ROOT, + "torch_compile_cache", + hash_key, + ) + self.compilation_config.cache_dir = cache_dir + + cache_dir = self.compilation_config.cache_dir os.makedirs(cache_dir, exist_ok=True) + local_cache_dir = os.path.join( + cache_dir, f"rank_{vllm_config.parallel_config.rank}") + self.compilation_config.local_cache_dir = local_cache_dir disabled = envs.VLLM_DISABLE_COMPILE_CACHE self.inductor_hash_cache: InductorHashCache = InductorHashCache( - cache_dir, disabled=disabled) + local_cache_dir, disabled=disabled) if disabled: logger.info("vLLM's torch.compile cache is disabled.") else: logger.info("Using cache directory: %s for vLLM's torch.compile", - cache_dir) + local_cache_dir) # when dynamo calls the backend, it means the bytecode # transform and analysis are done @@ -561,6 +615,18 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: self.vllm_config, self.graph_pool, self).run(*example_inputs) + graph_path = os.path.join(local_cache_dir, "computation_graph.py") + if not os.path.exists(graph_path): + # code adapted from https://github.com/thuml/depyf/blob/dab831108a752d1facc00acdd6d4243891845c37/depyf/explain/patched_lazy_format_graph_code.py#L30 # noqa + # use `print_readable` because it can include submodules + src = "from __future__ import annotations\nimport torch\n" + \ + self.split_gm.print_readable(print_output=False) + src = src.replace("", "GraphModule") + with open(graph_path, "w") as f: + f.write(src) + + logger.debug("Computation graph saved to %s", graph_path) + self._called = True if not self.compilation_config.use_cudagraph or \ @@ -576,9 +642,13 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable: ] # index of tensors that have symbolic shapes (batch size) + # for weights and static buffers, they will have concrete shapes. + # symbolic shape only happens for input tensors. + from torch.fx.experimental.symbolic_shapes import is_symbolic self.sym_tensor_indices = [ i for i, x in enumerate(fake_args) - if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) + if isinstance(x, torch._subclasses.fake_tensor.FakeTensor) and \ + any(is_symbolic(d) for d in x.size()) ] # compiler managed cudagraph input buffers @@ -610,7 +680,7 @@ def copy_and_call(*args): class ConcreteSizeEntry: runtime_shape: int need_to_compile: bool # the size is in compile_sizes - use_cudagraph: bool # the size is in capture_sizes + use_cudagraph: bool # the size is in cudagraph_capture_sizes compiled: bool = False runnable: Callable = None # type: ignore @@ -657,8 +727,8 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, self.compile_sizes: Set[int] = set( self.compilation_config.compile_sizes) - self.capture_sizes: Set[int] = set( - self.compilation_config.capture_sizes + self.cudagraph_capture_sizes: Set[int] = set( + self.compilation_config.cudagraph_capture_sizes ) if self.compilation_config.use_cudagraph else set() self.first_run_finished = False @@ -676,11 +746,11 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, # to_be_compiled_sizes tracks the remaining sizes to compile, # and updates during the compilation process, so we need to copy it self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy() - for shape in self.compile_sizes.union(self.capture_sizes): + for shape in self.compile_sizes.union(self.cudagraph_capture_sizes): self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, need_to_compile=shape in self.compile_sizes, - use_cudagraph=shape in self.capture_sizes, + use_cudagraph=shape in self.cudagraph_capture_sizes, ) def check_for_ending_compilation(self): diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 10513111ea7f1..17eb0592ced6d 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -76,8 +76,8 @@ def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): During runtime, when we actually mark dimensions of tensors, it depends on the value of arguments: - - if it is a single integer, the corresponding dimension of the argument - will be marked as dynamic. + - if it is a single integer (can be negative), the corresponding dimension + of the argument will be marked as dynamic. - if it is `None`, ignored. - if it is `IntermediateTensors`, all the tensors in the intermediate tensors will be marked as dynamic. @@ -177,10 +177,20 @@ def __call__(self, *args, **kwargs): for k, dims in dynamic_arg_dims.items(): arg = bound_args.arguments.get(k) if arg is not None: + dims = [dims] if isinstance(dims, int) else dims if isinstance(arg, torch.Tensor): + # In case dims is specified with negative indexing + dims = [ + arg.ndim + dim if dim < 0 else dim for dim in dims + ] torch._dynamo.mark_dynamic(arg, dims) elif isinstance(arg, IntermediateTensors): for tensor in arg.tensors.values(): + # In case dims is specified with negative indexing + dims = [ + tensor.ndim + dim if dim < 0 else dim + for dim in dims + ] torch._dynamo.mark_dynamic(tensor, dims) else: raise ValueError( @@ -188,6 +198,8 @@ def __call__(self, *args, **kwargs): f" {dims} for argument {k} with type {type(arg)}.") # here, it is the starting point of the `torch.compile` process start_monitoring_torch_compile(self.vllm_config) + logger.debug("Start compiling function %s", + self.original_code_object) # if we don't use custom dispatcher, we can directly call the # compiled function and let torch.compile handle the dispatching, diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index e3260a10c02ae..58a8fa76f6ce2 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -9,6 +9,9 @@ import vllm.envs as envs from vllm.config import CompilationLevel, get_current_vllm_config +from vllm.logger import init_logger + +logger = init_logger(__name__) class TorchCompileWrapperWithCustomDispatcher: @@ -82,6 +85,25 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): return self.compiled_codes.append(new_code) + local_cache_dir = self.vllm_config.compilation_config.local_cache_dir + if isinstance(local_cache_dir, str): + decompiled_file = os.path.join(local_cache_dir, + "transformed_code.py") + if not os.path.exists(decompiled_file): + try: + # usually the decompilation will succeed for most models, + # as we guarantee a full-graph compilation in Dynamo. + # but there's no 100% guarantee, since decompliation is + # not a reversible process. + import depyf + src = depyf.decompile(new_code) + with open(decompiled_file, "w") as f: + f.write(src) + + logger.debug("Dynamo transformed code saved to %s", + decompiled_file) + except Exception: + pass if self.vllm_config.compilation_config.use_cudagraph and \ "update" in new_code.co_names: diff --git a/vllm/config.py b/vllm/config.py index e2ae96dbbab64..4880535793a4d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -67,7 +67,8 @@ _TASK_RUNNER: Dict[_ResolvedTask, RunnerType] = { task: runner - for runner, tasks in _RUNNER_TASKS.items() for task in tasks + for runner, tasks in _RUNNER_TASKS.items() + for task in tasks } HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig], @@ -120,11 +121,6 @@ class ModelConfig: decoding draft models. quantization: Quantization method that was used to quantize the model weights. If None, we assume the model weights are not quantized. - quantization_param_path: Path to JSON file containing scaling factors. - Used to load KV cache scaling factors into the model when KV cache - type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also - be used to load activation and weight scaling factors when the - model dtype is FP8_E4M3 on ROCm. enforce_eager: Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid. @@ -169,6 +165,8 @@ class ModelConfig: `logits_processors` extra completion argument. Defaults to None, which allows no processors. generation_config: Configuration parameter file for generation. + override_generation_config: Override the generation config with the + given config. """ def compute_hash(self) -> str: @@ -187,7 +185,6 @@ def compute_hash(self) -> str: factors.append(self.model) factors.append(self.dtype) factors.append(self.quantization) - factors.append(self.quantization_param_path) factors.append(self.revision) factors.append(self.code_revision) factors.append(self.trust_remote_code) @@ -195,40 +192,43 @@ def compute_hash(self) -> str: factors.append(self.rope_theta) return hashlib.sha256(str(factors).encode()).hexdigest() - def __init__(self, - model: str, - task: Union[TaskOption, Literal["draft"]], - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - dtype: Union[str, torch.dtype], - seed: int, - allowed_local_media_path: str = "", - revision: Optional[str] = None, - code_revision: Optional[str] = None, - rope_scaling: Optional[Dict[str, Any]] = None, - rope_theta: Optional[float] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - spec_target_max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - quantization_param_path: Optional[str] = None, - enforce_eager: Optional[bool] = None, - max_seq_len_to_capture: Optional[int] = None, - max_logprobs: int = 20, - disable_sliding_window: bool = False, - skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, - limit_mm_per_prompt: Optional[Mapping[str, int]] = None, - use_async_output_proc: bool = True, - config_format: ConfigFormat = ConfigFormat.AUTO, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[Dict[str, Any]] = None, - disable_mm_preprocessor_cache: bool = False, - override_neuron_config: Optional[Dict[str, Any]] = None, - override_pooler_config: Optional["PoolerConfig"] = None, - logits_processor_pattern: Optional[str] = None, - generation_config: Optional[str] = None) -> None: + def __init__( + self, + model: str, + task: Union[TaskOption, Literal["draft"]], + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + dtype: Union[str, torch.dtype], + seed: int, + allowed_local_media_path: str = "", + revision: Optional[str] = None, + code_revision: Optional[str] = None, + rope_scaling: Optional[Dict[str, Any]] = None, + rope_theta: Optional[float] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + spec_target_max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + enforce_eager: Optional[bool] = None, + max_seq_len_to_capture: Optional[int] = None, + max_logprobs: int = 20, + disable_sliding_window: bool = False, + skip_tokenizer_init: bool = False, + served_model_name: Optional[Union[str, List[str]]] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, + use_async_output_proc: bool = True, + config_format: ConfigFormat = ConfigFormat.AUTO, + hf_overrides: Optional[HfOverrides] = None, + mm_processor_kwargs: Optional[Dict[str, Any]] = None, + disable_mm_preprocessor_cache: bool = False, + override_neuron_config: Optional[Dict[str, Any]] = None, + override_pooler_config: Optional["PoolerConfig"] = None, + logits_processor_pattern: Optional[str] = None, + generation_config: Optional[str] = None, + enable_sleep_mode: bool = False, + override_generation_config: Optional[Dict[str, Any]] = None, + ) -> None: self.model = model self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode @@ -271,12 +271,17 @@ def __init__(self, else: self.tokenizer_revision = tokenizer_revision self.quantization = quantization - self.quantization_param_path = quantization_param_path self.enforce_eager = enforce_eager self.max_seq_len_to_capture = max_seq_len_to_capture self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init + self.enable_sleep_mode = enable_sleep_mode + + from vllm.platforms import current_platform + + if self.enable_sleep_mode and not current_platform.is_cuda(): + raise ValueError("Sleep mode is only supported on CUDA devices.") hf_config = get_config(self.model, trust_remote_code, revision, code_revision, config_format) @@ -309,14 +314,15 @@ def __init__(self, (self.hf_text_config.model_type in ["gemma2", "cohere2"])) if (not self.disable_sliding_window and has_interleaved_attention): - if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": + if (backend := + envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"): sliding_window_len_min = get_min_sliding_window( self.hf_text_config.sliding_window) logger.warning_once( f"{self.hf_text_config.model_type} has interleaved " "attention, which is currently not supported by the " - "XFORMERS backend. Disabling sliding window and capping " + f"{backend} backend. Disabling sliding window and capping " "the max length to the sliding window size " f"({sliding_window_len_min}).") self.disable_sliding_window = True @@ -348,7 +354,6 @@ def __init__(self, self.is_hybrid = self._init_is_hybrid() self.has_inner_state = self._init_has_inner_state() - from vllm.platforms import current_platform if current_platform.is_neuron(): self.override_neuron_config = override_neuron_config else: @@ -366,6 +371,7 @@ def __init__(self, self.logits_processor_pattern = logits_processor_pattern self.generation_config = generation_config + self.override_generation_config = override_generation_config or {} self._verify_quantization() self._verify_cuda_graph() @@ -903,20 +909,31 @@ def get_diff_sampling_param(self) -> Dict[str, Any]: """ if self.generation_config is None: # When generation_config is not set - return {} - config = self.try_get_generation_config() + config = {} + else: + config = self.try_get_generation_config() + + # Overriding with given generation config + config.update(self.override_generation_config) + available_params = [ "repetition_penalty", "temperature", "top_k", "top_p", "min_p", + "max_new_tokens", ] if any(p in config for p in available_params): diff_sampling_param = { p: config.get(p) for p in available_params if config.get(p) is not None } + # Huggingface definition of max_new_tokens is equivalent + # to vLLM's max_tokens + if "max_new_tokens" in diff_sampling_param: + diff_sampling_param["max_tokens"] = diff_sampling_param.pop( + "max_new_tokens") else: diff_sampling_param = {} return diff_sampling_param @@ -995,6 +1012,7 @@ def __init__( sliding_window: Optional[int] = None, enable_prefix_caching: bool = False, cpu_offload_gb: float = 0, + calculate_kv_scales: Optional[bool] = None, ) -> None: self.block_size = block_size self.gpu_memory_utilization = gpu_memory_utilization @@ -1005,7 +1023,7 @@ def __init__( self.sliding_window = sliding_window self.enable_prefix_caching = enable_prefix_caching self.cpu_offload_gb = cpu_offload_gb - + self.calculate_kv_scales = calculate_kv_scales self._verify_args() self._verify_cache_dtype() self._verify_prefix_caching() @@ -1014,6 +1032,10 @@ def __init__( self.num_gpu_blocks: Optional[int] = None self.num_cpu_blocks: Optional[int] = None + # Set calculate_kv_scales to False if the value is unset. + if self.calculate_kv_scales is None: + self.calculate_kv_scales = False + def metrics_info(self): # convert cache_config to dict(key: str, value: str) for prometheus # metrics info @@ -1227,9 +1249,6 @@ class ParallelConfig: pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. tensor_parallel_size: int = 1 # Number of tensor parallel groups. - # Deprecated, use distributed_executor_backend instead. - worker_use_ray: Optional[bool] = None - # Maximum number of multiple batches # when load model sequentially. To avoid RAM OOM when using tensor # parallel and large models. @@ -1283,13 +1302,6 @@ def __post_init__(self) -> None: self.world_size = self.pipeline_parallel_size * \ self.tensor_parallel_size - if self.worker_use_ray: - if self.distributed_executor_backend is None: - self.distributed_executor_backend = "ray" - elif not self.use_ray: - raise ValueError(f"worker-use-ray can't be used with " - f"distributed executor backend " - f"'{self.distributed_executor_backend}'.") ray_only_devices = ["tpu"] from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices @@ -1710,7 +1722,8 @@ def maybe_create_spec_config( raise ValueError("Expect the batch size threshold of disabling " "speculative decoding is > 1, but got " f"{speculative_disable_by_batch_size=}") - + if (enable_chunked_prefill and speculative_model == "eagle"): + raise ValueError("Chunked prefill and EAGLE are not compatible.") # TODO: The user should be able to specify revision/max model len # for the draft model. It is not currently supported. draft_revision = None @@ -1777,12 +1790,6 @@ def maybe_create_spec_config( f"num_speculative_tokens={n_predict}, but " f"{num_speculative_tokens=} was provided.") - if enable_chunked_prefill and draft_hf_config.model_type in ( - "medusa", "mlp_speculator", "eagle"): - raise ValueError( - "Chunked prefill and hidden-state based draft models are " - "not compatible.") - speculative_draft_tensor_parallel_size = \ SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size( target_parallel_config, @@ -2006,8 +2013,8 @@ def _verify_args(self) -> None: "typical_acceptance_sampler.") if (self.draft_token_acceptance_method != 'rejection_sampler' - and self.draft_token_acceptance_method != - 'typical_acceptance_sampler'): + and self.draft_token_acceptance_method + != 'typical_acceptance_sampler'): raise ValueError( "Expected draft_token_acceptance_method to be either " "rejection_sampler or typical_acceptance_sampler. Instead it " @@ -2730,10 +2737,11 @@ class CompilationConfig(BaseModel): - use_inductor: whether to use inductor compilation. - False: inductor compilation is not used. graph runs in eager. - True: inductor compilation is used. one graph for symbolic shape - is compiled. In addition, compile for cudagraph sizes that are - in candidate_compile_sizes, using configurations - in inductor_compile_config. - - candidate_compile_sizes: sizes to compile for inductor. + is compiled. In addition, compile for compile_sizes, + using configurations in inductor_compile_config. + - compile_sizes: sizes to compile for inductor. In addition + to integers, it also supports "cudagraph_capture_sizes" to + specify the sizes for cudagraph capture. - inductor_compile_config: additional configurations for inductor. - None: use default configurations. - inductor_passes: additional passes for inductor. It is a dictionary @@ -2761,7 +2769,7 @@ class CompilationConfig(BaseModel): splitting_ops: List[str] = Field(default=None) # type: ignore use_inductor: bool = True - candidate_compile_sizes: Optional[List[int]] = Field(default=None) + compile_sizes: Optional[List[Union[int, str]]] = Field(default=None) inductor_compile_config: Dict = Field(default_factory=dict) inductor_passes: Dict[str, str] = Field(default_factory=dict) @@ -2809,9 +2817,8 @@ def model_post_init(self, __context: Any) -> None: pass_config: PassConfig = Field(default_factory=PassConfig) # not configurable, computed after init - compile_sizes: List[int] = PrivateAttr - capture_sizes: List[int] = PrivateAttr max_capture_size: int = PrivateAttr + local_cache_dir: str = PrivateAttr # local cache dir for each rank # optimization: # Intuitively, bs_to_padded_graph_size should be Dict[int, int]. # since we know all keys are in a range [0, max_capture_size], @@ -2889,17 +2896,8 @@ def model_post_init(self, __context: Any) -> None: "vllm.unified_attention_with_output", ] else: - # v0 can use full graph compilation without splitting, - # splitting is optional. - # right now we still need it. kv cache shape - # will be included in the graph if we don't split - # the graph. - # TODO: hide kv cache in static forward context - # so that inductor does not see it. - self.splitting_ops = [ - "vllm.unified_attention", - "vllm.unified_attention_with_output", - ] + # v0 uses full graph compilation + self.splitting_ops = [] for k, v in self.inductor_passes.items(): if not isinstance(v, str): @@ -2945,43 +2943,47 @@ def init_backend(self, vllm_config: "VllmConfig") -> Union[str, Callable]: from vllm.compilation.backends import VllmBackend return VllmBackend(vllm_config) - def init_with_cudagraph_sizes(self, sizes_to_specialize: List[int]): + def init_with_cudagraph_sizes(self, + cudagraph_capture_sizes: List[int]) -> None: """To complete the initialization of config, we need to know the cudagraph sizes.""" if self.cudagraph_capture_sizes is None: - self.capture_sizes = sizes_to_specialize + self.cudagraph_capture_sizes = cudagraph_capture_sizes else: - self.capture_sizes = self.cudagraph_capture_sizes + # de-duplicate the sizes provided by the config + self.cudagraph_capture_sizes = list( + set(self.cudagraph_capture_sizes)) logger.info(("cudagraph sizes specified by model runner" " %s is overridden by config %s"), - sizes_to_specialize, self.cudagraph_capture_sizes) - - if self.candidate_compile_sizes is None: - self.candidate_compile_sizes = [] - self.compile_sizes = [ - x for x in self.candidate_compile_sizes if x in self.capture_sizes - ] - ignored_sizes = [ - x for x in self.candidate_compile_sizes - if x not in self.capture_sizes - ] - if ignored_sizes: - logger.warning(("candidate_compile_sizes %s are ignored " - "because they are not cudagraph capture sizes."), - ignored_sizes) + cudagraph_capture_sizes, self.cudagraph_capture_sizes) + + computed_compile_sizes = [] + if self.compile_sizes is not None: + # de-duplicate the sizes provided by the config + self.compile_sizes = list(set(self.compile_sizes)) + for x in self.compile_sizes: + if isinstance(x, str): + assert x == "cudagraph_capture_sizes", \ + "Unrecognized size type in compile_sizes, " \ + f"expect 'cudagraph_capture_sizes', got {x}" + computed_compile_sizes.extend(self.cudagraph_capture_sizes) + else: + assert isinstance(x, int) + computed_compile_sizes.append(x) + self.compile_sizes = computed_compile_sizes # type: ignore # sort to make sure cudagraph capture sizes are in descending order - self.capture_sizes.sort(reverse=True) - self.max_capture_size = self.capture_sizes[ - 0] if self.capture_sizes else 0 + self.cudagraph_capture_sizes.sort(reverse=True) + self.max_capture_size = self.cudagraph_capture_sizes[ + 0] if self.cudagraph_capture_sizes else 0 # pre-compute the mapping from batch size to padded graph size self.bs_to_padded_graph_size = [ 0 for i in range(self.max_capture_size + 1) ] - for end, start in zip(self.capture_sizes, - self.capture_sizes[1:] + [0]): + for end, start in zip(self.cudagraph_capture_sizes, + self.cudagraph_capture_sizes[1:] + [0]): for bs in range(start, end): if bs == start: self.bs_to_padded_graph_size[bs] = start @@ -3252,14 +3254,14 @@ def _set_cudagraph_sizes(self): However, if users specify the cudagraph capture sizes through compilation config, we will use the specified sizes instead. - In the end, `vllm_config.compilation_config.capture_sizes` will be the - final sizes to capture cudagraph (in descending order). + In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` + will be the final sizes to capture cudagraph (in descending order). During runtime, if batchsize is larger than - `vllm_config.compilation_config.capture_sizes`, + `vllm_config.compilation_config.cudagraph_capture_sizes`, no cudagraph will be used. If the batch size is no larger than - `vllm_config.compilation_config.capture_sizes`, + `vllm_config.compilation_config.cudagraph_capture_sizes`, we can quickly find the padded graph size for a given batch size by looking up `vllm_config.compilation_config.bs_to_padded_graph_size`. """ @@ -3321,7 +3323,6 @@ def __str__(self): f"quantization={self.model_config.quantization}, " f"enforce_eager={self.model_config.enforce_eager}, " f"kv_cache_dtype={self.cache_config.cache_dtype}, " - f"quantization_param_path={self.model_config.quantization_param_path}," f" device_config={self.device_config.device}, " f"decoding_config={self.decoding_config!r}, " f"observability_config={self.observability_config!r}, " @@ -3342,7 +3343,7 @@ def __str__(self): @contextmanager -def set_current_vllm_config(vllm_config: VllmConfig): +def set_current_vllm_config(vllm_config: VllmConfig, check_compile=False): """ Temporarily set the current VLLM config. Used during model initialization. @@ -3362,7 +3363,8 @@ def set_current_vllm_config(vllm_config: VllmConfig): vllm_config.compilation_config.enabled_custom_ops) logger.debug("disabled custom ops: %s", vllm_config.compilation_config.disabled_custom_ops) - if vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ + if check_compile and \ + vllm_config.compilation_config.level == CompilationLevel.PIECEWISE \ and compilation_counter.num_models_seen == num_models_seen: # If the model supports compilation, # compilation_counter.num_models_seen should be increased diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index c03b5932eafb6..115f663e4ad34 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -34,9 +34,10 @@ class RefCounter(RefCounterProtocol): def __init__(self, all_block_indices: Iterable[BlockId]): deduped = set(all_block_indices) - self._refcounts: Dict[BlockId, - RefCount] = {index: 0 - for index in deduped} + self._refcounts: Dict[BlockId, RefCount] = { + index: 0 + for index in deduped + } def incr(self, block_id: BlockId) -> RefCount: assert block_id in self._refcounts diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 3a57487a6cd8a..c3e1665b4464e 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -339,6 +339,13 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: assert device in self._allocators return self._allocators[device].get_prefix_cache_hit_rate() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + success = True + for allocator in self._allocators.values(): + success = success and allocator.reset_prefix_cache() + return success + def get_and_reset_swaps(self) -> List[Tuple[int, int]]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 985a1098b6cd1..cb432db919c73 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -192,6 +192,11 @@ def get_prefix_cache_hit_rate(self) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache.""" + pass + class NoFreeBlocksError(ValueError): pass @@ -297,6 +302,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache.""" + pass + @abstractmethod def find_cached_blocks_prefix( self, diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 695870742da50..73acf377c546c 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,5 +1,5 @@ import heapq -from typing import FrozenSet, Iterable, List, Optional, Tuple +from typing import FrozenSet, Iterable, List, Optional, Tuple, Union from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) @@ -138,16 +138,18 @@ def _allocate_block_id(self) -> BlockId: self._refcounter.incr(block_id) return block_id - def _free_block_id(self, block: Block) -> None: - block_id = block.block_id + def _free_block_id(self, block: Union[Block, BlockId]) -> None: + if isinstance(block, Block): + block_id = block.block_id + block.block_id = None + else: + block_id = block assert block_id is not None refcount = self._refcounter.decr(block_id) if refcount == 0: heapq.heappush(self._free_block_indices, block_id) - block.block_id = None - def free(self, block: Block, keep_block_object: bool = False) -> None: # Release the physical block id self._free_block_id(block) @@ -156,6 +158,9 @@ def free(self, block: Block, keep_block_object: bool = False) -> None: if not keep_block_object: self._block_pool.free_block(block) + def free_block_id(self, block_id: BlockId) -> None: + self._free_block_id(block_id) + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. @@ -327,6 +332,10 @@ def swap_in(self, blocks: List[Block]) -> None: def get_prefix_cache_hit_rate(self) -> float: return -1 + def reset_prefix_cache(self) -> bool: + """No prefix cache for naive block allocator.""" + return True + def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]: # Not applicable for naive block allocator. return [] diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 1238303234deb..ccdc5daa9595c 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -12,6 +12,7 @@ from vllm.core.block.naive_block import (BlockPool, NaiveBlock, NaiveBlockAllocator) from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor +from vllm.logger import init_logger from vllm.sequence import Sequence PrefixHash = int @@ -21,6 +22,8 @@ # then we know this block hasn't been accessed yet. _DEFAULT_LAST_ACCESSED_TIME = -1 +logger = init_logger(__name__) + class BlockTracker: """Used to track the status of a block inside the prefix caching allocator @@ -105,7 +108,8 @@ def __init__( # Evitor used to maintain how we want to handle those computed blocks # if we find memory pressure is high. - self.evictor: Evictor = make_evictor(eviction_policy) + self.eviction_policy = eviction_policy + self.evictor: Evictor = make_evictor(self.eviction_policy) # We share the refcounter between allocators. This allows us to promote # blocks originally allocated in the hashless allocator to immutable @@ -428,6 +432,44 @@ def all_block_ids(self) -> FrozenSet[int]: def get_prefix_cache_hit_rate(self) -> float: return self.metric_data.get_hit_rate() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache. This function may be used in RLHF + flows to invalid prefix caching after the weights are updated, + or used for resetting prefix caching status for benchmarking. + + Returns: + bool: True if the prefix cache is successfully reset, + False otherwise. + """ + num_used_blocks = (self.get_num_total_blocks() - + self.get_num_free_blocks()) + if num_used_blocks > 0: + logger.warning( + "Failed to reset prefix cache because some " + "blocks (%d) are not freed yet", num_used_blocks) + return False + + # Free all blocks in the evictor. + while (block_id := + self._maybe_allocate_evicted_block_id()) is not None: + self._hashless_allocator.free_block_id(block_id) + + # Should not have any cached blocks because all blocks are evicted. + assert not self._cached_blocks + + # Reset the evictor. + self.evictor = make_evictor(self.eviction_policy) + + # Reset the block tracker. + for block_id in self._block_tracker: + self._block_tracker[block_id] = BlockTracker() + + # Reset the metrics. + self.metric_data = CacheMetricData() + + logger.info("Successfully reset prefix cache") + return True + def is_block_cached(self, block: Block) -> bool: assert block.content_hash is not None return block.content_hash in self._cached_blocks diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py index b41e848221882..2d6a132ed555b 100644 --- a/vllm/core/block_manager.py +++ b/vllm/core/block_manager.py @@ -136,8 +136,8 @@ def can_allocate(self, device=Device.GPU) # Use watermark to avoid frequent cache eviction. - if (self.num_total_gpu_blocks - num_required_blocks < - self.watermark_blocks): + if (self.num_total_gpu_blocks - num_required_blocks + < self.watermark_blocks): return AllocStatus.NEVER if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks: return AllocStatus.OK @@ -455,6 +455,9 @@ def get_num_free_cpu_blocks(self) -> int: def get_prefix_cache_hit_rate(self, device: Device) -> float: return self.block_allocator.get_prefix_cache_hit_rate(device) + def reset_prefix_cache(self) -> bool: + return self.block_allocator.reset_prefix_cache() + def _can_swap(self, seq_group: SequenceGroup, device: Device, diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index b10b8d3f4a5bf..9c7e246e3c4ed 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -122,6 +122,11 @@ def get_prefix_cache_hit_rate(self, device: Device) -> float: """Prefix cache hit rate. -1 means not supported or disabled.""" pass + @abstractmethod + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + pass + @abstractmethod def get_num_cached_tokens(self, seq: Sequence) -> int: pass diff --git a/vllm/core/placeholder_block_space_manager.py b/vllm/core/placeholder_block_space_manager.py index a47e594518534..f9924be4a3835 100644 --- a/vllm/core/placeholder_block_space_manager.py +++ b/vllm/core/placeholder_block_space_manager.py @@ -90,5 +90,8 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup, def get_prefix_cache_hit_rate(self, device: Device) -> float: return -1 + def reset_prefix_cache(self) -> bool: + return True + def get_num_cached_tokens(self, seq: Sequence) -> int: return 0 diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 200098e3828da..4facdaf0f894b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -593,6 +593,9 @@ def has_unfinished_seqs(self) -> bool: def get_prefix_cache_hit_rate(self, device: Device) -> float: return self.block_manager.get_prefix_cache_hit_rate(device) + def reset_prefix_cache(self) -> bool: + return self.block_manager.reset_prefix_cache() + def get_num_unfinished_seq_groups(self) -> int: return len(self.waiting) + len(self.running) + len(self.swapped) @@ -1074,8 +1077,8 @@ def _schedule_prefills( waiting_queue.popleft() continue - if (budget.num_batched_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (budget.num_batched_tokens + >= self.scheduler_config.max_num_batched_tokens): # We've reached the budget limit - since there might be # continuous prefills in the running queue, we should break # to avoid scheduling any new prefills. @@ -1202,8 +1205,8 @@ def _schedule_default(self) -> SchedulerOutputs: running_scheduled.swapped_out) == 0: swapped_in = self._schedule_swapped(budget, curr_loras) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1295,8 +1298,8 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs: curr_loras, enable_chunking=True) - assert (budget.num_batched_tokens <= - self.scheduler_config.max_num_batched_tokens) + assert (budget.num_batched_tokens + <= self.scheduler_config.max_num_batched_tokens) assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs # Update waiting requests. @@ -1464,8 +1467,8 @@ def schedule( # NOTE: We use get_len instead of get_prompt_len because when # a sequence is preempted, prefill includes previous generated # output tokens. - if (token_chunk_size + num_computed_tokens < - seqs[0].data.get_len()): + if (token_chunk_size + num_computed_tokens + < seqs[0].data.get_len()): do_sample = False # It assumes the scheduled_seq_groups is ordered by @@ -1731,10 +1734,9 @@ def _passed_delay(self, now: float) -> bool: if self.scheduler_config.delay_factor > 0 and self.waiting: earliest_arrival_time = min( [e.metrics.arrival_time for e in self.waiting]) - passed_delay = ( - (now - earliest_arrival_time) > - (self.scheduler_config.delay_factor * self.last_prompt_latency) - or not self.running) + passed_delay = ((now - earliest_arrival_time) + > (self.scheduler_config.delay_factor * + self.last_prompt_latency) or not self.running) else: passed_delay = True return passed_delay diff --git a/vllm/device_allocator/__init__.py b/vllm/device_allocator/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py new file mode 100644 index 0000000000000..a43418dbb3b46 --- /dev/null +++ b/vllm/device_allocator/cumem.py @@ -0,0 +1,254 @@ +# cumem-based pytorch pluggable allocator to implement sleep mode. +# other approaches tried but failed: +# - cuda-python package binding +# - custom libcuda driver ctypes wrapper +# both of them failed because of cuda context mismatch. +# not sure why, they are created from a different context. +# the only successful approach is to call cuda driver API in C. +import dataclasses +from contextlib import contextmanager +from typing import Callable, Dict, Optional, Tuple, Union + +import torch + +from vllm.utils import is_pin_memory_available + + +def find_loaded_library(lib_name) -> Optional[str]: + """ + According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html, + the file `/proc/self/maps` contains the memory maps of the process, which includes the + shared libraries loaded by the process. We can use this file to find the path of the + a loaded library. + """ # noqa + found_line = None + with open("/proc/self/maps") as f: + for line in f: + if lib_name in line: + found_line = line + break + if found_line is None: + # the library is not loaded in the current process + return None + # if lib_name is libcudart, we need to match a line with: + # address /path/to/libcudart-hash.so.11.0 + start = found_line.index("/") + path = found_line[start:].strip() + filename = path.split("/")[-1] + assert filename.rpartition(".so")[0].startswith(lib_name), \ + f"Unexpected filename: {filename} for library {lib_name}" + return path + + +cumem_available = False +try: + from vllm.cumem_allocator import (init_module, python_create_and_map, + python_unmap_and_release) + from vllm.distributed.device_communicators.cuda_wrapper import ( + CudaRTLibrary) + lib_name = find_loaded_library("cumem_allocator") + libcudart = CudaRTLibrary() + cumem_available = True +except ModuleNotFoundError: + # rocm platform does not support cumem allocator + init_module = None + python_create_and_map = None + python_unmap_and_release = None + CudaRTLibrary = None + lib_name = None + libcudart = None + +# py_device, py_alignedSize, py_d_mem, py_p_memHandle +HandleType = Tuple[int, int, int, int] + + +@dataclasses.dataclass +class AllocationData: + handle: HandleType + tag: str + cpu_backup_tensor: Optional[torch.Tensor] = None + + +def create_and_map(allocation_handle: HandleType) -> None: + python_create_and_map(*allocation_handle) + + +def unmap_and_release(allocation_handle: HandleType) -> None: + python_unmap_and_release(*allocation_handle) + + +def get_pluggable_allocator( + python_malloc_fn: Callable[[int], + int], python_free_func: Callable[[int, int], + None] +) -> torch.cuda.memory.CUDAPluggableAllocator: + init_module(python_malloc_fn, python_free_func) + new_alloc = torch.cuda.memory.CUDAPluggableAllocator( + lib_name, 'my_malloc', 'my_free') + return new_alloc + + +@contextmanager +def use_memory_pool_with_allocator( + python_malloc_fn: Callable[[int], int], + python_free_func: Callable[[int, int], None]) -> None: + new_alloc = get_pluggable_allocator(python_malloc_fn, python_free_func) + mem_pool = torch.cuda.memory.MemPool(new_alloc._allocator) + with torch.cuda.memory.use_mem_pool(mem_pool): + yield mem_pool + + +class CuMemAllocator: + """ + A singleton class that manages a memory pool for CUDA tensors. + The memory in this pool can be offloaded or discarded when the + allocator sleeps. + + Inside the `use_memory_pool(tag)` context, all tensors created will + be allocated in the memory pool, and has the same tag as the + tag passed to the context. + + When we call `sleep`, all tensors with the specified tag will be + offloaded to CPU memory, and the rest of the tensors will be discarded. + When we call `wake_up`, all tensors that are previously offloaded + will be loaded back to GPU memory, and the rest of the tensors will + have empty memory. + + Why it needs to be a singleton? + When allocated tensors are garbage collected, PyTorch will call + the free callback, which will call the `python_free_callback` method. + The C-extension uses a global variable to store the function of an + instance of this class. If we create multiple instances of this class, + the global variable will be overwritten and the free callback will + not work as expected. + """ + instance: "CuMemAllocator" = None + default_tag: str = "default" + + @staticmethod + def get_instance() -> "CuMemAllocator": + """ + CuMemAllocator is a singleton class. + We cannot call the constructor directly. + Call this method to get the instance. + """ + assert cumem_available, "cumem allocator is not available" + if CuMemAllocator.instance is None: + CuMemAllocator.instance = CuMemAllocator() + return CuMemAllocator.instance + + def __init__(self): + self.pointer_to_data: Dict[int, AllocationData] = {} + self.current_tag: str = CuMemAllocator.default_tag + + def python_malloc_callback(self, allocation_handle: HandleType) -> None: + """ + Internal method to store the allocation data + when memory is allocated in the memory pool.""" + py_d_mem = allocation_handle[2] + self.pointer_to_data[py_d_mem] = AllocationData( + allocation_handle, self.current_tag) + return + + def python_free_callback(self, ptr: int) -> HandleType: + """ + Internal method to look up the allocation data + when memory is freed in the memory pool.""" + data = self.pointer_to_data.pop(ptr) + if data.cpu_backup_tensor is not None: + data.cpu_backup_tensor = None + return data.handle + + def sleep( + self, + offload_tags: Optional[Union[Tuple[str, ...], + str]] = None) -> None: + """ + Put the allocator in sleep mode. + All data in the memory allocation with the specified tag will be + offloaded to CPU memory, and others will be discarded. + + :param offload_tags: The tags of the memory allocation that will be + offloaded. The rest of the memory allocation will be discarded. + """ + if offload_tags is None: + # by default, allocated tensors are offloaded + # when the allocator sleeps + offload_tags = (CuMemAllocator.default_tag, ) + elif isinstance(offload_tags, str): + offload_tags = (offload_tags, ) + + assert isinstance(offload_tags, tuple) + + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + if data.tag in offload_tags: + size_in_bytes = handle[1] + cpu_backup_tensor = torch.empty( + size_in_bytes, + dtype=torch.uint8, + device='cpu', + pin_memory=is_pin_memory_available()) + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(cpu_ptr, ptr, size_in_bytes) + data.cpu_backup_tensor = cpu_backup_tensor + unmap_and_release(handle) + + def wake_up(self): + """ + Wake up the allocator from sleep mode. + All data that is previously offloaded will be loaded back to GPU + memory, and the rest of the data will have empty memory.""" + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + create_and_map(handle) + if data.cpu_backup_tensor is not None: + cpu_backup_tensor = data.cpu_backup_tensor + if cpu_backup_tensor is not None: + size_in_bytes = cpu_backup_tensor.numel( + ) * cpu_backup_tensor.element_size() + cpu_ptr = cpu_backup_tensor.data_ptr() + libcudart.cudaMemcpy(ptr, cpu_ptr, size_in_bytes) + data.cpu_backup_tensor = None + + @contextmanager + def use_memory_pool(self, tag: Optional[str] = None): + """ + A context manager to use the memory pool. + All memory allocation created inside the context will be allocated + in the memory pool, and has the specified tag. + + :param tag: The tag of the memory allocation. If None, the default tag + will be used. + """ + if tag is None: + tag = CuMemAllocator.default_tag + + assert isinstance(tag, str) + + old_tag = self.current_tag + self.current_tag = tag + with use_memory_pool_with_allocator(self.python_malloc_callback, + self.python_free_callback): + yield + # PyTorch's bug, calling torch.cuda.empty_cache() will error + # when using pluggable allocator, see + # https://github.com/pytorch/pytorch/issues/145168 . + # if we have some memory allocated and then freed, + # the memory will not be released. + # right now it is fine, because we only use this allocator + # during weight loading and kv cache creation, where we only + # allocate memory. + # TODO: we need to find a way to release the memory, + # i.e. calling torch.cuda.empty_cache() + self.current_tag = old_tag + + def get_current_usage(self) -> int: + """ + Get the total number of bytes allocated in the memory pool. + """ + sum_bytes: int = 0 + for ptr, data in self.pointer_to_data.items(): + handle = data.handle + sum_bytes += handle[1] + return sum_bytes diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 4ced991f62f66..268edc0925fe8 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -352,8 +352,8 @@ def acquire_write(self, timeout: Optional[float] = None): sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 @@ -410,8 +410,8 @@ def acquire_read(self, timeout: Optional[float] = None): sched_yield() # if we wait for a long time, log a message - if (time.monotonic() - start_time > - VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): + if (time.monotonic() - start_time + > VLLM_RINGBUFFER_WARNING_INTERVAL * n_warning): logger.debug("No available block found in %s second. ", VLLM_RINGBUFFER_WARNING_INTERVAL) n_warning += 1 diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py index 4ace03ff1184e..7780e2dfa317d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py @@ -35,6 +35,7 @@ def __init__( ): self.config = config.kv_transfer_config + self.tp_size = config.parallel_config.tensor_parallel_size if self.config.kv_connector == "PyNcclConnector": from vllm.distributed.kv_transfer.kv_pipe.pynccl_pipe import ( @@ -161,7 +162,7 @@ def send_kv_caches_and_hidden_states( end_layer = model_executable.model.end_layer model_config = model_executable.model.config - num_heads = model_config.num_key_value_heads + num_heads = int(model_config.num_key_value_heads / self.tp_size) hidden_size = model_config.hidden_size num_attention_heads = model_config.num_attention_heads head_size = int(hidden_size / num_attention_heads) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index bf8b30cccd5f6..7fe9b68d4b9e8 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1014,8 +1014,8 @@ def initialize_model_parallel( backend = backend or torch.distributed.get_backend( get_world_group().device_group) - if (world_size != - tensor_model_parallel_size * pipeline_model_parallel_size): + if (world_size + != tensor_model_parallel_size * pipeline_model_parallel_size): raise RuntimeError( f"world_size ({world_size}) is not equal to " f"tensor_model_parallel_size ({tensor_model_parallel_size}) x " @@ -1069,8 +1069,8 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: return if all([ - vllm_config.kv_transfer_config.need_kv_parallel_group, - _KV_TRANSFER is None + vllm_config.kv_transfer_config.need_kv_parallel_group, _KV_TRANSFER + is None ]): _KV_TRANSFER = kv_transfer.KVTransferAgent( rank=get_world_group().rank, @@ -1183,6 +1183,11 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): from vllm.platforms import current_platform if not current_platform.is_cpu(): torch.cuda.empty_cache() + try: + torch._C._host_emptyCache() + except AttributeError: + logger.warning( + "torch._C._host_emptyCache() only available in Pytorch >=2.5") def in_the_same_node_as(pg: Union[ProcessGroup, StatelessProcessGroup], diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 58a8fdc07af19..59ba6cfd925e3 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -100,10 +100,8 @@ class EngineArgs: config_format: ConfigFormat = ConfigFormat.AUTO dtype: str = 'auto' kv_cache_dtype: str = 'auto' - quantization_param_path: Optional[str] = None seed: int = 0 max_model_len: Optional[int] = None - worker_use_ray: bool = False # Note: Specifying a custom executor backend by passing a class # is intended for expert use only. The API may change without # notice. @@ -202,6 +200,10 @@ class EngineArgs: kv_transfer_config: Optional[KVTransferConfig] = None generation_config: Optional[str] = None + override_generation_config: Optional[Dict[str, Any]] = None + enable_sleep_mode: bool = False + + calculate_kv_scales: Optional[bool] = None def __post_init__(self): if not self.tokenizer: @@ -408,12 +410,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: 'or equal to the number of GPUs available, "mp" will be used to ' 'keep processing on a single host. Otherwise, this will default ' 'to "ray" if Ray is installed and fail otherwise. Note that tpu ' - 'only support Ray for distributed inference.') + 'only supports Ray for distributed inference.') - parser.add_argument( - '--worker-use-ray', - action='store_true', - help='Deprecated, use ``--distributed-executor-backend=ray``.') parser.add_argument('--pipeline-parallel-size', '-pp', type=int, @@ -468,7 +466,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--use-padding-aware-scheduling', default=EngineArgs.use_padding_aware_scheduling, - action='store_true', + action=StoreBoolean, + nargs="?", + const="True", help=('Use padding-aware scheduling. If True, the scheduler ' 'will consider padded tokens in prefill. ' 'By default this is set to False on non-HPU devices. ')) @@ -984,10 +984,38 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=nullable_str, default=None, help="The folder path to the generation config. " - "Defaults to None, will use the default generation config in vLLM. " - "If set to 'auto', the generation config will be automatically " - "loaded from model. If set to a folder path, the generation config " - "will be loaded from the specified folder path.") + "Defaults to None, no generation config is loaded, vLLM defaults " + "will be used. If set to 'auto', the generation config will be " + "loaded from model path. If set to a folder path, the generation " + "config will be loaded from the specified folder path. If " + "`max_new_tokens` is specified in generation config, then " + "it sets a server-wide limit on the number of output tokens " + "for all requests.") + + parser.add_argument( + "--override-generation-config", + type=json.loads, + default=None, + help="Overrides or sets generation config in JSON format. " + "e.g. ``{\"temperature\": 0.5}``. If used with " + "--generation-config=auto, the override parameters will be merged " + "with the default config from the model. If generation-config is " + "None, only the override parameters are used.") + + parser.add_argument("--enable-sleep-mode", + action="store_true", + default=False, + help="Enable sleep mode for the engine. " + "(only cuda platform is supported)") + + parser.add_argument( + '--calculate-kv-scales', + action='store_true', + help='This enables dynamic calculation of ' + 'k_scale and v_scale when kv-cache-dtype is fp8. ' + 'If calculate-kv-scales is false, the scales will ' + 'be loaded from the model checkpoint if available. ' + 'Otherwise, the scales will default to 1.0.') return parser @@ -1018,7 +1046,6 @@ def create_model_config(self) -> ModelConfig: tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, - quantization_param_path=self.quantization_param_path, enforce_eager=self.enforce_eager, max_seq_len_to_capture=self.max_seq_len_to_capture, max_logprobs=self.max_logprobs, @@ -1033,7 +1060,10 @@ def create_model_config(self) -> ModelConfig: override_neuron_config=self.override_neuron_config, override_pooler_config=self.override_pooler_config, logits_processor_pattern=self.logits_processor_pattern, - generation_config=self.generation_config) + generation_config=self.generation_config, + override_generation_config=self.override_generation_config, + enable_sleep_mode=self.enable_sleep_mode, + ) def create_load_config(self) -> LoadConfig: return LoadConfig( @@ -1094,11 +1124,11 @@ def create_engine_config(self, sliding_window=model_config.get_sliding_window(), enable_prefix_caching=self.enable_prefix_caching, cpu_offload_gb=self.cpu_offload_gb, + calculate_kv_scales=self.calculate_kv_scales, ) parallel_config = ParallelConfig( pipeline_parallel_size=self.pipeline_parallel_size, tensor_parallel_size=self.tensor_parallel_size, - worker_use_ray=self.worker_use_ray, max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, tokenizer_pool_config=TokenizerPoolConfig.create_config( @@ -1309,11 +1339,22 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None: self.enable_chunked_prefill = True # When no user override, set the default values based on the usage # context. - # TODO(woosuk): Tune the default values for different hardware. - default_max_num_batched_tokens = { - UsageContext.LLM_CLASS: 8192, - UsageContext.OPENAI_API_SERVER: 2048, - } + # Use different default values for different hardware. + from vllm.platforms import current_platform + device_name = current_platform.get_device_name().lower() + if "h100" in device_name or "h200" in device_name: + # For H100 and H200, we use larger default values. + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 16384, + UsageContext.OPENAI_API_SERVER: 8192, + } + else: + # TODO(woosuk): Tune the default values for other hardware. + default_max_num_batched_tokens = { + UsageContext.LLM_CLASS: 8192, + UsageContext.OPENAI_API_SERVER: 2048, + } + if (self.max_num_batched_tokens is None and usage_context in default_max_num_batched_tokens): self.max_num_batched_tokens = default_max_num_batched_tokens[ diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 08fef8250d483..739ea06ae3818 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -1182,6 +1182,9 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: self.engine.stop_profile() + async def reset_prefix_cache(self) -> None: + self.engine.reset_prefix_cache() + async def add_lora(self, lora_request: LoRARequest) -> None: self.engine.add_lora(lora_request) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 88c21f9a6d31b..dd677300fc66a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -5,10 +5,10 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, - Iterable, List, Mapping, NamedTuple, Optional) +from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable, + List, Mapping, NamedTuple, Optional) from typing import Sequence as GenericSequence -from typing import Set, Tuple, Type, Union, cast, overload +from typing import Set, Type, Union, cast, overload import torch from typing_extensions import TypeVar, deprecated @@ -230,7 +230,7 @@ def __init__( ) logger.info( - "Initializing an LLM engine (v%s) with config: %s, " + "Initializing a V0 LLM engine (v%s) with config: %s, " "use_cached_outputs=%s, ", VLLM_VERSION, vllm_config, @@ -689,7 +689,9 @@ def add_request( :class:`~vllm.PoolingParams` for pooling. arrival_time: The arrival time of the request. If None, we use the current monotonic time. + lora_request: The LoRA request to add. trace_headers: OpenTelemetry trace headers. + prompt_adapter_request: The prompt adapter request to add. priority: The priority of the request. Only applicable with priority scheduling. @@ -912,6 +914,14 @@ def has_unfinished_requests_for_virtual_engine( """ return self.scheduler[virtual_engine].has_unfinished_seqs() + def reset_prefix_cache(self) -> bool: + """Reset prefix cache for all devices.""" + + success = True + for scheduler in self.scheduler: + success = success and scheduler.reset_prefix_cache() + return success + @staticmethod def _process_sequence_group_outputs( seq_group: SequenceGroup, @@ -1000,8 +1010,23 @@ def _process_model_outputs(self, self.speculative_config # Organize outputs by [step][sequence group] instead of # [sequence group][step]. - outputs_by_sequence_group = create_output_by_sequence_group( - outputs, num_seq_groups=len(seq_group_metadata_list)) + if self.scheduler_config.is_multi_step: + outputs_by_sequence_group = create_output_by_sequence_group( + outputs, len(seq_group_metadata_list)) + elif self.speculative_config: + # Decodes are multi-steps while prefills are not, outputting at + # most 1 token. Separate them so that we can trigger chunk + # processing without having to pad or copy over prompts K times + # to match decodes structure (costly with prompt_logprobs). + num_prefills = sum(sg.is_prompt + for sg in seq_group_metadata_list) + prefills, decodes = outputs[:num_prefills], outputs[ + num_prefills:] + outputs_by_sequence_group = create_output_by_sequence_group( + decodes, + num_seq_groups=len(seq_group_metadata_list) - num_prefills) + outputs_by_sequence_group = [p.outputs for p in prefills + ] + outputs_by_sequence_group # We have outputs for multiple steps submitted in a single burst, # so invalidate is_first_step_output. is_first_step_output = None @@ -1816,16 +1841,15 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.model_executor.stop_profile() - def collective_rpc(self, - method: Union[str, Callable], - timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: - """ - See LLM.collective_rpc for more details. - """ - return self.model_executor.collective_rpc(method, timeout, args, - kwargs) + def sleep(self, level: int = 1) -> None: + assert self.vllm_config.model_config.enable_sleep_mode, ( + "Sleep mode is not enabled in the model config") + self.model_executor.sleep(level=level) + + def wake_up(self) -> None: + assert self.vllm_config.model_config.enable_sleep_mode, ( + "Sleep mode is not enabled in the model config") + self.model_executor.wake_up() def check_health(self) -> None: if self.tokenizer: @@ -1866,46 +1890,44 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: metrics = seq_group.metrics ttft = metrics.first_token_time - metrics.arrival_time e2e_time = metrics.finished_time - metrics.arrival_time - # attribute names are based on - # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/llm-spans.md - seq_span.set_attribute(SpanAttributes.LLM_RESPONSE_MODEL, + seq_span.set_attribute(SpanAttributes.GEN_AI_RESPONSE_MODEL, self.model_config.model) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_ID, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_ID, seq_group.request_id) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TEMPERATURE, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE, seq_group.sampling_params.temperature) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_TOP_P, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_TOP_P, seq_group.sampling_params.top_p) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_MAX_TOKENS, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS, seq_group.sampling_params.max_tokens) - seq_span.set_attribute(SpanAttributes.LLM_REQUEST_N, + seq_span.set_attribute(SpanAttributes.GEN_AI_REQUEST_N, seq_group.sampling_params.n) - seq_span.set_attribute(SpanAttributes.LLM_USAGE_NUM_SEQUENCES, + seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_NUM_SEQUENCES, seq_group.num_seqs()) - seq_span.set_attribute(SpanAttributes.LLM_USAGE_PROMPT_TOKENS, + seq_span.set_attribute(SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS, len(seq_group.prompt_token_ids)) seq_span.set_attribute( - SpanAttributes.LLM_USAGE_COMPLETION_TOKENS, + SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS, sum([ seq.get_output_len() for seq in seq_group.get_finished_seqs() ])) - seq_span.set_attribute(SpanAttributes.LLM_LATENCY_TIME_IN_QUEUE, + seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE, metrics.time_in_queue) seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_TO_FIRST_TOKEN, ttft) - seq_span.set_attribute(SpanAttributes.LLM_LATENCY_E2E, e2e_time) + SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN, ttft) + seq_span.set_attribute(SpanAttributes.GEN_AI_LATENCY_E2E, e2e_time) if metrics.scheduler_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_SCHEDULER, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_SCHEDULER, metrics.scheduler_time) if metrics.model_forward_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_FORWARD, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD, metrics.model_forward_time / 1000.0) if metrics.model_execute_time is not None: seq_span.set_attribute( - SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE, + SpanAttributes.GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE, metrics.model_execute_time) def _validate_model_inputs(self, inputs: ProcessorInputs, diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index c8aec8dd3afa3..b771c190dd82a 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -120,7 +120,8 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): labelnames=labelnames) buckets = [1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096] if not vllm_config.model_config.enforce_eager: - buckets = vllm_config.compilation_config.capture_sizes.copy() + buckets = vllm_config.compilation_config.\ + cudagraph_capture_sizes.copy() buckets.sort() self.histogram_iteration_tokens = self._histogram_cls( name="vllm:iteration_tokens_total", @@ -258,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): documentation="Number of emitted tokens.", labelnames=labelnames)) - # Deprecated in favor of vllm:prompt_tokens_total - self.gauge_avg_prompt_throughput = self._gauge_cls( - name="vllm:avg_prompt_throughput_toks_per_s", - documentation="Average prefill throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # Deprecated in favor of vllm:generation_tokens_total - self.gauge_avg_generation_throughput = self._gauge_cls( - name="vllm:avg_generation_throughput_toks_per_s", - documentation="Average generation throughput in tokens/s.", - labelnames=labelnames, - multiprocess_mode="sum", - ) - # end-metrics-definitions @@ -634,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None: self._log_histogram(self.metrics.histogram_max_tokens_request, stats.max_tokens_requests) - def _log_prometheus_interval(self, prompt_throughput: float, - generation_throughput: float) -> None: - # Logs metrics to prometheus that are computed every logging_interval. - # Support legacy gauge metrics that make throughput calculations on - # the vLLM side. Moving forward, we should use counters like - # counter_prompt_tokens, counter_generation_tokens - # Which log raw data and calculate summaries using rate() on the - # grafana/prometheus side. See - # https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666 - self.metrics.gauge_avg_prompt_throughput.labels( - **self.labels).set(prompt_throughput) - self.metrics.gauge_avg_generation_throughput.labels( - **self.labels).set(generation_throughput) - def log(self, stats: Stats): """Logs to prometheus and tracked stats every iteration.""" # Log to prometheus. @@ -663,20 +635,6 @@ def log(self, stats: Stats): # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): - # Compute summary metrics for tracked stats (and log them - # to promethus if applicable). - prompt_throughput = get_throughput(self.num_prompt_tokens, - now=stats.now, - last_log=self.last_local_log) - generation_throughput = get_throughput( - self.num_generation_tokens, - now=stats.now, - last_log=self.last_local_log) - - self._log_prometheus_interval( - prompt_throughput=prompt_throughput, - generation_throughput=generation_throughput) - if self.spec_decode_metrics is not None: self._log_gauge( self.metrics.gauge_spec_decode_draft_acceptance_rate, diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 7132f9840001a..d9703b820a779 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -121,6 +121,10 @@ class RPCUProfileRequest(Enum): STOP_PROFILE = 2 +class RPCResetPrefixCacheRequest(Enum): + RESET_PREFIX_CACHE = 1 + + @dataclass class RPCLoadAdapterRequest: lora_request: LoRARequest @@ -134,7 +138,8 @@ class RPCAdapterLoadedResponse: RPC_REQUEST_T = Union[RPCProcessRequest, RPCAbortRequest, RPCStartupRequest, - RPCUProfileRequest, RPCLoadAdapterRequest] + RPCUProfileRequest, RPCLoadAdapterRequest, + RPCResetPrefixCacheRequest] REQUEST_OUTPUTS_T = Union[List[RequestOutput], RPCAdapterLoadedResponse, RPCError] diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index a9ab899535180..5237f63c34c01 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -27,8 +27,9 @@ VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, RPCLoadAdapterRequest, - RPCProcessRequest, RPCStartupRequest, - RPCStartupResponse, + RPCProcessRequest, + RPCResetPrefixCacheRequest, + RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) from vllm.engine.protocol import EngineClient # yapf: enable @@ -262,7 +263,14 @@ async def setup(self): """Setup the client before it starts sending server requests.""" # Start output_loop - self.output_loop = asyncio.create_task(self.run_output_handler_loop()) + if self.output_loop is None: + # only generate once to avoid multiple concurrent output_loops + # this will lead to race conditions and wrong orders of tokens + # returned by the engine + # setup will be called multiple times during the startup of + # the engine + self.output_loop = asyncio.create_task( + self.run_output_handler_loop()) with self.get_data_socket() as socket: # Wait until server is ready. @@ -271,8 +279,9 @@ async def setup(self): self.tracing_flag = response.tracing_enabled # Start health_loop. - self.health_loop = asyncio.create_task( - self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT)) + if self.health_loop is None: + self.health_loop = asyncio.create_task( + self.run_heartbeat_loop(timeout=VLLM_RPC_TIMEOUT)) def close(self): """Destroy the ZeroMQ Context.""" @@ -667,6 +676,13 @@ async def stop_profile(self) -> None: await self._send_one_way_rpc_request( request=RPCUProfileRequest.STOP_PROFILE, socket=self.input_socket) + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache""" + + await self._send_one_way_rpc_request( + request=RPCResetPrefixCacheRequest.RESET_PREFIX_CACHE, + socket=self.input_socket) + async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" # Uses the same I/O as generate requests diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 3aa9d30549f36..166f89743b3cd 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -16,8 +16,9 @@ VLLM_RPC_SUCCESS_STR, RPCAbortRequest, RPCAdapterLoadedResponse, RPCError, RPCLoadAdapterRequest, - RPCProcessRequest, RPCStartupRequest, - RPCStartupResponse, + RPCProcessRequest, + RPCResetPrefixCacheRequest, + RPCStartupRequest, RPCStartupResponse, RPCUProfileRequest) # yapf: enable from vllm.logger import init_logger @@ -237,6 +238,8 @@ def handle_new_input(self): self.stop_profile() elif isinstance(request, RPCLoadAdapterRequest): self._handle_load_adapter_request(request) + elif isinstance(request, RPCResetPrefixCacheRequest): + self.reset_prefix_cache() else: raise ValueError("Unknown RPCRequest Type: " f"{type(request)}") @@ -361,6 +364,9 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.engine.stop_profile() + def reset_prefix_cache(self) -> bool: + return self.engine.reset_prefix_cache() + def signal_handler(*_) -> None: raise KeyboardInterrupt("MQLLMEngine terminated") diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index c8b282b1a7676..99c2baf3f4df4 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -144,7 +144,7 @@ def process_outputs(self, def _process_decode_and_stop(self, seq: Sequence, sampling_params: SamplingParams) -> None: new_char_count = 0 - if sampling_params.detokenize: + if sampling_params.detokenize and self.detokenizer: new_char_count = self.detokenizer.decode_sequence_inplace( seq, sampling_params) diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py index 9e4011896075d..fbc60958df8a9 100644 --- a/vllm/engine/output_processor/single_step.py +++ b/vllm/engine/output_processor/single_step.py @@ -102,9 +102,9 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, Args: seq_group: the output is associated with this :class:`SequenceGroup` - output: the :class:`SequenceGroupOutput` for a single scheduler step + outputs: the :class:`SequenceGroupOutput` for a single scheduler step """ - assert len(outputs) == 1, ("Single step should only has 1 output.") + assert len(outputs) == 1, "Single step should only have 1 output." output = outputs[0] assert isinstance(output, CompletionSequenceGroupOutput) single_step_process_prompt_logprob(self, seq_group, output) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index f05ff62c4766b..de7b2c1b91f50 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -271,6 +271,11 @@ async def stop_profile(self) -> None: """Start profiling the engine""" ... + @abstractmethod + async def reset_prefix_cache(self) -> None: + """Reset the prefix cache""" + ... + @abstractmethod async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index beedf5d16ab86..97d2561df602a 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -3,7 +3,7 @@ import json from abc import ABC, abstractmethod from collections import defaultdict, deque -from functools import lru_cache, partial +from functools import cache, lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, Literal, Optional, Tuple, TypeVar, Union, cast) @@ -377,7 +377,7 @@ def allowed_local_media_path(self): return self._model_config.allowed_local_media_path @staticmethod - @lru_cache(maxsize=None) + @cache def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: return tokenizer.decode(token_index) @@ -392,7 +392,7 @@ def _placeholder_str(self, modality: ModalityStr, if model_type == "phi3_v": # Workaround since this token is not defined in the tokenizer return f"<|image_{current_count}|>" - if model_type == "minicpmv": + if model_type in ("minicpmo", "minicpmv"): return "(./)" if model_type in ("blip-2", "chatglm", "fuyu", "paligemma", "pixtral"): @@ -424,10 +424,14 @@ def _placeholder_str(self, modality: ModalityStr, if model_type == "qwen2_audio": return (f"Audio {current_count}: " f"<|audio_bos|><|AUDIO|><|audio_eos|>") + if model_type == "minicpmo": + return "()" raise TypeError(f"Unknown model type: {model_type}") elif modality == "video": if model_type == "qwen2_vl": return "<|vision_start|><|video_pad|><|vision_end|>" + if model_type in ("minicpmo", "minicpmv"): + return "()" if model_type.startswith("llava"): return self._cached_token_str(self._tokenizer, hf_config.video_token_index) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0cfe6be9ac767..b2c51706ee2b3 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -5,8 +5,10 @@ Tuple, Type, Union, cast, overload) import cloudpickle +import torch +import torch.nn as nn from tqdm import tqdm -from typing_extensions import deprecated +from typing_extensions import TypeVar, deprecated from vllm import envs from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput, @@ -42,6 +44,8 @@ logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class LLM: """An LLM for generating texts from given prompts and sampling parameters. @@ -464,25 +468,42 @@ def generate( return self.engine_class.validate_outputs(outputs, RequestOutput) def collective_rpc(self, - method: Union[str, Callable], + method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: + """ + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. """ - Run a method on all workers, with homogeneous arguments. - The main extension point for the LLM entrypoint. - Users can provide custom worker class through `worker_cls` - argument, and implement new methods in the worker class. - Then, users can call the new methods through this API. - It is recommended to use this API to only pass control messages, - and set up data-plane communication to pass data. - The method can also be a callable, which will be serialized - and sent to all workers to execute. - If the method is a callable, it should accept an additional - `self` argument, in addition to the arguments passed in `args` - and `kwargs`. The `self` argument will be the worker object. + executor = self.llm_engine.model_executor + return executor.collective_rpc(method, timeout, args, kwargs) + + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. """ - return self.llm_engine.collective_rpc(method, timeout, args, kwargs) + executor = self.llm_engine.model_executor + return executor.apply_model(func) def beam_search( self, @@ -976,6 +997,107 @@ def classify( return [ClassificationRequestOutput.from_base(item) for item in items] + def _embedding_score( + self, + tokenizer: AnyTokenizer, + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + encoded_output = self.encode( + text_1 + text_2, + use_tqdm=use_tqdm, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + encoded_output_1 = encoded_output[0:len(text_1)] + encoded_output_2 = encoded_output[len(text_1):] + + if len(encoded_output_1) == 1: + encoded_output_1 = encoded_output_1 * len(encoded_output_2) + + output_pairs = [(t1, t2) + for t1, t2 in zip(encoded_output_1, encoded_output_2)] + + scores = [] + scorer = torch.nn.CosineSimilarity(0) + + for embed_1, embed_2 in output_pairs: + pair_score = scorer(embed_1.outputs.data, embed_2.outputs.data) + + if (pad_token_id := getattr(tokenizer, "pad_token_id", + None)) is not None: + tokens = embed_1.prompt_token_ids + [ + pad_token_id + ] + embed_2.prompt_token_ids + else: + tokens = embed_1.prompt_token_ids + embed_2.prompt_token_ids + + scores.append( + PoolingRequestOutput( + request_id=f"{embed_1.request_id}_{embed_2.request_id}", + outputs=pair_score, + prompt_token_ids=tokens, + finished=True)) + + items = self.engine_class.validate_outputs(scores, + PoolingRequestOutput) + return [ScoringRequestOutput.from_base(item) for item in items] + + def _cross_encoding_score( + self, + tokenizer: Union[AnyTokenizer], + text_1: List[Union[str, TextPrompt, TokensPrompt]], + text_2: List[Union[str, TextPrompt, TokensPrompt]], + truncate_prompt_tokens: Optional[int] = None, + use_tqdm: bool = True, + lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> List[ScoringRequestOutput]: + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "Score API is only enabled for `--task embed or score`") + + if len(text_1) == 1: + text_1 = text_1 * len(text_2) + + input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] + + pooling_params = PoolingParams() + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + parsed_prompts = [] + + for q, t in input_pairs: + prompt_inputs = tokenizer(text=q, + text_pair=t, + **tokenization_kwargs) + engine_prompt = TokensPrompt( + prompt_token_ids=prompt_inputs["input_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + parsed_prompts.append(engine_prompt) + + self._validate_and_add_requests( + prompts=parsed_prompts, + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, + ) + + outputs = self._run_engine(use_tqdm=use_tqdm) + items = self.engine_class.validate_outputs(outputs, + PoolingRequestOutput) + + return [ScoringRequestOutput.from_base(item) for item in items] + def score( self, text_1: Union[SingletonPrompt, Sequence[SingletonPrompt]], @@ -1027,25 +1149,20 @@ def score( raise ValueError(" ".join(messages)) - if not self.llm_engine.model_config.is_cross_encoder: - raise ValueError("Your model does not support cross encoding") - if self.llm_engine.model_config.task != "score": - raise ValueError("Score API is only enabled for `--task score`") - - tokenizer = self.llm_engine.get_tokenizer() - - if isinstance(tokenizer, MistralTokenizer): + if self.llm_engine.model_config.task not in ("embed", "score"): raise ValueError( - "MistralTokenizer not supported for cross-encoding") + "Score API is only enabled for `--task embed or --task score`") # the tokenizer for models such as # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # lists of tokens to the `text` and `text_pair` kwargs + tokenizer = self.llm_engine.get_tokenizer() + def ensure_str(prompt: SingletonPrompt): if isinstance(prompt, dict): if "multi_modal_data" in prompt: raise ValueError("Multi-modal prompt is not " - "supported for cross encoding") + "supported for scoring") elif "prompt_token_ids" in prompt: prompt = tokenizer.decode( cast(TokensPrompt, prompt)["prompt_token_ids"]) @@ -1071,40 +1188,15 @@ def ensure_str(prompt: SingletonPrompt): if len(text_2) == 0: raise ValueError("At least one text_pair element must be given") - if len(text_1) == 1: - text_1 = text_1 * len(text_2) - - input_pairs = [(t1, t2) for t1, t2 in zip(text_1, text_2)] - pooling_params = PoolingParams() - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - parsed_prompts = [] - - for q, t in input_pairs: - prompt_inputs = tokenizer(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - parsed_prompts.append(engine_prompt) - - self._validate_and_add_requests( - prompts=parsed_prompts, - params=pooling_params, - lora_request=lora_request, - prompt_adapter_request=prompt_adapter_request, - ) - - outputs = self._run_engine(use_tqdm=use_tqdm) - items = self.engine_class.validate_outputs(outputs, - PoolingRequestOutput) - - return [ScoringRequestOutput.from_base(item) for item in items] + if self.llm_engine.model_config.is_cross_encoder: + return self._cross_encoding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, + prompt_adapter_request) + else: + return self._embedding_score(tokenizer, text_1, text_2, + truncate_prompt_tokens, use_tqdm, + lora_request, prompt_adapter_request) def start_profile(self) -> None: self.llm_engine.start_profile() @@ -1112,6 +1204,36 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.llm_engine.stop_profile() + def reset_prefix_cache(self) -> bool: + return self.llm_engine.reset_prefix_cache() + + def sleep(self, level: int = 1): + """ + Put the engine to sleep. The engine should not process any requests. + The caller should guarantee that no requests are being processed + during the sleep period, before `wake_up` is called. + + :param level: The sleep level. Level 1 sleep will offload the model + weights and discard the kv cache. The content of kv cache is + forgotten. Level 1 sleep is good for sleeping and waking up the + engine to run the same model again. The model weights are backed + up in CPU memory. Please make sure there's enough CPU memory to + store the model weights. Level 2 sleep will discard both the model + weights and the kv cache. The content of both the model weights + and kv cache is forgotten. Level 2 sleep is good for sleeping and + waking up the engine to run a different model or update the model, + where previous model weights are not needed. It reduces CPU memory + pressure. + """ + self.reset_prefix_cache() + self.llm_engine.sleep(level=level) + + def wake_up(self): + """ + Wake up the engine from sleep mode. See the :meth:`sleep` method + for more details.""" + self.llm_engine.wake_up() + # LEGACY def _convert_v1_inputs( self, @@ -1284,6 +1406,10 @@ def _run_engine( if use_tqdm: pbar.close() + + # Make sure that all workers are finished. + self.llm_engine.stop_remote_worker_execution_loop() + # Sort the outputs by request ID. # This is necessary because some requests may be finished earlier than # its previous requests. diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 1aeefe86cd05e..9e5cf4ba2e490 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1,5 +1,6 @@ import asyncio import atexit +import gc import importlib import inspect import multiprocessing @@ -55,10 +56,12 @@ PoolingChatRequest, PoolingCompletionRequest, PoolingRequest, PoolingResponse, + RerankRequest, RerankResponse, ScoreRequest, ScoreResponse, TokenizeRequest, TokenizeResponse, UnloadLoraAdapterRequest) +from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion @@ -67,6 +70,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling +from vllm.entrypoints.openai.serving_rerank import JinaAIServingRerank from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( OpenAIServingTokenization) @@ -104,6 +108,11 @@ async def _force_log(): task.add_done_callback(_running_tasks.remove) else: task = None + + # Mark the startup heap as static so that it's ignored by GC. + # Reduces pause times of oldest generation collections. + gc.collect() + gc.freeze() try: yield finally: @@ -300,6 +309,10 @@ def score(request: Request) -> Optional[OpenAIServingScores]: return request.app.state.openai_serving_scores +def rerank(request: Request) -> Optional[JinaAIServingRerank]: + return request.app.state.jinaai_serving_reranking + + def tokenization(request: Request) -> OpenAIServingTokenization: return request.app.state.openai_serving_tokenization @@ -496,6 +509,40 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) +@router.post("/rerank") +@with_cancellation +async def do_rerank(request: RerankRequest, raw_request: Request): + handler = rerank(raw_request) + if handler is None: + return base(raw_request).create_error_response( + message="The model does not support Rerank (Score) API") + generator = await handler.do_rerank(request, raw_request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + elif isinstance(generator, RerankResponse): + return JSONResponse(content=generator.model_dump()) + + assert_never(generator) + + +@router.post("/v1/rerank") +@with_cancellation +async def do_rerank_v1(request: RerankRequest, raw_request: Request): + logger.warning_once( + "To indicate that the rerank API is not part of the standard OpenAI" + " API, we have located it at `/rerank`. Please update your client" + "accordingly. (Note: Conforms to JinaAI rerank API)") + + return await do_rerank(request, raw_request) + + +@router.post("/v2/rerank") +@with_cancellation +async def do_rerank_v2(request: RerankRequest, raw_request: Request): + return await do_rerank(request, raw_request) + + TASK_HANDLERS: Dict[str, Dict[str, tuple]] = { "generate": { "messages": (ChatCompletionRequest, create_chat_completion), @@ -506,7 +553,10 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): "default": (EmbeddingCompletionRequest, create_embedding), }, "score": { - "default": (ScoreRequest, create_score), + "default": (RerankRequest, do_rerank) + }, + "rerank": { + "default": (RerankRequest, do_rerank) }, "reward": { "messages": (PoolingChatRequest, create_pooling), @@ -518,6 +568,18 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): }, } +if envs.VLLM_SERVER_DEV_MODE: + + @router.post("/reset_prefix_cache") + async def reset_prefix_cache(raw_request: Request): + """ + Reset the prefix cache. Note that we currently do not check if the + prefix cache is successfully reset in the API server. + """ + logger.info("Resetting prefix cache...") + await engine_client(raw_request).reset_prefix_cache() + return Response(status_code=200) + @router.post("/invocations") async def invocations(raw_request: Request): @@ -710,6 +772,8 @@ async def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, + enable_reasoning=args.enable_reasoning, + reasoning_parser=args.reasoning_parser, enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.runner_type == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( @@ -741,6 +805,12 @@ async def init_app_state( state.openai_serving_models, request_logger=request_logger ) if model_config.task == "score" else None + state.jinaai_serving_reranking = JinaAIServingRerank( + engine_client, + model_config, + state.openai_serving_models, + request_logger=request_logger + ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, @@ -777,6 +847,13 @@ async def run_server(args, **uvicorn_kwargs) -> None: raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " f"(chose from {{ {','.join(valid_tool_parses)} }})") + valid_reasoning_parses = ReasoningParserManager.reasoning_parsers.keys() + if args.enable_reasoning \ + and args.reasoning_parser not in valid_reasoning_parses: + raise KeyError( + f"invalid reasoning parser: {args.reasoning_parser} " + f"(chose from {{ {','.join(valid_reasoning_parses)} }})") + # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. # see https://github.com/vllm-project/vllm/issues/8204 diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 35445449463e9..9cfe07c65d55e 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -12,6 +12,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) +from vllm.entrypoints.openai.reasoning_parsers import ReasoningParserManager from vllm.entrypoints.openai.serving_models import (LoRAModulePath, PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParserManager @@ -117,7 +118,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "or JSON format. " "Example (old format): ``'name=path'`` " "Example (new format): " - "``{\"name\": \"name\", \"local_path\": \"path\", " + "``{\"name\": \"name\", \"path\": \"lora_path\", " "\"base_model_name\": \"id\"}``") parser.add_argument( "--prompt-adapters", @@ -208,6 +209,23 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help="Enable auto tool choice for supported models. Use " "``--tool-call-parser`` to specify which parser to use.") + parser.add_argument( + "--enable-reasoning", + action="store_true", + default=False, + help="Whether to enable reasoning_content for the model. " + "If enabled, the model will be able to generate reasoning content.") + + valid_reasoning_parsers = ReasoningParserManager.reasoning_parsers.keys() + parser.add_argument( + "--reasoning-parser", + type=str, + metavar="{" + ",".join(valid_reasoning_parsers) + "}", + default=None, + help= + "Select the reasoning parser depending on the model that you're using." + " This is used to parse the reasoning content into OpenAI API " + "format. Required for ``--enable-reasoning``.") valid_tool_parsers = ToolParserManager.tool_parsers.keys() parser.add_argument( @@ -267,6 +285,18 @@ def validate_parsed_serve_args(args: argparse.Namespace): raise TypeError("Error: --enable-auto-tool-choice requires " "--tool-call-parser") + # Enable reasoning needs a reasoning parser to be valid + if args.enable_reasoning and not args.reasoning_parser: + raise TypeError("Error: --enable-reasoning requires " + "--reasoning-parser") + + # Ref https://api-docs.deepseek.com/guides/reasoning_model + # tool call and reasoning cannot be enabled at the same time. + if args.enable_auto_tool_choice and args.enable_reasoning: + raise TypeError( + "Error: --enable-auto-tool-choice and " + "--enable-reasoning cannot be enabled at the same time") + def create_parser_for_docs() -> FlexibleArgumentParser: parser_for_docs = FlexibleArgumentParser( diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 14e41346df775..29d071ce50c8e 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -3,10 +3,11 @@ import re import time from argparse import Namespace -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, ClassVar, Dict, List, Literal, Optional, Set, Union import torch -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, + ValidationInfo, field_validator, model_validator) from typing_extensions import Annotated from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -42,24 +43,32 @@ class OpenAIBaseModel(BaseModel): # OpenAI API does allow extra fields model_config = ConfigDict(extra="allow") - @model_validator(mode="before") + # Cache class field names + field_names: ClassVar[Optional[Set[str]]] = None + + @model_validator(mode="wrap") @classmethod - def __log_extra_fields__(cls, data): - if isinstance(data, dict): + def __log_extra_fields__(cls, data, handler): + result = handler(data) + if not isinstance(data, dict): + return result + field_names = cls.field_names + if field_names is None: # Get all class field names and their potential aliases field_names = set() for field_name, field in cls.model_fields.items(): field_names.add(field_name) - if hasattr(field, 'alias') and field.alias: - field_names.add(field.alias) - - # Compare against both field names and aliases - extra_fields = data.keys() - field_names - if extra_fields: - logger.warning( - "The following fields were present in the request " - "but ignored: %s", extra_fields) - return data + if alias := getattr(field, 'alias', None): + field_names.add(alias) + cls.field_names = field_names + + # Compare against both field names and aliases + if any(k not in field_names for k in data): + logger.warning( + "The following fields were present in the request " + "but ignored: %s", + data.keys() - field_names) + return result class ErrorResponse(OpenAIBaseModel): @@ -372,13 +381,17 @@ def to_beam_search_params( ) -> BeamSearchParams: # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} n = self.n if self.n is not None else 1 + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + if (temperature := self.temperature) is None: temperature = default_sampling_params.get( "temperature", self._DEFAULT_SAMPLING_PARAMS["temperature"]) @@ -398,11 +411,16 @@ def to_sampling_params( default_sampling_params: Optional[dict] = None) -> SamplingParams: # TODO(#9845): remove max_tokens when field is removed from OpenAI API max_tokens = self.max_completion_tokens or self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} + + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + # Default parameters if (repetition_penalty := self.repetition_penalty) is None: repetition_penalty = default_sampling_params.get( @@ -732,13 +750,17 @@ def to_beam_search_params( default_sampling_params: Optional[dict] = None ) -> BeamSearchParams: max_tokens = self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} n = self.n if self.n is not None else 1 + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + if (temperature := self.temperature) is None: temperature = default_sampling_params.get("temperature", 1.0) @@ -756,11 +778,16 @@ def to_sampling_params( logits_processor_pattern: Optional[str], default_sampling_params: Optional[dict] = None) -> SamplingParams: max_tokens = self.max_tokens - if max_tokens is None: - max_tokens = default_max_tokens if default_sampling_params is None: default_sampling_params = {} + + # Use minimum of context window, user request & server limit. + max_tokens = min( + val for val in (default_max_tokens, max_tokens, + default_sampling_params.get("max_tokens", None)) + if val is not None) + # Default parameters if (repetition_penalty := self.repetition_penalty) is None: repetition_penalty = default_sampling_params.get( @@ -992,6 +1019,52 @@ def to_pooling_params(self): return PoolingParams(additional_data=self.additional_data) +class RerankRequest(OpenAIBaseModel): + model: str + query: str + documents: List[str] + top_n: int = Field(default_factory=lambda: 0) + truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None + + # doc: begin-rerank-pooling-params + additional_data: Optional[Any] = None + # doc: end-rerank-pooling-params + + # doc: begin-rerank-extra-params + priority: int = Field( + default=0, + description=( + "The priority of the request (lower means earlier handling; " + "default: 0). Any priority other than 0 will raise an error " + "if the served model does not use priority scheduling.")) + + # doc: end-rerank-extra-params + + def to_pooling_params(self): + return PoolingParams(additional_data=self.additional_data) + + +class RerankDocument(BaseModel): + text: str + + +class RerankResult(BaseModel): + index: int + document: RerankDocument + relevance_score: float + + +class RerankUsage(BaseModel): + total_tokens: int + + +class RerankResponse(OpenAIBaseModel): + id: str + model: str + usage: RerankUsage + results: List[RerankResult] + + class CompletionLogProbs(OpenAIBaseModel): text_offset: List[int] = Field(default_factory=list) token_logprobs: List[Optional[float]] = Field(default_factory=list) @@ -1130,6 +1203,7 @@ class ExtractedToolCallInformation(BaseModel): class ChatMessage(OpenAIBaseModel): role: str + reasoning_content: Optional[str] = None content: Optional[str] = None tool_calls: List[ToolCall] = Field(default_factory=list) @@ -1171,6 +1245,7 @@ class ChatCompletionResponse(OpenAIBaseModel): class DeltaMessage(OpenAIBaseModel): role: Optional[str] = None content: Optional[str] = None + reasoning_content: Optional[str] = None tool_calls: List[DeltaToolCall] = Field(default_factory=list) @@ -1211,7 +1286,21 @@ class BatchRequestInput(OpenAIBaseModel): url: str # The parameters of the request. - body: Union[ChatCompletionRequest, EmbeddingRequest] + body: Union[ChatCompletionRequest, EmbeddingRequest, ScoreRequest] + + @field_validator('body', mode='plain') + @classmethod + def check_type_for_url(cls, value: Any, info: ValidationInfo): + # Use url to disambiguate models + url = info.data['url'] + if url == "/v1/chat/completions": + return ChatCompletionRequest.model_validate(value) + if url == "/v1/embeddings": + return TypeAdapter(EmbeddingRequest).validate_python(value) + if url == "/v1/score": + return ScoreRequest.model_validate(value) + return TypeAdapter(Union[ChatCompletionRequest, EmbeddingRequest, + ScoreRequest]).validate_python(value) class BatchResponseData(OpenAIBaseModel): @@ -1222,7 +1311,8 @@ class BatchResponseData(OpenAIBaseModel): request_id: str # The body of the response. - body: Optional[Union[ChatCompletionResponse, EmbeddingResponse]] = None + body: Optional[Union[ChatCompletionResponse, EmbeddingResponse, + ScoreResponse]] = None class BatchRequestOutput(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/reasoning_parsers/__init__.py b/vllm/entrypoints/openai/reasoning_parsers/__init__.py new file mode 100644 index 0000000000000..a21bff52f61fa --- /dev/null +++ b/vllm/entrypoints/openai/reasoning_parsers/__init__.py @@ -0,0 +1,6 @@ +from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager +from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser + +__all__ = [ + "ReasoningParser", "ReasoningParserManager", "DeepSeekR1ReasoningParser" +] diff --git a/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py new file mode 100644 index 0000000000000..e5d10ee0bc3a8 --- /dev/null +++ b/vllm/entrypoints/openai/reasoning_parsers/abs_reasoning_parsers.py @@ -0,0 +1,158 @@ +import os +from functools import cached_property +from typing import Callable, Dict, List, Optional, Sequence, Tuple, Type, Union + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils import import_from_path, is_list_of + +logger = init_logger(__name__) + + +class ReasoningParser: + """ + Abstract reasoning parser class that should not be used directly. + Provided and methods should be used in derived classes. + + It is used to extract reasoning content from the model output. + """ + + def __init__(self, tokenizer: AnyTokenizer): + self.model_tokenizer = tokenizer + + @cached_property + def vocab(self) -> Dict[str, int]: + # NOTE: Only PreTrainedTokenizerFast is guaranteed to have .vocab + # whereas all tokenizers have .get_vocab() + return self.model_tokenizer.get_vocab() + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> Tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from a complete model-generated string. + + Used for non-streaming responses where we have the entire model response + available before sending to the client. + + Parameters: + model_output: str + The model-generated string to extract reasoning content from. + + request: ChatCompletionRequest + The request object that was used to generate the model_output. + + Returns: + Tuple[Optional[str], Optional[str]] + A tuple containing the reasoning content and the content. + """ + + raise NotImplementedError( + "AbstractReasoningParser.extract_reasoning_calls " + "has not been implemented!") + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Instance method that should be implemented for extracting reasoning + from an incomplete response; for use when handling reasoning calls and + streaming. Has to be an instance method because it requires state - + the current tokens/diffs, but also the information about what has + previously been parsed and extracted (see constructor) + """ + raise NotImplementedError( + "AbstractReasoningParser.extract_reasoning_content_streaming " + "has not been implemented!") + + +class ReasoningParserManager: + reasoning_parsers: Dict[str, Type] = {} + + @classmethod + def get_reasoning_parser(cls, name) -> Type: + """ + Get reasoning parser by name which is registered by `register_module`. + + Raise a KeyError exception if the name is not registered. + """ + if name in cls.reasoning_parsers: + return cls.reasoning_parsers[name] + + raise KeyError(f"reasoning helper: '{name}' not found in " + "reasoning_parsers") + + @classmethod + def _register_module(cls, + module: Type, + module_name: Optional[Union[str, List[str]]] = None, + force: bool = True) -> None: + if not issubclass(module, ReasoningParser): + raise TypeError("module must be subclass of ReasoningParser, " + f"but got {type(module)}") + if module_name is None: + module_name = module.__name__ + if isinstance(module_name, str): + module_name = [module_name] + for name in module_name: + if not force and name in cls.reasoning_parsers: + existed_module = cls.reasoning_parsers[name] + raise KeyError(f"{name} is already registered " + f"at {existed_module.__module__}") + cls.reasoning_parsers[name] = module + + @classmethod + def register_module( + cls, + name: Optional[Union[str, List[str]]] = None, + force: bool = True, + module: Union[Type, None] = None) -> Union[type, Callable]: + """ + Register module with the given name or name list. it can be used as a + decoder(with module as None) or normal function(with module as not + None). + """ + if not isinstance(force, bool): + raise TypeError(f"force must be a boolean, but got {type(force)}") + + # raise the error ahead of time + if not (name is None or isinstance(name, str) + or is_list_of(name, str)): + raise TypeError( + "name must be None, an instance of str, or a sequence of str, " + f"but got {type(name)}") + + # use it as a normal method: x.register_module(module=SomeClass) + if module is not None: + cls._register_module(module=module, module_name=name, force=force) + return module + + # use it as a decorator: @x.register_module() + def _register(module): + cls._register_module(module=module, module_name=name, force=force) + return module + + return _register + + @classmethod + def import_reasoning_parser(cls, plugin_path: str) -> None: + """ + Import a user-defined reasoning parser by the path + of the reasoning parser define file. + """ + module_name = os.path.splitext(os.path.basename(plugin_path))[0] + + try: + import_from_path(module_name, plugin_path) + except Exception: + logger.exception("Failed to load module '%s' from %s.", + module_name, plugin_path) + return diff --git a/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py new file mode 100644 index 0000000000000..a440ddc8d3b5d --- /dev/null +++ b/vllm/entrypoints/openai/reasoning_parsers/deepseek_r1_reasoning_parser.py @@ -0,0 +1,133 @@ +import re +from typing import Optional, Sequence, Tuple, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.entrypoints.openai.reasoning_parsers.abs_reasoning_parsers import ( + ReasoningParser, ReasoningParserManager) +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("deepseek_r1") +class DeepSeekR1ReasoningParser(ReasoningParser): + """ + Reasoning parser for DeepSeek R1 model. + + The DeepSeek R1 model uses ... tokens to denote reasoning + text. This parser extracts the reasoning content from the model output. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + self.think_start_token = "" + self.think_end_token = "" + + self.reasoning_regex = re.compile( + rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction.") + + self.think_start_token_id = self.vocab.get(self.think_start_token) + self.think_end_token_id = self.vocab.get(self.think_end_token) + if (self.think_start_token_id is None + or self.think_end_token_id is None): + raise RuntimeError( + "DeepSeek R1 reasoning parser could not locate think start/end " + "tokens in the tokenizer!") + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Extract reasoning content from a delta message. + Handles streaming output where previous + delta = current. + Uses token IDs for faster processing. + For text abcxyz: + - 'abc' goes to reasoning_content + - 'xyz' goes to content + """ + # Skip single special tokens + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ + self.think_start_token_id, self.think_end_token_id + ]): + return None + + if self.think_start_token_id in previous_token_ids: + if self.think_end_token_id in delta_token_ids: + # in previous, in delta, + # extract reasoning content + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + elif self.think_end_token_id in previous_token_ids: + # in previous, in previous, + # reasoning content continues + return DeltaMessage(content=delta_text) + else: + # in previous, no in previous or delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + elif self.think_start_token_id in delta_token_ids: + logger.info(delta_text) + if self.think_end_token_id in delta_token_ids: + # in delta, in delta, extract reasoning content + start_index = delta_text.find(self.think_start_token) + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[start_index + + len(self.think_start_token + ):end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + else: + # in delta, no in delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + else: + # No in previous or delta, reasoning content continues. + return DeltaMessage(content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> Tuple[Optional[str], Optional[str]]: + + # Check if the model output contains the tokens. + if (self.think_start_token not in model_output + or self.think_end_token not in model_output): + return None, model_output + else: + # Use a regex to find the reasoning content + reasoning_content = self.reasoning_regex.findall(model_output)[0] + + # Remove the reasoning content from the model output + # Although deepseek's token is always at the + # beginning of the line, we cannot guarantee that the + # other models will follow this convention. + # Therefore, we need to add :start_index. + start_index = model_output.find(self.think_start_token) + if start_index != -1: + end_index = start_index + len( + f"{self.think_start_token}{reasoning_content}{self.think_end_token}" + ) + model_output = model_output[:start_index] + \ + model_output[end_index:] + + if len(model_output) == 0: + return reasoning_content, None + + return reasoning_content, model_output diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index f8f136f9d5024..37ae23506acea 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -16,12 +16,14 @@ BatchRequestOutput, BatchResponseData, ChatCompletionResponse, - EmbeddingResponse, ErrorResponse) + EmbeddingResponse, ErrorResponse, + ScoreResponse) # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding from vllm.entrypoints.openai.serving_models import (BaseModelPath, OpenAIServingModels) +from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -167,7 +169,8 @@ async def run_request(serving_engine_func: Callable, tracker: BatchProgressTracker) -> BatchRequestOutput: response = await serving_engine_func(request.body) - if isinstance(response, (ChatCompletionResponse, EmbeddingResponse)): + if isinstance(response, + (ChatCompletionResponse, EmbeddingResponse, ScoreResponse)): batch_output = BatchRequestOutput( id=f"vllm-{random_uuid()}", custom_id=request.custom_id, @@ -239,6 +242,12 @@ async def main(args): chat_template=None, chat_template_content_format="auto", ) if model_config.task == "embed" else None + openai_serving_scores = (OpenAIServingScores( + engine, + model_config, + openai_serving_models, + request_logger=request_logger, + ) if model_config.task == "score" else None) tracker = BatchProgressTracker() logger.info("Reading batch from %s...", args.input_file) @@ -279,14 +288,28 @@ async def main(args): )) continue + response_futures.append(run_request(handler_fn, request, tracker)) + tracker.submitted() + elif request.url == "/v1/score": + handler_fn = (None if openai_serving_scores is None else + openai_serving_scores.create_score) + if handler_fn is None: + response_futures.append( + make_async_error_request_output( + request, + error_msg="The model does not support Scores API", + )) + continue + response_futures.append(run_request(handler_fn, request, tracker)) tracker.submitted() else: response_futures.append( make_async_error_request_output( request, - error_msg="Only /v1/chat/completions and " - "/v1/embeddings are supported in the batch endpoint.", + error_msg= + "Only /v1/chat/completions, /v1/embeddings, and /v1/score " + "are supported in the batch endpoint.", )) with tracker.pbar(): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 89a119ac65695..dc97f0eb059d7 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -21,6 +21,8 @@ ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo) +from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser, + ReasoningParserManager) from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager @@ -47,6 +49,8 @@ def __init__( chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, return_tokens_as_token_ids: bool = False, + enable_reasoning: bool = False, + reasoning_parser: Optional[str] = None, enable_auto_tools: bool = False, tool_parser: Optional[str] = None, enable_prompt_tokens_details: bool = False, @@ -69,6 +73,18 @@ def __init__( " the parallel_tool_calls client option is preset for " "compatibility reasons, it will be ignored.") + self.enable_reasoning: bool = enable_reasoning + self.reasoning_parser: Optional[Callable[[AnyTokenizer], + ReasoningParser]] = None + if self.enable_reasoning: + try: + self.reasoning_parser = ( + ReasoningParserManager.get_reasoning_parser( + reasoning_parser)) + except Exception as e: + raise TypeError("Error: --enable-reasoning requires " + f"reasoning_parser:'{reasoning_parser}' " + "which has not been registered") from e self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None if self.enable_auto_tools: try: @@ -285,14 +301,35 @@ async def chat_completion_stream_generator( not tool_choice_function_name and self._should_stream_with_auto_tool_parsing(request)) + should_stream_with_reasoning_parsing = ( + self._should_stream_with_reasoning_parsing(request)) + all_previous_token_ids: Optional[List[List[int]]] - if tool_choice_auto: + + # Only one of these will be used, thus previous_texts and + # all_previous_token_ids will not be used twice in the same iteration. + if tool_choice_auto or should_stream_with_reasoning_parsing: # These are only required in "auto" tool choice case previous_texts = [""] * num_choices all_previous_token_ids = [[]] * num_choices else: previous_texts, all_previous_token_ids = None, None + try: + # There is no need to check if the reasoning_parser is None + # because the should_stream_with_reasoning_parsing check + # already ensures that the reasoning_parser is not None. + # but the pre-commit hook requires it. + if should_stream_with_reasoning_parsing and \ + self.reasoning_parser is not None: + reasoning_parser = self.reasoning_parser(tokenizer) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + data = self.create_streaming_error_response(str(e)) + yield f"data: {data}\n\n" + yield "data: [DONE]\n\n" + return + # Prepare the tool parser if it's needed try: if tool_choice_auto and self.tool_parser: @@ -456,6 +493,32 @@ async def chat_completion_stream_generator( # update the previous values for the next iteration previous_texts[i] = current_text all_previous_token_ids[i] = current_token_ids + # reasoning_content cannot be enabled with tool_choice. + # If it is, the tool_choice will be used instead. + elif self.enable_reasoning: + # handle reasoning_content delta + assert reasoning_parser is not None + assert previous_texts is not None + assert all_previous_token_ids is not None + previous_text = previous_texts[i] + previous_token_ids = all_previous_token_ids[i] + current_text = previous_text + delta_text + current_token_ids = previous_token_ids + list( + output.token_ids) + + delta_message = (reasoning_parser. + extract_reasoning_content_streaming( + previous_text, + current_text, + delta_text, + previous_token_ids, + current_token_ids, + output.token_ids, + )) + + # update the previous values for the next iteration + previous_texts[i] = current_text + all_previous_token_ids[i] = current_token_ids # handle streaming just a content delta else: @@ -642,17 +705,38 @@ async def chat_completion_full_generator( else: logprobs = None + should_stream_with_reasoning_parsing = ( + self._should_stream_with_reasoning_parsing(request)) + # In the OpenAI API the finish_reason is "tools_called" # if the tool choice is auto and the model produced a tool # call. The same is not true for named function calls auto_tools_called = False + if should_stream_with_reasoning_parsing and \ + self.reasoning_parser is not None: + try: + reasoning_parser = self.reasoning_parser(tokenizer) + except RuntimeError as e: + logger.exception("Error in reasoning parser creation.") + return self.create_error_response(str(e)) + + reasoning_content, content = ( + reasoning_parser.extract_reasoning_content( + output.text, request=request)) + + if reasoning_content: + message = ChatMessage(role=role, + content=content, + reasoning_content=reasoning_content) + else: + message = ChatMessage(role=role, content=output.text) + # if auto tools are not enabled, and a named tool choice using # outlines is not being used - if (not self.enable_auto_tools - or not self.tool_parser) and not isinstance( - request.tool_choice, - ChatCompletionNamedToolChoiceParam): + elif (not self.enable_auto_tools + or not self.tool_parser) and not isinstance( + request.tool_choice, ChatCompletionNamedToolChoiceParam): message = ChatMessage(role=role, content=output.text) # if the request uses tools and specified a tool choice @@ -835,6 +919,17 @@ def _should_stream_with_auto_tool_parsing(self, return (request.tools and self.tool_parser and self.enable_auto_tools and request.tool_choice in ['auto', None]) + def _should_stream_with_reasoning_parsing(self, + request: ChatCompletionRequest): + """ + Utility function to check if streamed tokens should go through the + reasoning parser that was configured. + + We only want to do this IF reasoning is enabled and a reasoning + parser is configured. + """ + return self.enable_reasoning and self.reasoning_parser is not None + def _should_check_for_unstreamed_tool_arg_tokens( self, delta_message: Optional[DeltaMessage], diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2c9c20caf8119..13c3926368890 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -58,7 +58,7 @@ def __init__( async def create_completion( self, request: CompletionRequest, - raw_request: Request, + raw_request: Optional[Request] = None, ) -> Union[AsyncGenerator[str, None], CompletionResponse, ErrorResponse]: """Completion API similar to OpenAI's API. @@ -137,7 +137,7 @@ async def create_completion( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request) - trace_headers = (await + trace_headers = (None if raw_request is None else await self._get_trace_headers(raw_request.headers)) if isinstance(sampling_params, BeamSearchParams): @@ -522,11 +522,10 @@ def _create_completion_logprobs( out_top_logprobs.append({ # Convert float("-inf") to the # JSON-serializable float that OpenAI uses - self._get_decoded_token( - top_lp[1], - top_lp[0], - tokenizer, - return_as_token_id=self.return_tokens_as_token_ids): + self._get_decoded_token(top_lp[1], + top_lp[0], + tokenizer, + return_as_token_id=self.return_tokens_as_token_ids): max(top_lp[1].logprob, -9999.0) for i, top_lp in enumerate(step_top_logprobs.items()) if num_output_top_logprobs >= i diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 88859255f202a..8d54164e500eb 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -26,7 +26,8 @@ DetokenizeRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, - ErrorResponse, ScoreRequest, + ErrorResponse, RerankRequest, + ScoreRequest, TokenizeChatRequest, TokenizeCompletionRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels @@ -203,15 +204,19 @@ def _validate_input( ) -> TextTokensPrompt: token_num = len(input_ids) - # Note: EmbeddingRequest doesn't have max_tokens + # Note: EmbeddingRequest and ScoreRequest doesn't have max_tokens if isinstance(request, - (EmbeddingChatRequest, EmbeddingCompletionRequest)): + (EmbeddingChatRequest, EmbeddingCompletionRequest, + ScoreRequest, RerankRequest)): + + operation = "score" if isinstance(request, ScoreRequest) \ + else "embedding generation" if token_num > self.max_model_len: raise ValueError( f"This model's maximum context length is " f"{self.max_model_len} tokens. However, you requested " - f"{token_num} tokens in the input for embedding " - f"generation. Please reduce the length of the input.") + f"{token_num} tokens in the input for {operation}. " + f"Please reduce the length of the input.") return TextTokensPrompt(prompt=input_text, prompt_token_ids=input_ids) diff --git a/vllm/entrypoints/openai/serving_rerank.py b/vllm/entrypoints/openai/serving_rerank.py new file mode 100644 index 0000000000000..be4420261afe3 --- /dev/null +++ b/vllm/entrypoints/openai/serving_rerank.py @@ -0,0 +1,206 @@ +import asyncio +from typing import Any, AsyncGenerator, Dict, List, Optional, Union, cast + +from fastapi import Request + +from vllm.config import ModelConfig +from vllm.engine.protocol import EngineClient +from vllm.entrypoints.logger import RequestLogger +from vllm.entrypoints.openai.protocol import (ErrorResponse, RerankDocument, + RerankRequest, RerankResponse, + RerankResult, RerankUsage) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels +from vllm.inputs.data import TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput +from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from vllm.utils import make_async, merge_async_iterators + +logger = init_logger(__name__) + + +class JinaAIServingRerank(OpenAIServing): + + def __init__( + self, + engine_client: EngineClient, + model_config: ModelConfig, + models: OpenAIServingModels, + *, + request_logger: Optional[RequestLogger], + ) -> None: + super().__init__(engine_client=engine_client, + model_config=model_config, + models=models, + request_logger=request_logger) + + async def do_rerank( + self, + request: RerankRequest, + raw_request: Optional[Request] = None + ) -> Union[RerankResponse, ErrorResponse]: + """ + Rerank API based on JinaAI's rerank API; implements the same + API interface. Designed for compatibility with off-the-shelf + tooling, since this is a common standard for reranking APIs + + See example client implementations at + https://github.com/infiniflow/ragflow/blob/main/rag/llm/rerank_model.py + numerous clients use this standard. + """ + error_check_ret = await self._check_model(request) + if error_check_ret is not None: + return error_check_ret + + model_name = request.model + request_id = f"rerank-{self._base_request_id(raw_request)}" + truncate_prompt_tokens = request.truncate_prompt_tokens + query = request.query + documents = request.documents + request_prompts = [] + engine_prompts = [] + top_n = request.top_n if request.top_n > 0 else len(documents) + + try: + ( + lora_request, + prompt_adapter_request, + ) = self._maybe_get_adapters(request) + + tokenizer = await self.engine_client.get_tokenizer(lora_request) + + if prompt_adapter_request is not None: + raise NotImplementedError("Prompt adapter is not supported " + "for scoring models") + + if isinstance(tokenizer, MistralTokenizer): + raise ValueError( + "MistralTokenizer not supported for cross-encoding") + + if not self.model_config.is_cross_encoder: + raise ValueError("Model is not cross encoder.") + + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({self.max_model_len})." + f" Please, select a smaller truncation size.") + for doc in documents: + request_prompt = f"{query}{tokenizer.sep_token}{doc}" + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=query, + text_pair=doc, + **tokenization_kwargs) + + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + + except ValueError as e: + logger.exception("Error in preprocessing prompt inputs") + return self.create_error_response(str(e)) + + # Schedule the request and get the result generator. + generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] + + try: + pooling_params = request.to_pooling_params() + + for i, engine_prompt in enumerate(engine_prompts): + request_id_item = f"{request_id}-{i}" + + self._log_inputs(request_id_item, + request_prompts[i], + params=pooling_params, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) + + trace_headers = (None if raw_request is None else await + self._get_trace_headers(raw_request.headers)) + + generator = self.engine_client.encode( + engine_prompt, + pooling_params, + request_id_item, + lora_request=lora_request, + trace_headers=trace_headers, + priority=request.priority, + ) + + generators.append(generator) + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + result_generator = merge_async_iterators(*generators) + + num_prompts = len(engine_prompts) + + # Non-streaming response + final_res_batch: List[Optional[PoolingRequestOutput]] + final_res_batch = [None] * num_prompts + + try: + async for i, res in result_generator: + final_res_batch[i] = res + + assert all(final_res is not None for final_res in final_res_batch) + + final_res_batch_checked = cast(List[PoolingRequestOutput], + final_res_batch) + + response = self.request_output_to_rerank_response( + final_res_batch_checked, request_id, model_name, documents, + top_n) + except asyncio.CancelledError: + return self.create_error_response("Client disconnected") + except ValueError as e: + # TODO: Use a vllm-specific Validation Error + return self.create_error_response(str(e)) + + return response + + def request_output_to_rerank_response( + self, final_res_batch: List[PoolingRequestOutput], request_id: str, + model_name: str, documents: List[str], + top_n: int) -> RerankResponse: + """ + Convert the output of do_rank to a RerankResponse + """ + results: List[RerankResult] = [] + num_prompt_tokens = 0 + for idx, final_res in enumerate(final_res_batch): + classify_res = ScoringRequestOutput.from_base(final_res) + + result = RerankResult( + index=idx, + document=RerankDocument(text=documents[idx]), + relevance_score=classify_res.outputs.score, + ) + results.append(result) + prompt_token_ids = final_res.prompt_token_ids + num_prompt_tokens += len(prompt_token_ids) + + # sort by relevance, then return the top n if set + results.sort(key=lambda x: x.relevance_score, reverse=True) + if top_n < len(documents): + results = results[:top_n] + + return RerankResponse( + id=request_id, + model=model_name, + results=results, + usage=RerankUsage(total_tokens=num_prompt_tokens)) diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index 5d3e7139d7a17..381edf8fac49e 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -101,6 +101,38 @@ async def create_score( if not self.model_config.is_cross_encoder: raise ValueError("Model is not cross encoder.") + if truncate_prompt_tokens is not None and \ + truncate_prompt_tokens > self.max_model_len: + raise ValueError( + f"truncate_prompt_tokens value ({truncate_prompt_tokens}) " + f"is greater than max_model_len ({self.max_model_len})." + f" Please, select a smaller truncation size.") + + input_pairs = make_pairs(request.text_1, request.text_2) + for q, t in input_pairs: + request_prompt = f"{q}{tokenizer.sep_token}{t}" + + tokenization_kwargs: Dict[str, Any] = {} + if truncate_prompt_tokens is not None: + tokenization_kwargs["truncation"] = True + tokenization_kwargs["max_length"] = truncate_prompt_tokens + + tokenize_async = make_async(tokenizer.__call__, + executor=self._tokenizer_executor) + prompt_inputs = await tokenize_async(text=q, + text_pair=t, + **tokenization_kwargs) + + input_ids = prompt_inputs["input_ids"] + text_token_prompt = \ + self._validate_input(request, input_ids, request_prompt) + engine_prompt = TokensPrompt( + prompt_token_ids=text_token_prompt["prompt_token_ids"], + token_type_ids=prompt_inputs.get("token_type_ids")) + + request_prompts.append(request_prompt) + engine_prompts.append(engine_prompt) + except ValueError as e: logger.exception("Error in preprocessing prompt inputs") return self.create_error_response(str(e)) @@ -108,28 +140,6 @@ async def create_score( # Schedule the request and get the result generator. generators: List[AsyncGenerator[PoolingRequestOutput, None]] = [] - input_pairs = make_pairs(request.text_1, request.text_2) - - for q, t in input_pairs: - request_prompt = f"{q}{tokenizer.sep_token}{t}" - - tokenization_kwargs: Dict[str, Any] = {} - if truncate_prompt_tokens is not None: - tokenization_kwargs["truncation"] = True - tokenization_kwargs["max_length"] = truncate_prompt_tokens - - tokenize_async = make_async(tokenizer.__call__, - executor=self._tokenizer_executor) - prompt_inputs = await tokenize_async(text=q, - text_pair=t, - **tokenization_kwargs) - engine_prompt = TokensPrompt( - prompt_token_ids=prompt_inputs["input_ids"], - token_type_ids=prompt_inputs.get("token_type_ids")) - - request_prompts.append(request_prompt) - engine_prompts.append(engine_prompt) - try: pooling_params = request.to_pooling_params() diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 94db8f379e33a..93e357e8b9f21 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -62,8 +62,8 @@ def extract_tool_calls( start_of_json = match.end() # end_index == the start of the next function call # (if exists) - next_function_call_start = (matches[i + 1].start() - if i + 1 < len(matches) else None) + next_function_call_start = (matches[i + 1].start() if i + + 1 < len(matches) else None) raw_function_calls.append( dec.raw_decode( diff --git a/vllm/envs.py b/vllm/envs.py index b7b597ea15af3..8627caec7790d 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -11,6 +11,7 @@ VLLM_NCCL_SO_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None VLLM_USE_TRITON_FLASH_ATTN: bool = False + VLLM_FLASH_ATTN_VERSION: Optional[int] = None LOCAL_RANK: int = 0 CUDA_VISIBLE_DEVICES: Optional[str] = None VLLM_ENGINE_ITERATION_TIMEOUT_S: int = 60 @@ -72,6 +73,10 @@ VLLM_ENABLE_V1_MULTIPROCESSING: bool = True VLLM_LOG_BATCHSIZE_INTERVAL: float = -1 VLLM_DISABLE_COMPILE_CACHE: bool = False + K_SCALE_CONSTANT: int = 200 + V_SCALE_CONSTANT: int = 100 + VLLM_SERVER_DEV_MODE: bool = False + VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 def get_default_cache_root(): @@ -88,6 +93,12 @@ def get_default_config_root(): ) +def maybe_convert_int(value: Optional[str]) -> Optional[int]: + if value is None: + return None + return int(value) + + # The begin-* and end* here are used by the documentation generator # to extract the used env vars. @@ -201,6 +212,11 @@ def get_default_config_root(): lambda: (os.environ.get("VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1")), + # Force vllm to use a specific flash-attention version (2 or 3), only valid + # when using the flash-attention backend. + "VLLM_FLASH_ATTN_VERSION": + lambda: maybe_convert_int(os.environ.get("VLLM_FLASH_ATTN_VERSION", None)), + # Internal flag to enable Dynamo fullgraph capture "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE": lambda: bool( @@ -460,6 +476,13 @@ def get_default_config_root(): "VLLM_USE_V1": lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))), + # Divisor for dynamic key scale factor calculation for FP8 KV Cache + "K_SCALE_CONSTANT": + lambda: int(os.getenv("K_SCALE_CONSTANT", "200")), + + # Divisor for dynamic value scale factor calculation for FP8 KV Cache + "V_SCALE_CONSTANT": + lambda: int(os.getenv("V_SCALE_CONSTANT", "100")), # If set, enable multiprocessing in LLM for the V1 code path. "VLLM_ENABLE_V1_MULTIPROCESSING": lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1"))), @@ -467,6 +490,22 @@ def get_default_config_root(): lambda: float(os.getenv("VLLM_LOG_BATCHSIZE_INTERVAL", "-1")), "VLLM_DISABLE_COMPILE_CACHE": lambda: bool(int(os.getenv("VLLM_DISABLE_COMPILE_CACHE", "0"))), + + # If set, vllm will run in development mode, which will enable + # some additional endpoints for developing and debugging, + # e.g. `/reset_prefix_cache` + "VLLM_SERVER_DEV_MODE": + lambda: bool(int(os.getenv("VLLM_SERVER_DEV_MODE", "0"))), + + # Controls the maximum number of requests to handle in a + # single asyncio task when processing per-token outputs in the + # V1 AsyncLLM interface. It is applicable when handling a high + # concurrency of streaming requests. + # Setting this too high can result in a higher variance of + # inter-message latencies. Setting it too low can negatively impact + # TTFT and overall throughput. + "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE": + lambda: int(os.getenv("VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", "128")), } # end-env-vars-definition diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index d8457cb693cdb..471d1bfac3119 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -3,6 +3,9 @@ from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple, Union) +import torch.nn as nn +from typing_extensions import TypeVar + from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -11,9 +14,12 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import ExecuteModelRequest, PoolerOutput from vllm.utils import make_async +from vllm.worker.worker_base import WorkerBase logger = init_logger(__name__) +_R = TypeVar("_R", default=Any) + class ExecutorBase(ABC): """Base class for all executors. @@ -41,25 +47,41 @@ def __init__( self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self._init_executor() + self.is_sleeping = False @abstractmethod def _init_executor(self) -> None: - pass + raise NotImplementedError @abstractmethod def collective_rpc(self, - method: Union[str, Callable], + method: Union[str, Callable[..., _R]], timeout: Optional[float] = None, args: Tuple = (), - kwargs: Optional[Dict] = None) -> List[Any]: + kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: """ - The main interface of the executor to run a method on all workers, - with homogeneous arguments. - If the args are heterogeneous, then we can pack them into a list, - and unpack them in the method of every worker, because every worker - knows their own rank. + Execute an RPC call on all workers. + + Args: + method: Name of the worker method to execute, or a callable that + is serialized and sent to all workers to execute. + + If the method is a callable, it should accept an additional + `self` argument, in addition to the arguments passed in `args` + and `kwargs`. The `self` argument will be the worker object. + timeout: Maximum time in seconds to wait for execution. Raises a + :exc:`TimeoutError` on timeout. `None` means wait indefinitely. + args: Positional arguments to pass to the worker method. + kwargs: Keyword arguments to pass to the worker method. + + Returns: + A list containing the results from each worker. + + Note: + It is recommended to use this API to only pass control messages, + and set up data-plane communication to pass data. """ - pass + raise NotImplementedError def determine_num_available_blocks(self) -> Tuple[int, int]: """Determine the number of available blocks for the GPU KV cache and @@ -79,16 +101,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: b = min([r[1] for r in results]) return a, b - def initialize(self, num_gpu_blocks: int) -> None: - """ - Initialize the KV caches and begin the model execution loop of the - underlying workers. - For V1 compatibility. - """ - logger.info("# GPU blocks: %d", num_gpu_blocks) - self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) - self.collective_rpc("compile_or_warm_up_model") - def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: """Initialize the KV cache by invoking the underlying worker. """ @@ -107,6 +119,17 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: self.collective_rpc("initialize_cache", args=(num_gpu_blocks, num_cpu_blocks)) + def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: + """ + Run a function directly on the model inside each worker, + returning the result for each of them. + """ + + def rpc_func(worker: WorkerBase) -> _R: + return func(worker.get_model()) + + return self.collective_rpc(rpc_func) + def execute_model( self, execute_model_req: ExecuteModelRequest ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]: @@ -171,6 +194,20 @@ def start_profile(self) -> None: def stop_profile(self) -> None: self.collective_rpc("stop_profile") + def sleep(self, level: int = 1): + if self.is_sleeping: + logger.warning("Executor is already sleeping.") + return + self.collective_rpc("sleep", kwargs=dict(level=level)) + self.is_sleeping = True + + def wake_up(self): + if not self.is_sleeping: + logger.warning("Executor is not sleeping.") + return + self.collective_rpc("wake_up") + self.is_sleeping = False + def save_sharded_state( self, path: str, diff --git a/vllm/executor/mp_distributed_executor.py b/vllm/executor/mp_distributed_executor.py index 8da97df13190c..c2e65d0343a78 100644 --- a/vllm/executor/mp_distributed_executor.py +++ b/vllm/executor/mp_distributed_executor.py @@ -1,4 +1,5 @@ import asyncio +import os from typing import Any, Callable, List, Optional, Union import cloudpickle @@ -10,8 +11,9 @@ from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest -from vllm.utils import (_run_task_with_lock, get_distributed_init_method, - get_ip, get_open_port, make_async, run_method) +from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, + get_distributed_init_method, get_ip, get_open_port, + make_async, run_method, update_environment_variables) from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -22,7 +24,39 @@ class MultiprocessingDistributedExecutor(DistributedExecutorBase): uses_ray: bool = False + def _check_cuda(self) -> None: + """Check that the number of GPUs is sufficient for the parallel + configuration. Separate from _init_executor to reduce the number of + indented blocks. + """ + parallel_config = self.parallel_config + world_size = parallel_config.world_size + tensor_parallel_size = parallel_config.tensor_parallel_size + + cuda_device_count = cuda_device_count_stateless() + # Use confusing message for more common TP-only case. + if tensor_parallel_size > cuda_device_count: + raise RuntimeError( + f"please set tensor_parallel_size ({tensor_parallel_size}) " + f"to less than max local gpu count ({cuda_device_count})") + + if world_size > cuda_device_count: + raise RuntimeError( + f"please ensure that world_size ({world_size}) " + f"is less than than max local gpu count ({cuda_device_count})") + + # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers + if "CUDA_VISIBLE_DEVICES" not in os.environ: + update_environment_variables({ + "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) + }) + def _init_executor(self) -> None: + + from vllm.platforms import current_platform + if current_platform.is_cuda_alike(): + self._check_cuda() + # Create the parallel GPU workers. world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size @@ -118,7 +152,7 @@ def _run_workers( async_run_tensor_parallel_workers_only: bool = False, max_concurrent_workers: Optional[int] = None, **kwargs, - ) -> Any: + ) -> List[Any]: """Runs the given method on all workers. Args: diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index b8163a7acde1d..57e85779dd587 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from vllm.multimodal import (MultiModalDataDict, MultiModalKwargs, MultiModalPlaceholderDict) - from vllm.multimodal.inputs import MultiModalInputsV2 + from vllm.multimodal.inputs import MultiModalInputs class TextPrompt(TypedDict): @@ -207,7 +207,7 @@ def token_inputs( return inputs -DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputs"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -222,14 +222,14 @@ class EncoderDecoderInputs(TypedDict): This specifies the required data for encoder-decoder models. """ - encoder: Union[TokenInputs, "MultiModalInputsV2"] + encoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the encoder portion.""" - decoder: Union[TokenInputs, "MultiModalInputsV2"] + decoder: Union[TokenInputs, "MultiModalInputs"] """The inputs for the decoder portion.""" -SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] +SingletonInputs = Union[TokenInputs, "MultiModalInputs"] """ A processed :class:`SingletonPrompt` which can be passed to :class:`vllm.sequence.Sequence`. @@ -311,7 +311,7 @@ def multi_modal_hashes(self) -> List[str]: return inputs.get("multi_modal_hashes", []) if inputs["type"] == "multimodal": - # only the case when we use MultiModalInputsV2 + # only the case when we use MultiModalInputs return inputs.get("mm_hashes", []) # type: ignore[return-value] assert_never(inputs) # type: ignore[arg-type] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0890883cc984f..70372e0cad22d 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -7,7 +7,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputsV2 +from vllm.multimodal.inputs import MultiModalDataDict, MultiModalInputs from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup @@ -247,7 +247,7 @@ def _process_multimodal( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, returning the corresponding token IDs and metadata. @@ -271,7 +271,7 @@ async def _process_multimodal_async( mm_data: MultiModalDataDict, mm_processor_kwargs: Optional[Mapping[str, object]], lora_request: Optional[LoRARequest], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """Async version of :meth:`_process_multimodal`.""" tokenizer_group = self.get_tokenizer_group() tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index c39926110c375..84ebcb9587de8 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -220,8 +220,10 @@ def set_lora( lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ].copy_(embeddings_tensor, non_blocking=True) + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ].copy_(embeddings_tensor, non_blocking=True) if self.embeddings_slice is not None: # TODO(yard1): Optimize this copy, we don't need to copy # everything, just the modified part @@ -1024,8 +1026,10 @@ def set_lora( lora_b.T, non_blocking=True) if embeddings_tensor is not None: self.embeddings_tensors[ - index, :embeddings_tensor.shape[0], :embeddings_tensor. - shape[1], ] = embeddings_tensor + index, + :embeddings_tensor.shape[0], + :embeddings_tensor.shape[1], + ] = embeddings_tensor def _get_logits( self, diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 9809405ca9a61..2e04cb902d009 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -75,8 +75,9 @@ def __init__( # Scaling factor for long context lora model. None if it is not # fine tuned for the long context. self.scaling_factor = scaling_factor - assert (lora_model_id > - 0), f"a valid lora id should be greater than 0, got {self.id}" + assert ( + lora_model_id + > 0), f"a valid lora id should be greater than 0, got {self.id}" self.rank = rank self.loras: Dict[str, LoRALayerWeights] = loras @@ -273,7 +274,8 @@ def from_local_checkpoint( new_embeddings_tensor_path) elif os.path.isfile(new_embeddings_bin_file_path): embeddings = torch.load(new_embeddings_bin_file_path, - map_location=device) + map_location=device, + weights_only=True) return cls.from_lora_tensors( lora_model_id=get_lora_id() diff --git a/vllm/lora/ops/triton_ops/sgmv_expand.py b/vllm/lora/ops/triton_ops/sgmv_expand.py index 8af44b703810b..48fa5cd63741f 100644 --- a/vllm/lora/ops/triton_ops/sgmv_expand.py +++ b/vllm/lora/ops/triton_ops/sgmv_expand.py @@ -136,9 +136,8 @@ def _sgmv_expand_kernel( c_ptr = (out_ptr + offset_cm[:, None] * output_d0_stride + offset_cn[None, :] * output_d1_stride) M = tl.load(seq_lens + cur_batch) - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < - (cur_slice_start + curr_N)) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & ( + offset_cn[None, :] < (cur_slice_start + curr_N)) if ADD_INPUTS: tiled_out = tl.load(c_ptr, mask=c_mask) tiled_c += tiled_out diff --git a/vllm/lora/ops/triton_ops/sgmv_shrink.py b/vllm/lora/ops/triton_ops/sgmv_shrink.py index 3d2ebe8286f56..9bb35e8ffd323 100644 --- a/vllm/lora/ops/triton_ops/sgmv_shrink.py +++ b/vllm/lora/ops/triton_ops/sgmv_shrink.py @@ -114,8 +114,8 @@ def _sgmv_shrink_kernel( slice_id * output_d0_stride) c_ptr = cur_out_ptr + offset_cm[:, None] * output_d1_stride + offset_cn[ None, :] * output_d2_stride - c_mask = (offset_cm[:, None] < - (cur_seq_start + M)) & (offset_cn[None, :] < N) + c_mask = (offset_cm[:, None] < (cur_seq_start + M)) & (offset_cn[None, :] + < N) accumulator *= scaling # handles write-back with reduction-splitting if SPLIT_K == 1: diff --git a/vllm/lora/punica_wrapper/punica_hpu.py b/vllm/lora/punica_wrapper/punica_hpu.py index d9c4f44a1c282..3661a7214648a 100644 --- a/vllm/lora/punica_wrapper/punica_hpu.py +++ b/vllm/lora/punica_wrapper/punica_hpu.py @@ -1,10 +1,18 @@ -from typing import Optional, Tuple, Union, final +# SPDX-License-Identifier: Apache-2.0 + +from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final import torch from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, dispatch_bgmv_linear) from .punica_base import PunicaWrapperBase +from .utils import convert_mapping + +if TYPE_CHECKING: + # avoid circuit import + from vllm.lora.layers import LoRAMapping + from vllm.lora.models import LongContextLoRAContext @final @@ -17,6 +25,55 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens, max_batches, device) + def _update_base_metadata( + self, + mapping: "LoRAMapping", + lora_index_to_id: List[Optional[int]], + max_loras: int, + vocab_size: int, + extra_vocab_size: int, + long_lora_context: Optional["LongContextLoRAContext"] = None, + ): + ( + base_indices, + sampler_indices, + sampler_indices_padded, + embeddings_indices, + long_lora_offsets_tensor, + indices_len, + ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size, + extra_vocab_size, self.device, None) + # Updating each element in `long_lora_offsets` with `lora_offset` slows + # down perf in HPU due to a series of `strided_insert` ops during lazy + # graph accumulation. Hence HPU appends `lora_offset` to a list and + # converts it to a tensor only after it is ready. + if long_lora_context: + index_mapping_indices: List[int] = list( + mapping.index_mapping).copy() + long_lora_offsets: List[int] = [] + for i in range(len(index_mapping_indices)): + lora_offset: int = long_lora_context.offsets_by_lora_id.get( + index_mapping_indices[i], 0) + long_lora_offsets.append(lora_offset) + long_lora_offsets_tensor = torch.tensor(long_lora_offsets, + device=self.device, + dtype=torch.long) + indices_len[-1] = long_lora_offsets_tensor.shape[-1] + + self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) + self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) + self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_( + sampler_indices_padded) + self._embeddings_indices[:embeddings_indices. + shape[0], :embeddings_indices.shape[1]].copy_( + embeddings_indices) + if long_lora_offsets_tensor is not None: + self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( + long_lora_offsets_tensor) + else: + self._long_lora_indices.zero_() + self.indices_len[:] = indices_len + def add_lora_embedding(self, y: torch.Tensor, x: torch.Tensor, diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 4504e19b20816..dbc2d27c597f2 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -1,9 +1,9 @@ +# SPDX-License-Identifier: Apache-2.0 + from typing import TYPE_CHECKING, List, Optional, Tuple, Union import torch -from vllm.platforms import current_platform - if TYPE_CHECKING: # avoid circuit import from vllm.lora.layers import LoRAMapping @@ -88,14 +88,10 @@ def convert_mapping( embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() long_lora_offsets: Optional[torch.Tensor] = None - if long_lora_context: - if current_platform.is_hpu(): - long_lora_offsets_list: List[int] = [] - else: - long_lora_offsets = torch.zeros(len(index_mapping_indices), - device=device, - dtype=torch.long) + long_lora_offsets = torch.zeros(len(index_mapping_indices), + device=device, + dtype=torch.long) prompt_mapping: List[int] = [ lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping @@ -108,18 +104,10 @@ def convert_mapping( embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 lora_indices[i] = lora_idx if long_lora_context: + assert long_lora_offsets is not None lora_offset: int = long_lora_context.offsets_by_lora_id.get( index_mapping_indices[i], 0) - if current_platform.is_hpu(): - long_lora_offsets_list.append(lora_offset) - else: - assert long_lora_offsets is not None - long_lora_offsets[i] = lora_offset - - if long_lora_context and current_platform.is_hpu(): - long_lora_offsets = torch.tensor(long_lora_offsets_list, - device=device, - dtype=torch.long) + long_lora_offsets[i] = lora_offset indices_list: List[Union[List[int], torch.Tensor]] = [ index_mapping_indices, diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py index 20abaefbacc51..90dfa62ec4670 100644 --- a/vllm/model_executor/guided_decoding/utils.py +++ b/vllm/model_executor/guided_decoding/utils.py @@ -20,6 +20,13 @@ def check_object(obj: dict) -> bool: ]): return True + # Check for array unsupported keywords + if obj.get("type") == "array" and any(key in obj for key in [ + "uniqueItems", "contains", "minContains", "maxContains", + "minItems", "maxItems" + ]): + return True + # Recursively check all nested objects and arrays for value in obj.values(): if isinstance(value, dict): diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..b6f1d01f88652 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json index 6a976788f9b10..66f9106bd1be3 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json @@ -1,44 +1,44 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 1, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 1, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -48,76 +48,76 @@ "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 1, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "64": { "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 8, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, - "num_stages": 0, + "num_warps": 8, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 8, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 16, + "matrix_instr_nonkdim": 32, "kpack": 2 }, "256": { @@ -126,10 +126,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 128, @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,9 +148,9 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, - "matrix_instr_nonkdim": 32, + "matrix_instr_nonkdim": 16, "kpack": 2 }, "1536": { @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,10 +181,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4096": { "BLOCK_SIZE_M": 128, @@ -192,9 +192,9 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..0e5fd1eec77d7 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..d6ad63509f157 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=16384,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 16, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..8323f512db015 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json index 0a46390b2e31b..1b46cb5716514 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json @@ -1,11 +1,11 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -16,95 +16,95 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 0, + "num_warps": 8, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "16": { - "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "24": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 0, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "64": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 4, - "num_warps": 4, - "num_stages": 0, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -112,24 +112,24 @@ "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 64, @@ -137,7 +137,7 @@ "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -148,10 +148,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1536": { "BLOCK_SIZE_M": 128, @@ -159,10 +159,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2048": { "BLOCK_SIZE_M": 128, @@ -170,7 +170,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -181,10 +181,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4096": { "BLOCK_SIZE_M": 128, @@ -192,9 +192,9 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..81bb765d30031 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..811c77ab41093 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..379ca107a9469 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json index 91011e64c7de4..ed5b655d89937 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json @@ -1,44 +1,44 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 4, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -49,43 +49,43 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "24": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "32": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "48": { - "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "64": { "BLOCK_SIZE_M": 32, @@ -93,7 +93,7 @@ "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -101,43 +101,43 @@ "96": { "BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "512": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -148,10 +148,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1536": { "BLOCK_SIZE_M": 128, @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,10 +170,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "3072": { "BLOCK_SIZE_M": 128, @@ -181,7 +181,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -192,9 +192,9 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..48bb5f2ccb8e3 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..a64d06c6d1724 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 32, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..bd2c6fbc1b941 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json index f807d4a5abaed..822f04e33e879 100644 --- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json @@ -1,66 +1,66 @@ { "1": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "2": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "4": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "8": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 0, + "num_warps": 1, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "16": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 0, + "num_warps": 2, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "24": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 1, - "num_warps": 8, - "num_stages": 0, + "num_warps": 1, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 1 @@ -68,43 +68,43 @@ "32": { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "48": { "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, "num_warps": 2, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "64": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 4, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 2 + "kpack": 1 }, "96": { "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, "GROUP_SIZE_M": 4, - "num_warps": 4, - "num_stages": 0, + "num_warps": 8, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -112,21 +112,21 @@ "128": { "BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "256": { "BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 4, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 32, "kpack": 2 @@ -137,10 +137,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1024": { "BLOCK_SIZE_M": 128, @@ -148,10 +148,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "1536": { "BLOCK_SIZE_M": 128, @@ -159,7 +159,7 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 @@ -170,10 +170,10 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 }, "3072": { "BLOCK_SIZE_M": 128, @@ -181,20 +181,20 @@ "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, "kpack": 2 }, "4096": { - "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_M": 256, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 1, "num_warps": 8, - "num_stages": 0, + "num_stages": 2, "waves_per_eu": 0, "matrix_instr_nonkdim": 16, - "kpack": 1 + "kpack": 2 } } diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json new file mode 100644 index 0000000000000..cd4fb8f11b935 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4096": { + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json new file mode 100644 index 0000000000000..cf66868e9d57a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X.json @@ -0,0 +1,200 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "2": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 1, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 1 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "48": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "64": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "96": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "128": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "256": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "512": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1024": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "1536": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "2048": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "3072": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + }, + "4096": { + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 64, + "GROUP_SIZE_M": 1, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0, + "matrix_instr_nonkdim": 16, + "kpack": 2 + } +} diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 3b2354b394d9d..31959cbe9ce3b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -40,7 +40,7 @@ class FusedMoEMethodBase(QuantizeMethodBase): @abstractmethod def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): raise NotImplementedError @@ -67,22 +67,24 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): # Fused gate_up_proj (column parallel) - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) # down_proj (row parallel) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) @@ -314,13 +316,20 @@ def __init__( self.quant_method = quant_config.get_quant_method(self, prefix) assert self.quant_method is not None - self.quant_method.create_weights( - layer=self, - num_experts=num_experts, - hidden_size=hidden_size, - intermediate_size=self.intermediate_size_per_partition, - params_dtype=params_dtype, - weight_loader=self.weight_loader) + moe_quant_params = { + "num_experts": num_experts, + "hidden_size": hidden_size, + "intermediate_size_per_partition": + self.intermediate_size_per_partition, + "params_dtype": params_dtype, + "weight_loader": self.weight_loader, + } + # need full intermediate size pre-sharding for WNA16 act order + if (self.quant_method.__class__.__name__ == + "CompressedTensorsWNA16MoEMethod"): + moe_quant_params["intermediate_size_full"] = intermediate_size + + self.quant_method.create_weights(layer=self, **moe_quant_params) def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, @@ -337,20 +346,32 @@ def _load_per_tensor_weight_scale(self, shard_id: str, elif shard_id == "w2": param_data[expert_id] = loaded_weight - def _load_model_weight_or_group_weight_scale(self, shard_dim: int, + def _load_model_weight_or_group_weight_scale(self, + shard_dim: int, expert_data: torch.Tensor, shard_id: str, loaded_weight: torch.tensor, - tp_rank: int, expert_id: int): - # Load grouped weight scales for group quantization - # or model weights + tp_rank: int, + expert_id: int, + load_full_w2: bool = False): + """ + Load grouped weight scales for group quantization or model weights + :param shard_dim: dimension to shard + :param expert_data: parameter for a particular expert + :param shard_id: either w1, w2, or w3 + :param loaded_weight: checkpoint weight to load into the param + :param tp_rank: tensor parallel rank + :param load_full_w2: whether or not the w2 loaded should be sharded. + """ if shard_id == "w2": - self._load_w2(shard_id=shard_id, - shard_dim=shard_dim, + # In the case where we have actorder/g_idx, we do not partition the + # w2 scales, as indicated by `load_full` argument, for all tp cases + self._load_w2(shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, tp_rank=tp_rank, - expert_id=expert_id) + expert_id=expert_id, + load_full=load_full_w2) elif shard_id in ("w1", "w3"): self._load_w13(shard_id=shard_id, shard_dim=shard_dim, @@ -404,17 +425,19 @@ def _load_w13(self, def _load_w2(self, expert_data: torch.Tensor, shard_dim: int, - shard_id: str, loaded_weight: torch.tensor, tp_rank: int, + load_full: bool = False, expert_id: Optional[int] = None): # Index the loaded weight for tp sharding. # down_proj: "RowParallel" so tp sharding on input_dim # Narrow parameter and load. shard_size = expert_data.shape[shard_dim] - loaded_weight = loaded_weight.narrow(shard_dim, shard_size * tp_rank, - shard_size) + if not load_full: + loaded_weight = loaded_weight.narrow(shard_dim, + shard_size * tp_rank, + shard_size) # w2, down_proj: Load into only logical weight of w2. expert_data.copy_(loaded_weight) if is_hpu: @@ -431,8 +454,7 @@ def _load_g_idx(self, shard_id: str, expert_data: torch.Tensor, shard_dim: int, loaded_weight: torch.Tensor, tp_rank: int): if shard_id == "w2": - self._load_w2(shard_id=shard_id, - shard_dim=shard_dim, + self._load_w2(shard_dim=shard_dim, loaded_weight=loaded_weight, expert_data=expert_data, tp_rank=tp_rank) @@ -460,7 +482,7 @@ def weight_loader(self, param: torch.nn.Parameter, ] # Fetch the dim to shard the parameter/loaded weight # based on the shard id. This will be whatever - # dimension intermediate_size is used. + # dimension intermediate_size_per_partition is used. SHARD_ID_TO_SHARDED_DIM = {"w1": 0, "w2": 1, "w3": 0} expert_data = param.data[expert_id] @@ -468,11 +490,11 @@ def weight_loader(self, param: torch.nn.Parameter, # is_transposed: if the dim to shard the weight # should be flipped. Required by GPTQ, compressed-tensors - # should be whatever dimension intermediate_size is + # should be whatever dimension intermediate_size_per_partition is is_transposed = getattr(param, "is_transposed", False) shard_dim = SHARD_ID_TO_SHARDED_DIM[shard_id] if is_transposed: - shard_dim = ~shard_dim + shard_dim = int(not shard_dim) # Case input scale: input_scale loading is only supported for fp8 if "input_scale" in weight_name: @@ -525,7 +547,8 @@ def weight_loader(self, param: torch.nn.Parameter, loaded_weight=loaded_weight, expert_data=expert_data, tp_rank=tp_rank, - expert_id=expert_id) + expert_id=expert_id, + load_full_w2=getattr(param, "load_full_w2", False)) elif quant_method == FusedMoeWeightScaleSupported.TENSOR.value: self._load_per_tensor_weight_scale(shard_id=shard_id, param=param, diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 616a53df2f020..0a20fac8381d4 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -32,7 +32,8 @@ "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", - "HQQMarlinMethod", "QuarkLinearMethod" + "HQQMarlinMethod", "QuarkLinearMethod", "AWQHPULinearMethod", + "GPTQHPULinearMethod" ] @@ -345,11 +346,13 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if output_dim is not None and not use_bitsandbytes_4bit: + if output_dim is not None and not is_sharded_weight: shard_size = param_data.shape[output_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(output_dim, start_idx, @@ -547,6 +550,11 @@ def weight_loader(self, use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: shard_size = loaded_weight.shape[output_dim] shard_offset = loaded_weight.shape[output_dim] * \ @@ -555,9 +563,7 @@ def weight_loader(self, param_data = param_data.narrow(output_dim, shard_offset, shard_size) start_idx = tp_rank * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) # Special case for AQLM codebooks. @@ -942,6 +948,11 @@ def weight_loader(self, use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit + if use_bitsandbytes_4bit: orig_qkv_offsets = { "q": (0, self.num_heads * self.head_size), @@ -965,9 +976,7 @@ def weight_loader(self, shard_id = tp_rank // self.num_kv_head_replicas start_idx = shard_id * shard_size - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if not use_bitsandbytes_4bit: + if not is_sharded_weight: loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) @@ -1072,6 +1081,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): tp_size = get_tensor_model_parallel_world_size() input_dim = getattr(param, "input_dim", None) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit # Special case for GGUF is_gguf_weight = getattr(param, "is_gguf_weight", False) @@ -1087,9 +1100,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype) param_data = param.data - # bitsandbytes loads the weights of the specific portion - # no need to narrow here - if input_dim is not None and not use_bitsandbytes_4bit: + if input_dim is not None and not is_sharded_weight: shard_size = param_data.shape[input_dim] start_idx = tp_rank * shard_size loaded_weight = loaded_weight.narrow(input_dim, start_idx, diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index c2387638e360d..a79c956112723 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -6,6 +6,7 @@ QUANTIZATION_METHODS: List[str] = [ "aqlm", "awq", + "awq_hpu", "deepspeedfp", "tpu_int8", "fp8", @@ -19,6 +20,7 @@ "gptq_marlin", "awq_marlin", "gptq", + "gptq_hpu", "compressed-tensors", "bitsandbytes", "qqq", @@ -30,12 +32,54 @@ "quark" ] +# The customized quantization methods which will be added to this dict. +_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {} + + +def register_quantization_config(quantization: str): + """Register a customized vllm quantization config. + + When a quantization method is not supported by vllm, you can register a customized + quantization config to support it. + + Args: + quantization (str): The quantization method name. + + Examples: + >>> from vllm.model_executor.layers.quantization import register_quantization_config + >>> from vllm.model_executor.layers.quantization import get_quantization_config + >>> from vllm.model_executor.layers.quantization.base_config import QuantizationConfig + >>> + >>> @register_quantization_config("my_quant") + ... class MyQuantConfig(QuantizationConfig): + ... pass + >>> + >>> get_quantization_config("my_quant") + + """ # noqa: E501 + + def _wrapper(quant_config_cls): + if quantization in QUANTIZATION_METHODS: + raise ValueError( + f"The quantization method `{quantization}` is already exists.") + if not issubclass(quant_config_cls, QuantizationConfig): + raise ValueError("The quantization config must be a subclass of " + "`QuantizationConfig`.") + _CUSTOMIZED_METHOD_TO_QUANT_CONFIG[quantization] = quant_config_cls + QUANTIZATION_METHODS.append(quantization) + return quant_config_cls + + return _wrapper + def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization not in QUANTIZATION_METHODS: raise ValueError(f"Invalid quantization method: {quantization}") # lazy import to avoid triggering `torch.compile` too early + from vllm_hpu_extension.awq_hpu import AWQHPUConfig + from vllm_hpu_extension.gptq_hpu import GPTQHPUConfig + from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig from .aqlm import AQLMConfig @@ -64,6 +108,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: method_to_config: Dict[str, Type[QuantizationConfig]] = { "aqlm": AQLMConfig, "awq": AWQConfig, + "awq_hpu": AWQHPUConfig, "deepspeedfp": DeepSpeedFPConfig, "tpu_int8": Int8TpuConfig, "fp8": Fp8Config, @@ -77,6 +122,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: "gptq_marlin": GPTQMarlinConfig, "awq_marlin": AWQMarlinConfig, "gptq": GPTQConfig, + "gptq_hpu": GPTQHPUConfig, "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, "qqq": QQQConfig, @@ -87,6 +133,8 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: "inc": INCConfig, "quark": QuarkConfig } + # Update the `method_to_config` with customized quantization methods. + method_to_config.update(_CUSTOMIZED_METHOD_TO_QUANT_CONFIG) return method_to_config[quantization] diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index c28fd0c6737e0..0c3c9816878e9 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -303,7 +303,7 @@ def __init__(self, quant_config: AWQMarlinConfig): self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): extra_weight_attrs.update({ "is_transposed": @@ -312,17 +312,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, FusedMoeWeightScaleSupported.GROUP.value, }) - w13_qweight = Parameter(torch.empty(num_experts, - hidden_size, - 2 * intermediate_size // - self.quant_config.pack_factor, - dtype=torch.int32), - requires_grad=False) + w13_qweight = Parameter( + torch.empty(num_experts, + hidden_size, + 2 * intermediate_size_per_partition // + self.quant_config.pack_factor, + dtype=torch.int32), + requires_grad=False) layer.register_parameter("w13_qweight", w13_qweight) set_weight_attrs(w13_qweight, extra_weight_attrs) w2_qweight = Parameter(torch.empty(num_experts, - intermediate_size, + intermediate_size_per_partition, hidden_size // self.quant_config.pack_factor, dtype=torch.int32), @@ -331,13 +332,14 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, set_weight_attrs(w2_qweight, extra_weight_attrs) num_groups_w13 = hidden_size // self.quant_config.group_size - num_groups_w2 = intermediate_size // self.quant_config.group_size + num_groups_w2 = (intermediate_size_per_partition // + self.quant_config.group_size) # WEIGHT_SCALES # Allocate 2 scales for w1 and w3 respectively. w13_scales = Parameter(torch.empty(num_experts, num_groups_w13, - intermediate_size * 2, + intermediate_size_per_partition * 2, dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_scales", w13_scales) @@ -353,12 +355,13 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, # WEIGHT_ZERO_POINT # Allocate 2 zero points for w1 and w3 respectively. - w13_qzeros = Parameter(torch.empty(num_experts, - num_groups_w13, - 2 * intermediate_size // - self.quant_config.pack_factor, - dtype=torch.int32), - requires_grad=False) + w13_qzeros = Parameter( + torch.empty(num_experts, + num_groups_w13, + 2 * intermediate_size_per_partition // + self.quant_config.pack_factor, + dtype=torch.int32), + requires_grad=False) layer.register_parameter("w13_qzeros", w13_qzeros) set_weight_attrs(w13_qzeros, extra_weight_attrs) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index a9351147a6c30..063bdeefadd41 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -9,6 +9,7 @@ QuantizationType) from pydantic import BaseModel +from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, UnquantizedLinearMethod) @@ -27,6 +28,8 @@ from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform +logger = init_logger(__name__) + __all__ = ["CompressedTensorsLinearMethod"] SPARSITY_CONFIG_NAME: Literal["sparsity_config"] = "sparsity_config" @@ -79,6 +82,8 @@ def get_quant_method( return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) + if scheme is None: + return UnquantizedLinearMethod() layer.scheme = scheme return CompressedTensorsLinearMethod(self) if isinstance(layer, Attention): @@ -342,10 +347,10 @@ def _get_scheme_from_parts( raise NotImplementedError( "No compressed-tensors compatible scheme was found.") - def get_scheme( - self, - layer: torch.nn.Module, - layer_name: Optional[str] = None) -> "CompressedTensorsScheme": + def get_scheme(self, + layer: torch.nn.Module, + layer_name: Optional[str] = None + ) -> Optional["CompressedTensorsScheme"]: """ compressed-tensors supports non uniform in the following way: @@ -355,10 +360,7 @@ def get_scheme( which can be a full layer_name, a regex for a layer_name, or an nn.Module name. - We first check whether a layer is in the ignore group and use - CompressedTensorsUnquantized (i.e. fp16/bf16) scheme for the layer - - We then detect whether a layer_name is found in any target and + Detect whether a layer_name is found in any target and use the quantization scheme corresponding to the matched target to select the CompressedTensorsScheme used for infernece. """ @@ -396,6 +398,13 @@ def get_scheme( if self.supports_cutlass_24(weight_quant=weight_quant, input_quant=input_quant, sparsity_scheme=sparsity_scheme): + # FIXME(tlrmchlsmth): layers using W16A16 CUTLASS 2:4 sparse kernels + # currently produce bad output in some cases + if weight_quant is None: + logger.warning_once( + "CompressedTensors24 scheme is disabled for the w16a16 " + "case. Falling back to UnquantizedLinearMethod") + return None # Have a valid sparsity scheme # Validate layer is supported by Cutlass 2:4 Kernel scheme = CompressedTensors24(quantized=weight_quant is not None diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 4fb8fd84e92d4..e1c45f4e42e41 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -13,6 +13,7 @@ FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( WNA16_SUPPORTED_BITS) +from vllm.model_executor.layers.quantization.utils import replace_parameter from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs @@ -75,24 +76,26 @@ def __init__( self.static_input_scales = not self.input_quant.dynamic def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): params_dtype = torch.float8_e4m3fn # WEIGHTS - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) @@ -254,6 +257,7 @@ def __init__( self.packed_factor = 32 // config.num_bits self.strategy = config.strategy self.group_size = config.group_size + self.actorder = config.actorder assert config.symmetric, ( "Only symmetric quantization is supported for MoE") @@ -266,9 +270,16 @@ def __init__( f"{WNA16_SUPPORTED_BITS}") def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): + assert params_dtype == torch.float16, ( + "float16 is required for MoE compressed models. Set dtype=torch.float16" # noqa: E501 + ) + + intermediate_size_full = extra_weight_attrs.pop( + "intermediate_size_full") + # Will transpose the loaded weight along the # intermediate and hidden dim sizes. Will # shard for TP along the transposed dims @@ -276,35 +287,45 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, "is_transposed": True, "quant_method": self.strategy }) - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size // - self.packed_factor, - 2 * intermediate_size, - dtype=torch.int32), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size // self.packed_factor, + 2 * intermediate_size_per_partition, + dtype=torch.int32), requires_grad=False) layer.register_parameter("w13_weight_packed", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - intermediate_size // - self.packed_factor, - hidden_size, - dtype=torch.int32), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + intermediate_size_per_partition // self.packed_factor, + hidden_size, + dtype=torch.int32), requires_grad=False) layer.register_parameter("w2_weight_packed", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) + # In the case where we have actorder/g_idx, + # we do not partition the w2 scales + load_full_w2 = self.actorder and self.group_size != -1 + w2_scales_size = (intermediate_size_full + if load_full_w2 else intermediate_size_per_partition) + + self.is_k_full = (not self.actorder) or ( + intermediate_size_per_partition == intermediate_size_full) + if self.strategy == "channel": num_groups_w2 = num_groups_w13 = 1 self.group_size = -1 else: - num_groups_w2 = intermediate_size // self.group_size + num_groups_w2 = w2_scales_size // self.group_size num_groups_w13 = hidden_size // self.group_size - w13_scale = torch.nn.Parameter(torch.ones(num_experts, - num_groups_w13, - 2 * intermediate_size, - dtype=params_dtype), + w13_scale = torch.nn.Parameter(torch.ones( + num_experts, + num_groups_w13, + 2 * intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight_scale", w13_scale) set_weight_attrs(w13_scale, extra_weight_attrs) @@ -316,6 +337,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, requires_grad=False) layer.register_parameter("w2_weight_scale", w2_scale) set_weight_attrs(w2_scale, extra_weight_attrs) + set_weight_attrs(w2_scale, {"load_full_w2": load_full_w2}) w2_weight_shape = torch.nn.Parameter(torch.empty(num_experts, 2), requires_grad=False) @@ -335,18 +357,18 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, ), requires_grad=False, ) - layer.register_parameter("w13_g_idx", w13_g_idx) + layer.register_parameter("w13_weight_g_idx", w13_g_idx) set_weight_attrs(w13_g_idx, extra_weight_attrs) w2_g_idx = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, ) - layer.register_parameter("w2_g_idx", w2_g_idx) + layer.register_parameter("w2_weight_g_idx", w2_g_idx) set_weight_attrs(w2_g_idx, extra_weight_attrs) w13_g_idx_sort_indices = torch.nn.Parameter( @@ -364,7 +386,7 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, w2_g_idx_sort_indices = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, @@ -422,24 +444,55 @@ def marlin_moe_permute_scales(s: torch.Tensor, size_k: int, size_k2 = layer.w2_weight_packed.shape[2] size_k13 = layer.w13_weight_packed.shape[2] - num_experts = layer.w13_g_idx.shape[0] - device = layer.w13_g_idx.device - layer.w13_g_idx = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) - layer.w2_g_idx = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) - layer.w13_g_idx_sort_indices = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) - layer.w2_g_idx_sort_indices = torch.nn.Parameter( - torch.empty((num_experts, 0), dtype=torch.int32, device=device), - requires_grad=False, - ) + num_experts = layer.w13_weight_g_idx.shape[0] + device = layer.w13_weight_g_idx.device + + # when running models with grouped act order, + # resort to g_idx values provided in checkpoint + if self.actorder == "group": + w13_g_idx_sort_indices = torch.empty_like(layer.w13_weight_g_idx) + w2_g_idx_sort_indices = torch.empty_like(layer.w2_weight_g_idx) + w13_sorted_g_idx = torch.empty_like(layer.w13_weight_g_idx) + w2_sorted_g_idx = torch.empty_like(layer.w2_weight_g_idx) + + for e in range(num_experts): + w13_g_idx_sort_indices[e] = torch.argsort( + layer.w13_weight_g_idx[e]).to(torch.int32) + w2_g_idx_sort_indices[e] = torch.argsort( + layer.w2_weight_g_idx[e]).to(torch.int32) + w13_sorted_g_idx[e] = layer.w13_weight_g_idx[e][ + w13_g_idx_sort_indices[e]] + w2_sorted_g_idx[e] = layer.w2_weight_g_idx[e][ + w2_g_idx_sort_indices[e]] + + replace_parameter(layer, "w13_weight_g_idx", w13_sorted_g_idx) + replace_parameter(layer, "w2_weight_g_idx", w2_sorted_g_idx) + replace_parameter(layer, "w13_g_idx_sort_indices", + w13_g_idx_sort_indices) + replace_parameter(layer, "w2_g_idx_sort_indices", + w2_g_idx_sort_indices) + + else: + layer.w13_weight_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_weight_g_idx = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w13_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) + layer.w2_g_idx_sort_indices = torch.nn.Parameter( + torch.empty((num_experts, 0), dtype=torch.int32, + device=device), + requires_grad=False, + ) marlin_w13_qweight = ops.gptq_marlin_moe_repack( layer.w13_weight_packed, @@ -511,9 +564,9 @@ def apply( router_logits, topk_weights, topk_ids, - g_idx1=layer.w13_g_idx, - g_idx2=layer.w2_g_idx, + g_idx1=layer.w13_weight_g_idx, + g_idx2=layer.w2_weight_g_idx, sort_indices1=layer.w13_g_idx_sort_indices, sort_indices2=layer.w2_g_idx_sort_indices, num_bits=self.num_bits, - ) + is_k_full=self.is_k_full) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py index 61d1c911cd1ad..2e1b5e3c2d3b1 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py @@ -62,7 +62,7 @@ def create_weights(self, layer: torch.nn.Module, input_size: int, **kwargs): assert params_dtype == torch.float16, ( - "float16 is required for marlin24 compressd models. Set dtype=torch.float16" # noqa: E501 + "float16 is required for marlin24 compressed models. Set dtype=torch.float16" # noqa: E501 ) pack_factor = 32 // self.quant_type.size_bits diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 209f12c6dfec9..100cbfa4c9598 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -52,7 +52,7 @@ def __init__(self, quant_config: ExpertsInt8Config): self.quant_config = quant_config def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): int8_dtype = torch.int8 @@ -64,26 +64,29 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, extra_weight_attrs['weight_loader'] = wrapped_weight_loader # Fused gate_up_proj (column parallel) - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=int8_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=int8_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) # down_proj (row parallel) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=int8_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=int8_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) - w13_scale = torch.nn.Parameter(torch.zeros(num_experts, - 2 * intermediate_size, - dtype=torch.float32), + w13_scale = torch.nn.Parameter(torch.zeros( + num_experts, + 2 * intermediate_size_per_partition, + dtype=torch.float32), requires_grad=False) layer.register_parameter("w13_scale", w13_scale) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 09e542d848950..b509a070a77f4 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -255,6 +255,15 @@ def create_weights( def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + weight, weight_scale, _ = \ + normalize_e4m3fn_to_e4m3fnuz( + weight=layer.weight, + weight_scale=layer.weight_scale_inv, + input_scale=layer.input_scale) + layer.weight = Parameter(weight, requires_grad=False) + layer.weight_scale_inv = Parameter(weight_scale, + requires_grad=False) return layer.weight = torch.nn.Parameter(layer.weight.data, requires_grad=False) @@ -385,8 +394,8 @@ def __init__(self, quant_config: Fp8Config): self.block_quant = self.quant_config.weight_block_size is not None def create_weights(self, layer: Module, num_experts: int, hidden_size: int, - intermediate_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): + intermediate_size_per_partition: int, + params_dtype: torch.dtype, **extra_weight_attrs): if self.quant_config.is_checkpoint_fp8_serialized: params_dtype = torch.float8_e4m3fn @@ -401,30 +410,34 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, # scales, the output_size of the weights for both the gate and up # layers must be divisible by block_n. # Required by column parallel or enabling merged weights - if intermediate_size % block_n != 0: + if intermediate_size_per_partition % block_n != 0: raise ValueError( f"The output_size of gate's and up's weight = " - f"{intermediate_size} is not divisible by " + f"{intermediate_size_per_partition} is not divisible by " f"weight quantization block_n = {block_n}.") - if (tp_size > 1 and intermediate_size % block_k != 0): + if (tp_size > 1 + and intermediate_size_per_partition % block_k != 0): # Required by row parallel - raise ValueError(f"The input_size of down's weight = " - f"{intermediate_size} is not divisible by " - f"weight quantization block_k = {block_k}.") + raise ValueError( + f"The input_size of down's weight = " + f"{intermediate_size_per_partition} is not divisible by " + f"weight quantization block_k = {block_k}.") # WEIGHTS - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) @@ -445,7 +458,8 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, w13_weight_scale = torch.nn.Parameter( torch.ones( num_experts, - 2 * ((intermediate_size + block_n - 1) // block_n), + 2 * ((intermediate_size_per_partition + block_n - 1) // + block_n), (hidden_size + block_k - 1) // block_k, dtype=torch.float32, ), @@ -455,7 +469,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, torch.ones( num_experts, (hidden_size + block_n - 1) // block_n, - (intermediate_size + block_k - 1) // block_k, + (intermediate_size_per_partition + block_k - 1) // block_k, dtype=torch.float32, ), requires_grad=False, @@ -503,6 +517,30 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int, def process_weights_after_loading(self, layer: Module) -> None: # Block quant doesn't need to process weights after loading if self.block_quant: + if current_platform.is_rocm(): + w13_weight, w13_weight_scale_inv, w13_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w13_weight, layer.w13_weight_scale_inv, + layer.w13_input_scale) + w2_weight, w2_weight_scale_inv, w2_input_scale = \ + normalize_e4m3fn_to_e4m3fnuz( + layer.w2_weight, layer.w2_weight_scale_inv, + layer.w2_input_scale) + # Reset the parameter + layer.w13_weight = torch.nn.Parameter(w13_weight, + requires_grad=False) + layer.w13_weight_scale_inv = torch.nn.Parameter( + w13_weight_scale_inv, requires_grad=False) + if w13_input_scale is not None: + layer.w13_input_scale = torch.nn.Parameter( + w13_input_scale, requires_grad=False) + layer.w2_weight = torch.nn.Parameter(w2_weight, + requires_grad=False) + layer.w2_weight_scale_inv = torch.nn.Parameter( + w2_weight_scale_inv, requires_grad=False) + if w2_input_scale is not None: + layer.w2_input_scale = torch.nn.Parameter( + w2_input_scale, requires_grad=False) return # If checkpoint is fp16, quantize in place. if not self.quant_config.is_checkpoint_fp8_serialized: diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 2dbfca9b07690..4dc4b052b0410 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -317,7 +317,7 @@ def create_weights( layer: torch.nn.Module, num_experts: int, hidden_size: int, - intermediate_size: int, + intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs, ): @@ -326,7 +326,8 @@ def create_weights( # Supports only sym for now (no zp) if self.quant_config.group_size != -1: scales_size13 = hidden_size // self.quant_config.group_size - scales_size2 = intermediate_size // self.quant_config.group_size + scales_size2 = (intermediate_size_per_partition // + self.quant_config.group_size) strategy = FusedMoeWeightScaleSupported.GROUP.value else: scales_size13 = 1 @@ -342,7 +343,7 @@ def create_weights( torch.empty( num_experts, hidden_size // self.quant_config.pack_factor, - 2 * intermediate_size, + 2 * intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, @@ -353,7 +354,8 @@ def create_weights( w2_qweight = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size // self.quant_config.pack_factor, + intermediate_size_per_partition // + self.quant_config.pack_factor, hidden_size, dtype=torch.int32, ), @@ -365,7 +367,7 @@ def create_weights( w13_scales = torch.nn.Parameter( torch.empty(num_experts, scales_size13, - 2 * intermediate_size, + 2 * intermediate_size_per_partition, dtype=torch.half), requires_grad=False, ) @@ -385,7 +387,8 @@ def create_weights( w13_qzeros = torch.nn.Parameter( torch.empty(num_experts, scales_size13, - 2 * intermediate_size // self.quant_config.pack_factor, + 2 * intermediate_size_per_partition // + self.quant_config.pack_factor, dtype=params_dtype), requires_grad=False, ) @@ -414,7 +417,7 @@ def create_weights( w2_g_idx = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, @@ -435,7 +438,7 @@ def create_weights( w2_g_idx_sort_indices = torch.nn.Parameter( torch.empty( num_experts, - intermediate_size, + intermediate_size_per_partition, dtype=torch.int32, ), requires_grad=False, diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py index b04612a9b00d9..915bdc4778929 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/MPLinearKernel.py @@ -73,12 +73,12 @@ def _transform_param(self, layer: torch.nn.Module, name: Optional[str], torch.nn.Parameter(new_param.data, requires_grad=False)) def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # w_q - torch.Tensor, # w_s - Optional[torch.Tensor], # w_zp, - Optional[torch.Tensor] # w_gidx - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # w_q + torch.Tensor, # w_s + Optional[torch.Tensor], # w_zp, + Optional[torch.Tensor] # w_gidx + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py index 75cf91f191136..c4a83b4faafe6 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/ScaledMMLinearKernel.py @@ -48,13 +48,13 @@ def apply_weights(self, raise NotImplementedError def _get_weight_params( - self, layer: torch.nn.Module - ) -> Tuple[torch.Tensor, # weight - torch.Tensor, # weight_scale - Optional[torch.Tensor], # input_scale, - Optional[torch.Tensor], # input_zp - Optional[torch.Tensor], # azp_adj - ]: + self, layer: torch.nn.Module) -> Tuple[ + torch.Tensor, # weight + torch.Tensor, # weight_scale + Optional[torch.Tensor], # input_scale, + Optional[torch.Tensor], # input_zp + Optional[torch.Tensor], # azp_adj + ]: return ( getattr(layer, self.w_q_name), getattr(layer, self.w_s_name), diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py index 586752d3d34e3..4824a11804163 100644 --- a/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/__init__.py @@ -5,8 +5,8 @@ CutlassScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.ScaledMMLinearKernel import ( # noqa: E501 ScaledMMLinearKernel, ScaledMMLinearLayerConfig) -# from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( -# TritonScaledMMLinear) +from vllm.model_executor.layers.quantization.kernels.scaled_mm.triton import ( + TritonScaledMMLinearKernel) from vllm.model_executor.layers.quantization.kernels.scaled_mm.xla import ( XLAScaledMMLinearKernel) from vllm.platforms import PlatformEnum, current_platform @@ -15,9 +15,7 @@ _POSSIBLE_KERNELS: Dict[PlatformEnum, List[Type[ScaledMMLinearKernel]]] = { PlatformEnum.CPU: [CutlassScaledMMLinearKernel], PlatformEnum.CUDA: [CutlassScaledMMLinearKernel], - # TODO(rob): Create TritonScaledMMLinear kernel. ROCM will - # incorrectly attempt to run AZP models if prompted to. - PlatformEnum.ROCM: [CutlassScaledMMLinearKernel], + PlatformEnum.ROCM: [TritonScaledMMLinearKernel], PlatformEnum.TPU: [XLAScaledMMLinearKernel], } diff --git a/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py new file mode 100644 index 0000000000000..97ec8cb0500d7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/kernels/scaled_mm/triton.py @@ -0,0 +1,38 @@ +from typing import Optional, Tuple + +import torch + +from vllm.platforms import current_platform + +from .cutlass import CutlassScaledMMLinearKernel +from .ScaledMMLinearKernel import ScaledMMLinearLayerConfig + + +class TritonScaledMMLinearKernel(CutlassScaledMMLinearKernel): + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def can_implement( + cls, c: ScaledMMLinearLayerConfig) -> Tuple[bool, Optional[str]]: + if current_platform.is_cpu(): + return ( + False, + "TritonScaledMMLinearKernel requires Triton which is not " + + "currently supported on CPU.") + if not c.input_symmetric: + return (False, + "TritonScaledMMLinearKernel only supports symmetric " + + "quantization.") + return True, None + + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + super().process_weights_after_loading(layer) + + def apply_weights(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + return super().apply_weights(layer, x, bias) diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index a74f5415c8a51..e1870c73cc932 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -3,6 +3,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -40,11 +41,16 @@ def apply(self, layer: torch.nn.Module) -> torch.Tensor: def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0 # regardless whether the kv-scale is available in the checkpoint. - if layer.kv_cache_dtype != "auto": + # No need to process kv scales after loading if we are going to + # calculate them on the fly. + if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales: if layer.k_scale > 0.0 and layer.v_scale > 0.0: # We prefer to use separate k_scale and v_scale if present k_scale = layer.k_scale.to("cpu").tolist() v_scale = layer.v_scale.to("cpu").tolist() + if current_platform.is_rocm(): + k_scale *= 2 + v_scale *= 2 elif layer.k_scale < 0.0 and layer.v_scale < 0.0: # If no scales were loaded (both scales are invalid negative # values), use the default value of 1.0 @@ -58,6 +64,9 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: scale_to_duplicate = max(layer.k_scale, layer.v_scale) k_scale = scale_to_duplicate.to("cpu").tolist() v_scale = scale_to_duplicate.to("cpu").tolist() + if current_platform.is_rocm(): + k_scale *= 2 + v_scale *= 2 if not isinstance(k_scale, float) or not isinstance( v_scale, float): @@ -65,9 +74,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: "for fp8 KV cache") # These are used in the final Attention.forward() - layer._k_scale = k_scale - layer._v_scale = v_scale - if (layer._k_scale == 1.0 and layer._v_scale == 1.0 + layer._k_scale.copy_(k_scale) + layer._v_scale.copy_(v_scale) + layer._k_scale_float = k_scale + layer._v_scale_float = v_scale + if (k_scale == 1.0 and v_scale == 1.0 and "e5m2" not in layer.kv_cache_dtype): logger.warning_once( "Using KV cache scaling factor 1.0 for fp8_e4m3. This " diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 3e19247300808..68a3954540763 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -60,24 +60,26 @@ def __init__(self, weight_config: Dict[str, Any], input_config: Dict[str, self.static_input_scales = not self.input_quant.get("is_dynamic") def create_weights(self, layer: torch.nn.Module, num_experts: int, - hidden_size: int, intermediate_size: int, + hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): params_dtype = torch.float8_e4m3fn # WEIGHTS - w13_weight = torch.nn.Parameter(torch.empty(num_experts, - 2 * intermediate_size, - hidden_size, - dtype=params_dtype), + w13_weight = torch.nn.Parameter(torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w13_weight", w13_weight) set_weight_attrs(w13_weight, extra_weight_attrs) - w2_weight = torch.nn.Parameter(torch.empty(num_experts, - hidden_size, - intermediate_size, - dtype=params_dtype), + w2_weight = torch.nn.Parameter(torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition, + dtype=params_dtype), requires_grad=False) layer.register_parameter("w2_weight", w2_weight) set_weight_attrs(w2_weight, extra_weight_attrs) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index f3c3e130e4161..43b1997019107 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -5,6 +5,8 @@ import triton import triton.language as tl +from vllm.platforms import current_platform + def apply_w8a8_block_fp8_linear( input: torch.Tensor, @@ -33,11 +35,14 @@ def apply_w8a8_block_fp8_linear( def input_to_float8( - x: torch.Tensor, - dtype: torch.dtype = torch.float8_e4m3fn + x: torch.Tensor, + dtype: Optional[torch.dtype] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """This function quantizes input values to float8 values " "with tensor-wise quantization.""" + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) finfo = torch.finfo(dtype) min_val, max_val = x.aminmax() amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12) @@ -67,9 +72,10 @@ def block_quant_to_tensor_quant( x_dq_block = x_q_block.to(torch.float32) x_dq_block_tiles = [[ - x_dq_block[j * block_n:min((j + 1) * block_n, n), - i * block_k:min((i + 1) * block_k, k), ] - for i in range(k_tiles) + x_dq_block[ + j * block_n:min((j + 1) * block_n, n), + i * block_k:min((i + 1) * block_k, k), + ] for i in range(k_tiles) ] for j in range(n_tiles)] for i in range(k_tiles): @@ -125,7 +131,7 @@ def per_token_group_quant_fp8( x: torch.Tensor, group_size: int, eps: float = 1e-10, - dtype: torch.dtype = torch.float8_e4m3fn, + dtype: Optional[torch.dtype] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: """Function to perform per-token-group quantization on an input tensor `x`. It converts the tensor values into signed float8 values and returns the @@ -140,6 +146,9 @@ def per_token_group_quant_fp8( Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization. """ + if dtype is None: + dtype = (torch.float8_e4m3fnuz + if current_platform.is_rocm() else torch.float8_e4m3fn) assert (x.shape[-1] % group_size == 0), ( f"the last dimension of `x` {x.shape[-1]} must be divisible " f"by `group_size` {group_size}") diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index e4a86544b6c20..a537b87c70581 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -89,8 +89,8 @@ def requantize_with_max_scale( # from disk in this case. Skip requantization in this case (since) # we already are quantized with the single scale. # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 - unfused_module_in_checkpoint = (weight_scale[-1] > torch.finfo( - torch.float8_e4m3fn).min) + unfused_module_in_checkpoint = (weight_scale[-1] + > torch.finfo(torch.float8_e4m3fn).min) # If unfused checkpoint, need requanize with the single scale. if unfused_module_in_checkpoint: diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 614906b13f58b..e17d0bc8ca7f3 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -867,6 +867,37 @@ def get_input_positions( ) -> Tuple[List[List[int]], int]: """Get mrope input positions and delta value.""" + llm_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + input_tokens, + image_grid_thw, + video_grid_thw, + image_token_id, + video_token_id, + vision_start_token_id, + vision_end_token_id, + spatial_merge_size, + context_len, + seq_len, + ) + + return llm_positions.tolist(), mrope_position_delta + + @staticmethod + def get_input_positions_tensor( + input_tokens: List[int], + image_grid_thw: Union[List[List[int]], torch.Tensor], + video_grid_thw: Union[List[List[int]], torch.Tensor], + image_token_id: int, + video_token_id: int, + vision_start_token_id: int, + vision_end_token_id: int, + spatial_merge_size: int, + context_len: int = 0, + seq_len: Optional[int] = None, + ) -> Tuple[torch.Tensor, int]: + """Get mrope input positions and delta value.""" + if isinstance(image_grid_thw, torch.Tensor): image_grid_thw = image_grid_thw.tolist() if isinstance(video_grid_thw, torch.Tensor): @@ -942,7 +973,7 @@ def get_input_positions( len(input_tokens)).item() llm_positions = llm_positions[:, context_len:seq_len] - return llm_positions.tolist(), mrope_position_delta + return llm_positions, mrope_position_delta @staticmethod def get_next_input_positions( @@ -956,6 +987,17 @@ def get_next_input_positions( seq_len + mrope_position_delta)) for _ in range(3) ] + @staticmethod + def get_next_input_positions_tensor( + mrope_position_delta: int, + context_len: int, + seq_len: int, + ) -> torch.Tensor: + return torch.arange( + mrope_position_delta + context_len, + mrope_position_delta + seq_len, + ).expand(3, -1) + _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {} diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index cafff6a9134dc..e75d69f3034fc 100755 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -889,9 +889,10 @@ def _sample_with_torch( tensors required for Pythonization ''' - categorized_seq_group_ids: Dict[SamplingType, - List[int]] = {t: [] - for t in SamplingType} + categorized_seq_group_ids: Dict[SamplingType, List[int]] = { + t: [] + for t in SamplingType + } categorized_sample_indices = sampling_metadata.categorized_sample_indices for i, seq_group in enumerate(sampling_metadata.seq_groups): sampling_params = seq_group.sampling_params diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 65920aa61ba15..f230efacacdbb 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -115,17 +115,17 @@ def num_elements_padded(self) -> int: def __post_init__(self): # sanity checks - assert (self.padded_org_vocab_start_index <= - self.padded_org_vocab_end_index) - assert (self.padded_added_vocab_start_index <= - self.padded_added_vocab_end_index) + assert (self.padded_org_vocab_start_index + <= self.padded_org_vocab_end_index) + assert (self.padded_added_vocab_start_index + <= self.padded_added_vocab_end_index) assert self.org_vocab_start_index <= self.org_vocab_end_index assert self.added_vocab_start_index <= self.added_vocab_end_index assert self.org_vocab_start_index <= self.padded_org_vocab_start_index - assert (self.added_vocab_start_index <= - self.padded_added_vocab_start_index) + assert (self.added_vocab_start_index + <= self.padded_added_vocab_start_index) assert self.org_vocab_end_index <= self.padded_org_vocab_end_index assert self.added_vocab_end_index <= self.padded_added_vocab_end_index @@ -141,8 +141,8 @@ def get_masked_input_and_mask( added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]: # torch.compile will fuse all of the pointwise ops below # into a single kernel, making it very fast - org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < - org_vocab_end_index) + org_vocab_mask = (input_ >= org_vocab_start_index) & ( + input_ < org_vocab_end_index) added_vocab_mask = (input_ >= added_vocab_start_index) & ( input_ < added_vocab_end_index) added_offset = added_vocab_start_index - ( @@ -355,7 +355,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): elif isinstance(param, UninitializedParameter): shape = list(loaded_weight.shape) if output_dim is not None: - shape[output_dim] = shape[output_dim] // self.tp_size + shape[output_dim] = self.num_embeddings_per_partition param.materialize(tuple(shape), dtype=loaded_weight.dtype) # If parameter does not have output dim, then it should @@ -381,7 +381,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): else: assert loaded_weight.shape[output_dim] == self.org_vocab_size - # Copy the data. + # Copy the data. Select chunk corresponding to current shard. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) if current_platform.is_hpu(): diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 96f042df49d69..caab4ec17552f 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -114,7 +114,7 @@ def _initialize_model( all_params = [param.name for param in signatures.parameters.values()] if "vllm_config" in all_params and "prefix" in all_params: # new-style model class - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(vllm_config=vllm_config, prefix=prefix) msg = ("vLLM model class should accept `vllm_config` and `prefix` as " @@ -142,7 +142,7 @@ def _initialize_model( kwargs["lora_config"] = vllm_config.lora_config if "scheduler_config" in all_params: kwargs["scheduler_config"] = vllm_config.scheduler_config - with set_current_vllm_config(vllm_config): + with set_current_vllm_config(vllm_config, check_compile=True): return model_class(**kwargs) @@ -182,6 +182,9 @@ class Source: fall_back_to_pt: bool = True """Whether .pt weights can be used.""" + allow_patterns_overrides: Optional[list[str]] = None + """If defined, weights will load exclusively using these patterns.""" + def __init__(self, load_config: LoadConfig): super().__init__(load_config) if load_config.model_loader_extra_config: @@ -218,6 +221,7 @@ def _prepare_weights( model_name_or_path: str, revision: Optional[str], fall_back_to_pt: bool, + allow_patterns_overrides: Optional[list[str]], ) -> Tuple[str, List[str], bool]: """Prepare weights for the model. @@ -249,6 +253,9 @@ def _prepare_weights( if fall_back_to_pt: allow_patterns += ["*.pt"] + if allow_patterns_overrides is not None: + allow_patterns = allow_patterns_overrides + if not is_local: hf_folder = download_weights_from_hf( model_name_or_path, @@ -298,7 +305,8 @@ def _get_weights_iterator( ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( - source.model_or_path, source.revision, source.fall_back_to_pt) + source.model_or_path, source.revision, source.fall_back_to_pt, + source.allow_patterns_overrides) if self.load_config.load_format == LoadFormat.NPCACHE: # Currently np_cache only support *.bin checkpoints assert use_safetensors is False @@ -351,6 +359,8 @@ def _get_all_weights( prefix="", fall_back_to_pt=getattr(model, "fall_back_to_pt_during_load", True), + allow_patterns_overrides=getattr(model, "allow_patterns_overrides", + None), ) yield from self._get_weights_iterator(primary_weights) @@ -364,7 +374,8 @@ def _get_all_weights( def download_model(self, model_config: ModelConfig) -> None: self._prepare_weights(model_config.model, model_config.revision, - fall_back_to_pt=True) + fall_back_to_pt=True, + allow_patterns_overrides=None) def load_model(self, vllm_config: VllmConfig) -> nn.Module: device_config = vllm_config.device_config @@ -1081,8 +1092,8 @@ def _load_weights(self, model_config: ModelConfig, # weight tensor. So TP does not work with pre_quantized bnb models. if pre_quant and get_tensor_model_parallel_world_size() > 1: raise ValueError( - "Prequant BitsAndBytes models with TP is not supported." - "Please try with PP.") + "Prequant BitsAndBytes models with tensor parallelism is not " + "supported. Please try with pipeline parallelism.") load_8bit = False if pre_quant: @@ -1121,15 +1132,23 @@ def _load_weights(self, model_config: ModelConfig, weight_name, index, ) in self.modules_mapping.inverse_packed_mapping.items(): - shard_pos = quant_param_name.find(shard_name) # Some models, such as MiniCPM V2.5/2.6, contain both # module names 'kv_proj' and 'qkv_proj'. To prevent 'kv_proj' # from being incorrectly identified as being present in # 'vpm.encoder.layers.0.self_attn.qkv_proj.weight - if shard_pos > 0 and quant_param_name[shard_pos - 1] == ".": + shard_pos = quant_param_name.find(shard_name) + can_correct_rename = (shard_pos + > 0) and (quant_param_name[shard_pos - 1] + == ".") + # If the quant_param_name is packed, it won't occur in the + # param_dict before renaming. + new_quant_param_name = quant_param_name.replace( + shard_name, weight_name) + need_rename = (quant_param_name not in param_dict) \ + and (new_quant_param_name in param_dict) + if can_correct_rename and need_rename: shard_index = index - quant_param_name = quant_param_name.replace( - shard_name, weight_name) + quant_param_name = new_quant_param_name break # Models like Clip/Siglip may skip some layers in initialization, diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index fbd4937112e11..9266ca75ddaac 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -288,7 +288,8 @@ def _init_model(self): model_args.torch_dtype = self.tensorizer_config.dtype assert self.tensorizer_config.model_class is not None # TODO: Do we need to consider old-style model class? - with no_init_or_tensor(), set_current_vllm_config(self.vllm_config): + with no_init_or_tensor(), set_current_vllm_config(self.vllm_config, + check_compile=True): return self.tensorizer_config.model_class( vllm_config=self.vllm_config, ) @@ -297,8 +298,8 @@ def _resize_lora_embeddings(self): to allow for adapter added tokens.""" for child in self.model.modules(): if (isinstance(child, VocabParallelEmbedding) - and child.weight.shape[0] < - child.num_embeddings_per_partition): + and child.weight.shape[0] + < child.num_embeddings_per_partition): new_weight = torch.empty(child.num_embeddings_per_partition, child.embedding_dim, dtype=child.weight.dtype, @@ -459,16 +460,7 @@ def tensorize_vllm_model(engine_args: EngineArgs, stream.write(encryption_params.key) engine = LLMEngine.from_engine_args(engine_args) - if tensorizer_config._is_sharded: - # if the engine is a distributed engine (for tensor parallel) then each - # worker shard needs to serialize its part of the model. - engine.model_executor._run_workers( - "save_tensorized_model", - tensorizer_config=tensorizer_config, - ) - else: - # with a single worker, we can get to the underlying model directly - serialize_vllm_model( - engine.model_executor.driver_worker.model_runner.model, - tensorizer_config, - ) + engine.model_executor.collective_rpc( + "save_tensorized_model", + kwargs=dict(tensorizer_config=tensorizer_config), + ) diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 8a3a29765c5fa..fecfdc06723f7 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -6,8 +6,7 @@ import os import tempfile from collections import defaultdict -from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional, - Tuple, Union) +from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union import filelock import gguf @@ -23,7 +22,6 @@ from vllm.logger import init_logger from vllm.model_executor.layers.quantization import (QuantizationConfig, get_quantization_config) -from vllm.model_executor.layers.quantization.schema import QuantParamSchema from vllm.platforms import current_platform from vllm.utils import PlaceholderModule @@ -95,7 +93,7 @@ def convert_bin_to_safetensor_file( pt_filename: str, sf_filename: str, ) -> None: - loaded = torch.load(pt_filename, map_location="cpu") + loaded = torch.load(pt_filename, map_location="cpu", weights_only=True) if "state_dict" in loaded: loaded = loaded["state_dict"] shared = _shared_pointers(loaded) @@ -383,7 +381,9 @@ def np_cache_weights_iterator( disable=not enable_tqdm, bar_format=_BAR_FORMAT, ): - state = torch.load(bin_file, map_location="cpu") + state = torch.load(bin_file, + map_location="cpu", + weights_only=True) for name, param in state.items(): param_path = os.path.join(np_folder, name) with open(param_path, "wb") as f: @@ -449,7 +449,7 @@ def pt_weights_iterator( disable=not enable_tqdm, bar_format=_BAR_FORMAT, ): - state = torch.load(bin_file, map_location="cpu") + state = torch.load(bin_file, map_location="cpu", weights_only=True) yield from state.items() del state torch.cuda.empty_cache() @@ -496,47 +496,6 @@ def gguf_quant_weights_iterator( yield name, param -def kv_cache_scales_loader( - filename: str, tp_rank: int, tp_size: int, num_hidden_layers: int, - model_type: Optional[str]) -> Iterable[Tuple[int, float]]: - """ - A simple utility to read in KV cache scaling factors that have been - previously serialized to disk. Used by the model to populate the appropriate - KV cache scaling factors. The serialization should represent a dictionary - whose keys are the TP ranks and values are another dictionary mapping layers - to their KV cache scaling factors. - Keep this function in sync with the output of - examples/other/fp8/extract_scales.py - """ - try: - with open(filename) as f: - context = { - "model_type": model_type, - "num_hidden_layers": num_hidden_layers, - "tp_rank": tp_rank, - "tp_size": tp_size, - } - schema_dct = json.load(f) - schema = QuantParamSchema.model_validate(schema_dct, - context=context) - layer_scales_map = schema.kv_cache.scaling_factor[tp_rank] - return layer_scales_map.items() - - except FileNotFoundError: - logger.error("File or directory '%s' not found.", filename) - except json.JSONDecodeError: - logger.error("Error decoding JSON in file '%s'.", filename) - except Exception: - logger.exception("An error occurred while reading '%s'.", filename) - # This section is reached if and only if any of the excepts are hit - # Return an empty iterable (list) => no KV cache scales are loaded - # which ultimately defaults to 1.0 scales - logger.warning( - "Defaulting to KV cache scaling factors = 1.0 for all " - "layers in TP rank %d as an error occurred during loading.", tp_rank) - return [] - - def convert_pyslice_to_tensor(x: Any) -> torch.Tensor: """convert PySafeSlice object from safetensors to torch.Tensor @@ -644,7 +603,12 @@ def initialize_dummy_weights( param.uniform_(low, high) continue - generator = torch.Generator(device=param.data.device) + if current_platform.is_hpu(): + import habana_frameworks.torch.hpu.random as htrandom + generator = htrandom.default_generators[0] + else: + generator = torch.Generator(device=param.data.device) + generator.manual_seed(seed) if torch.finfo(param.data.dtype).bits < 16: # uniform_ doesn't support < 16-bit datatypes (FP8) @@ -693,9 +657,18 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]: return remapped_name possible_scale_names = [".k_scale", ".v_scale"] + modelopt_scale_names = [ + ".self_attn.k_proj.k_scale", ".self_attn.v_proj.v_scale" + ] for scale_name in possible_scale_names: if name.endswith(scale_name): - remapped_name = name.replace(scale_name, f".attn{scale_name}") + if any(mo_scale_name in name + for mo_scale_name in modelopt_scale_names): + remapped_name = name.replace( + f".self_attn.{scale_name[1]}_proj{scale_name}", + f".self_attn.attn{scale_name}") + else: + remapped_name = name.replace(scale_name, f".attn{scale_name}") if remapped_name not in params_dict: logger.warning_once( f"Found {scale_name} in the checkpoint (e.g. {name}), " diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 5b97eced62df0..8c6873de13627 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,9 +1,11 @@ -from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch import torch.nn as nn -from transformers import BatchFeature, PretrainedConfig +from transformers import AriaConfig, AriaTextConfig, BatchFeature +from transformers.models.aria.modeling_aria import AriaCrossAttention +from transformers.models.aria.processing_aria import AriaProcessor from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig @@ -26,10 +28,12 @@ BaseProcessingInfo, PromptReplacement) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, - AriaVisionConfig) -from .idefics2_vision_model import Idefics2VisionTransformer +# yapf: disable +from .idefics2_vision_model import Idefics2VisionConfig +from .idefics2_vision_model import ( + Idefics2VisionTransformer as Idefics3VisionTransformer) +# yapf: enable from .interfaces import SupportsMultiModal from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, @@ -47,87 +51,69 @@ class AriaImagePixelInputs(TypedDict): """ -class AriaVisionTransformer(Idefics2VisionTransformer): - """ - AriaVisionTransformer is a modified version of Idefics2VisionTransformer - that replaces the post-layernorm with an identity layer. - """ +class AriaVisionTransformer(Idefics3VisionTransformer): def __init__( self, - config: AriaVisionConfig, + config: Idefics2VisionConfig, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: super().__init__(config, quant_config, prefix) + # Unlike Idefics3VisionTransformer which uses LayerNorm after the + # final layer, Aria omits this normalization, so we replace it with an + # Identity layer self.post_layernorm = nn.Identity() + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + + # NOTE: post_layernorm is not used in Aria + if "post_layernorm" in name: + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + -class AriaVisionModel(nn.Module): - config_class = AriaVisionConfig +class AriaProjectorMLP(nn.Module): def __init__( self, - config: AriaVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - *, - prefix: str = "", + in_features: int, + hidden_features: int, + output_dim: int, ) -> None: super().__init__() - self.vision_model = AriaVisionTransformer( - config, - quant_config, - prefix=f"{prefix}.vision_model", - ) - - def forward( - self, - pixel_values: torch.Tensor, - pixel_mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - patch_attention_mask = self._create_patch_attention_mask(pixel_mask) - - vit_oup = self.vision_model( - pixel_values=pixel_values, - patch_attention_mask=patch_attention_mask, - ) - - image_atts = self._create_image_attention_mask(patch_attention_mask) - - return vit_oup, image_atts - - def _create_patch_attention_mask( - self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor: - if pixel_mask is None: - return None - - patches_subgrid = pixel_mask.unfold( - dimension=1, - size=self.vision_model.config.patch_size, - step=self.vision_model.config.patch_size, - ).unfold( - dimension=2, - size=self.vision_model.config.patch_size, - step=self.vision_model.config.patch_size, - ) - return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - - def _create_image_attention_mask( - self, patch_attention_mask: torch.Tensor) -> torch.Tensor: - if patch_attention_mask is None: - return None - - flattened_mask = patch_attention_mask.flatten(1) - return torch.logical_not(flattened_mask) - - -class FFN(nn.Module): - - def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None: - super().__init__() - self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False) - self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False) + self.linear_in = ColumnParallelLinear(in_features, + hidden_features, + bias=False) + self.linear_out = RowParallelLinear(hidden_features, + output_dim, + bias=False) self.act = get_act_fn("gelu_new") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -137,46 +123,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states -class CrossAttention(nn.Module): - - def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None: - super().__init__() - self.num_heads = num_heads - self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) - self.k_proj = nn.Linear(kv_dim, embed_dim, bias=False) - self.v_proj = nn.Linear(kv_dim, embed_dim, bias=False) - - self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) - self.linear = nn.Linear(embed_dim, embed_dim) - - self.layer_norm = nn.LayerNorm(embed_dim) - self.ln_kv = nn.LayerNorm(kv_dim) - - def forward( - self, - x: torch.Tensor, - hidden_states: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - normed_hidden_states = self.layer_norm(hidden_states) - query = self.q_proj(normed_hidden_states).permute(1, 0, 2) - - x = self.ln_kv(x) - key = self.k_proj(x).permute(1, 0, 2) - value = self.v_proj(x).permute(1, 0, 2) - - attn_output, _ = self.multihead_attn(query, - key, - value, - attn_mask=attn_mask) - - attn_output = attn_output.permute(1, 0, 2) - - attn_output = self.linear(attn_output) - - return attn_output - - class AriaProjector(nn.Module): """ A projection module with one cross attention layer and one FFN layer, which @@ -198,42 +144,42 @@ class AriaProjector(nn.Module): A tensor with the shape of (batch_size, query_number, output_dim) """ - def __init__( - self, - patch_to_query_dict: dict[int, int], - embed_dim: int, - num_heads: int, - kv_dim: int, - ff_dim: int, - output_dim: int, - norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, - ) -> None: + def __init__(self, config: AriaConfig) -> None: super().__init__() - self.patch_to_query_dict = patch_to_query_dict - self.embed_dim = embed_dim - self.num_heads = num_heads + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size self.query = nn.Parameter( - torch.empty(max(patch_to_query_dict.values()), self.embed_dim)) + torch.empty(config.max_value_projector_patch_to_query_dict, + self.in_features)) - self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) + self.cross_attn = AriaCrossAttention(config) - self.ln_ffn = norm_layer(embed_dim) - self.ffn = FFN(embed_dim, ff_dim, output_dim) + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, + self.hidden_features, + self.output_dim) def forward( self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: - bs = x.shape[0] - queries = self.query.unsqueeze(0).repeat(bs, 1, 1) + batch_size, num_patches = x.shape[0], x.shape[1] + + if num_patches not in self.patch_to_query_dict: + raise KeyError(f"Number of patches {num_patches} not found in " + "patch_to_query_dict amongst possible values " + f"{self.patch_to_query_dict.keys()}.") - query_num = self.patch_to_query_dict.get(x.shape[1], None) - assert (query_num is not None - ), f"Query number for {x.shape[1]} patches is not provided" + query_num = self.patch_to_query_dict[num_patches] - queries = queries[:, :query_num, :] + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) if attn_mask is not None: attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) @@ -241,7 +187,7 @@ def forward( attention_out = self.cross_attn(x, queries, attn_mask=attn_mask) - out = self.ffn(self.ln_ffn(attention_out)) + out = self.feed_forward(self.layer_norm(attention_out)) return out @@ -278,7 +224,7 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, param.data.copy_(loaded_weight.transpose(1, 2)) -class MoELayer(nn.Module): +class AriaTextMoELayer(nn.Module): """ Mixture of Experts (MoE) Layer for the AriaMoE model. @@ -289,7 +235,7 @@ class MoELayer(nn.Module): def __init__( self, - config: AriaMoELMConfig, + config: AriaTextConfig, quant_config: Optional[QuantizationConfig], ) -> None: super().__init__() @@ -303,15 +249,16 @@ def __init__( num_experts=config.moe_num_experts, top_k=config.moe_topk, hidden_size=config.hidden_size, - intermediate_size=config.moe_intermediate_size, + intermediate_size=config.intermediate_size, quant_config=quant_config, reduce_results=True, ) self.shared_experts = LlamaMLP( config.hidden_size, - config.moe_intermediate_size * config.moe_num_shared_experts, + config.intermediate_size * config.moe_num_shared_experts, "silu", quant_config=quant_config, + bias=config.mlp_bias, ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -329,13 +276,15 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_output = torch.nn.functional.linear(hidden_states, self.router_weight) - shared_expert_output = self.shared_experts(hidden_states) + hidden_states_copy = hidden_states.clone() + # NOTE: hidden_states will be modified inplace by `FusedMoE` sparse_expert_output = self.experts(hidden_states, router_output) + shared_expert_output = self.shared_experts(hidden_states_copy) return sparse_expert_output + shared_expert_output -class MoEDecoderLayer(LlamaDecoderLayer): +class AriaTextDecoderLayer(LlamaDecoderLayer): """ Custom Decoder Layer for the AriaMoE model which modifies the standard `LlamaDecoderLayer` by replacing the traditional MLP with a Mixture of @@ -344,16 +293,16 @@ class MoEDecoderLayer(LlamaDecoderLayer): def __init__( self, - config: AriaMoELMConfig, + config: AriaTextConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: super().__init__(config, cache_config, quant_config, prefix) - self.mlp = MoELayer(config, quant_config=quant_config) + self.mlp = AriaTextMoELayer(config, quant_config=quant_config) -class AriaMoELMModel(LlamaModel): +class AriaTextModel(LlamaModel): """ Custom LlamaModel for the AriaMoE model which modifies the standard LlamaModel by replacing the `LlamaDecoderLayer` with `MoEDecoderLayer`. @@ -362,7 +311,7 @@ class AriaMoELMModel(LlamaModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__(vllm_config=vllm_config, prefix=prefix, - layer_type=MoEDecoderLayer) + layer_type=AriaTextDecoderLayer) # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` @@ -434,25 +383,17 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -def build_mm_projector(config: PretrainedConfig): - return AriaProjector( - patch_to_query_dict=config.projector_patch_to_query_dict, - embed_dim=config.vision_config.hidden_size, - num_heads=config.vision_config.num_attention_heads, - kv_dim=config.vision_config.hidden_size, - ff_dim=config.text_config.hidden_size, - output_dim=config.text_config.hidden_size, - ) - - class AriaProcessingInfo(BaseProcessingInfo): def get_hf_config(self): - return self.ctx.get_hf_config() + return self.ctx.get_hf_config(AriaConfig) - def get_vision_config(self) -> AriaVisionConfig: + def get_vision_config(self): return self.get_hf_config().vision_config + def get_hf_processor(self): + return self.ctx.get_hf_processor(AriaProcessor) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -484,7 +425,7 @@ def get_dummy_processor_inputs( } hf_processor = self.info.get_hf_processor() - image_token: str = hf_processor.image_token # type: ignore + image_token: str = hf_processor.tokenizer.image_token # type: ignore return ProcessorInputs( prompt_text=image_token * num_images, @@ -554,10 +495,14 @@ def __init__( quant_config = vllm_config.quant_config self.config = config - self.vision_tower = AriaVisionModel(config.vision_config) - self.multi_modal_projector = build_mm_projector(config) + self.vision_tower = AriaVisionTransformer( + config.vision_config, + quant_config, + prefix=f"{prefix}.vision_tower", + ) + self.multi_modal_projector = AriaProjector(config) self.vocab_size = config.text_config.vocab_size - self.language_model = AriaMoELMModel( + self.language_model = AriaTextModel( vllm_config=vllm_config.with_hf_config(config.text_config), prefix=maybe_prefix(prefix, "language_model.model"), ) @@ -608,6 +553,22 @@ def _parse_and_validate_image_input( pixel_mask=pixel_mask, ) + def _create_patch_attention_mask( + self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor: + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ).unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + def _process_image_input( self, image_input: AriaImagePixelInputs ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -616,9 +577,18 @@ def _process_image_input( pixel_values = image_input['pixel_values'] pixel_mask = image_input['pixel_mask'] - image_feature, image_attn_mask = self.vision_tower( - pixel_values, pixel_mask=pixel_mask) - return self.multi_modal_projector(image_feature, image_attn_mask) + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + + image_outputs = self.vision_tower( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + return self.multi_modal_projector(image_outputs, image_attn_mask) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) @@ -683,6 +653,5 @@ def sample( return next_tokens def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 917b88e802071..b559ac677a740 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -14,12 +14,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -475,36 +475,27 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) + + image_token_id = vocab[""] num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target="", - replacement="" * num_image_tokens + "", + target=[bos_token_id], + replacement=PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ), ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only tokens should be considered as placeholders, - # so we ignore the trailing bos_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor, info=Blip2ProcessingInfo, diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a6634204699c9..e834c9004f140 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -28,12 +28,12 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import MultiModalDataItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -122,8 +122,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds sep token for chat mode tokenizer = self.info.get_tokenizer() - sep_token_id: int = \ - tokenizer.vocab[tokenizer.sep_token] # type: ignore + vocab = tokenizer.get_vocab() + + sep_token_id = vocab[tokenizer.sep_token] # type: ignore return prompt_tokens + [sep_token_id] @@ -141,39 +142,27 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + image_start_id = vocab[processor.image_start_token] + image_token_id = vocab[processor.image_token] + image_end_id = vocab[processor.image_end_token] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [image_token_id] * num_image_tokens return [ PromptReplacement( modality="image", - target="", - replacement="".join([ - processor.image_start_token, - processor.image_token * self.info.get_num_image_tokens(), - processor.image_end_token, - ]), + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_start_id] + image_tokens + [image_end_id]), + features=image_tokens, + ), ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only tokens should be considered as placeholders, - # so we ignore the image_start_token and image_end_token - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"] + 1, - length=p["length"] - 2) for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - class ChameleonLayerNorm(nn.LayerNorm): diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 7e37ce3086e6b..d5f9b4d19e5ca 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -41,7 +41,7 @@ from vllm.transformers_utils.configs import ChatGLMConfig from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP -from .utils import (is_pp_missing_parameter, +from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -605,9 +605,50 @@ def forward( return IntermediateTensors({"hidden_states": hidden_states}) return hidden_states + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), + ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "rotary_pos_emb.inv_freq" in name: + continue + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + class ChatGLMBaseModel(nn.Module, SupportsLoRA, SupportsPP): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_substr={".word_embeddings": ""}, ) + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config @@ -660,52 +701,9 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: - # Merge two ColumnParallelLinear into one MergedColumnParallelLinear - merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = { - "transformer.vision.linear_proj.merged_proj.weight": { - "transformer.vision.linear_proj.gate_proj.weight": None, - "transformer.vision.linear_proj.dense_h_to_4h.weight": None, - } - } - - params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() - for name, loaded_weight in weights: - is_weight_to_be_merge = False - for _, merged_weight_dict in merged_weights_dict.items(): - if name in merged_weight_dict: - assert merged_weight_dict[name] is None - merged_weight_dict[name] = loaded_weight - is_weight_to_be_merge = True - if is_weight_to_be_merge: - continue - if "rotary_pos_emb.inv_freq" in name: - continue - if "word_embeddings" in name: - name = name.replace(".word_embeddings", "") - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - - for combined_name, merged_weight_dict in merged_weights_dict.items(): - if combined_name in params_dict: - param = params_dict[combined_name] - combined_weight = torch.cat(list(merged_weight_dict.values()), - dim=0) - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, combined_weight) - loaded_params.add(combined_name) - return loaded_params + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) class ChatGLM(ChatGLMBaseModel): @@ -726,6 +724,7 @@ class ChatGLM(ChatGLMBaseModel): class ChatGLMV(ChatGLMBaseModel, SupportsMultiModal): + packed_modules_mapping = { "query_key_value": ["query_key_value"], "dense_h_to_4h": ["dense_h_to_4h"], @@ -777,7 +776,7 @@ def __new__( ) -> None: config = vllm_config.model_config.hf_config # Initialize VL - if hasattr(config, "visual"): + if hasattr(config, "vision_config"): return ChatGLMV(vllm_config=vllm_config, prefix=prefix) # Initialize LLM else: diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 4553695022169..344832d8b33e6 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -1,7 +1,7 @@ # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math -from functools import cached_property, partial +from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -9,7 +9,7 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from transformers import AutoProcessor, BatchFeature, ProcessorMixin +from transformers import BatchFeature from vllm.attention import AttentionMetadata from vllm.config import VllmConfig @@ -31,6 +31,8 @@ from vllm.transformers_utils.configs.deepseek_vl2 import (DeepseekVLV2Config, MlpProjectorConfig, VisionEncoderConfig) +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP @@ -129,25 +131,8 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo): def get_hf_config(self): return self.ctx.get_hf_config(DeepseekVLV2Config) - def get_hf_processor(self) -> ProcessorMixin: - # TODO(Isotr0py): we should get rid of dependency on deepseek_vl2 - # in the future, because it's flasky and lack of maintenance. - try: - from deepseek_vl2.models.processing_deepseek_vl_v2 import ( - DeepseekVLV2Processor, select_best_resolution) - AutoProcessor.register("DeepseekVLV2Processor", - DeepseekVLV2Processor) - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "You need to `pip install " - "git+https://github.com/deepseek-ai/DeepSeek-VL2.git` " - "to use this model") from exc - - processor = self.ctx.get_hf_processor(DeepseekVLV2Processor) - processor.select_best_resolution = partial( - select_best_resolution, - candidate_resolutions=processor.candidate_resolutions) - return processor + def get_hf_processor(self) -> DeepseekVLV2Processor: + return self.ctx.get_hf_processor(DeepseekVLV2Processor) def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} @@ -224,31 +209,21 @@ def _call_hf_processor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: if mm_data: - outputs = self.info.ctx.call_hf_processor( + processed_outputs = self.info.ctx.call_hf_processor( self.info.get_hf_processor(**mm_kwargs), dict(prompt=prompt, **mm_data), mm_kwargs, ) - - # Deepseek-vl2 processor don't return BatchFeature, - # we need to manually create it - processed_outputs = dict(input_ids=outputs["input_ids"]) - processed_outputs = BatchFeature(data=dict(processed_outputs), - tensor_type="pt") - - # Remove batch dimension from processor outputs, - # because we will try batch to create NestedTensors target_dtype = self.info.ctx.model_config.dtype - pixel_values = outputs["images"].to(target_dtype).squeeze(0) - images_spatial_crop = outputs["images_spatial_crop"].squeeze(0) + pixel_values = processed_outputs.pop("pixel_values").to( + target_dtype) + # split pixel values into patches corresponding to each image + images_spatial_crop = processed_outputs["images_spatial_crop"] patches_per_image = [ x.prod().item() + 1 for x in images_spatial_crop ] - - # Rename `images` -> `pixel_values` to avoid confusion - processed_outputs["pixel_values"] = list( - pixel_values.split(patches_per_image)) - processed_outputs["images_spatial_crop"] = images_spatial_crop + pixel_values = pixel_values.split(patches_per_image) + processed_outputs["pixel_values"] = pixel_values else: tokenizer = self.info.get_tokenizer() processed_outputs = tokenizer(prompt, @@ -274,8 +249,10 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self.info.get_hf_processor() - image_token_id: int = hf_processor.image_token_id + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + image_token_id = hf_processor.image_token_id + assert isinstance(image_token_id, int) def get_replacement_deepseek_vl2(item_idx: int): images = mm_items.get_items( diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index eab3bf0756fca..bc3295da7b60a 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -30,8 +30,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -44,9 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.exaone import ExaoneConfig @@ -576,32 +574,3 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params - - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, - tp_rank, - tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type, - ): - if not isinstance(self.transformer.h[layer_idx], nn.Identity): - layer_self_attn = self.transformer.h[layer_idx].attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py new file mode 100644 index 0000000000000..b93a68680375d --- /dev/null +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -0,0 +1,151 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Meta Platforms, Inc. and affiliates. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Llama model for fairseq2 weights.""" + +from typing import Iterable, Set, Tuple + +import torch +from torch.nn import Parameter + +from vllm.config import VllmConfig +from vllm.distributed import (get_tensor_model_parallel_rank, + get_tensor_model_parallel_world_size) +from vllm.model_executor.layers.linear import set_weight_attrs +from vllm.model_executor.models.llama import LlamaForCausalLM + +from .utils import AutoWeightsLoader, WeightsMapper + + +class Fairseq2LlamaForCausalLM(LlamaForCausalLM): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() + # For the model loader to read only the relevant checkpoint files + self.allow_patterns_overrides = [ + # either the full checkpoint + "model.pt", + # or the tp-sharded checkpoint of the current rank + f"model.{self.tp_rank}.pt", + ] + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + # fairseq2's serialization adds a wrapper to usual .pt state_dict's: + # { "model_key": my_model_name, "my_model_name": state_dict } + # which we first need to unpack + weights_wrapped = dict(weights) + weights = weights_wrapped[ + weights_wrapped["model_key"]].items() # type: ignore + + # remap keys + fs2_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder_frontend.embed.": "model.embed_tokens.", + "decoder.": "model.", + "final_proj.": "lm_head.", + }, + orig_to_new_substr={ + ".self_attn_layer_norm.": ".input_layernorm.", + ".ffn_layer_norm.": ".post_attention_layernorm.", + ".self_attn.output_proj.": ".self_attn.o_proj.", + ".ffn.gate_proj.": ".mlp.gate_proj.", + ".ffn.inner_proj.": ".mlp.up_proj.", + ".ffn.output_proj.": ".mlp.down_proj.", + ".layer_norm.": ".norm.", + }, + ) + weights = fs2_to_vllm_mapper.apply(weights) + + params = dict(self.named_parameters()) + + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights( + (self.reshape_fairseq2_weights(name, loaded_weight, params) + for name, loaded_weight in weights)) + + def flag_sharded_weights(self, params: dict[str, Parameter]): + """Sets the `is_sharded_weight` flag to True for all sharded weights""" + for name, param in params.items(): + modules = name.split(".") + if "norm" in name and len(param.size()) < 2: + # layer norms are not sharded + continue + elif any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # for now we repeat embedding layers for compatibility + continue + else: + # all other layers are sharded + set_weight_attrs(param, {"is_sharded_weight": True}) + + def reshape_fairseq2_weights( + self, + name: str, + loaded_weight: torch.Tensor, + params: dict[str, Parameter], + ) -> Tuple[str, torch.Tensor]: + """Reshape fairseq2's weights.""" + + def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: + attn_in = self.config.head_dim * n_heads + # check for a sharded weight on dim 0 + if attn_in // self.tp_size == w.size()[0]: + attn_in //= self.tp_size + n_heads //= self.tp_size + attn_out = self.config.hidden_size + return (w.view(n_heads, attn_in // n_heads // 2, 2, + attn_out).transpose(1, + 2).reshape(attn_in, attn_out)) + + modules = name.split(".") + + # rotary embeds should be sliced + if "k_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_key_value_heads) + + elif "q_proj" in modules: + loaded_weight = permute(loaded_weight, + self.config.num_attention_heads) + + # We make the loaded weights compatible with both + # full checkpoints and tp sharded checkpoints. + # Embeddings are repeated to fit the vocab size. + # Other weights are flagged for the weight_loader calls. + if any(emb in modules for emb in ["embed_tokens", "lm_head"]): + # Embeddings are sharded on dim 0 + dim = 0 + # In fairseq2, vocab size has to be divisible by tp_size + # so we don't worry about padding + if self.tp_size > 1 and loaded_weight.shape[ + dim] < self.config.vocab_size: + assert loaded_weight.shape[ + dim] * self.tp_size == self.config.vocab_size, \ + "vocab_size should be divisible by tp_size." + repeats = [1] * len(loaded_weight.size()) + repeats[dim] = self.tp_size + # repeat to match vocab size and to be easily 'narrow'able + loaded_weight = loaded_weight.repeat(repeats) + set_weight_attrs(params[name], {"is_sharded_weight": False}) + # if embeddings are sharded, the rest is too + if "embed_tokens" in modules: + self.flag_sharded_weights(params) + + return name, loaded_weight diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 63e7147f84e03..dbf9da50cc9de 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -16,7 +16,7 @@ """ PyTorch Fuyu model.""" import math from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) + TypedDict) import torch import torch.nn as nn @@ -30,13 +30,13 @@ from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import (ImageProcessorItems, ImageSize, MultiModalDataItems) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -183,7 +183,9 @@ def _apply_hf_processor_tokens_only( ) -> list[int]: # HF processor adds boa_token_id tokenizer = self.info.get_tokenizer() - boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + vocab = tokenizer.get_vocab() + + boa_token_id = vocab["<0x04>"] return prompt_tokens + [boa_token_id] @@ -202,6 +204,7 @@ def _get_prompt_replacements( ) -> list[PromptReplacement]: hf_config = self.info.get_hf_config() bos_token_id = hf_config.bos_token_id + assert isinstance(bos_token_id, int) tokenizer = self.info.get_tokenizer() eot_token_id = tokenizer.bos_token_id @@ -215,9 +218,13 @@ def get_replacement_fuyu(item_idx: int): image_width=image_size.width, image_height=image_size.height, ) + image_tokens = ([_IMAGE_TOKEN_ID] * ncols + + [_NEWLINE_TOKEN_ID]) * nrows - return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + - [bos_token_id]) + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) return [ PromptReplacement( @@ -227,26 +234,6 @@ def get_replacement_fuyu(item_idx: int): ) ] - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only |SPEAKER| (image) tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor, info=FuyuProcessingInfo, diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index 6de0c866bc2f0..b23aba829c549 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" -from functools import lru_cache +from functools import cache from typing import Iterable, List, Optional, Set, Tuple, Union import torch @@ -48,7 +48,7 @@ logger = init_logger(__name__) -@lru_cache(maxsize=None) +@cache def _get_gemma_act_fn( hidden_act: Optional[str], hidden_activation: Optional[str], diff --git a/vllm/model_executor/models/glm4_vision_encoder.py b/vllm/model_executor/models/glm4_vision_encoder.py index 39a5736eb199b..51922e6f2d03d 100644 --- a/vllm/model_executor/models/glm4_vision_encoder.py +++ b/vllm/model_executor/models/glm4_vision_encoder.py @@ -42,7 +42,8 @@ def forward(self, images: torch.Tensor) -> torch.Tensor: torch.Tensor Transformed tensor with shape (B, L, D) """ - images = images.to(self.proj.weight.device) + images = images.to(device=self.proj.weight.device, + dtype=self.proj.weight.dtype) x = self.proj(images) x = x.flatten(2).transpose(1, 2) cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 1656a3cc9e46d..2f1aa2d68653c 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -258,13 +258,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.transformer = GPT2Model(vllm_config=vllm_config, prefix=maybe_prefix( prefix, "transformer")) + self.lm_head = ParallelLMHead(self.config.vocab_size, + self.config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.lm_head") if self.config.tie_word_embeddings: - self.lm_head = self.transformer.wte - else: - self.lm_head = ParallelLMHead(self.config.vocab_size, - self.config.hidden_size, - quant_config=quant_config, - prefix=f"{prefix}.lm_head") + self.lm_head = self.lm_head.tie_weights(self.transformer.wte) + self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() self.make_empty_intermediate_tensors = ( @@ -309,15 +309,12 @@ def load_weights(self, weights: Iterable[Tuple[str, params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: Set[str] = set() for name, loaded_weight in weights: - if name.startswith("lm_head"): - # GPT-2 ties the weights of the embedding layer and the final - # linear layer. - continue if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. # NOTE: "c_attn.bias" should not be skipped. continue - if not name.startswith("transformer."): + if not name.startswith("transformer.") and not name.startswith( + "lm_head"): name = "transformer." + name if is_pp_missing_parameter(name, self): diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index ddd2d7a16b242..543b4e2f5e286 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -29,8 +29,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -44,9 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -518,29 +516,3 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params - - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, tp_rank, tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type): - if not isinstance(self.model.layers[layer_idx], nn.Identity): - layer_self_attn = self.model.layers[layer_idx].self_attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 51296ef0cc08e..cdf9414d5949c 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -348,6 +348,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = config self.lora_config = lora_config + self.quant_config = quant_config # Required by MixtralForCausalLM self.model = GraniteMoeModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) @@ -428,10 +429,10 @@ def load_weights(self, weights: Iterable[Tuple[str, for e in range(p.size(0)): w1_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w1.weight" % e) + f".block_sparse_moe.experts.{e}.w1.weight") w3_name = n.replace( '.block_sparse_moe.input_linear.weight', - ".block_sparse_moe.experts.%d.w3.weight" % e) + f".block_sparse_moe.experts.{e}.w3.weight") w1_param, w3_param = p[e].chunk(2, dim=0) assert w1_name not in new_weights assert w3_name not in new_weights @@ -441,7 +442,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for e in range(p.size(0)): w2_name = n.replace( '.block_sparse_moe.output_linear.weight', - ".block_sparse_moe.experts.%d.w2.weight" % e) + f".block_sparse_moe.experts.{e}.w2.weight") w2_param = p[e] assert w2_name not in new_weights new_weights[w2_name] = w2_param diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 4c353ae6ffc13..37b91a803d71e 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -3,7 +3,6 @@ import torch import torch.nn as nn -from transformers import PretrainedConfig from typing_extensions import TypeIs, TypeVar from vllm.logger import init_logger @@ -19,9 +18,6 @@ logger = init_logger(__name__) -# The type of HF config -C_co = TypeVar("C_co", bound=PretrainedConfig, covariant=True) - # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa # which has T = List[torch.Tensor] @@ -34,7 +30,7 @@ @runtime_checkable -class VllmModel(Protocol[C_co, T_co]): +class VllmModel(Protocol[T_co]): """The interface required for all models in vLLM.""" def __init__( @@ -97,7 +93,7 @@ def is_vllm_model( @runtime_checkable -class VllmModelForTextGeneration(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForTextGeneration(VllmModel[T], Protocol[T]): """The interface required for all generative models in vLLM.""" def compute_logits( @@ -143,7 +139,7 @@ def is_text_generation_model( @runtime_checkable -class VllmModelForPooling(VllmModel[C_co, T], Protocol[C_co, T]): +class VllmModelForPooling(VllmModel[T], Protocol[T]): """The interface required for all pooling models in vLLM.""" def pooler( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 00ff2351efe34..c36c55f69d7d3 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -30,8 +30,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -44,7 +43,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors @@ -412,6 +411,11 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight) loaded_params.add(scale_name) continue + if "scale" in name: + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue @@ -431,10 +435,6 @@ def load_weights(self, weights: Iterable[Tuple[str, # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - # Remapping the name of FP8 kv-scale. - name = maybe_remap_kv_scale_name(name, params_dict) - if name is None: - continue if is_pp_missing_parameter(name, self): continue @@ -446,32 +446,6 @@ def load_weights(self, weights: Iterable[Tuple[str, loaded_params.add(name) return loaded_params - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, tp_rank, tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type): - if not isinstance(self.layers[layer_idx], nn.Identity): - layer_self_attn = self.layers[layer_idx].self_attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") - class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { @@ -599,9 +573,6 @@ def load_weights(self, weights: Iterable[Tuple[str, self.maybe_remap_mistral(name, loaded_weight) for name, loaded_weight in weights) - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - self.model.load_kv_cache_scales(quantization_param_path) - # This function is used to remap the mistral format as # used by Mistral and Llama <=2 def maybe_remap_mistral( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 722fff98d5c19..296af2aac5660 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -5,9 +5,11 @@ import torch import torch.nn as nn +from packaging.version import Version from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, SiglipVisionConfig) +from transformers import __version__ as TRANSFORMERS_VERSION from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -22,7 +24,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, + MultiModalInputs, MultiModalKwargs, NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) @@ -313,13 +315,14 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) hf_config = self.info.get_hf_config() - image_token_id = hf_config.image_token_index + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() - processor = self.info.get_hf_processor() - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token + image_break_id = vocab[processor.image_break_token] + image_token_id = hf_config.image_token_index + image_end_id = vocab[processor.image_end_token] vision_config = hf_config.vision_config assert isinstance(vision_config, PixtralVisionConfig) @@ -334,10 +337,10 @@ def get_replacement(item_idx: int): image_height=image_size.height, ) - tokens = ([image_token] * ncols + [image_break_token]) * nrows - tokens[-1] = image_end_token + tokens = ([image_token_id] * ncols + [image_break_id]) * nrows + tokens[-1] = image_end_id - return "".join(tokens) + return tokens return [ PromptReplacement( @@ -716,6 +719,27 @@ def load_weights(self, weights: Iterable[Tuple[str, return loader.load_weights(weights) +class MantisProcessingInfo(LlavaProcessingInfo): + + def get_hf_processor(self): + hf_config = self.get_hf_config() + vision_info = self.get_vision_encoder_info() + + if Version(TRANSFORMERS_VERSION) < Version("4.48"): + # BUG: num_additional_image_tokens = 0 but treated as 1, + # so we set vision_feature_select_strategy to None to offset this + vision_feature_select_strategy = None + else: + # FIXED: https://github.com/huggingface/transformers/pull/33424/files#diff-6a37acc21efcadaae622b079b2712a131131448ff64262bd219aa346aeec38faL150 + vision_feature_select_strategy = hf_config.vision_feature_select_strategy # noqa: E501 + + return self.ctx.get_hf_processor( + LlavaProcessor, + patch_size=vision_info.get_patch_size(), + vision_feature_select_strategy=vision_feature_select_strategy, + ) + + class MantisMultiModalProcessor(LlavaMultiModalProcessor): def apply( @@ -723,7 +747,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: hf_config = self.info.get_hf_config() image_token_id = hf_config.image_token_index @@ -782,7 +806,7 @@ def get_replacement_mantis(item_idx: int): for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, @@ -794,7 +818,7 @@ def get_replacement_mantis(item_idx: int): # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor, - info=LlavaProcessingInfo, + info=MantisProcessingInfo, dummy_inputs=LlavaDummyInputsBuilder) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index c9283e0c5ba20..5b0f35b08646b 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -554,10 +554,12 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: # Preserve the order of modalities if there are multiple of them # from the order of kwargs. for input_key in kwargs: - if input_key == "pixel_values" and "images" not in modalities: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: modalities["images"] = self._parse_and_validate_image_input( **kwargs) - if input_key == "pixel_values_videos" and "videos" not in modalities: # noqa E501 + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: modalities["videos"] = self._parse_and_validate_video_input( **kwargs) @@ -814,7 +816,7 @@ def apply_pooling(self, image_features, stride=2): return image_feature def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return None @@ -840,8 +842,7 @@ def get_multimodal_embeddings( def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: @@ -850,6 +851,34 @@ def get_input_embeddings( [self.config.image_token_index, self.config.video_token_index]) return inputs_embeds + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[NestedTensors] = None, + video_input: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_index, + ) + + if video_input is not None: + video_embeds = self._process_video_pixels(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_index, + ) + + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -869,13 +898,21 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py new file mode 100644 index 0000000000000..eb4282d62005a --- /dev/null +++ b/vllm/model_executor/models/minicpmo.py @@ -0,0 +1,811 @@ +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only MiniCPM-O model compatible with HuggingFace weights.""" +from functools import partial +from itertools import accumulate +from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) + +import torch +import torch.types +from torch import nn +from transformers.modeling_outputs import BaseModelOutputWithPast +from transformers.models.whisper.modeling_whisper import ( + ACT2FN, WHISPER_ATTENTION_CLASSES, WhisperConfig, WhisperEncoder) + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import MultiModalFieldConfig +from vllm.multimodal.parse import (ModalityData, ModalityDataItems, + MultiModalDataItems, MultiModalDataParser, + VideoItem) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + PromptReplacement) +from vllm.multimodal.profiling import ProcessorInputs +from vllm.sequence import IntermediateTensors + +from .minicpmv import (MiniCPMV2_6, MiniCPMVDummyInputsBuilder, + MiniCPMVEmbeddingItems, MiniCPMVMultiModalDataParser, + MiniCPMVMultiModalProcessor, MiniCPMVProcessingInfo) +from .utils import AutoWeightsLoader, maybe_prefix + +CPU_DEVICE = torch.device("cpu") + +MiniCPMOEmbeddingItems = MiniCPMVEmbeddingItems + + +class MiniCPMOAudioFeatureInputs(TypedDict): + type: Literal["audio_features"] + data: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices, num_channels, length)` + Slice here means chunk. Audio that is too long will be split into slices, + which is the same as image. + Padding is used therefore `data` is `torch.Tensor`. + """ + + audio_feature_lens: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices)` + + This should be feature length of each audio slice, + which equals to `data.shape[-1]` + """ + + audio_bounds: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices, 2)` + + This should be in `(start, stop)` format. + """ + + +class MiniCPMOAudioEmbeddingInputs(TypedDict): + type: Literal["audio_embeds"] + data: List[torch.Tensor] + """ + Shape: `(batch_size * num_images * num_slices, hidden_size)` + + `hidden_size` must match the hidden size of language model backbone. + instead of a batched tensor. + Length of each slice may vary, so pass it as a list. + """ + audio_bounds: torch.Tensor + """ + Shape: `(batch_size * num_audios * num_slices, 2)` + + This should be in `(start, stop)` format. + """ + + +MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs, + MiniCPMOAudioEmbeddingInputs] + + +class MiniCPMOAudioEmbeddingItems(MiniCPMOEmbeddingItems): + + def __init__(self, data: Dict) -> None: + super().__init__(data, "audio") + audio_embeds = self.data.get("audio_embeds", None) + if audio_embeds is None: + raise ValueError("Incorrect type of video_embeds", + "Got type: None") + self.data["audio_embeds"] = audio_embeds + + def get(self, index: int) -> object: + return self.data["audio_embeds"][index] + + +class MiniCPMOMultiModalDataParser(MiniCPMVMultiModalDataParser): + + def _parse_audio_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return MiniCPMOAudioEmbeddingItems(data) + return super()._parse_audio_data(data) + + +class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo): + audio_pattern = "()" + + def get_supported_mm_modalities(self) -> List[str]: + return ["image", "video", "audio"] + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None, "audio": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self.get_max_image_tokens(), + "audio": self.get_max_audio_tokens(), + "video": self.get_max_video_tokens(seq_len) + } + + def get_default_audio_pool_step(self) -> int: + return 2 + + def get_default_audio_sampling_rate(self) -> int: + return 16000 + + def get_chunk_length(self) -> int: + return self.get_hf_config().audio_chunk_length + + def get_max_audio_tokens_per_chunk(self) -> int: + pool_step = self.get_default_audio_pool_step() + fbank_feat_in_chunk = 100 + cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1 + num_audio_tokens = (cnn_feat_in_chunk - pool_step) // pool_step + 1 + return num_audio_tokens + 2 # + + def get_max_audio_chunks_with_most_features(self) -> int: + return 30 + + def get_audio_len_by_num_chunks(self, num_chunks: int) -> int: + sampling_rate = self.get_default_audio_sampling_rate() + # exclude + num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk() - 2 + return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1 + + def get_num_frames_with_most_features(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) + max_audios = mm_config.limit_per_prompt.get("audio", 1) + + # count tokens + # which are not in get_max_image_tokens + max_image_tokens = self.get_max_image_tokens( + ) * max_images + 4 * max_images + max_audio_tokens = self.get_max_audio_tokens( + ) * max_audios + 2 * max_audios + max_total_frames = self.get_max_video_frames(seq_len - + max_image_tokens - + max_audio_tokens) + + num_frames = max(max_total_frames // max(max_videos, 1), 1) + + return num_frames + + +class MiniCPMODummyInputsBuilder(MiniCPMVDummyInputsBuilder): + + def get_dummy_processor_inputs( + self, seq_len: int, mm_counts: Mapping[str, + int]) -> ProcessorInputs: + num_audios = mm_counts.get("audio", 0) + audio_len = self.info.get_max_audio_chunks_with_most_features() * \ + self.info.get_default_audio_sampling_rate() + + processor_inputs = super().get_dummy_processor_inputs( + seq_len, mm_counts) + mm_data = { + "image": + processor_inputs.mm_data["image"], + "video": + processor_inputs.mm_data["video"], + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + audio_prompt_texts = self.info.audio_pattern * num_audios + + return ProcessorInputs(prompt_text=processor_inputs.prompt_text + \ + audio_prompt_texts, + mm_data=mm_data) + + +class MiniCPMOMultiModalProcessor( + MiniCPMVMultiModalProcessor, + BaseMultiModalProcessor[MiniCPMOProcessingInfo]): + + def _get_data_parser(self) -> MultiModalDataParser: + return MiniCPMOMultiModalDataParser( + target_sr=self.info.get_default_audio_sampling_rate()) + + def get_audio_prompt_texts(self, + audio_lens: int, + chunk_input: bool = True, + chunk_length: int = 1) -> str: + return self.info.get_hf_processor().get_audio_placeholder( + audio_lens, chunk_input, chunk_length) + + def get_special_tokens(self) -> Dict[str, torch.Tensor]: + tokenizer = self.info.get_tokenizer() + special_tokens = super().get_special_tokens() + if hasattr(tokenizer, "audio_start_id"): + special_tokens["audio_start_id"] = torch.tensor( + tokenizer.audio_start_id) + special_tokens["audio_end_id"] = torch.tensor( + tokenizer.audio_end_id) + return special_tokens + + def process_audios(self, mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + audios = mm_data.pop("audios", []) + audio_embeds = mm_data.pop("audio_embeds", []) + if isinstance(audios, (list, torch.Tensor)) and len(audios) > 0: + audio_outputs = { + "audio_lens": [], + "audio_features": [], + "audio_feature_lens": [], + "audio_num_segments": [] + } + for audio in audios: + single_audio_outputs = super().call_base_hf_processor( + prompt=self.info.audio_pattern, + mm_data={ + "audios": audio, + "chunk_input": True + }, + mm_kwargs=mm_kwargs) + audio_outputs["audio_lens"].append(len(audio)) + audio_outputs["audio_features"].append( + single_audio_outputs["audio_features"]) + audio_outputs["audio_num_segments"].append( + len(single_audio_outputs["audio_feature_lens"][0])) + audio_outputs["audio_feature_lens"] += \ + single_audio_outputs["audio_feature_lens"] + audio_outputs["audio_features"] = [ + audio_feature for single_audio_features in \ + audio_outputs["audio_features"] + for audio_feature in single_audio_features + ] + audio_outputs["audio_feature_lens"] = torch.cat( + audio_outputs["audio_feature_lens"]) + elif len(audio_embeds): + audio_outputs = { + "audio_lens": [ + self.info.get_audio_len_by_num_chunks( + sum(chunk_embeds.shape[0] + for chunk_embeds in single_audio_embeds)) + for single_audio_embeds in audio_embeds + ], + "audio_embeds": [ + chunk_embeds for single_audio_embeds in audio_embeds + for chunk_embeds in single_audio_embeds + ], + "audio_num_segments": [ + len(single_audio_embeds) + for single_audio_embeds in audio_embeds + ] + } + else: + audio_outputs = {} + return audio_outputs + + def get_placeholder_match_pattern(self) -> str: + return r"\(<(image|video|audio)>./\)" + + def get_placeholder_split_pattern(self) -> str: + return r"\(<(?:image|video|audio)>./\)" + + def process_mm_inputs(self, mm_data, mm_kwargs) -> object: + return { + "image": self.process_images(mm_data, mm_kwargs), + "video": self.process_videos(mm_data, mm_kwargs), + "audio": self.process_audios(mm_data, mm_kwargs) + } + + def get_modality_num_counter(self, modality: str) -> str: + if modality == "audio": + return "audio_lens" + return super().get_modality_num_counter(modality) + + def get_num_slices_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> int: + if modality == "audio": + return inputs["audio"]["audio_num_segments"][index] + return super().get_num_slices_by_modality(inputs, modality, index) + + def get_prompt_texts_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> str: + if modality == "audio": + return self.get_audio_prompt_texts( + inputs["audio"]["audio_lens"][index]) + return super().get_prompt_texts_by_modality(inputs, modality, index) + + def _get_prompt_replacements( + self, mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: + placeholder = { + "image": self.info.image_pattern, + "video": self.info.video_pattern, + "audio": self.info.audio_pattern + } + + def get_replacement_minicpmv(item_idx: int, modality: str): + if modality == "image": + return self.get_image_prompt_texts( + mm_items["image"].get_image_size(item_idx), item_idx) + elif modality == "video": + return self.get_video_prompt_texts( + mm_items["video"].get_frame_size(item_idx), + mm_items["video"].get_num_frames(item_idx)) + else: # audio + if isinstance(mm_items["audio"], MiniCPMOAudioEmbeddingItems): + single_audio_embeds = mm_items["audio"].get(item_idx) + audio_len = self.info.get_audio_len_by_num_chunks( + sum(chunk_embeds.shape[0] + for chunk_embeds in single_audio_embeds)) + return self.get_audio_prompt_texts(audio_len) + return self.get_audio_prompt_texts( + len(mm_items["audio"].get(item_idx))) + + return [ + PromptReplacement(modality=modality, + target=placeholder[modality], + replacement=partial(get_replacement_minicpmv, + modality=modality)) + for modality in ("image", "video", "audio") + ] + + def _get_mm_fields_config( + self, + hf_inputs, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + def get_slices(num_slices: List[int]) -> List[int]: + slice_indices = [0] + list(accumulate(num_slices)) + slices = [(slice_indices[i], slice_indices[i + 1]) + for i in range(len(num_slices))] + return [slice(*slice_item) for slice_item in slices] + + audio_slices = get_slices( + hf_inputs.get("audio_num_slices", torch.empty(0))) + return dict( + **super()._get_mm_fields_config(hf_inputs, hf_processor_mm_kwargs), + audio_features=MultiModalFieldConfig.flat("audio", audio_slices), + audio_feature_lens=MultiModalFieldConfig.flat( + "audio", audio_slices), + audio_num_slices=MultiModalFieldConfig.batched("audio"), + audio_orders_in_mm_data=MultiModalFieldConfig.batched("audio"), + audio_embeds=MultiModalFieldConfig.flat("audio", audio_slices)) + + +class MultiModalProjector(nn.Module): + + def __init__(self, in_dim: int, out_dim: int): + super().__init__() + self.linear1 = nn.Linear(in_features=in_dim, + out_features=out_dim, + bias=True) + self.relu = nn.ReLU() + self.linear2 = nn.Linear(in_features=out_dim, + out_features=out_dim, + bias=True) + + def forward(self, audio_features: torch.Tensor) -> torch.Tensor: + hidden_states = self.relu(self.linear1(audio_features)) + hidden_states = self.linear2(hidden_states) + return hidden_states + + +class MiniCPMWhisperEncoderLayer(nn.Module): + + def __init__(self, config: WhisperConfig, layer_idx: int = None): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = WHISPER_ATTENTION_CLASSES[ + config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + layer_idx=layer_idx, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + ) -> torch.Tensor: + residual = hidden_states + past_key_values = None + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights, past_key_values = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_value=past_key_values, + ) + hidden_states = nn.functional.dropout(hidden_states, + p=self.dropout, + training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, + p=self.activation_dropout, + training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, + p=self.dropout, + training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() + or torch.isnan(hidden_states).any()): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, + min=-clamp_value, + max=clamp_value) + + outputs = (hidden_states, ) + + return outputs + + +class MiniCPMWhisperEncoder(WhisperEncoder): + + def __init__(self, config: WhisperConfig): + super().__init__(config) + self.layers = nn.ModuleList([ + MiniCPMWhisperEncoderLayer(config, layer_idx=i) + for i in range(config.encoder_layers) + ]) + + def forward( + self, + input_features: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + ) -> BaseModelOutputWithPast: + # Ignore copy + input_features = input_features.to(dtype=self.conv1.weight.dtype, + device=self.conv1.weight.device) + + inputs_embeds = nn.functional.gelu(self.conv1(input_features)) + inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) + + inputs_embeds = inputs_embeds.permute(0, 2, 1) + + embed_pos = self.embed_positions.weight + + embed_pos = embed_pos[:inputs_embeds.shape[1], :] + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, + p=self.dropout, + training=self.training) + + encoder_states = () + + for idx, encoder_layer in enumerate(self.layers): + encoder_states = encoder_states + (hidden_states, ) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + # Ignore copy + if to_drop: + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + ) + + hidden_states = layer_outputs[0] + + hidden_states = self.layer_norm(hidden_states) + encoder_states = encoder_states + (hidden_states, ) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + hidden_states=encoder_states, + ) + + +@MULTIMODAL_REGISTRY.register_processor( + MiniCPMOMultiModalProcessor, + info=MiniCPMOProcessingInfo, + dummy_inputs=MiniCPMODummyInputsBuilder) +class MiniCPMO(MiniCPMV2_6): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__(vllm_config=vllm_config, prefix=prefix) + self.apm = self.init_audio_module(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "apm")) + + def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): + # Do not use parameters temporarily + audio_config = self.config.audio_config + model = MiniCPMWhisperEncoder(audio_config) + audio_output_dim = int(audio_config.encoder_ffn_dim // 4) + self.audio_avg_pooler = \ + nn.AvgPool1d(self.config.audio_pool_step, + stride=self.config.audio_pool_step) + self.audio_projection_layer = \ + MultiModalProjector(in_dim=audio_output_dim,out_dim=self.embed_dim) + self.audio_encoder_layer = -1 + return model + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + loader = AutoWeightsLoader(self, skip_prefixes=["tts"]) + return loader.load_weights(weights) + + def subsequent_chunk_mask( + self, + size: int, + chunk_size: int, + num_left_chunks: int = -1, + device: torch.device = CPU_DEVICE, + num_lookhead: int = 0, + ) -> torch.Tensor: + ret = torch.zeros(size, size, device=device, dtype=torch.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max((i // chunk_size - num_left_chunks) * chunk_size, + 0) + ending = min((i // chunk_size + 1) * chunk_size + num_lookhead, + size) + ret[i, start:ending] = True + return ret + + def _get_feat_extract_output_lengths(self, + input_lengths: torch.LongTensor): + input_lengths_after_cnn = (input_lengths - 1) // 2 + 1 + input_lengths_after_pooling = ( + input_lengths_after_cnn - + self.config.audio_pool_step) // self.config.audio_pool_step + 1 + input_lengths_after_pooling = input_lengths_after_pooling.to( + dtype=torch.int32) + + return input_lengths_after_cnn, input_lengths_after_pooling + + # Copied from HF repo of MiniCPM-o-2_6, + # designed for batched inputs and outputs + def get_audio_hidden_states(self, data: MiniCPMOAudioInputs, + chunk_length: int) -> torch.Tensor: + wavforms = data.get( + "data", + []) # (bs, 80, frames) or [], multi audios need filled in advance + audio_feature_lens_raw = [data.get("audio_feature_lens", + [])] # list, [[x1, x2], [y1], [z1]] + + # exist audio + if len(wavforms) > 0: + audio_feature_lens = torch.hstack(audio_feature_lens_raw) + batch_size, _, max_mel_seq_len = wavforms.shape + max_seq_len = (max_mel_seq_len - 1) // 2 + 1 + + # Create a sequence tensor of shape (batch_size, max_seq_len) + seq_range = (torch.arange( + 0, + max_seq_len, + dtype=audio_feature_lens.dtype, + device=audio_feature_lens.device).unsqueeze(0).expand( + batch_size, max_seq_len)) + lengths_expand = audio_feature_lens.unsqueeze(1).expand( + batch_size, max_seq_len) + # Create mask + padding_mask = seq_range >= lengths_expand # 1 for padded values + + audio_attention_mask_ = padding_mask.view( + batch_size, 1, 1, max_seq_len).expand(batch_size, 1, + max_seq_len, max_seq_len) + audio_attention_mask = audio_attention_mask_.to( + dtype=self.apm.conv1.weight.dtype, + device=self.apm.conv1.weight.device) + + if chunk_length > 0: + chunk_num_frame = int(chunk_length * 50) + chunk_mask = self.subsequent_chunk_mask( + size=max_seq_len, + chunk_size=chunk_num_frame, + num_left_chunks=-1, + device=audio_attention_mask_.device, + ) + audio_attention_mask_ = torch.logical_or( + audio_attention_mask_, torch.logical_not(chunk_mask)) + + audio_attention_mask[audio_attention_mask_] = float("-inf") + audio_states = self.apm( + wavforms, attention_mask=audio_attention_mask).hidden_states[ + self.audio_encoder_layer] + audio_embeds = self.audio_projection_layer(audio_states) + + audio_embeds = audio_embeds.transpose(1, 2) + audio_embeds = self.audio_avg_pooler(audio_embeds) + audio_embeds = audio_embeds.transpose(1, 2) + + _, feature_lens_after_pooling = \ + self._get_feat_extract_output_lengths(audio_feature_lens) + + num_audio_tokens = feature_lens_after_pooling + + final_audio_embeds = [] + idx = 0 + for i in range(len(audio_feature_lens_raw)): + target_audio_embeds = [] + for _ in range(len(audio_feature_lens_raw[i])): + target_audio_embeds.append( + audio_embeds[idx, :num_audio_tokens[idx], :]) + idx += 1 + final_audio_embeds.append(target_audio_embeds) + return final_audio_embeds + else: + return [] + + def get_embedding_with_audios(self, vlm_embedding: torch.Tensor, + audio_inputs: Optional[MiniCPMOAudioInputs], + chunk_length: int) -> torch.Tensor: + device, dtype = vlm_embedding.device, vlm_embedding.dtype + if audio_inputs["type"] == "audio_embeds": + audio_embeddings = audio_inputs["data"] + audio_embeddings = [ + audio_embeddings[i].to(device=device, dtype=dtype) + for i in range(len(audio_embeddings)) + ] + else: + audio_embeddings = self.get_audio_hidden_states( + audio_inputs, chunk_length)[0] + if audio_embeddings is None or len(audio_embeddings) == 0: + return vlm_embedding + audio_bounds = audio_inputs["audio_bounds"] + if self.config.chunk_input: + audio_embs = torch.cat(audio_embeddings, dim=0).to(device=device, + dtype=dtype) + audio_start_pos = 0 + for bound in audio_bounds: + audio_len = bound[1] - bound[0] + vlm_embedding[bound[0]:bound[1]] = audio_embs[ + audio_start_pos:audio_start_pos + audio_len, :] + audio_start_pos += audio_len + else: + for embs, bound in zip(audio_embeddings, audio_bounds): + audio_indices = torch.arange(bound[0], + bound[1], + dtype=torch.long).to(device) + + if embs.shape[0] != len(audio_indices): + raise ValueError( + "Shape mismatch: Trying to assign embeddings " + f"of shape {embs.shape} " + f"to input indices of length {len(audio_indices)}") + vlm_embedding[audio_indices] = embs.to(dtype) + return vlm_embedding + + def _get_audio_bounds(self, input_ids: torch.Tensor, + audio_start_id: torch.Tensor, + audio_end_id: torch.Tensor) -> torch.Tensor: + audio_start_tokens, = torch.where(input_ids == audio_start_id[0]) + audio_start_tokens += 1 + audio_end_tokens, = torch.where(input_ids == audio_end_id[0]) + valid_audio_nums = max(len(audio_start_tokens), len(audio_end_tokens)) + return torch.hstack([ + audio_start_tokens[:valid_audio_nums].unsqueeze(-1), + audio_end_tokens[:valid_audio_nums].unsqueeze(-1) + ]) + + def _parse_and_validate_audio_inputs( + self, input_ids: torch.Tensor, + **kwargs: object) -> Tuple[MiniCPMOAudioInputs]: + audio_features = kwargs.pop("audio_features", []) + audio_feature_lens = kwargs.pop("audio_feature_lens", []) + audio_embeds = kwargs.pop("audio_embeds", None) + audio_start_id = kwargs.pop("audio_start_id", None) + audio_end_id = kwargs.pop("audio_end_id", None) + if audio_embeds is not None: + audio_embeds = [ + audio_embeds[i][j] for i in range(len(audio_embeds)) + for j in range(len(audio_embeds[i])) + ] + return MiniCPMOAudioEmbeddingInputs( + audio_bounds=self._get_audio_bounds(input_ids, audio_start_id, + audio_end_id), + data=audio_embeds, + type="audio_embeds") + if len(audio_features) > 0: + audio_features_all = [ + i.permute(1, 0) for audio_feature in audio_features + for i in audio_feature + ] + audio_features = torch.nn.utils.rnn.pad_sequence( + audio_features_all, batch_first=True, + padding_value=0.0).permute(0, 2, 1) + audio_feature_lens = torch.cat( + [item for item in audio_feature_lens]) + + return MiniCPMOAudioFeatureInputs( + audio_bounds=self._get_audio_bounds(input_ids, audio_start_id, + audio_end_id), + data=audio_features, + audio_feature_lens=audio_feature_lens, + type="audio_features") + return None + + def _parse_and_validate_inputs(self, input_ids: torch.Tensor, + **kwargs: object): + image_inputs = self._parse_and_validate_image_inputs( + input_ids, **kwargs) + if not any("audio" in key for key in kwargs): + return image_inputs, None + audio_inputs = self._parse_and_validate_audio_inputs( + input_ids, **kwargs) + return image_inputs, audio_inputs + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + **kwargs: Any, + ) -> torch.Tensor: + if intermediate_tensors is not None: + vlm_embeddings = None + else: + image_inputs, audio_inputs = \ + self._parse_and_validate_inputs(input_ids, **kwargs) + vlm_embeddings, _ = self.get_embedding_with_vision( + input_ids, image_inputs) + + if audio_inputs is not None: + vlm_embeddings = self.get_embedding_with_audios( + vlm_embeddings, audio_inputs, + self.config.audio_chunk_length) + + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + # for `torch.compile` integration + input_ids = None + + output = self.llm.model( + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=vlm_embeddings, + ) + return output diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 1aa529056893b..bf967d33a3176 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -22,21 +22,21 @@ """Inference-only MiniCPM-V model compatible with HuggingFace weights.""" import math import re +from collections import Counter from functools import cached_property, partial -from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, - Set, Tuple, TypedDict, Union) +from itertools import accumulate +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Set, Tuple, TypedDict, Union) +import numpy as np import torch import torch.types from PIL import Image from torch import nn -from transformers import PretrainedConfig -from typing_extensions import NotRequired +from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2, get_2d_sincos_pos_embed) @@ -48,33 +48,30 @@ from vllm.model_executor.models.qwen2 import Qwen2ForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputs, PlaceholderRange) +from vllm.multimodal.parse import (ImageItem, ImageSize, ModalityData, + ModalityDataItems, MultiModalDataItems, + MultiModalDataParser, VideoItem) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors from .idefics2_vision_model import Idefics2VisionTransformer from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import AutoWeightsLoader, maybe_prefix -RawImageType = Union[Image.Image, torch.Tensor] - - -class MiniCPMVRawImageInput(TypedDict): - """Input mapper input with auxiliary data for computing image bounds.""" - image: RawImageType +CPU_DEVICE = torch.device("cpu") - # Image bounds token ids in 0-dim scaler tensor. - im_start_id: torch.Tensor - im_end_id: torch.Tensor - slice_start_id: NotRequired[torch.Tensor] - slice_end_id: NotRequired[torch.Tensor] +RawImageType = Union[Image.Image, torch.Tensor] class MiniCPMVImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: List[torch.Tensor] """ - Shape: `(batch_size * num_images, num_channels, height, width)` + Shape: `(batch_size * num_images * num_slices, num_channels, height, width)` Note that the image size may vary, so we pass it as a list instead of a batched tensor. @@ -82,14 +79,14 @@ class MiniCPMVImagePixelInputs(TypedDict): image_bounds: torch.Tensor """ - Shape: `(batch_size * num_images, 2)` + Shape: `(batch_size * num_images * num_slices, 2)` This should be in `(start, stop)` format. """ tgt_sizes: torch.Tensor """ - Shape: `(batch_size * num_images, 2)` + Shape: `(batch_size * num_images * num_slices, 2)` This should be in `(height, width)` format. """ @@ -99,7 +96,8 @@ class MiniCPMVImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] data: torch.Tensor """ - Shape: `(batch_size * num_images, image_feature_size, hidden_size)` + Shape: `(batch_size * num_images * num_slices, + image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. instead of a batched tensor. @@ -107,7 +105,7 @@ class MiniCPMVImageEmbeddingInputs(TypedDict): image_bounds: torch.Tensor """ - Shape: `(batch_size * num_images, 2)` + Shape: `(batch_size * num_images * num_slices, 2)` This should be in `(start, stop)` format. """ @@ -116,6 +114,93 @@ class MiniCPMVImageEmbeddingInputs(TypedDict): MiniCPMVImageInputs = Union[MiniCPMVImagePixelInputs, MiniCPMVImageEmbeddingInputs] + +class MiniCPMVEmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], + dict[str, torch.Tensor]]): + + def __init__(self, data: Dict, modality: str) -> None: + super().__init__(data, modality) + + def get_processor_data(self) -> Mapping[str, object]: + return self.data + + def get_passthrough_data(self) -> Mapping[str, object]: + return {} + + def get_count(self) -> int: + return len(self.data[f"{self.modality}_embeds"]) + + def get(self, index: int) -> Dict[str, torch.Tensor]: + out = {} + for k, v in self.data.items(): + out[k] = v[index] + return out + + +class MiniCPMVImageEmbeddingItems(MiniCPMVEmbeddingItems): + + def __init__(self, data: Dict) -> None: + super().__init__(data, "image") + image_embeds = self.data.get("image_embeds", None) + image_sizes = self.data.get("image_sizes", None) + if image_embeds is None: + raise ValueError("In correct type of image_embeds", + "Got type: None") + if not isinstance(image_embeds[0], torch.Tensor): + raise ValueError("In correct type of image_embeds", + f"Got type: {type(image_embeds[0])}") + if image_sizes is None: + raise ValueError( + "In correct type of image_sizes", "Got type: None." + "If you're using `image_size_list`, " + "please rename it to `image_sizes`") + if len(image_embeds[0].shape) == 2: + image_embeds = [image_embeds] + image_sizes = [image_sizes] + self.data["image_embeds"] = image_embeds + self.data["image_sizes"] = image_sizes + + def get_image_size(self, index: int) -> ImageSize: + image_size = self.data["image_sizes"][index] + return ImageSize(width=image_size[0], height=image_size[1]) + + +class MiniCPMVVideoEmbeddingItems(MiniCPMVEmbeddingItems): + + def __init__(self, data: Dict) -> None: + super().__init__(data, "video") + video_embeds = self.data.get("video_embeds", None) + image_sizes = self.data.get("image_sizes", None) + num_frames = self.data.get("num_frames", None) + if video_embeds is None: + raise ValueError("In correct type of video_embeds", + "Got type: None") + if not isinstance(video_embeds[0], torch.Tensor): + raise ValueError("In correct type of video_embeds", + f"Got type: {type(video_embeds[0])}") + if image_sizes is None: + raise ValueError( + "In correct type of image_sizes", "Got type: None." + "If you're using `image_size_list`, " + "please rename it to `image_sizes`") + if num_frames is None: + raise ValueError("In correct type of numframes", "Got type: None") + if len(video_embeds[0].shape) == 2: + video_embeds = [video_embeds] + image_sizes = [image_sizes] + num_frames = [num_frames] + self.data["video_embeds"] = video_embeds + self.data["image_sizes"] = image_sizes + self.data["num_frames"] = num_frames + + def get_frame_size(self, index: int) -> ImageSize: + frame_size = self.data["image_sizes"][index] + return ImageSize(width=frame_size[0], height=frame_size[1]) + + def get_num_frames(self, index: int) -> int: + return self.data["num_frames"][index] + + DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6) @@ -212,25 +297,6 @@ def forward(self, x: torch.Tensor, return x -def _build_image_input(ctx: InputContext, - image: RawImageType) -> MiniCPMVRawImageInput: - tokenizer = cached_get_tokenizer( - ctx.model_config.tokenizer, - trust_remote_code=ctx.model_config.trust_remote_code) - if hasattr(tokenizer, "slice_start_id"): - return MiniCPMVRawImageInput( - image=image, - im_start_id=torch.tensor(tokenizer.im_start_id), - im_end_id=torch.tensor(tokenizer.im_end_id), - slice_start_id=torch.tensor(tokenizer.slice_start_id), - slice_end_id=torch.tensor(tokenizer.slice_end_id)) - else: - return MiniCPMVRawImageInput( - image=image, - im_start_id=torch.tensor(tokenizer.im_start_id), - im_end_id=torch.tensor(tokenizer.im_end_id)) - - def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: version_float = getattr(config, "version", None) @@ -240,129 +306,512 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: if config.hidden_size == 2304 and config.query_num == 64: return (2, 0) return (2, 5) - version_str = str(version_float) return tuple(int(x) for x in version_str.split(".")) -def get_max_minicpmv_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config() - return getattr(hf_config, "query_num", 64) +class MiniCPMVMultiModalDataParser(MultiModalDataParser): + + def _parse_image_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return MiniCPMVImageEmbeddingItems(data) + return super()._parse_image_data(data) + + def _parse_video_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return MiniCPMVVideoEmbeddingItems(data) + return super()._parse_video_data(data) + + +class MiniCPMVProcessingInfo(BaseProcessingInfo): + image_pattern = "(./)" + video_pattern = "()" + + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor( + self, + **kwargs: object, + ): + hf_processor = self.ctx.get_hf_processor() + return hf_processor + + def get_image_processor(self): + hf_processor = self.get_hf_processor() + image_processor = hf_processor.image_processor # type: ignore + return image_processor + + def get_model_version(self): + return get_version_by_config(self.get_hf_config()) + + def get_supported_mm_modalities(self) -> List[str]: + if self.get_model_version() == (2, 6): + return ["image", "video"] + else: + return ["image"] + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + if self.get_model_version() == (2, 6): + return {"image": None, "video": None} + else: + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + mm_max_tokens = {"image": self.get_max_image_tokens()} + if self.get_model_version() == (2, 6): + mm_max_tokens["video"] = self.get_max_video_tokens(seq_len) + return mm_max_tokens + + def get_max_video_frame_tokens(self) -> int: + frame_size = self.get_video_frame_size_with_most_features() + return self.get_num_image_tokens(frame_size, + self.get_video_max_slice_num()) + + def get_max_video_tokens(self, seq_len: int) -> int: + return self.get_max_video_frame_tokens( + ) * self.get_num_frames_with_most_features(seq_len) + + def get_max_audio_tokens(self) -> int: + return self.get_max_audio_tokens_per_chunk( + ) * self.get_max_audio_chunks_with_most_features() + + def get_slice_query_num(self) -> int: + hf_config = self.get_hf_config() + query_num = getattr(hf_config, "query_num", 64) + return query_num + + def get_max_slice_num(self) -> int: + hf_config = self.get_hf_config() + max_slice_num = getattr(hf_config, "max_slice_num", 9) + return max_slice_num + + def get_sliced_grid(self, image_size: ImageSize, + max_slice_num: int) -> Tuple[int, int]: + if self.get_model_version() == (2, 6): + slice_grid = self.get_image_processor().get_sliced_grid( + image_size, max_slice_num) + else: + slice_grid = self.get_image_processor().get_sliced_grid(image_size) + return slice_grid + + def get_num_image_tokens(self, image_size: ImageSize, + max_slice_num: int) -> int: + slice_grid = self.get_sliced_grid(image_size, max_slice_num) + num_tokens = self.get_slice_query_num( + ) + 2 # ( * query_num) + if slice_grid is not None: + if self.get_model_version() == (2, 6): + num_additional_tokens = 0 + else: + # ( * query_num) + num_additional_tokens = 2 + num_tokens += ((self.get_slice_query_num() + 2) \ + * slice_grid[0] * slice_grid[1]) \ + + slice_grid[1] - 1 + num_additional_tokens + return num_tokens + def get_image_slice_nums(self, image_size: torch.Tensor, + max_slice_nums: int) -> int: + grid = self.get_sliced_grid(image_size, max_slice_nums) + return 1 if grid is None else grid[0] * grid[1] + 1 -def dummy_seq_data_for_minicpmv(seq_len: int, num_images: int): - return SequenceData.from_prompt_token_counts((0, seq_len)) + def get_max_image_tokens(self) -> int: + image_size = self.get_image_size_with_most_features() + return self.get_num_image_tokens(image_size, self.get_max_slice_num()) + def get_image_size_with_most_features(self) -> ImageSize: + # Result in the max possible feature size (h:w = 9:1) + return self.get_default_image_sizes(self.get_max_slice_num()) -def dummy_image_for_minicpmv(ctx: InputContext, hf_config: PretrainedConfig, - num_images: int): - width = height = hf_config.image_size - image = _build_image_input(ctx, - image=Image.new("RGB", (width, height), - color=0)) - return {"image": [image] if num_images == 1 else [image] * num_images} + def get_video_max_slice_num(self) -> int: + return 1 + def get_video_frame_size_with_most_features(self) -> ImageSize: + return self.get_default_image_sizes(self.get_video_max_slice_num()) -def dummy_data_for_minicpmv(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config() - num_images = mm_counts["image"] + def get_max_video_frames(self, max_tokens: int) -> int: + num_frame_tokens = self.get_max_video_frame_tokens() + num_frames = max_tokens // num_frame_tokens + return num_frames - seq_data = dummy_seq_data_for_minicpmv(seq_len, num_images) - mm_data = dummy_image_for_minicpmv(ctx, hf_config, num_images) + def get_num_frames_with_most_features(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) - return DummyData(seq_data, mm_data) + # count tokens + # which are not in get_max_image_tokens + max_image_tokens = self.get_max_image_tokens( + ) * max_images + 4 * max_images + max_total_frames = self.get_max_video_frames(seq_len - + max_image_tokens) + num_frames = max(max_total_frames // max(max_videos, 1), 1) -def input_processor_for_minicpmv(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - model_config = ctx.model_config - version = get_version_by_config(model_config.hf_config) - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - image_processor = cached_get_image_processor(model_config.tokenizer) + return num_frames - def get_placeholder(image_size: Tuple[int, int], num_image: int): + def get_default_image_sizes(self, num_slices: int) -> ImageSize: + image_size = getattr(self.get_hf_config(), "image_size", 448) + return ImageSize(width=image_size, height=image_size * num_slices) + + +class MiniCPMVDummyInputsBuilder(BaseDummyInputsBuilder[MiniCPMVProcessingInfo] + ): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + image_width, image_height = \ + self.info.get_image_size_with_most_features() + video_width, video_height = \ + self.info.get_video_frame_size_with_most_features() + num_video_frames = \ + self.info.get_num_frames_with_most_features(seq_len) + + mm_data = { + "image": + self._get_dummy_images(width=image_width, + height=image_height, + num_images=num_images), + "video": [ + self._get_dummy_images(width=video_width, + height=video_height, + num_images=num_video_frames) + ] * num_videos, + } + + image_prompt_texts = self.info.image_pattern * num_images + video_prompt_texts = self.info.video_pattern * num_videos + + return ProcessorInputs(prompt_text=image_prompt_texts + + video_prompt_texts, + mm_data=mm_data) + + +class MiniCPMVMultiModalProcessor( + BaseMultiModalProcessor[MiniCPMVProcessingInfo]): + + def _get_data_parser(self) -> MultiModalDataParser: + return MiniCPMVMultiModalDataParser() + + def get_slice_image_placeholder(self, image_size: ImageSize, + **kwargs) -> str: + image_processor = self.info.get_image_processor() + version = self.info.get_model_version() if version == (2, 0) or version == (2, 5): return image_processor.get_slice_image_placeholder(image_size) return image_processor.get_slice_image_placeholder( - image_size, num_image) - - prompt = inputs.get("prompt") - token_ids = inputs.get("prompt_token_ids") - if prompt is None: - prompt = tokenizer.decode(token_ids) - - pattern = "(./)" - images = multi_modal_data["image"] - image_tags = re.findall(pattern, prompt) - if len(image_tags) == 0: - new_token_ids = token_ids - new_prompt = prompt - else: - if isinstance(images, dict): - image_size_list = images.get("image_size_list") - images = [images.get("image_embeds")] + image_size, **kwargs) + + def get_image_prompt_texts(self, + image_size: ImageSize, + image_idx: int = 0) -> str: + prompt_texts = self.get_slice_image_placeholder(image_size, + image_idx=image_idx) + return prompt_texts + + def get_video_prompt_texts(self, image_size: ImageSize, + num_frames: int) -> str: + prompt_texts = "".join( + self.get_slice_image_placeholder( + image_size=image_size, + image_idx=0, + max_slice_nums=self.info.get_video_max_slice_num(), + use_image_id=False) for image_idx in range(num_frames)) + return prompt_texts + + def get_special_tokens(self) -> Dict[str, torch.Tensor]: + tokenizer = self.info.get_tokenizer() + special_tokens = { + "im_start_id": torch.tensor(tokenizer.im_start_id), + "im_end_id": torch.tensor(tokenizer.im_end_id) + } + if hasattr(tokenizer, "slice_start_id"): + special_tokens["slice_start_id"] = torch.tensor( + tokenizer.slice_start_id) + special_tokens["slice_end_id"] = torch.tensor( + tokenizer.slice_end_id) + return special_tokens + + @staticmethod + def repack_processor_outputs(outputs: Any) -> BatchFeature: + valid_keys = ["pixel_values", "image_sizes", "tgt_sizes"] + outputs = {key: outputs[key][0] for key in valid_keys} + return outputs + + def process_images(self, mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + images = mm_data.pop("images", []) + image_embeds = mm_data.pop("image_embeds", []) + if isinstance(images, Image.Image): + images = [images] + if isinstance(images, (list, torch.Tensor)) and len(images) > 0: + image_outputs = super()._call_hf_processor( + prompt=self.info.image_pattern * len(images), + mm_data={"images": images}, + mm_kwargs=mm_kwargs) + image_outputs = MiniCPMVMultiModalProcessor.\ + repack_processor_outputs(image_outputs) + elif len(image_embeds) > 0: + image_sizes = mm_data.pop("image_sizes", None) + image_outputs = { + "image_embeds": torch.cat(image_embeds), + "image_sizes": image_sizes + } else: - if isinstance(images, Image.Image): - images = [images] - image_size_list = [image.size for image in images] - - text_chunks = prompt.split(pattern) - new_prompt_chunks: List[str] = [] - for i in range(len(image_size_list)): - new_prompt_chunks += [ - text_chunks[i], - get_placeholder(image_size_list[i], i) - ] - new_prompt_chunks.append(text_chunks[-1]) - new_prompt = "".join(new_prompt_chunks) - new_token_ids = tokenizer.encode(new_prompt) - - multi_modal_data["image"] = [ - _build_image_input(ctx, image) for image in images - ] + image_outputs = {} + return image_outputs + + def process_videos(self, mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object]) -> Dict[str, object]: + videos = mm_data.pop("videos", []) + video_embeds = mm_data.pop("video_embeds", []) + if len(videos) > 0 and isinstance(videos[0], Image.Image): + videos = [videos] + if isinstance(videos, list) and len(videos) > 0: + video_outputs = { + "video_pixel_values": [], + "video_image_sizes": [], + "video_tgt_sizes": [], + "num_frames": [] + } + for video in videos: + parsed_video = [] + for frame in video: + if isinstance(frame, np.ndarray): + parsed_video.append(Image.fromarray(frame)) + else: + parsed_video.append(frame) + video = parsed_video + single_video_outputs = super()._call_hf_processor( + prompt=self.info.image_pattern * len(video), + mm_data={"images": video}, + mm_kwargs={ + **mm_kwargs, "max_slice_nums": + self.info.get_video_max_slice_num() + }) + video_outputs["num_frames"].append(len(video)) + for key in single_video_outputs: + if "video_" + key in video_outputs: + if key == "image_sizes": + video_outputs["video_" + key].append( + single_video_outputs[key][0][0]) + else: + video_outputs["video_" + + key] += single_video_outputs[key][0] + elif len(video_embeds): + image_sizes = mm_data.pop("image_sizes", None) + num_frames = mm_data.pop("num_frames", None) + video_outputs = { + "video_embeds": torch.cat(video_embeds), + "video_image_sizes": image_sizes, + "num_frames": num_frames + } + else: + video_outputs = {} + return video_outputs - return token_inputs( - prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - ) + def get_placeholder_match_pattern(self) -> str: + return r"\(<(image|video)>./\)" + def get_placeholder_split_pattern(self) -> str: + return r"\(<(?:image|video)>./\)" -def input_mapper_for_minicpmv(ctx: InputContext, data: object): - model_config = ctx.model_config + def process_mm_inputs(self, mm_data, mm_kwargs) -> object: + return { + "image": self.process_images(mm_data, mm_kwargs), + "video": self.process_videos(mm_data, mm_kwargs) + } - image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) - if image_processor is None: - raise RuntimeError("No HuggingFace processor is available " - "to process the image object") + def get_input_modalities(self, mm_data) -> List[str]: + supported_mm_modalities = self.info.get_supported_mm_modalities() + input_modalities = [] + for modality in supported_mm_modalities: + if modality in mm_data and mm_data[modality] != {}: + input_modalities.append(modality) + return input_modalities + + def get_modality_num_counter(self, modality: str) -> str: + if modality == "image": + return "image_sizes" + elif modality == "video": + return "video_image_sizes" + + def get_num_slices_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> int: + if modality == "image": + return self.info.get_image_slice_nums( + inputs[modality]["image_sizes"][index], + self.info.get_max_slice_num()) + elif modality == "video": + return self.info.get_image_slice_nums( + inputs[modality]["video_image_sizes"][index], + self.info.get_video_max_slice_num() + ) * inputs[modality]["num_frames"][index] + else: + raise ValueError(f"UnExpected modality: {modality}") + + def check_mm_inputs(self, inputs: Dict[str, object], + matches: List[str]) -> None: + counts = Counter(matches) + for modality, count in counts.items(): + if modality not in inputs or not inputs[modality]: + raise ValueError(f"None input data of {modality}." + "But prompt requires.") + counter_key = self.get_modality_num_counter(modality) + if len(inputs[modality][counter_key]) != count: + raise ValueError(f"The prompt requires {count} " + f"{modality} inputs while you pass " + f"{len(inputs[modality][counter_key])}") + + def get_prompt_texts_by_modality(self, inputs: Dict[str, object], + modality: str, index: int) -> str: + if modality == "image": + return self.get_image_prompt_texts( + inputs["image"]["image_sizes"][index], index) + elif modality == "video": + return self.get_video_prompt_texts( + inputs["video"]["video_image_sizes"][index], + inputs["video"]["num_frames"][index]) + else: + raise ValueError(f"UnExpected modality: {modality}") - if not isinstance(data, list): - raise ValueError( - "Image input must be list of MiniCPMVImageInput, got (%s)", data) + def call_base_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + return super()._call_hf_processor(prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + # Do not support combination inputs of images and videos for now + # Try to handle interleaved multimodal data + tokenizer = self.info.get_tokenizer() + inputs = self.process_mm_inputs(mm_data, mm_kwargs) + mm_input_modalities = self.get_input_modalities(inputs) + num_mm_slices = {modality: [] for modality in mm_input_modalities} + for modality in mm_input_modalities: + num_counter_key = self.get_modality_num_counter(modality) + for index in range(len(inputs[modality][num_counter_key])): + num_mm_slices[modality].append( + self.get_num_slices_by_modality(inputs, modality, index)) + return { + "input_ids": np.array([tokenizer.encode(prompt)]), + **{ + key: value + for modality in inputs + for key, value in inputs[modality].items() + }, + **{ + f"{modality}_num_slices": num_mm_slices[modality] + for modality in mm_input_modalities + } + } - if len(data) > 0 and isinstance(data[0]['image'], torch.Tensor): - batch_data = { - "image_embeds": data[0]['image'], + def _get_prompt_replacements( + self, mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargs) -> List[PromptReplacement]: + placeholder = { + "image": self.info.image_pattern, + "video": self.info.video_pattern, } - else: - batch_data = image_processor \ - .preprocess([img["image"] for img in data], return_tensors="pt") \ - .data - if len(data) > 0: - batch_data["im_start_id"] = data[0]["im_start_id"] - batch_data["im_end_id"] = data[0]["im_end_id"] - if "slice_start_id" in data[0]: - batch_data["slice_start_id"] = data[0]["slice_start_id"] - batch_data["slice_end_id"] = data[0]["slice_end_id"] + def get_replacement_minicpmv(item_idx: int, modality: str): + if modality == "image": + return self.get_image_prompt_texts( + mm_items["image"].get_image_size(item_idx), item_idx) + else: # video + return self.get_video_prompt_texts( + mm_items["video"].get_frame_size(item_idx), + mm_items["video"].get_num_frames(item_idx)) + + return [ + PromptReplacement(modality=modality, + target=placeholder[modality], + replacement=partial(get_replacement_minicpmv, + modality=modality)) + for modality in ("image", "video") + ] - return MultiModalKwargs(batch_data) + def _get_mm_fields_config( + self, + hf_inputs, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + + def get_slices(num_slices: List[int]) -> List[int]: + slice_indices = [0] + list(accumulate(num_slices)) + slices = [(slice_indices[i], slice_indices[i + 1]) + for i in range(len(num_slices))] + return [slice(*slice_item) for slice_item in slices] + + image_slices = get_slices( + hf_inputs.get("image_num_slices", torch.empty(0))) + video_slices = get_slices( + hf_inputs.get("video_num_slices", torch.empty(0))) + + return dict( + pixel_values=MultiModalFieldConfig.flat("image", image_slices), + image_sizes=MultiModalFieldConfig.batched("image"), + tgt_sizes=MultiModalFieldConfig.flat("image", image_slices), + image_num_slices=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.flat("image", image_slices), + video_pixel_values=MultiModalFieldConfig.flat( + "video", video_slices), + video_image_sizes=MultiModalFieldConfig.batched("video"), + video_tgt_sizes=MultiModalFieldConfig.flat("video", video_slices), + video_embeds=MultiModalFieldConfig.flat("video", video_slices), + video_num_slices=MultiModalFieldConfig.batched("video")) + + def apply( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputs: + supported_mm_modalities = self.info.get_supported_mm_modalities() + if isinstance(prompt, list): + prompt = self.info.get_tokenizer().decode(prompt) + matches = re.findall(self.get_placeholder_match_pattern(), prompt) + mm_orders = { + f"{modality}_orders": + torch.tensor( + [index for index, m in enumerate(matches) if m == modality]) + for modality in supported_mm_modalities + } + result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) + # Exclude x from placeholders + if "image" in result["mm_placeholders"] and \ + self.info.get_model_version() == (2, 6): + result["mm_placeholders"]["image"] = [ + PlaceholderRange(offset=p["offset"] + 3 + idx // 10, + length=p["length"] - 3 - idx // 10) + for idx, p in enumerate(result["mm_placeholders"]["image"]) + ] + result["mm_kwargs"].update(**mm_orders) + result["mm_kwargs"].update(**self.get_special_tokens()) + return result class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP): @@ -409,7 +858,7 @@ def sampler(self): return get_sampler() - def get_embedding( + def get_embedding_with_vision( self, input_ids: torch.Tensor, image_inputs: Optional[MiniCPMVImageInputs], @@ -471,25 +920,46 @@ def _get_image_bounds( image_end_tokens[:valid_image_nums].unsqueeze(-1), ]) - def _parse_and_validate_inputs( + def _parse_and_validate_image_inputs( self, input_ids: torch.Tensor, **kwargs: object, ) -> Optional[MiniCPMVImageInputs]: - pixel_values = kwargs.pop("pixel_values", []) - tgt_sizes = kwargs.pop("tgt_sizes", []) + mm_data = { + "image": { + key: kwargs.pop(key, []) + for key in ["pixel_values", "tgt_sizes", "image_num_slices"] + }, + "video": { + "pixel_values": kwargs.pop("video_pixel_values", []), + "tgt_sizes": kwargs.pop("video_tgt_sizes", []), + "video_num_slices": kwargs.pop("video_num_slices", []) + } + } im_start_id = kwargs.pop("im_start_id", None) im_end_id = kwargs.pop("im_end_id", None) slice_start_id = kwargs.pop("slice_start_id", None) slice_end_id = kwargs.pop("slice_end_id", None) + mm_orders = { + f"{modality}": kwargs.pop(f"{modality}_orders", None) + for modality in ["image", "video", "audio"] + } + batch_size = max(len(mm_data["image"]["pixel_values"]), + len(mm_data["video"]["pixel_values"])) image_embeds = kwargs.pop("image_embeds", None) - + video_embeds = kwargs.pop("video_embeds", None) + if image_embeds is not None and video_embeds is not None: + raise ValueError( + "Incorrect inputs for vision embeddings. " + "Image embeds and video embeds can not exist simultaneously.") + if video_embeds is not None: + image_embeds = video_embeds if image_embeds is not None: if not isinstance(image_embeds, (torch.Tensor, list)): raise ValueError(f"Incorrect type of image embeds. " f"Got type: {type(image_embeds)}") - if isinstance(image_embeds, list): - image_embeds = torch.concat(image_embeds) + image_embeds = torch.concat( + [image_embeds[i] for i in range(len(image_embeds))]) return MiniCPMVImageEmbeddingInputs( image_bounds=self._get_image_bounds(input_ids, im_start_id, @@ -498,29 +968,47 @@ def _parse_and_validate_inputs( data=image_embeds, type="image_embeds", ) - - if not isinstance(pixel_values, (torch.Tensor, list)): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - if not isinstance(tgt_sizes, (torch.Tensor, list)): - raise ValueError("Incorrect type of target sizes. " - f"Got type: {type(tgt_sizes)}") - - if len(pixel_values) != len(tgt_sizes): - raise ValueError("Inconsistent batch lengths, found: " - f"{len(pixel_values)} vs. {len(tgt_sizes)}") + for modality, modality_mm_data in mm_data.items(): + if not isinstance(modality_mm_data["pixel_values"], + (torch.Tensor, list)): + raise ValueError( + "Incorrect type of pixel values. " + f"Got type: {type(modality_mm_data['pixel_values'])}") + + if not isinstance(modality_mm_data["tgt_sizes"], + (torch.Tensor, list)): + raise ValueError( + "Incorrect type of target sizes. " + f"Got type: {type(modality_mm_data['tgt_sizes'])}") + + if len(modality_mm_data["pixel_values"]) != len( + modality_mm_data["tgt_sizes"]): + raise ValueError( + "Inconsistent batch lengths, found: " + f"{len(modality_mm_data['pixel_values'])} vs. " + f"{len(modality_mm_data['tgt_sizes'])}") pixel_values_flat: List[torch.Tensor] = [] tgt_sizes_flat: List[torch.Tensor] = [] - for pixel_b, tgt_b in zip(pixel_values, tgt_sizes): - if len(pixel_b) != len(tgt_b): - raise ValueError("Inconsistent N lengths, found: " - f"{len(pixel_b)} vs {len(tgt_b)}") - - for pixel_n, tgt_n in zip(pixel_b, tgt_b): - pixel_values_flat += pixel_n - tgt_sizes_flat += tgt_n + for b in range(batch_size): + mm_counts = {"image": 0, "video": 0} if self.version == (2, 6) \ + else {"image": 0} + mm_slice_counts = {"image": 0, "video": 0} \ + if self.version == (2, 6) else {"image": 0} + mm_orders_b = [(index, modality) for modality in mm_counts + for index in mm_orders[modality][b]] + for _, modality in sorted(mm_orders_b, key=lambda x: x[0]): + pos = mm_counts[modality] + num_slices = mm_data[modality][f"{modality}_num_slices"][b][ + pos] + slice_start_idx = mm_slice_counts[modality] + slice_end_idx = slice_start_idx + num_slices + pixel_values_flat += mm_data[modality]["pixel_values"][b][ + slice_start_idx:slice_end_idx] + tgt_sizes_flat += mm_data[modality]["tgt_sizes"][b][ + slice_start_idx:slice_end_idx] + mm_counts[modality] += 1 + mm_slice_counts[modality] += num_slices # NOTE: Input IDs does not contain image tokens during memory profiling, # so we allow it to be empty @@ -544,6 +1032,10 @@ def _parse_and_validate_inputs( type="pixel_values", ) + def _parse_and_validate_inputs(self, input_ids: torch.Tensor, + **kwargs: object): + return self._parse_and_validate_image_inputs(input_ids, **kwargs) + def forward( self, input_ids: torch.Tensor, @@ -556,9 +1048,10 @@ def forward( if intermediate_tensors is not None: vlm_embeddings = None else: - image_inputs = self._parse_and_validate_inputs(input_ids, **kwargs) - - vlm_embeddings, _ = self.get_embedding(input_ids, image_inputs) + image_inputs = \ + self._parse_and_validate_inputs(input_ids, **kwargs) + vlm_embeddings, _ = self.get_embedding_with_vision( + input_ids, image_inputs) # always pass the input via `inputs_embeds` # to make sure the computation graph is consistent @@ -964,15 +1457,15 @@ def get_vision_hidden_states(self, _SUPPORT_VERSION = { (2, 0): MiniCPMV2_0, (2, 5): MiniCPMV2_5, - (2, 6): MiniCPMV2_6 + (2, 6): MiniCPMV2_6, } -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_minicpmv) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_minicpmv_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_minicpmv) -@INPUT_REGISTRY.register_input_processor(input_processor_for_minicpmv) -class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA): +@MULTIMODAL_REGISTRY.register_processor( + MiniCPMVMultiModalProcessor, + info=MiniCPMVProcessingInfo, + dummy_inputs=MiniCPMVDummyInputsBuilder) +class MiniCPMV(MiniCPMVBaseModel, SupportsMultiModal, SupportsLoRA): """ Different versions of MiniCPMV use different visual encoders and LLMs, which is not conducive to the current integration logic of LoRA and diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index da415cdae96ed..fbb3704fa080f 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -452,7 +452,11 @@ def load_weights(self, weights: Iterable[Tuple[str, # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue - + if name.endswith("scale"): + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index b8c40582b629e..5eda6f40a05e6 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -835,6 +835,7 @@ def _attention_with_mask( ) -> torch.Tensor: # Skip writing kv-cache for the initial profiling run. if len(kv_cache.shape) > 1: + i = torch.ones(1, dtype=torch.float32) if self.attn.backend in (_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1): cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) @@ -847,8 +848,8 @@ def _attention_with_mask( attn_metadata. cross_slot_mapping, # type: ignore[union-attr] "auto", - 1.0, - 1.0, + i, + i, ) elif self.attn.backend in (_Backend.XFORMERS, _Backend.TORCH_SDPA): key_cache, value_cache = PagedAttention.split_kv_cache( @@ -857,7 +858,7 @@ def _attention_with_mask( cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) PagedAttention.write_to_paged_cache( cached_k, cached_v, key_cache, value_cache, - attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + attn_metadata.cross_slot_mapping, "auto", i, i) else: raise ValueError( f"Unsupported Attention backend {self.attn.backend} " @@ -1381,8 +1382,8 @@ def forward( # For 1) text-only prefill and decode, 2) image-present decode. if image_inputs is None: full_text_row_masked_out_mask = ( - attn_metadata.encoder_seq_lens_tensor != 0).reshape(-1, 1).to( - input_ids.device) + attn_metadata.encoder_seq_lens_tensor + != 0).reshape(-1, 1).to(input_ids.device) skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0 # For image-present prefill. @@ -1501,14 +1502,23 @@ def convert_sparse_cross_attention_mask_to_dense( total_length = sum(lengths) total_tiles = sum([sum(tiles) for tiles in num_tiles]) dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) - # A list of ranges, range[i] = [start, end] means - # if the i-th sample has N tiles in total, the tiles[start, end] - # will be used for cross-attention decoding. + # A list of ranges, range[i] = [start, end] means that the i-th image will + # use tiles[start, end] for cross-attention decoding. tile_range_for_decode = [] seq_start = 0 tile_start = 0 - for masks, tiles, length in zip(sparse_mask, num_tiles, lengths): + + # sparse_mask has an [] entry for each sequence that does not have images, + # but num_tiles does not have these entries... + num_tiles_idx = 0 + for masks, length in zip(sparse_mask, lengths): + if len(masks) == 0: + # Text only + continue + + tiles = num_tiles[num_tiles_idx] + num_tiles_idx += 1 ts, td = -1, 0 for mask, tile in zip(masks, tiles): if len(mask) != 2: @@ -1528,6 +1538,7 @@ def convert_sparse_cross_attention_mask_to_dense( assert td != 0 tile_range_for_decode.append((ts, ts + td)) seq_start += length + assert num_tiles_idx == len(num_tiles) return dense_mask, tile_range_for_decode diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index d49da5f29aa14..f1d796ca26a16 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -81,8 +81,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: if self.tie_weights: assert ( - self.n_predict > - 1), "You cannot tie weights between stages when only 1 exists" + self.n_predict > 1 + ), "You cannot tie weights between stages when only 1 exists" embedding = VocabParallelEmbedding( config.vocab_size, self.inner_dim, diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index f9ad0c67adaba..5a28b1ffbb7b4 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -136,6 +136,17 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: @INPUT_REGISTRY.register_input_processor(input_processor_for_paligemma) class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7a230e5beb367..0fcda81da2800 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -30,15 +30,19 @@ VocabParallelEmbedding) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - NestedTensors, PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize, MultiModalDataItems) +# yapf conflicts with isort for this block +# yapf: disable from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo, BoundPromptReplacement, - PlaceholderInfo, PromptReplacement) + PlaceholderFeaturesInfo, + PromptReplacement, + PromptReplacementDetails) +# yapf: enable from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -437,7 +441,12 @@ def get_replacement_phi3v(item_idx: int): processor=hf_processor, ) - return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] + image_tokens = [_IMAGE_TOKEN_ID] * num_image_tokens + + return PromptReplacementDetails( + full=image_tokens + [bos_token_id], + features=image_tokens, + ) num_images = mm_items.get_count("image", strict=False) @@ -454,7 +463,7 @@ def _apply_prompt_replacements( token_ids: list[int], mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: token_ids, text, placeholders = super()._apply_prompt_replacements( token_ids=token_ids, mm_prompt_repls=mm_prompt_repls, @@ -467,11 +476,11 @@ def _apply_prompt_replacements( token_ids = [token_ids[0], *token_ids[2:]] placeholders = { modality: [ - PlaceholderInfo( + PlaceholderFeaturesInfo( modality=p.modality, item_idx=p.item_idx, start_idx=p.start_idx - 1, - replacement=p.replacement, + tokens=p.tokens, ) for p in ps ] for modality, ps in placeholders.items() @@ -479,26 +488,6 @@ def _apply_prompt_replacements( return token_ids, text, placeholders - def apply( - self, - prompt: Union[str, list[int]], - mm_data: MultiModalDataDict, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: - result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) - - # Only <|image|> tokens should be considered as placeholders, - # so we ignore the trailing bos_token_id - result["mm_placeholders"] = { - modality: [ - PlaceholderRange(offset=p["offset"], length=p["length"] - 1) - for p in ps - ] - for modality, ps in result["mm_placeholders"].items() - } - - return result - @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor, info=Phi3VProcessingInfo, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 881c09ea9db99..6367b770a0aff 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -167,8 +167,8 @@ def sparsemixer(scores, jitter_eps=0.01): # compute mask for sparsity mask_logits_threshold, max_ind = scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates = scores.masked_fill(mask_logits_threshold, float("-inf")) @@ -192,8 +192,8 @@ def sparsemixer(scores, jitter_eps=0.01): mask_logits_threshold, max_ind = masked_scores.max(dim=-1, keepdim=True) factor = scores.abs().clamp(min=mask_logits_threshold) - mask_logits_threshold = ( - (mask_logits_threshold - scores) / factor) > (2 * jitter_eps) + mask_logits_threshold = ((mask_logits_threshold - scores) / + factor) > (2 * jitter_eps) # apply mask masked_gates_top2 = masked_scores.masked_fill(mask_logits_threshold, diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 1345b381f0a99..86a9d3089c3ee 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -4,26 +4,28 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" +import copy import math import re -from functools import partial -from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, - Optional, Set, Tuple, TypedDict, Union) +import unicodedata +from functools import lru_cache, partial +from typing import (AbstractSet, Any, Callable, Collection, Dict, Iterable, + List, Literal, Mapping, Optional, Set, Tuple, TypedDict, + Union) -import numpy as np import torch -from PIL import Image from torch import nn from torchvision import transforms from torchvision.transforms import InterpolationMode -from transformers import PretrainedConfig +from transformers import (BatchFeature, PretrainedConfig, PreTrainedTokenizer, + TensorType) +from transformers.image_utils import ImageInput +from transformers.tokenization_utils_base import TextInput from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm @@ -42,15 +44,20 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SequenceData -from vllm.utils import is_list_of +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (flatten_bn, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + maybe_prefix, merge_multimodal_embeddings) logger = init_logger(__name__) @@ -353,8 +360,10 @@ def __init__(self, self.ln_post = norm_layer(output_dim) self.proj = nn.Parameter( (output_dim**-0.5) * torch.randn(output_dim, output_dim)) + self.image_start_id = image_start_id self.image_end_id = image_start_id + 1 + self.image_pad_id = image_start_id + 2 def forward(self, x: torch.Tensor) -> torch.Tensor: x = x.to( @@ -383,21 +392,6 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: return x - def get_image_positions(self, - input_ids: torch.Tensor) -> Optional[torch.Tensor]: - """Given the input IDs, extracts start/stop points corresponding to - images. - - args: - Returns: - Optional torch tensor corresponding to start/stop pairs of images. - """ - if torch.any(input_ids == self.image_start_id): - bos_pos = torch.where(input_ids == self.image_start_id) - eos_pos = torch.where(input_ids == self.image_end_id) - return torch.stack((bos_pos[0], eos_pos[0]), dim=1) - return None - class QWenMLP(nn.Module): """MLP for the language component of the Qwen model, which contains a @@ -579,9 +573,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) - self.visual = VisionTransformer(**config.visual, - quant_config=quant_config) if hasattr( - config, "visual") else None + + if (vision_config := getattr(config, "visual", None)): + self.visual = VisionTransformer(**vision_config, + quant_config=quant_config) + else: + self.visual = None def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.wte(input_ids) @@ -593,38 +590,13 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], - pixel_values: Optional[QwenImageInputs], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - img_pos = None - # If pixel / visual embeddings are provided, this is a visual model - if pixel_values is not None and self.visual is not None: - if pixel_values["type"] != "image_embeds": - image_embeds = self.visual(pixel_values["data"]) - else: - image_embeds = pixel_values["data"] - - # features should be of shape (# images, 256, hidden_dim) - img_pos = self.visual.get_image_positions(input_ids) - if isinstance( - img_pos, - np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]: - raise ValueError( - f"Number of placeholders: {img_pos.shape[0]} " - f"does not match number of images {image_embeds.shape[0]}." - ) - if get_pp_group().is_first_rank: if inputs_embeds is not None: hidden_states = inputs_embeds else: hidden_states = self.get_input_embeddings(input_ids) - hidden_states = self.wte(input_ids) - # Merge the image embeddings into the hidden states if actually have - # visual features and the corresponding image tokens - if img_pos is not None: - for idx, (img_bos, img_eos) in enumerate(img_pos): - hidden_states[img_bos + 1:img_eos] = image_embeds[idx] residual = None else: assert intermediate_tensors is not None @@ -648,159 +620,9 @@ def forward( return hidden_states -def get_image_text(image_num: int, padding: bool) -> str: - """Retrieves a placeholder text that when tokenized, will be expanded with - image pads. - - Args: - image_num: The number of the image that we want a text prompt for. - Images should be indexed starting at 1. - padding: Whether or not padding should be manually added. - - Returns: - Text placeholder prompt for the image being considered. - """ - image_start = f"Picture {image_num}: {IMG_START}" - image_end = f"{IMG_END}\n" - if not padding: - return f"{image_start}{image_end}" - return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}" - - -def input_processor_for_qwen(ctx: InputContext, - inputs: DecoderOnlyInputs) -> DecoderOnlyInputs: - """Processes the inputs, which may or may not be multimodal. - Multimodal inputs will only be processed if the model has a "visual" - component in its model config, otherwise they'll be ignored. - - Args: - ctx: Context of the loaded model. - inputs: LLM inputs which may have a multi_modal_data attribute. - - Returns: - If the model is language only or not multimodal inputs were provided, - returns inputs unmodified. Otherwise, processes the multimodal - images / image embeddings and adds the fixed-length image placeholders. - """ - multi_modal_data = inputs.get("multi_modal_data") - - # Only process images if we have multimodal data and a visual config - hf_config = ctx.get_hf_config() - if (multi_modal_data is None or "image" not in multi_modal_data - or not hasattr(hf_config, "visual")): - return inputs - - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - image_data = multi_modal_data["image"] - if isinstance(image_data, torch.Tensor): - num_dims = len(image_data.shape) - if num_dims < 2 or num_dims > 3: - raise ValueError( - f"Expected img embeds to be have 3 dimensions, got {num_dims}") - num_images = 1 if num_dims == 2 else image_data.shape[0] - elif isinstance(image_data, Image.Image): - num_images = 1 - elif is_list_of(image_data, Image.Image): - num_images = len(image_data) - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - if prompt is None: - prompt = tokenizer.decode(prompt_token_ids) - - # Drops anything between / tags; encoding with the tokenizer - # will automatically add the image pads for the context. - new_prompt, num_matched_images = re.subn( - r"(Picture \d*: ).*?(<\/img>\n)", - r"\1\2", - prompt, - ) - - if num_matched_images != num_images: - logger.warning( - "Number of matched image placeholders %s doesn't match the number " - "of expected images %s; check your placeholder formatting.", - num_matched_images, num_images) - - new_prompt_token_ids = tokenizer.encode(new_prompt) - - return token_inputs(prompt=new_prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data) - - -def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs: - """Maps the input data to its MultiModalKwargs (if any). - - Args: - ctx: Context of the loaded model. - data: data potentially containing image/image embeddings to be mapped - to pixel_values in .forward() for a visual QWenLMHeadModel model. - - Returns: - MultiModalKwargs containing the stacked normalized images tensor or - image embeddings. - """ - # Early exit if we have provided an image to a language only Qwen model - hf_config = ctx.get_hf_config() - if not hasattr(hf_config, "visual"): - logger.warning( - "Images were provided but this model has no visual config; " - "multimodal inputs will not be forwarded to the model.") - return MultiModalKwargs() - - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - image_pair_tok = tokenizer.encode(IMG_START + IMG_END, - add_special_tokens=False, - return_tensors="pt").squeeze() - image_start_id = image_pair_tok[0] - image_end_id = image_pair_tok[-1] - if (image_start_id + 1) != image_end_id: - raise ValueError( - f"Found image end ID {image_end_id}, but expected {IMG_START} + 1") - if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2): - raise ValueError( - f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, " - f"but got {image_pair_tok - 2}") - - hf_config = ctx.get_hf_config() - image_size = hf_config.visual["image_size"] - img_emb_size = hf_config.visual["output_dim"] - - if isinstance(data, torch.Tensor): - # It's expected that our values have already been processed - # by the visual transformer; shape is expected to be: - # (# images, 256, hidden_size) - if len(data.shape) == 2: - # Assume only one image embed was provided; unsqueeze the extra dim - data = data.unsqueeze(0) - if len(data.shape) != 3 or data.shape[ - 1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size: - raise ValueError( - "Expected image embeds to be a tensor of shape" - f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but " - f"received shape [{data.shape}]") - pixel_values = data - else: - transform = build_normalization_transform(image_size) - if not isinstance(data, (list, tuple)): - data = [data] - transformed_images = [transform(datum) for datum in data] - pixel_values = torch.stack(transformed_images, dim=0) - return MultiModalKwargs({"pixel_values": pixel_values}) - - def build_normalization_transform(image_size: int) -> transforms.Compose: - """Builds a normalization transform which can be applied to one or + """ + Build a normalization transform which can be applied to one or more input images from which we want to extract visual features. Args: @@ -817,62 +639,251 @@ def build_normalization_transform(image_size: int) -> transforms.Compose: ]) -def dummy_data_for_qwen( - ctx: InputContext, - seq_len: int, - mm_counts: Mapping[str, int], -) -> DummyData: - """Build dummy data for warming up Qwen models; this will only contain text - matching the defaults for VLLM unless the model has a visual config. +@lru_cache(maxsize=1) +def _get_tokenizer_without_image_pad( + tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: + """ + The logic of adding image pad tokens should only be applied in + :class:`QWenVLProcessor`, so they are patched out here. - Args: - ctx: Context of the loaded model. - seq_len: Number of tokens in the text sequence. - mm_counts: multimodal data counts. - - Returns: - Tuple containing sequential and multimodal data. + The definition of the wrapped tokenizer can be found here: + https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py + """ + new_tokenizer = copy.deepcopy(tokenizer) + + class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore + + def tokenize( + self, + text: str, + allowed_special: Union[AbstractSet[str], str] = "all", + disallowed_special: Union[Collection[str], str] = (), + **kwargs, + ) -> list[Union[bytes, str]]: + text = unicodedata.normalize("NFC", text) + + return [ + self.decoder[t] for t in self.tokenizer.encode( + text, + allowed_special=allowed_special, + disallowed_special=disallowed_special, + ) + ] + + def _decode( + self, + token_ids: Union[int, List[int]], + skip_special_tokens: bool = False, + errors: Optional[str] = None, + **kwargs, + ) -> str: + if isinstance(token_ids, int): + token_ids = [token_ids] + + return self.tokenizer.decode( + token_ids, + errors=errors or self.errors, + ) + + TokenizerWithoutImagePad.__name__ = \ + f"{tokenizer.__class__.__name__}WithoutImagePad" + + new_tokenizer.__class__ = TokenizerWithoutImagePad + return new_tokenizer + + +class QWenVLProcessor: + """ + This model doesn't define its own HF processor, + so we implement our own one here. + + We call the wrapped tokenizer to automatically insert image pad tokens: + https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245 + + The image processor is defined here: + https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354 """ - hf_config = ctx.get_hf_config() - - # The presence of a visual config indicates this is a multimodal model. - # If we don't have it, the model is considered an LLM for warmup purposes. - if not hasattr(hf_config, "visual"): - seq_data = SequenceData.from_prompt_token_counts((0, seq_len)) - mm_data = None - return DummyData(seq_data, mm_data) - - # We have a visual component - use images to warm up - num_images = mm_counts["image"] - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - # Build the image prompts with no imgpads; the tokenizer will add img pads - image_prompt = ''.join( - [get_image_text(idx, False) for idx in range(1, num_images + 1)]) - toks = tokenizer.encode(image_prompt, add_special_tokens=False) - - # Make sure we actually get the fixed context size per tok padding - num_pads = toks.count(tokenizer.encode(IMG_PAD)[0]) - if num_pads != (num_images * MAX_QWEN_IMG_TOKENS): - raise ValueError( - f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads" - f" per image, but got {num_pads} pads for {num_images} image(s)" - " in total. Are you using a qwen tokenizer?") - - # Ensure the number of tokens is at minimum the sequence length provided - if len(toks) < seq_len: - toks += [0] * (seq_len - len(toks)) - - seq_data = SequenceData.from_seqs(toks) - - # Build the input images; width/height doesn't actually matter here since - # the data will get resized and the # of tokens per image is constant - image = Image.new("RGB", (224, 224), color=0) - mm_data = {"image": image if num_images == 1 else [image] * num_images} - return DummyData(seq_data, mm_data) + + def __init__( + self, + config: PretrainedConfig, + tokenizer: PreTrainedTokenizer, + ) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + if hasattr(self.config, "visual"): + self.image_transform = build_normalization_transform( + config.visual["image_size"]) + else: + self.image_transform = None + + special_tokens: dict[str, + int] = tokenizer.special_tokens # type: ignore + self.img_start_id = special_tokens[IMG_START] + self.img_end_id = special_tokens[IMG_END] + + def __call__( + self, + text: Optional[Union[TextInput, list[TextInput]]] = None, + images: Optional[Union[ImageInput, list[ImageInput]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchFeature: + if text is None: + text = [] + if not isinstance(text, list): + text = [text] + if images is None: + images = [] + if not isinstance(images, list): + images = [images] + + text_inputs = self.tokenizer(text) + + if len(images) == 0: + image_inputs = {} + else: + if self.image_transform is None: + raise ValueError("This model does not support image inputs") + + pixel_values = [self.image_transform(image) for image in images] + image_inputs = {"pixel_values": torch.stack(pixel_values)} + + return BatchFeature( + { + **text_inputs, + **image_inputs, + }, + tensor_type=return_tensors, + ) + + +class QWenVLProcessingInfo(BaseProcessingInfo): + + def get_tokenizer(self) -> PreTrainedTokenizer: + tokenizer = self.ctx.tokenizer + assert isinstance(tokenizer, PreTrainedTokenizer) + + return _get_tokenizer_without_image_pad(tokenizer) + + def get_hf_processor(self) -> QWenVLProcessor: + tokenizer = self.ctx.tokenizer + assert isinstance(tokenizer, PreTrainedTokenizer) + + return QWenVLProcessor(self.get_hf_config(), tokenizer) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self.get_num_image_tokens()} + + def get_num_image_tokens(self) -> int: + return MAX_QWEN_IMG_TOKENS + + +class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]): + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.info.get_hf_config() + if not hasattr(hf_config, "visual"): + return ProcessorInputs(prompt_text="", mm_data={}) + + vision_config = hf_config.visual + + max_image_size = vision_config["image_size"] + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="".join(f"Picture {i}: {IMG_START}{IMG_END}\n" + for i in range(1, num_images + 1)), + mm_data=mm_data, + ) + + +class QWenVLMultiModalProcessor(BaseMultiModalProcessor[QWenVLProcessingInfo]): + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + # Drops anything between / tags; encoding with the tokenizer + # will automatically add the image pads for the context. + prompt, num_matched_images = re.subn( + r"(Picture \d*: ).*?(<\/img>\n)", + r"\1\2", + prompt, + ) + + image_data = mm_data.get("images") + if image_data is not None: + assert isinstance(image_data, list) + + num_images = len(image_data) + if num_matched_images != num_images: + logger.warning( + "Number of matched image placeholders %s doesn't match " + "the number of expected images %s; check your placeholder " + "formatting.", num_matched_images, num_images) + + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + tokenizer = self.info.get_tokenizer() + special_tokens: dict[str, + int] = tokenizer.special_tokens # type: ignore + + img_start_id = special_tokens[IMG_START] + img_end_id = special_tokens[IMG_END] + img_pad_id = special_tokens[IMG_PAD] + + num_image_tokens = self.info.get_num_image_tokens() + image_tokens = [img_pad_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[img_start_id, img_end_id], + replacement=PromptReplacementDetails( + full=[img_start_id] + image_tokens + [img_end_id], + features=image_tokens, + ), + ) + ] class QWenBaseModel(nn.Module, SupportsPP, SupportsLoRA): @@ -898,38 +909,77 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) - def _get_image_input_type( - self, - pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]: - """Determines if the provided pixel_values are normalized pixel values - or image embeddings. + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + h = w = self.config.visual["image_size"] + expected_dims = (3, h, w) + actual_dims = tuple(data.shape[1:]) - Args: - pixel_values: Optional data to processed into visual embeddings. + if actual_dims != expected_dims: + expected_expr = ("batch_size", *map(str, expected_dims)) + raise ValueError( + f"The expected shape of pixel values is {expected_expr}. " + f"You supplied {tuple(data.shape)}.") + + return data + + def _parse_and_validate_image_input( + self, **kwargs: object) -> Optional[QwenImageInputs]: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is not None: + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") + + return QwenImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) + + if image_embeds is not None: + if not isinstance(image_embeds, torch.Tensor): + raise ValueError("Incorrect type of image embeddings. " + f"Got type: {type(image_embeds)}") + + return QwenImageEmbeddingInputs( + type="image_embeds", + data=flatten_bn(image_embeds), + ) - Returns: - None of the QwenImageInputs type used to determine whether or not - the visual transformer needs to process the pixel_values. - """ - if pixel_values is not None and self.transformer.visual is not None: - pixel_values = flatten_bn(pixel_values) - if len(pixel_values.shape) == 3 and pixel_values.shape[ - 1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[ - 2] == self.config.visual["output_dim"]: - return QwenImageEmbeddingInputs( - type="image_embeds", - data=pixel_values, - ) - else: - # If we have the wrong shape, assume we still need to process - return QwenImagePixelInputs( - type="pixel_values", - data=pixel_values, - ) return None - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.transformer.get_input_embeddings(input_ids) + def _process_image_input(self, + image_input: QwenImageInputs) -> torch.Tensor: + if image_input["type"] == "image_embeds": + return image_input["data"] + + assert self.transformer.visual is not None + return self.transformer.visual(image_input["data"]) + + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.transformer.get_input_embeddings(input_ids) + + if multimodal_embeddings is not None: + assert self.transformer.visual is not None + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.transformer.visual.image_pad_id) + + return inputs_embeds def forward( self, @@ -938,18 +988,23 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, - pixel_values: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) input_ids = None - pixel_values = None - else: - pixel_values = self._get_image_input_type(pixel_values) hidden_states = self.transformer(input_ids, positions, kv_caches, attn_metadata, intermediate_tensors, - pixel_values, inputs_embeds) + inputs_embeds) return hidden_states def compute_logits( @@ -1063,10 +1118,9 @@ def get_mm_mapping(self) -> MultiModelKeys: tower_model="transformer.visual.transformer") -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen) -@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen) -@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen) +@MULTIMODAL_REGISTRY.register_processor(QWenVLMultiModalProcessor, + info=QWenVLProcessingInfo, + dummy_inputs=QWenVLDummyInputsBuilder) class QWenLMHeadModel(QWenBaseModel, SupportsMultiModal, SupportsLoRA): """ QWenLMHeadModel is not only applicable to LLM but also to VL, which is not @@ -1084,7 +1138,7 @@ def __new__( cls, vllm_config: VllmConfig, prefix: str = "", - ) -> None: + ) -> QWenBaseModel: config = vllm_config.model_config.hf_config # Initialize VL if hasattr(config, "visual"): diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 51510eaeeb9f4..dbdefda8ae553 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -257,7 +257,15 @@ def forward( return hidden_states, residual -@support_torch_compile +@support_torch_compile( + dynamic_arg_dims={ + "input_ids": 0, + # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl, + # otherwise (seq_len, ). + "positions": -1, + "intermediate_tensors": 0, + "inputs_embeds": 0, + }) class Qwen2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0dff9595c6c08..fc5aed5c94abb 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -41,7 +41,8 @@ from vllm.multimodal.parse import (AudioProcessorItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - BaseProcessingInfo, PromptReplacement) + BaseProcessingInfo, PromptReplacement, + PromptReplacementDetails) from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -153,29 +154,24 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, Any], ) -> BatchFeature: - mm_data = dict(mm_data) - audios = mm_data.pop("audios", []) - - if audios: - mm_data["audios"] = audios - - feature_extractor = self.info.get_feature_extractor(**mm_kwargs) - mm_kwargs = dict( - **mm_kwargs, - sampling_rate=feature_extractor.sampling_rate, - ) - else: - # NOTE: WhisperFeatureExtractor cannot handle empty list of audios - pass + # Text-only input not supported in composite processor + if not mm_data or not mm_data.get("audios", []): + prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + feature_extractor = self.info.get_feature_extractor(**mm_kwargs) + mm_kwargs = dict( + **mm_kwargs, + sampling_rate=feature_extractor.sampling_rate, + ) - processed_outputs = super()._call_hf_processor( + return super()._call_hf_processor( prompt=prompt, mm_data=mm_data, mm_kwargs=mm_kwargs, ) - return processed_outputs - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -192,8 +188,20 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.info.get_hf_config() - placeholder = hf_config.audio_token_index + processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + # Use getattr with default to be compatible with transformers<4.48 + audio_token = getattr(processor, "audio_token", "<|AUDIO|>") + audio_bos_token = getattr(processor, "audio_bos_token", + "<|audio_bos|>") + audio_eos_token = getattr(processor, "audio_eos_token", + "<|audio_eos|>") + + audio_token_id = vocab[audio_token] + audio_bos_id = vocab[audio_bos_token] + audio_eos_id = vocab[audio_eos_token] feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: @@ -206,20 +214,25 @@ def _get_prompt_replacements( audio_output_lengths = audio_output_lens.tolist() def get_replacement_qwen2_audio(item_idx: int): - num_placeholders = audio_output_lengths[item_idx] - if num_placeholders == 0: + num_features = audio_output_lengths[item_idx] + if num_features == 0: audios = mm_items.get_items("audio", AudioProcessorItems) audio = audios.get(item_idx) raise ValueError( f"The audio {audio} (len={len(audio)}) is too short " "to be represented inside the model") - return [placeholder] * num_placeholders + audio_tokens = [audio_token_id] * num_features + + return PromptReplacementDetails( + full=[audio_bos_id] + audio_tokens + [audio_eos_id], + features=audio_tokens, + ) return [ PromptReplacement( modality="audio", - target=[placeholder], + target=audio_token, replacement=get_replacement_qwen2_audio, ) ] diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 988d682d36be3..593ce4857af0f 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -12,7 +12,7 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput @@ -32,7 +32,7 @@ def forward(self, input): return self.activation(input) -class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP): +class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): packed_modules_mapping = { "qkv_proj": [ "q_proj", @@ -60,7 +60,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config - pooler_config = vllm_config.model_config.pooler_config self.config = config self.lora_config = lora_config @@ -74,14 +73,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config.hidden_size, quant_config=quant_config), ReLU(), - RowParallelLinear(config.hidden_size, 1, + RowParallelLinear(config.hidden_size, + config.num_labels, quant_config=quant_config), ) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False) + self._pooler: SimplePooler self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -115,3 +111,31 @@ def load_weights(self, weights: Iterable[Tuple[str, loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) + + +class Qwen2ForRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 1 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.ALL, + normalize=False, + softmax=False) + + +class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): + + def __init__(self, *, vllm_config, prefix=""): + vllm_config.model_config.hf_config.num_labels = 2 + super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.STEP, + normalize=False, + softmax=True, + step_tag_id=151651, + ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index d00e5d362c8bc..a2778ee73810e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -55,7 +55,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, - NestedTensors, VideoItem) + VideoItem) from vllm.multimodal.parse import (ImageSize, ModalityDataItems, MultiModalDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, @@ -67,11 +67,15 @@ from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, - init_vllm_registered_model, maybe_prefix) + init_vllm_registered_model, maybe_prefix, + merge_multimodal_embeddings) from .vision import get_vit_attn_backend logger = init_logger(__name__) +# For profile run +_MAX_FRAMES_PER_VIDEO = 16 + # === Vision Inputs === # @@ -135,7 +139,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): - List[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features - (concatenation of all videos' feature tensors). + (concatenation of all videos' feature tensors). Tensor shape: `(num_image_features, hidden_size)` - `num_image_features` varies based on @@ -611,6 +615,7 @@ def forward( # adapter x = self.merger(x) + return x def load_weights(self, weights: Iterable[Tuple[str, @@ -874,8 +879,8 @@ def get_num_frames_with_most_features(self, seq_len: int) -> int: max_image_tokens = self.get_max_image_tokens() * max_images max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - - num_frames = max(max_total_frames // max(max_videos, 1), 1) + num_frames = min(max(max_total_frames // max(max_videos, 1), 1), + _MAX_FRAMES_PER_VIDEO) # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 if num_frames > 1 and num_frames % 2 == 1: @@ -948,26 +953,29 @@ def _get_prompt_replacements( hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) image_processor = self.info.get_image_processor( **hf_processor_mm_kwargs) + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered placeholder = { - "image": hf_processor.image_token, - "video": hf_processor.video_token, + "image": vocab[hf_processor.image_token], + "video": vocab[hf_processor.video_token], } + merge_length = image_processor.merge_size**2 def get_replacement_qwen2vl(item_idx: int, modality: str): grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] assert isinstance(grid_thw, torch.Tensor) - num_tokens = grid_thw.prod() // merge_length - return placeholder[modality] * num_tokens + num_tokens = int(grid_thw.prod()) // merge_length + return [placeholder[modality]] * num_tokens return [ PromptReplacement( modality=modality, - target=placeholder[modality], + target=[placeholder[modality]], replacement=partial(get_replacement_qwen2vl, modality=modality), ) for modality in ("image", "video") @@ -1047,11 +1055,8 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config: Qwen2VLConfig = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config - assert not cache_config.enable_prefix_caching, \ - "Qwen2-VL currently does not support prefix caching" self.config = config self.multimodal_config = multimodal_config @@ -1173,85 +1178,122 @@ def _parse_and_validate_video_input( video_embeds=video_embeds, video_grid_thw=video_grid_thw) - def _process_image_input(self, - image_input: Qwen2VLImageInputs) -> torch.Tensor: + def _process_image_input( + self, image_input: Qwen2VLImageInputs) -> tuple[torch.Tensor, ...]: + + grid_thw = image_input["image_grid_thw"] + assert grid_thw.ndim == 2 + if image_input["type"] == "image_embeds": - return image_input["image_embeds"].type(self.visual.dtype) + image_embeds = image_input["image_embeds"].type(self.visual.dtype) + else: + pixel_values = image_input["pixel_values"].type(self.visual.dtype) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + + # Split concatenated embeddings for each image item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size + + return image_embeds.split(sizes.tolist()) + + def _process_video_input( + self, video_input: Qwen2VLVideoInputs) -> tuple[torch.Tensor, ...]: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, - grid_thw=image_input["image_grid_thw"]) - return image_embeds + grid_thw = video_input["video_grid_thw"] + assert grid_thw.ndim == 2 - def _process_video_input(self, - video_input: Qwen2VLVideoInputs) -> torch.Tensor: if video_input["type"] == "video_embeds": - return video_input["video_embeds"].type(self.visual.dtype) + video_embeds = video_input["video_embeds"].type(self.visual.dtype) + else: + pixel_values_videos = video_input["pixel_values_videos"].type( + self.visual.dtype) + video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) - pixel_values_videos = video_input["pixel_values_videos"].type( - self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, - grid_thw=video_input["video_grid_thw"]) - return video_embeds + # Split concatenated embeddings for each video item. + merge_size = self.visual.spatial_merge_size + sizes = grid_thw.prod(-1) // merge_size // merge_size - def _merge_multimodal_embeddings( - self, - input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: torch.Tensor, - placeholder_token_id: int, - ) -> torch.Tensor: - mask = (input_ids == placeholder_token_id) - inputs_embeds[mask, :] = multimodal_embeddings - return inputs_embeds + return video_embeds.split(sizes.tolist()) + + def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict: + modalities = {} + + # Preserve the order of modalities if there are multiple of them + # from the order of kwargs. + for input_key in kwargs: + if input_key in ("pixel_values", + "image_embeds") and "images" not in modalities: + modalities["images"] = self._parse_and_validate_image_input( + **kwargs) + if input_key in ("pixel_values_videos", + "video_embeds") and "videos" not in modalities: + modalities["videos"] = self._parse_and_validate_video_input( + **kwargs) + + return modalities def get_multimodal_embeddings( - self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: + self, **kwargs) -> Optional[tuple[torch.Tensor, ...]]: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - if image_input is None and video_input is None: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: return None - # We make a tuple of each embedding with its modality string. This is a - # temporary workaround for models to handle mixed modalities when - # get_multimodal_embeddings and get_input_embeddings are called - # separately. - # TODO(ywang96): Add support for mixed-modality inference for v1. - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] - - if image_input is not None: - image_embeds = self._process_image_input(image_input) - multimodal_embeddings.append((image_embeds, "image")) - if video_input is not None: - video_embeds = self._process_video_input(video_input) - multimodal_embeddings.append((video_embeds, "video")) + # The result multimodal_embeddings is tuple of tensors, with each + # tensor correspoending to a multimodal data item (image or video). + multimodal_embeddings: tuple[torch.Tensor, ...] = () + + # NOTE: It is important to iterate over the keys in this dictionary + # to preserve the order of the modalities. + for modality in modalities: + if modality == "images": + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings += vision_embeddings + if modality == "videos": + video_input = modalities["videos"] + video_embeddings = self._process_video_input(video_input) + multimodal_embeddings += video_embeddings return multimodal_embeddings def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional[List[Tuple[NestedTensors, - str]]] = None, + multimodal_embeddings: Optional[tuple[torch.Tensor, ...]] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: - for embeddings, modality in multimodal_embeddings: - if modality == "image": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.image_token_id, - ) - if modality == "video": - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - embeddings, - placeholder_token_id=self.config.video_token_id, - ) + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + [self.config.image_token_id, self.config.video_token_id]) + return inputs_embeds + + def get_input_embeddings_v0( + self, + input_ids: torch.Tensor, + image_input: Optional[tuple[torch.Tensor, ...]] = None, + video_input: Optional[tuple[torch.Tensor, ...]] = None, + ) -> torch.Tensor: + + inputs_embeds = self.get_input_embeddings(input_ids) + if image_input is not None: + image_embeds = self._process_image_input(image_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + image_embeds, + placeholder_token_id=self.config.image_token_id, + ) + + if video_input is not None: + video_embeds = self._process_video_input(video_input) + inputs_embeds = merge_multimodal_embeddings( + input_ids, + inputs_embeds, + video_embeds, + placeholder_token_id=self.config.video_token_id, + ) return inputs_embeds def forward( @@ -1287,22 +1329,25 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None - # NOTE: In v1, inputs_embeds is always generated at model runner, this - # condition is for v0 compatibility. + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. elif inputs_embeds is None: - multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) - - # We need to check for usage of mrope here in case there is - # multimodal data. - # TODO (ywang96): move this to model runner in V1. - if multimodal_embeddings is not None and uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - - inputs_embeds = self.get_input_embeddings(input_ids, - multimodal_embeddings) - input_ids = None + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + + if image_input is None and video_input is None: + inputs_embeds = None + else: + if uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + inputs_embeds = self.get_input_embeddings_v0( + input_ids, + image_input=image_input, + video_input=video_input) + input_ids = None hidden_states = self.language_model.model( input_ids=input_ids, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a71f7f7029c7d..de05bf2b772f5 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -47,6 +47,7 @@ "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), + "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), @@ -126,6 +127,7 @@ "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForProcessRewardModel": ("qwen2_rm", "Qwen2ForProcessRewardModel"), "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 @@ -160,6 +162,7 @@ "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "MantisForConditionalGeneration": ("llava", "MantisForConditionalGeneration"), # noqa: E501 + "MiniCPMO": ("minicpmo", "MiniCPMO"), "MiniCPMV": ("minicpmv", "MiniCPMV"), "MolmoForCausalLM": ("molmo", "MolmoForCausalLM"), "NVLM_D": ("nvlm_d", "NVLM_D_Model"), @@ -460,7 +463,8 @@ def is_hybrid_model( ModelRegistry = _ModelRegistry({ - model_arch: _LazyRegisteredModel( + model_arch: + _LazyRegisteredModel( module_name=f"vllm.model_executor.models.{mod_relname}", class_name=cls_name, ) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index cca42842bc06e..1e51018973e8c 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -344,10 +344,14 @@ def __init__( self.config = config self.activation_fn = get_act_fn(config.hidden_act) - - # For quantization, we require the hidden size to be a multiple of 64 - quantizable = (config.hidden_size % 64 == 0 - and config.intermediate_size % 64 == 0) + # Special handling for BNB quantization + if quant_config and quant_config.get_name() == "bitsandbytes": + quantizable = True + else: + # For other quantization, we require the hidden size to be a + # multiple of 64 + quantizable = (config.hidden_size % 64 == 0 + and config.intermediate_size % 64 == 0) self.fc1 = ColumnParallelLinear( config.hidden_size, config.intermediate_size, diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 37c5a4b5713b8..e6d919f23c85d 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -30,8 +30,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -44,9 +43,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import ( - default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) + default_weight_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -535,32 +533,3 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params - - # If this function is called, it should always initialize KV cache scale - # factors (or else raise an exception). Thus, handled exceptions should - # make sure to leave KV cache scale factors in a known good (dummy) state - def load_kv_cache_scales(self, quantization_param_path: str) -> None: - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - for layer_idx, scaling_factor in kv_cache_scales_loader( - quantization_param_path, - tp_rank, - tp_size, - self.config.num_hidden_layers, - self.config.__class__.model_type, - ): - if not isinstance(self.model.layers[layer_idx], nn.Identity): - layer_self_attn = self.model.layers[layer_idx].self_attn - - if current_platform.is_rocm(): - # The scaling factor convention we are assuming is - # quantized_value * scaling_factor ~= true_value - # which is consistent with the practice of setting - # scaling_factor = tensor_amax / FPtype_max - scaling_factor *= 2 - if hasattr(layer_self_attn.attn, "_k_scale"): - layer_self_attn.attn._k_scale = scaling_factor - layer_self_attn.attn._v_scale = scaling_factor - else: - raise RuntimeError("Self attention has no KV cache scaling " - "factor attribute!") diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 587f18ccaf98f..605a0ecf4e0a9 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -137,7 +137,7 @@ def _call_hf_processor( mm_kwargs: Mapping[str, object], ) -> BatchFeature: # Text-only input not supported in composite processor - if not mm_data: + if not mm_data or not mm_data.get("audios", []): prompt_ids = self.info.get_tokenizer().encode(prompt) prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") @@ -146,13 +146,6 @@ def _call_hf_processor( audios = mm_data.pop("audios", []) assert isinstance(audios, list) - if not audios: - return super()._call_hf_processor( - prompt=prompt, - mm_data=mm_data, - mm_kwargs=mm_kwargs, - ) - feature_extractor = self.info.get_feature_extractor() mm_kwargs = dict( **mm_kwargs, @@ -212,11 +205,15 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - placeholder = hf_processor.audio_token_replacement # type: ignore + tokenizer = self.info.get_tokenizer() + vocab = tokenizer.get_vocab() + + replacement_id = vocab[ + hf_processor.audio_token_replacement] # type: ignore def get_replacement_ultravox(item_idx: int): audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] - return placeholder * audio_token_len + return [replacement_id] * int(audio_token_len) # type: ignore return [ PromptReplacement( @@ -336,10 +333,10 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor, - info=UltravoxProcessingInfo, - dummy_inputs=UltravoxDummyInputsBuilder - ) +@MULTIMODAL_REGISTRY.register_processor( + UltravoxMultiModalProcessor, + info=UltravoxProcessingInfo, + dummy_inputs=UltravoxDummyInputsBuilder) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 43b3c973c97b8..01a232fdc76de 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -599,9 +599,8 @@ def make_empty_intermediate_tensors( device: torch.device, ) -> IntermediateTensors: return IntermediateTensors({ - key: torch.zeros((batch_size, hidden_size), - dtype=dtype, - device=device) + key: + torch.zeros((batch_size, hidden_size), dtype=dtype, device=device) for key in keys }) diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index a1395982af44c..57166f05cd9bf 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -82,23 +82,25 @@ def get_vit_attn_backend(support_fa: bool = False) -> _Backend: if backend_by_env_var is not None: selected_backend = backend_name_to_enum(backend_by_env_var) if selected_backend is None: - # For Volta and Turing GPUs, use xformers instead. - device_available = current_platform.has_device_capability(80) - if device_available and support_fa: - from transformers.utils import is_flash_attn_2_available - if is_flash_attn_2_available(): - selected_backend = _Backend.FLASH_ATTN + if current_platform.is_cuda(): + device_available = current_platform.has_device_capability(80) + if device_available and support_fa: + from transformers.utils import is_flash_attn_2_available + if is_flash_attn_2_available(): + selected_backend = _Backend.FLASH_ATTN + else: + logger.warning_once( + "Current `vllm-flash-attn` has a bug inside vision " + "module, so we use xformers backend instead. You can " + "run `pip install flash-attn` to use flash-attention " + "backend.") + selected_backend = _Backend.XFORMERS else: - logger.warning_once( - "Current `vllm-flash-attn` has a bug inside vision module, " - "so we use xformers backend instead. You can run " - "`pip install flash-attn` to use flash-attention backend.") + # For Volta and Turing GPUs, use xformers instead. selected_backend = _Backend.XFORMERS - elif current_platform.is_cpu() or current_platform.is_rocm(): - # ROCM doesn't support xformers - selected_backend = _Backend.TORCH_SDPA else: - selected_backend = _Backend.XFORMERS + # Default to torch SDPA for other non-GPU platforms. + selected_backend = _Backend.TORCH_SDPA return selected_backend diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index c1f3bb0ca33c2..15e35fa9cd2c9 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -729,7 +729,22 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) - loaded_weights = [(name, loaded_weight) - for name, loaded_weight in weights] mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) - return loader.load_weights(loaded_weights, mapper=mapper) + # add fake zeros bias for k_proj to state_dict + weights = _create_fake_bias_for_k_proj(weights) + return loader.load_weights(weights, mapper=mapper) + + +def _create_fake_bias_for_k_proj( + weights: Iterable[Tuple[str, torch.Tensor]] +) -> Iterable[Tuple[str, torch.Tensor]]: + """ + Create full zeros bias for k_proj weight in self-attention layers. + So that the bias for k_proj in qkv_proj can be initialized with zeros. + """ + for name, weight in weights: + if name.endswith(".self_attn.k_proj.weight"): + bias = torch.zeros(weight.size(0)) + bias_name = name.replace("weight", "bias") + yield from [(name, weight), (bias_name, bias)] + yield name, weight diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py index 837960fd8d598..37d7cc66b70db 100644 --- a/vllm/model_executor/sampling_metadata.py +++ b/vllm/model_executor/sampling_metadata.py @@ -168,7 +168,8 @@ def prepare( pin_memory=pin_memory, ) categorized_sample_indices = { - t: async_tensor_h2d( + t: + async_tensor_h2d( seq_ids, dtype=torch.int, target_device=device, @@ -200,8 +201,12 @@ def _prepare_seq_groups( device: str, generators: Optional[Dict[str, torch.Generator]] = None, cache: Optional[SamplingMetadataCache] = None, -) -> Tuple[List[SequenceGroupToSample], List[int], Dict[SamplingType, - List[int]], int, ]: +) -> Tuple[ + List[SequenceGroupToSample], + List[int], + Dict[SamplingType, List[int]], + int, +]: """Prepare sequence groups and indices for sampling. Args: diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 4b63703585214..b35184f6855ab 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: """ -class MultiModalInputsV2(TypedDict): +class MultiModalInputs(TypedDict): """ Represents the outputs of :class:`vllm.multimodal.processing.BaseMultiModalProcessor`, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index fa199a07b4cf8..750646ac6e431 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,7 +1,8 @@ import re from abc import ABC, abstractmethod from collections import defaultdict -from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence +from collections.abc import (Callable, Generator, ItemsView, Iterable, Mapping, + Sequence) from dataclasses import dataclass, field from functools import lru_cache from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol, @@ -18,8 +19,8 @@ from .hasher import MultiModalHasher from .inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalInputsV2, MultiModalKwargs, - MultiModalKwargsItem, PlaceholderRange) + MultiModalInputs, MultiModalKwargs, MultiModalKwargsItem, + PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser if TYPE_CHECKING: @@ -28,23 +29,101 @@ logger = init_logger(__name__) _S = TypeVar("_S", str, list[int]) -_PromptSeq = Union[str, list[int]] + +PromptSeq = Union[str, list[int]] +"""A token sequence (list of token IDs) or text.""" + + +@dataclass +class PromptReplacementDetails: + """Details about the replacement token sequence or text.""" + + full: PromptSeq + """The full replacement.""" + + features: PromptSeq + """ + The part of the replacement that corresponds to feature placeholders; + this will be replaced by the output of the vision encoder during model + inference. + """ + + @staticmethod + def from_seq(seq: PromptSeq) -> "PromptReplacementDetails": + return PromptReplacementDetails(full=seq, features=seq) + + +PromptRepl = Union[PromptSeq, PromptReplacementDetails] +""" +The replacement token sequence or text. + +If only part of the replacement corresponds to feature placeholders, you can +use :class:`PromptReplacementDetails` to specify which part. +""" @dataclass class PromptReplacement: """ Defines how to replace portions of an input prompt with placeholder tokens. + + Example: + + For each image, replace one ```` input placeholder in the prompt + with a number of ```` feature placeholders + equal to the feature size of the vision encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement="" * image_feature_size, + ) + + As above, but further pad the feature placeholders with ```` + and ```, which are not supposed to be passed to the vision + encoder: + + .. code-block:: python + + PromptReplacement( + modality="image", + target="", + replacement=PromptReplacementDetails( + full="".join([ + "", + "" * image_feature_size, + "", + ]), + features="" * image_feature_size, + ), + ) + + To avoid unnecessary tokenization during prompt replacement, + we recommended passing token sequences instead of text: + + .. code-block:: python + + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=PromptReplacementDetails( + full=([image_bos_id] + [image_token_id] * image_feature_size + + [image_eos_id]), + features=[image_token_id] * image_feature_size, + ), + ) """ modality: str """The modality for which the replacement is made.""" - target: _PromptSeq + target: PromptSeq """The token sequence (or text) to find and replace.""" - replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) """ Given the index of the processed item within :attr:`modality`, output the replacement token sequence (or text). @@ -107,11 +186,26 @@ def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]: @dataclass class _BoundPromptSequence: + """ + A :data:`_PromptSeq` bound to a tokenizer to automatically + convert between token sequence and text representations. + """ tokenizer: AnyTokenizer = field(repr=False) _text: Optional[str] _token_ids: Optional[list[int]] + @staticmethod + def from_seq( + tokenizer: AnyTokenizer, + seq: PromptSeq, + ) -> "_BoundPromptSequence": + return _BoundPromptSequence( + tokenizer=tokenizer, + _text=seq if isinstance(seq, str) else None, + _token_ids=seq if isinstance(seq, list) else None, + ) + def __post_init__(self) -> None: if self._text is None and self._token_ids is None: raise ValueError("At least one of 'text' and 'token_ids' must be " @@ -134,6 +228,12 @@ def token_ids(self) -> list[int]: return self._token_ids +@dataclass +class _BoundPromptReplacementGroup: + full: _BoundPromptSequence + features: _BoundPromptSequence + + @dataclass class BoundPromptReplacement: """ @@ -144,25 +244,19 @@ class BoundPromptReplacement: tokenizer: AnyTokenizer = field(repr=False) modality: str - _target: _PromptSeq - _replacement: Union[Callable[[int], _PromptSeq], - _PromptSeq] = field(repr=False) + _target: PromptSeq + _replacement: Union[Callable[[int], PromptRepl], + PromptRepl] = field(repr=False) def __post_init__(self) -> None: - self._replacement_cache = dict[int, _BoundPromptSequence]() + self._replacement_cache = dict[int, _BoundPromptReplacementGroup]() @property def target(self) -> _BoundPromptSequence: """The token sequence (or text) to find and replace.""" - target = self._target + return _BoundPromptSequence.from_seq(self.tokenizer, self._target) - return _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=target if isinstance(target, str) else None, - _token_ids=target if isinstance(target, list) else None, - ) - - def get_replacement(self, item_idx: int) -> _BoundPromptSequence: + def get_replacement(self, item_idx: int) -> _BoundPromptReplacementGroup: """ Given the index of the processed item within :attr:`modality`, output the replacement token sequence (or text). @@ -177,10 +271,16 @@ def get_replacement(self, item_idx: int) -> _BoundPromptSequence: else: cache_key = None - bound_replacement = _BoundPromptSequence( - tokenizer=self.tokenizer, - _text=replacement if isinstance(replacement, str) else None, - _token_ids=replacement if isinstance(replacement, list) else None, + if not isinstance(replacement, PromptReplacementDetails): + replacement = PromptReplacementDetails.from_seq(replacement) + + bound_full = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.full) + bound_features = _BoundPromptSequence.from_seq(self.tokenizer, + replacement.features) + bound_replacement = _BoundPromptReplacementGroup( + full=bound_full, + features=bound_features, ) if cache_key is not None: @@ -197,7 +297,7 @@ class _TokenMatch(NamedTuple): def iter_token_matches( token_ids: list[int], match_ids: list[int], -) -> Iterable[_TokenMatch]: +) -> Generator[_TokenMatch]: """ Yield each occurrence of :code:`match_ids` in :code:`token_ids`. @@ -272,15 +372,15 @@ def end_idx(self) -> int: @dataclass -class PlaceholderInfo: +class PlaceholderFeaturesInfo: modality: str item_idx: int start_idx: int - replacement: list[int] + tokens: list[int] @property def length(self) -> int: - return len(self.replacement) + return len(self.tokens) def to_range(self) -> PlaceholderRange: return PlaceholderRange( @@ -314,7 +414,7 @@ def find_text_matches( def _resolve_matches( - prompt: _PromptSeq, + prompt: PromptSeq, mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], ) -> list[_PromptReplacementMatch]: """ @@ -362,10 +462,10 @@ def _replace_matches( replacement = repl_info.get_replacement(item_idx) if isinstance(prompt, str): - repl_seq = replacement.text + repl_seq = replacement.full.text out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) else: - repl_seq = replacement.token_ids + repl_seq = replacement.full.token_ids out_seqs.append(prompt[prev_end_idx:start_idx] + repl_seq) prev_end_idx = end_idx @@ -408,7 +508,7 @@ def _iter_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Iterable[PlaceholderInfo]: +) -> Iterable[PlaceholderFeaturesInfo]: """ Yield each set of placeholder tokens found in :code:`prompt`. @@ -432,23 +532,33 @@ def _iter_placeholders( for repl_info in modality_repls: replacement = repl_info.get_replacement(item_idx) - repl_tokens = replacement.token_ids - repl_len = len(repl_tokens) - end_idx = start_idx + repl_len + repl_tokens_full = replacement.full.token_ids + repl_len_full = len(repl_tokens_full) + end_idx_full = start_idx + repl_len_full - if repl_len == 0 or end_idx > prompt_len: + if repl_len_full == 0 or end_idx_full > prompt_len: continue - if prompt[start_idx:end_idx] == repl_tokens: - yield PlaceholderInfo( - modality=modality, - item_idx=item_idx, - start_idx=start_idx, - replacement=repl_tokens, - ) + if prompt[start_idx:end_idx_full] == repl_tokens_full: + repl_tokens_feat = replacement.features.token_ids + + try: + match = next( + iter_token_matches(repl_tokens_full, + repl_tokens_feat)) + yield PlaceholderFeaturesInfo( + modality=modality, + item_idx=item_idx, + start_idx=start_idx + match.start_idx, + tokens=repl_tokens_feat, + ) + except StopIteration: + raise AssertionError( + f"{repl_tokens_feat=} should be a " + f"subsequence of {repl_tokens_full=}") from None # Exclude overlapping matches - start_idx = end_idx + start_idx = end_idx_full item_idx_by_modality[modality] += 1 found = True break @@ -464,7 +574,7 @@ def find_mm_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], -) -> Mapping[str, list[PlaceholderInfo]]: +) -> Mapping[str, list[PlaceholderFeaturesInfo]]: it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) return dict(full_groupby_modality(it)) @@ -609,7 +719,7 @@ def __call__( prompt: str, mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) def _get_data_parser(self) -> MultiModalDataParser: @@ -679,7 +789,7 @@ def _find_mm_placeholders( mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], new_token_ids: list[int], mm_item_counts: Mapping[str, int], - ) -> Mapping[str, list[PlaceholderInfo]]: + ) -> Mapping[str, list[PlaceholderFeaturesInfo]]: return find_mm_placeholders(mm_prompt_repls, new_token_ids, mm_item_counts) @@ -948,7 +1058,7 @@ def _apply_prompt_replacements( token_ids: list[int], mm_prompt_repls: Mapping[str, Sequence[BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, Mapping[str, list[PlaceholderInfo]]]: + ) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]: tokenizer = self.info.get_tokenizer() mm_token_matches = { @@ -1037,7 +1147,7 @@ def _validate_mm_kwargs( def _validate_mm_placeholders( self, - mm_placeholders: Mapping[str, list[PlaceholderInfo]], + mm_placeholders: Mapping[str, list[PlaceholderFeaturesInfo]], mm_item_counts: Mapping[str, int], *, allow_missing: bool = False, @@ -1067,7 +1177,7 @@ def apply( prompt: Union[str, list[int]], mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -1169,7 +1279,7 @@ def apply( for modality, placeholders in mm_placeholders.items() } - return MultiModalInputsV2( + return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index ec580cd6ecddd..c68edaff80167 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -11,7 +11,7 @@ from vllm.inputs import DummyData from vllm.logger import init_logger -from .inputs import MultiModalDataDict, MultiModalInputsV2 +from .inputs import MultiModalDataDict, MultiModalInputs from .processing import BaseMultiModalProcessor, BaseProcessingInfo logger = init_logger(__name__) @@ -106,7 +106,7 @@ def processing_info(self) -> BaseProcessingInfo: def dummy_inputs(self) -> BaseDummyInputsBuilder[_I]: return self.processor.dummy_inputs - def _get_mm_limits(self) -> Mapping[str, int]: + def get_mm_limits(self) -> Mapping[str, int]: mm_config = self.processing_info.ctx.get_mm_config() mm_limit_per_prompt = mm_config.limit_per_prompt @@ -131,7 +131,7 @@ def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Mapping[str, int], - ) -> MultiModalInputsV2: + ) -> MultiModalInputs: factory = self.dummy_inputs processor_inputs = factory.get_dummy_processor_inputs( seq_len, mm_counts) @@ -146,7 +146,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData - mm_counts = self._get_mm_limits() + mm_counts = self.get_mm_limits() info = self.processing_info mm_max_tokens_per_item = info.get_mm_max_tokens_per_item(seq_len) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index aaf7ff34ca573..7a4b85385cac9 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -17,7 +17,7 @@ from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors from .processing import (BaseMultiModalProcessor, BaseProcessingInfo, ProcessingCache) -from .profiling import BaseDummyInputsBuilder +from .profiling import BaseDummyInputsBuilder, MultiModalProfiler from .utils import cached_get_tokenizer from .video import VideoPlugin @@ -282,13 +282,13 @@ def get_max_tokens_per_item_by_nonzero_modality( This is currently directly used only in V1 for profiling the memory usage of a model. """ - limits_per_plugin = self._limits_by_model[model_config] + mm_limits = self.get_mm_limits_per_prompt(model_config) return { key: max_tokens_per_mm_item for key, max_tokens_per_mm_item in self.get_max_tokens_per_item_by_modality(model_config).items() - if limits_per_plugin[key] > 0 + if mm_limits[key] > 0 } def get_max_tokens_by_modality( @@ -304,10 +304,10 @@ def get_max_tokens_by_modality( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ - limits_per_plugin = self._limits_by_model[model_config] + mm_limits = self.get_mm_limits_per_prompt(model_config) return { - key: limits_per_plugin[key] * max_tokens_per_mm_item + key: mm_limits[key] * max_tokens_per_mm_item for key, max_tokens_per_mm_item in self.get_max_tokens_per_item_by_modality(model_config).items() } @@ -371,6 +371,15 @@ def get_mm_limits_per_prompt( Note: This should be called after :meth:`init_mm_limits_per_prompt`. """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) + processor = self.create_processor(model_config, tokenizer) + profiler = MultiModalProfiler(processor) + return profiler.get_mm_limits() + return self._limits_by_model[model_config] def register_processor( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 1c6bbf77b926f..900bed5929b3d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,4 +1,5 @@ from functools import lru_cache +from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING, Optional, TypeVar, Union from urllib.parse import ParseResult, urlparse @@ -26,7 +27,7 @@ if TYPE_CHECKING: from .hasher import MultiModalHashDict - from .inputs import MultiModalPlaceholderDict + from .inputs import MultiModalKwargs, MultiModalPlaceholderDict class MediaConnector: @@ -477,3 +478,39 @@ def merge_and_sort_multimodal_metadata( merged_hashes = None return sorted_modalities, merged_placeholders, merged_hashes + + +def group_mm_inputs_by_modality( + mm_inputs: list["MultiModalKwargs"]) -> list[list["MultiModalKwargs"]]: + """Group consecutive MultiModalKwargs from mm_inputs with the same modality + together into the same list for batching purpose. For MultiModalKwargs with + multiple modalities, put them into their own list. + + Args: + mm_inputs: List of MultiModalKwargs. + + Returns: + list[list[MultiModalKwargs]]: List of list of MultiModalKwargs, each + inner list contains consecutive MultiModalKwargs with same modality, or + one with multimodal modalities. + """ + if not mm_inputs: + return [] + + def modality_group_func(mm_input: "MultiModalKwargs") -> Union[str, int]: + # If the input has multiple modalities, return a id as the unique key + # for the mm_input input. + if len(mm_input.modalities) > 1: + return id(mm_input) + + elif len(mm_input.modalities) == 1: + return list(mm_input.modalities)[0] + + # FIXME(Isotr0py): Modality of mm_input from legacy pipeline is empty, + # this is used to make InternVL with legacy pipeline still work with v1. + else: + return "" + + return [ + list(group) for _, group in groupby(mm_inputs, key=modality_group_func) + ] diff --git a/vllm/outputs.py b/vllm/outputs.py index b519c159b1531..25b2265285d16 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -1,6 +1,6 @@ import time from dataclasses import dataclass -from typing import Dict, Generic, List, Optional +from typing import Dict, Generic, List, MutableSequence, Optional from typing import Sequence as GenericSequence from typing import Union @@ -162,6 +162,26 @@ def new( finished=finished, ) + def add(self, next_output: "RequestOutput") -> None: + """Merge subsequent RequestOutput into this one""" + + self.prompt = next_output.prompt + self.prompt_token_ids = next_output.prompt_token_ids + self.prompt_logprobs = next_output.prompt_logprobs + self.finished |= next_output.finished + + #TODO assuming n == 1 for now + completion = self.outputs[0] + next_completion = next_output.outputs[0] + completion.text += next_completion.text + if not isinstance(completion.token_ids, MutableSequence): + completion.token_ids = list(completion.token_ids) + completion.token_ids.extend(next_completion.token_ids) + if next_completion.logprobs: + assert completion.logprobs is not None + completion.logprobs.extend(next_completion.logprobs) + completion.cumulative_logprob = next_completion.cumulative_logprob + @classmethod def from_seq_group( cls, seq_group: SequenceGroup, use_cache: bool, @@ -172,9 +192,9 @@ def from_seq_group( if seq_group.request_id in seq_id_to_seq_group: group: SequenceGroupBase = seq_id_to_seq_group[ seq_group.request_id] + assembled_seq_group = group.maybe_assemble_group(seq_group) if finished: group.finish_seq(seq_group) - assembled_seq_group = group.maybe_assemble_group(seq_group) if assembled_seq_group is None: return None return cls.from_seq_group(assembled_seq_group, use_cache, diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 88e35e1de1ec5..b09d29bd27241 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -110,6 +110,10 @@ def cpu_platform_plugin() -> Optional[str]: try: from importlib.metadata import version is_cpu = "cpu" in version("vllm") + if not is_cpu: + import platform + is_cpu = platform.machine().lower().startswith("arm") + except Exception: pass @@ -222,8 +226,11 @@ def __getattr__(name: str): global _init_trace _init_trace = "".join(traceback.format_stack()) return _current_platform - else: + elif name in globals(): return globals()[name] + else: + raise AttributeError( + f"No attribute named '{name}' exists in {__name__}.") __all__ = [ diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 8350177b68ade..e4b436edf7588 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -120,13 +120,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": if scheduler_config.is_multi_step: if envs.VLLM_USE_V1: - raise NotImplementedError + raise NotImplementedError( + "Multi-step scheduling is not supported (and not " + "needed) on VLLM V1. Please launch without " + "--num-scheduler-steps.") else: parallel_config.worker_cls = \ "vllm.worker.multi_step_worker.MultiStepWorker" elif vllm_config.speculative_config: if envs.VLLM_USE_V1: - raise NotImplementedError + raise NotImplementedError( + "Speculative decoding is not yet supported on VLLM V1." + ) else: parallel_config.worker_cls = \ "vllm.spec_decode.spec_decode_worker.create_spec_worker" @@ -139,28 +144,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: parallel_config.worker_cls = "vllm.worker.worker.Worker" - world_size = parallel_config.world_size - tensor_parallel_size = parallel_config.tensor_parallel_size - - from vllm.utils import (cuda_device_count_stateless, - update_environment_variables) - - # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers - if "CUDA_VISIBLE_DEVICES" not in os.environ: - update_environment_variables({ - "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) - }) - - cuda_device_count = cuda_device_count_stateless() - # Use confusing message for more common TP-only case. - assert tensor_parallel_size <= cuda_device_count, ( - f"please set tensor_parallel_size ({tensor_parallel_size}) " - f"to less than max local gpu count ({cuda_device_count})") - - assert world_size <= cuda_device_count, ( - f"please ensure that world_size ({world_size}) " - f"is less than than max local gpu count ({cuda_device_count})") - cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 16 diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 69c445766b824..344870032d57a 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,7 +1,9 @@ +import os from typing import TYPE_CHECKING, Optional import torch +from vllm import envs from vllm.logger import init_logger from .interface import Platform, PlatformEnum, _Backend @@ -21,7 +23,7 @@ class HpuPlatform(Platform): dispatch_key: str = "HPU" ray_device_key: str = "HPU" device_control_env_var: str = "HABANA_VISIBLE_MODULES" - supported_quantization: list[str] = ["inc"] + supported_quantization: list[str] = ["fp8", "inc", "awq_hpu", "gptq_hpu"] @classmethod def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, @@ -57,6 +59,22 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 128 + if (parallel_config.distributed_executor_backend == 'mp' + and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'): + if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD", + None) is not None: + logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Using " + "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, " + "as it was explicitly requested.") + else: + logger.warning( + "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork " + "might cause application hangs on exit. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "To override that behavior, please set " + "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @classmethod def is_pin_memory_available(cls): diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index ead3dab05a6b1..23a7126fb05cf 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -38,8 +38,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.world_size > 1: parallel_config.distributed_executor_backend = "uni" - assert (vllm_config.lora_config is - None), "LoRA is not supported for Neuron backend." + assert (vllm_config.lora_config + is None), "LoRA is not supported for Neuron backend." assert (not vllm_config.speculative_config ), "Speculative decoding not yet supported for Neuron backend." diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index ff54174f634af..a78a054917756 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,6 +1,9 @@ import logging +import os from typing import Callable, Dict +import torch + import vllm.envs as envs logger = logging.getLogger(__name__) @@ -51,6 +54,26 @@ def load_general_plugins(): if plugins_loaded: return plugins_loaded = True + + # some platform-specific configurations + from vllm.platforms import current_platform + + if current_platform.is_xpu(): + # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158 + torch._dynamo.config.disable = True + elif current_platform.is_hpu(): + # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1) + # does not support torch.compile + # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for + # torch.compile support + is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1' + if is_lazy: + torch._dynamo.config.disable = True + # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only) + # requires enabling lazy collectives + # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501 + os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true' + plugins = load_plugins_by_group(group='vllm.general_plugins') # general plugins, we only need to execute the loaded functions for func in plugins.values(): diff --git a/vllm/profiler/layerwise_profile.py b/vllm/profiler/layerwise_profile.py index 33babfebdca1e..29c0edd0ee535 100644 --- a/vllm/profiler/layerwise_profile.py +++ b/vllm/profiler/layerwise_profile.py @@ -1,7 +1,7 @@ import copy from collections import defaultdict from dataclasses import asdict, dataclass, field -from typing import Callable, Dict, List, Optional, Tuple, TypeAlias, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, TypeAlias, Union import pandas as pd from torch._C._autograd import DeviceType, _KinetoEvent, _ProfilerResult @@ -128,7 +128,7 @@ def export_summary_stats_table_csv(self, filename: str): ]) df.to_csv(filename) - def convert_stats_to_dict(self) -> str: + def convert_stats_to_dict(self) -> dict[str, Any]: return { "metadata": { "num_running_seqs": self.num_running_seqs @@ -227,7 +227,7 @@ def _total_cuda_time(self): [self._cumulative_cuda_time(root) for root in self._module_tree]) def _build_stats_trees(self): - summary_dict: Dict[str, self.StatsTreeNode] = {} + summary_dict: Dict[str, _StatsTreeNode] = {} total_cuda_time = self._total_cuda_time() def pct_cuda_time(cuda_time_us): diff --git a/vllm/prompt_adapter/utils.py b/vllm/prompt_adapter/utils.py index 473b87c89c21d..8b2732923c4e7 100644 --- a/vllm/prompt_adapter/utils.py +++ b/vllm/prompt_adapter/utils.py @@ -89,6 +89,7 @@ def load_peft_weights(model_id: str, adapters_weights = safe_load_file(filename, device=device) else: adapters_weights = torch.load(filename, - map_location=torch.device(device)) + map_location=torch.device(device), + weights_only=True) return adapters_weights diff --git a/vllm/scalar_type.py b/vllm/scalar_type.py index 9d711b0debcd8..20063a5b4b085 100644 --- a/vllm/scalar_type.py +++ b/vllm/scalar_type.py @@ -121,8 +121,8 @@ def _raw_min(self) -> Union[int, float]: min_raw = max_raw | sign_bit_double return struct.unpack('!d', struct.pack('!Q', min_raw))[0] else: - assert (not self.is_signed() or - self.size_bits <= 64), "Cannot represent min as a int64_t" + assert (not self.is_signed() or self.size_bits + <= 64), "Cannot represent min as a int64_t" if self.is_signed(): return -(1 << (self.size_bits - 1)) diff --git a/vllm/scripts.py b/vllm/scripts.py index 42e1c639eda10..8101e6b3af7ee 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -167,6 +167,7 @@ def main(): "Must be a YAML with the following options:" "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference" ) + serve_parser = make_arg_parser(serve_parser) serve_parser.set_defaults(dispatch_function=serve) diff --git a/vllm/sequence.py b/vllm/sequence.py index c657a909fdea4..e7df30e3aa784 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -839,7 +839,9 @@ def set_finished_time(self, time: Optional[float]) -> None: def get_max_num_running_seqs(self) -> int: """The maximum number of sequences running in parallel in the remaining lifetime of the request.""" - return 0 if self.first_seq.is_finished() else 1 + if self.is_single_seq: + return 0 if self.first_seq.is_finished() else 1 + return self.num_seqs() - self.num_finished_seqs() def get_seqs( self, @@ -848,7 +850,10 @@ def get_seqs( if status is None: return self.seqs - return self.seqs if self.first_seq.status == status else [] + if self.is_single_seq: + return self.seqs if self.first_seq.status == status else [] + + return [seq for seq in self.seqs if seq.status == status] def is_encoder_decoder(self) -> bool: return self.encoder_seq is not None @@ -857,19 +862,22 @@ def get_encoder_seq(self) -> Optional[Sequence]: return self.encoder_seq def get_finished_seqs(self) -> List[Sequence]: - return self.seqs if self.first_seq.is_finished() else [] + if self.is_single_seq: + return self.seqs if self.first_seq.is_finished() else [] + + return [seq for seq in self.seqs if seq.is_finished()] def update_num_computed_tokens(self, num_new_computed_tokens: int): """Update number of tokens computed so far.""" - seq = self.first_seq - if not seq.is_finished(): - seq.data.update_num_computed_tokens(num_new_computed_tokens) + for seq in self.seqs: + if not seq.is_finished(): + seq.data.update_num_computed_tokens(num_new_computed_tokens) def get_num_uncomputed_tokens(self) -> int: num_uncomputed_tokens = 0 - seq = self.first_seq - if not seq.is_finished(): - num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() + for seq in self.seqs: + if not seq.is_finished(): + num_uncomputed_tokens += seq.data.get_num_uncomputed_tokens() return num_uncomputed_tokens def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: @@ -884,10 +892,14 @@ def num_seqs(self, status: Optional[SequenceStatus] = None) -> int: return len(self.get_seqs(status)) def num_finished_seqs(self) -> int: - return 1 if self.first_seq.is_finished() else 0 + if self.is_single_seq: + return 1 if self.seqs[0].is_finished() else 0 + return len(self.get_finished_seqs()) def is_finished(self) -> bool: - return self.first_seq.is_finished() + if self.is_single_seq: + return self.first_seq.is_finished() + return all(seq.is_finished() for seq in self.seqs) def is_prefill(self) -> bool: return self.first_seq.is_prefill() @@ -1416,13 +1428,15 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): @staticmethod def add_request(request_id: str, engine, params, **kwargs): original_params = params - params = original_params.clone() - params.n = 1 group = ParallelSampleSequenceGroup(request_id) seqs = [] for i in range(original_params.n): request_id_i = f"{request_id}_parallel_sample_{i}" group.seq_id_to_index[request_id_i] = i + params = copy.deepcopy(original_params) + params.n = 1 + if params.seed is not None: + params.seed += i seq_group = engine._add_processed_request( request_id_i, params=params, @@ -1457,33 +1471,34 @@ def maybe_assemble_group( self, seq_group: SequenceGroup) -> Optional[SequenceGroup]: # in the streaming mode, we will return the assembled sequence - # for the first sequence, and then return None for the rest of - # sequences + # for the first remaining sequence, and then return None for the + # rest of sequences if self.streaming: - if self.seq_id_to_index[seq_group.request_id] == 0: + first_remaining_id = next(iter(self.to_be_finished)) + if seq_group.request_id == first_remaining_id: return self.assembled_seq_group return None # in the non-streaming mode, we will return the assembled sequence - # once after all sequences finish, and then return None for the + # when the last sequences finishes, and then return None for the # rest of the time - - if len(self.to_be_finished) > 0: - return None - - assert self.assembled_seq_group is not None - params = self.assembled_seq_group.sampling_params - assert isinstance(params, SamplingParams) - if not self.output_produced: - self.output_produced = True - if params._real_n is not None: - # Get the top-n sequences. - n = params._real_n or params.n - seqs = self.assembled_seq_group.seqs - sorting_key = lambda seq: seq.get_cumulative_logprob() - sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) - top_n_seqs = sorted_seqs[:n] - self.assembled_seq_group.seqs = top_n_seqs - return self.assembled_seq_group - if self.output_produced: - return None + if (len(self.to_be_finished) == 1 + and seq_group.request_id in self.to_be_finished + and seq_group.is_finished()): + assert self.assembled_seq_group is not None + params = self.assembled_seq_group.sampling_params + assert isinstance(params, SamplingParams) + if not self.output_produced: + self.output_produced = True + if params._real_n is not None: + # Get the top-n sequences. + n = params._real_n or params.n + seqs = self.assembled_seq_group.seqs + sorting_key = lambda seq: seq.get_cumulative_logprob() + sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) + top_n_seqs = sorted_seqs[:n] + self.assembled_seq_group.seqs = top_n_seqs + return self.assembled_seq_group + if self.output_produced: + return None + return None diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py index 53219042afeaa..0ed9535074106 100644 --- a/vllm/spec_decode/batch_expansion.py +++ b/vllm/spec_decode/batch_expansion.py @@ -84,14 +84,14 @@ def score_proposals( if not non_spec_indices: # All sequence groups in batch have spec decoding enabled - contracted = self._contract_batch_all_spec( + return self._contract_batch_all_spec( target_sampler_output=target_sampler_output, proposals=proposals, num_scoring_tokens=num_scoring_tokens, ) else: # Batch has a mix of spec decode enabled and disabled seq groups - contracted = self._contract_batch( + return self._contract_batch( execute_model_req.seq_group_metadata_list, target_sampler_output=target_sampler_output, proposals=proposals, @@ -101,14 +101,6 @@ def score_proposals( k=execute_model_req.num_lookahead_slots, ) - all_tokens, all_probs, spec_logprobs, all_hidden_states = contracted - return SpeculativeScores( - probs=all_probs, - token_ids=all_tokens, - logprobs=spec_logprobs, - hidden_states=all_hidden_states, - ) - def _expand_batch( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -145,13 +137,57 @@ def _expand_batch( return (spec_indices, non_spec_indices, target_seq_group_metadata_list, num_scoring_tokens) + def _contract_non_speculative( + self, scores: SpeculativeScores, + seq_group_metadata_list: List[SequenceGroupMetadata], + non_spec_indices: List[int], non_spec_outputs: SpeculativeScores, + has_prompt_log: bool) -> SpeculativeScores: + """ + Augment input `scores` with non-speculative requests outputs. + This includes decode requests with speculation turned off, as well + as prefill requests when `enable_chunked_prefill` is set. + For the latter, prefills are further separated into terminal and + non-terminal chunks (from which no token is sampled). + """ + if not non_spec_indices: + return scores + + if has_prompt_log: + # When prompt_logprobs is enabled, prefills yield output token + # (and respective prob) in the last entry (prompt|out): + # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..]. + # With chunked prefill, non-terminal chunks have -1 on each + # position: they're still picked, but they're discarded later. + seq_meta = seq_group_metadata_list + nospec_sizes = torch.tensor([ + seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1 + for i in non_spec_indices + ]) + nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1) + else: + # In this case only sampled tokens are returned, select all. + nospec_sampled_token_idxs = list( + range(len(non_spec_outputs.token_ids))) + + scores.token_ids[non_spec_indices, :1] = \ + non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1) + scores.probs[non_spec_indices, :1, :] = \ + non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1) + scores.logprobs[non_spec_indices, :1, :] = \ + non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1) + if scores.hidden_states is not None: + assert non_spec_outputs.hidden_states is not None + scores.hidden_states[non_spec_indices, :1, :] = \ + non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1) + return scores + def _contract_batch( - self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata], - target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, - num_scoring_tokens: int, non_spec_indices: List[int], - spec_indices: List[int], k: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: + self, + contracted_seq_group_metadata_list: List[SequenceGroupMetadata], + target_sampler_output: SamplerOutput, + proposals: SpeculativeProposals, num_scoring_tokens: int, + non_spec_indices: List[int], spec_indices: List[int], + k: int) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original sequences. @@ -204,23 +240,28 @@ def _contract_batch( else: all_hidden_states = None - # Rule out prefills that produce no tokens. - non_spec_indices = [ - idx for idx in non_spec_indices - if contracted_seq_group_metadata_list[idx].do_sample - ] - if len(non_spec_indices): - all_tokens[non_spec_indices, :1] = \ - non_spec_target_token_ids.unsqueeze(1) - all_probs[non_spec_indices, :1, :] = \ - non_spec_target_probs.unsqueeze(1) - all_logprobs[non_spec_indices, :1, :] = \ - non_spec_target_logprobs.unsqueeze(1) - if all_hidden_states is not None: - assert non_spec_target_hidden_states is not None - all_hidden_states[non_spec_indices, :1, :] = \ - non_spec_target_hidden_states.unsqueeze(1) - + has_prompt_log = any((sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + for sg in contracted_seq_group_metadata_list) + # When prompt logprobs is enabled, lens of returned tensors go from + # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. + # We adjust stride accordingly to get the generated tokens and + # their probs, but pass on prompt_logprobs as is. + prompt_logprobs = None + if (not self._scorer_worker.model_runner.disable_logprobs\ + and has_prompt_log): + prompt_logprobs = [ + o.prompt_logprobs for o in target_sampler_output.outputs + ] + elif not has_prompt_log: + # When prompt logprobs are not to be returned, + # we can ignore non-terminal chunks (no out token). + non_spec_indices = [ + idx for idx in non_spec_indices + if contracted_seq_group_metadata_list[idx].do_sample + ] + + # "Contract" speculative. if spec_indices: all_tokens[spec_indices] = target_token_ids all_probs[spec_indices] = target_probs @@ -228,15 +269,28 @@ def _contract_batch( if all_hidden_states is not None: all_hidden_states[spec_indices] = target_hidden_states - return all_tokens, all_probs, all_logprobs, all_hidden_states + spec_scores = SpeculativeScores(probs=all_probs, + token_ids=all_tokens, + logprobs=all_logprobs, + hidden_states=all_hidden_states, + prompt_logprobs=prompt_logprobs) + + non_spec_outputs = SpeculativeScores( + probs=non_spec_target_probs, + token_ids=non_spec_target_token_ids, + logprobs=non_spec_target_logprobs, + hidden_states=non_spec_target_hidden_states) + # Contract remaining nonspec entries based on non_spec_indices, if any. + return self._contract_non_speculative( + spec_scores, contracted_seq_group_metadata_list, non_spec_indices, + non_spec_outputs, has_prompt_log) def _contract_batch_all_spec( self, target_sampler_output: SamplerOutput, proposals: SpeculativeProposals, num_scoring_tokens: int, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: + ) -> SpeculativeScores: """Contract the expanded batch back into its original size. This maps the scores of speculative tokens back to their original sequences. @@ -284,8 +338,11 @@ def _contract_batch_all_spec( target_hidden_states = target_hidden_states.reshape( *target_token_ids.shape, target_hidden_states.shape[-1]) - return (target_token_ids, target_probs, target_logprobs, - target_hidden_states) + return SpeculativeScores(probs=target_probs, + token_ids=target_token_ids, + logprobs=target_logprobs, + hidden_states=target_hidden_states, + prompt_logprobs=None) def _create_scoring_model_input( self, diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py index a4fe0f13c8db1..c39e98b6cca12 100644 --- a/vllm/spec_decode/interfaces.py +++ b/vllm/spec_decode/interfaces.py @@ -1,10 +1,10 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional, Set, Union +from typing import List, Optional, Set, Union import torch -from vllm.sequence import ExecuteModelRequest +from vllm.sequence import ExecuteModelRequest, PromptLogprobs from vllm.worker.worker_base import WorkerBase @@ -54,6 +54,10 @@ class SpeculativeScores: # Optional last hidden states from the scoring model. hidden_states: Optional[torch.Tensor] = None + # Scoring model may also return logprobs for prompt tokens + # for each request, when chunked prefill is enabled. + prompt_logprobs: Optional[List[PromptLogprobs]] = None + def __repr__(self): return (f"SpeculativeScores(" f"probs={self.probs.shape}, " diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py index cbf793e2043e3..3aea2eabb4144 100644 --- a/vllm/spec_decode/mqa_scorer.py +++ b/vllm/spec_decode/mqa_scorer.py @@ -72,9 +72,15 @@ def score_proposals( target_token_ids = target_sampler_output.sampled_token_ids target_probs = target_sampler_output.sampled_token_probs target_logprobs = target_sampler_output.logprobs + prompt_logprobs = None + # If all requests have the same number of query tokens, we can avoid # the for loop to build output for better performance. if min(all_proposal_lengths) == k: + # Regular decodes only. + assert all(not sg.is_prompt + for sg in target_seq_group_metadata_list + if sg.is_prompt) bs, _ = proposals.proposal_token_ids.shape all_tokens = target_token_ids.reshape(bs, k + 1) all_probs = target_probs.reshape(bs, k + 1, self._vocab_size) @@ -88,19 +94,56 @@ def score_proposals( all_logprobs = target_logprobs.new_full(size=all_probs.shape, fill_value=-float("inf")) target_token_ids = target_token_ids.flatten() - start_loc = 0 - for i, (proposed_len, seq_meta) in enumerate( - zip(all_proposal_lengths, target_seq_group_metadata_list)): + + # When prompt logprobs is enabled, lens of returned tensors go from + # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. + # We adjust stride accordingly to get the generated tokens and + # their probs, but pass on prompt_logprobs as is, since it may be + # that n_prompts >> K. + has_prompt_log = any((sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + for sg in target_seq_group_metadata_list) + # TODO (NickLucche) we should surface `disable_logprobs` as to not + # break abstraction to get its value. + if (not self._scorer_worker.model_runner.disable_logprobs\ + and has_prompt_log): + prompt_logprobs = [ + o.prompt_logprobs for o in target_sampler_output.outputs + ] + + # Split loop into prefill|decode for readability. + start_loc, i = 0, 0 + while i < len(target_seq_group_metadata_list + ) and target_seq_group_metadata_list[i].is_prompt: + seq_meta = target_seq_group_metadata_list[i] + end_loc = start_loc + if has_prompt_log: + end_loc += seq_meta.token_chunk_size + elif seq_meta.do_sample: + end_loc += 1 + # Skip chunks with no output tokens. if seq_meta.do_sample: - output_len = proposed_len + 1 - end_loc = start_loc + output_len - all_tokens[ - i, :output_len] = target_token_ids[start_loc:end_loc] - all_probs[i, :output_len] = target_probs[start_loc:end_loc] - all_logprobs[ - i, :output_len] = target_logprobs[start_loc:end_loc] - start_loc = end_loc + # Get sampled token (last position in chunk) and its prob. + all_tokens[i, 0] = target_token_ids[end_loc - 1] + all_probs[i, 0] = target_probs[end_loc - 1] + all_logprobs[i, 0] = target_logprobs[end_loc - 1] + + i += 1 + start_loc = end_loc + # Decodes. + while i < len(target_seq_group_metadata_list): + proposed_len, seq_meta = all_proposal_lengths[ + i], target_seq_group_metadata_list[i] + output_len = proposed_len + 1 + end_loc = start_loc + output_len + all_tokens[ + i, :output_len] = target_token_ids[start_loc:end_loc] + all_probs[i, :output_len] = target_probs[start_loc:end_loc] + all_logprobs[ + i, :output_len] = target_logprobs[start_loc:end_loc] + start_loc = end_loc + i += 1 hidden_states = None if target_sampler_output.hidden_states is not None: @@ -110,4 +153,5 @@ def score_proposals( return SpeculativeScores(probs=all_probs, token_ids=all_tokens, logprobs=all_logprobs, - hidden_states=hidden_states) + hidden_states=hidden_states, + prompt_logprobs=prompt_logprobs) diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 6ee1ef6fb93bd..4e4acf1e1da60 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -2,6 +2,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.model_executor.layers.sampler import SamplerOutput from vllm.platforms import current_platform @@ -28,6 +29,10 @@ raise ValueError(f"Unsupported platform: {current_platform}") +class _DummyModel(nn.Module): + pass + + class NGramWorker(NonLLMProposerWorkerBase): """NGramWorker provides a light drafter without need for model. @@ -54,7 +59,6 @@ def set_ngram_window_size(self, ngram_prompt_lookup_min: int, def init_device(self): self.device = torch.device(f"{self.device_type}:{self.local_rank}") - self.load_model = lambda *args, **kwargs: None # Current NGramWorker only supports Top1Proposer self._proposer = Top1Proposer( @@ -63,6 +67,12 @@ def init_device(self): vocab_size=self.vocab_size, ) + def load_model(self) -> None: + pass # Dummy + + def get_model(self) -> nn.Module: + return _DummyModel() + def sampler_output( self, execute_model_req: ExecuteModelRequest, diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py index 8896b7dbc6b8a..c6ff5e52f9388 100644 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ b/vllm/spec_decode/smaller_tp_proposer_worker.py @@ -1,6 +1,7 @@ from typing import List, Optional, Set, Tuple import torch +import torch.nn as nn from vllm.distributed.parallel_state import (get_tp_group, init_model_parallel_group, @@ -15,6 +16,10 @@ logger = init_logger(__name__) +class _DummyModel(nn.Module): + pass + + class SmallerTpProposerWorker(ProposerWorkerBase): """Class which allows a speculative draft model to run with smaller tensor parallel degree than target model. @@ -139,6 +144,13 @@ def get_spec_proposals( return self._worker.get_spec_proposals( execute_model_req, seq_ids_with_bonus_token_in_last_step) + def get_model(self) -> nn.Module: + if self._is_dummy: + return _DummyModel() + + with self._patch_tensor_parallel_group(): + return self._worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 64f59ab3bf4ff..dfe5dee6aa8e8 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Type import torch +import torch.nn as nn from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig from vllm.distributed.communication_op import broadcast_tensor_dict @@ -408,6 +409,9 @@ def initialize_cache(self, num_gpu_blocks: int, self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.scorer_worker.get_model() + @torch.inference_mode() def execute_model( self, @@ -511,8 +515,8 @@ def _should_disable_all_speculation( self, execute_model_req: ExecuteModelRequest) -> bool: # When the batch size is too large, disable speculative decoding # to stop trading off throughput for latency. - return (execute_model_req.running_queue_size >= - self.disable_by_batch_size) + return (execute_model_req.running_queue_size + >= self.disable_by_batch_size) def _maybe_disable_speculative_tokens( self, disable_all_speculation: bool, @@ -564,50 +568,57 @@ def _serialize_sampler_output_no_logprobs( (seq_id, seq_data) for sg in \ execute_model_req.seq_group_metadata_list \ for seq_id, seq_data in sg.seq_data.items() - if sg.do_sample # ignore empty token sequences ] completion_seq_group_output_list: List[ CompletionSequenceGroupOutput] = [] output_index = 0 # Make sure the non-terminal prefill chunks are still aligned with # their own empty output. - for seq_group_meta in execute_model_req.seq_group_metadata_list: - # Since we can get chunks here, we dont always have a sampled token - # (only on last chunk) but we still have to provide an output. - if not seq_group_meta.do_sample: - completion_seq_group_output_list.append( - CompletionSequenceGroupOutput(samples=[], - prompt_logprobs=None)) - else: - # Sequence with output. - seq_id, seq_data = seq_data_entries[output_index] - needs_prompt_logprobs = seq_output_prompt_logprobs[ - output_index] - if needs_prompt_logprobs: - prompt_token_ids = seq_data.get_prompt_token_ids() - prompt_logprobs = [ - create_logprobs_output( - token_id=p_token_id, - token_id_logprob_rank=-1, - token_id_logprob=0.0, - topk_token_ids=[], - topk_logprobs=[], - ) - # no prompt logprobs for the first token - for p_token_id in prompt_token_ids[1:] - ] - else: - prompt_logprobs = None - completion_seq_group_output_list.append( - create_sequence_group_output( - token_id=sampled_token_ids_list[output_index][0], + for idx, seq_group_meta in enumerate( + execute_model_req.seq_group_metadata_list): + needs_prompt_logprobs = seq_output_prompt_logprobs[idx] + seq_id, seq_data = seq_data_entries[idx] + if needs_prompt_logprobs: + prompt_token_ids = seq_data.get_prompt_token_ids() + + # Some of these sequences may belong to non-terminal chunks, + # which may still have to report logprobs for prompts. + start = 1 if seq_data._num_computed_tokens == 0 \ + else seq_data._num_computed_tokens + end = (seq_data._num_computed_tokens + \ + seq_group_meta.token_chunk_size) + prompt_token_ids = prompt_token_ids[start:end] + prompt_logprobs = [ + create_logprobs_output( + token_id=p_token_id, token_id_logprob_rank=-1, token_id_logprob=0.0, - seq_id=seq_id, topk_token_ids=[], topk_logprobs=[], - prompt_logprobs=prompt_logprobs)) - output_index += 1 + ) for p_token_id in prompt_token_ids + ] + else: + prompt_logprobs = None + + # Since we can get chunks here, we dont always have a sampled token + # (only on last chunk) but we still have to provide an output. + if not seq_group_meta.do_sample: + completion_seq_group_output_list.append( + CompletionSequenceGroupOutput( + samples=[], prompt_logprobs=prompt_logprobs)) + continue + + # Sequence with output. + completion_seq_group_output_list.append( + create_sequence_group_output( + token_id=sampled_token_ids_list[output_index][0], + token_id_logprob_rank=-1, + token_id_logprob=0.0, + seq_id=seq_id, + topk_token_ids=[], + topk_logprobs=[], + prompt_logprobs=prompt_logprobs)) + output_index += 1 return [SamplerOutput(outputs=completion_seq_group_output_list)] @@ -625,24 +636,27 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest, assert len(sampler_output) == 1 sampler_output = sampler_output[0] - # Store hidden states from target model execution. + # Store hidden states from target model execution, BxD. hidden_states = sampler_output.hidden_states if hidden_states is not None: - # remove hidden_states for prompt tokens - # TODO Enable `return_hidden_states`: prefill chunks hidden states - # are pruned by the logits processor. Also, they should be arranged - # back into full-prefill latent. Address it to enable MLPSpeculator. - if any(seq.is_prompt - for seq in execute_model_req.seq_group_metadata_list): + # Only decodes and prefill terminal chunks need a hidden state. + seq_group_meta_with_hidden = [ + sg for sg in execute_model_req.seq_group_metadata_list + if sg.do_sample + ] + if any(seq.is_prompt for seq in seq_group_meta_with_hidden): + # Drop hidden_states with no prediction (eg non-terminal chunks) hidden_states = hidden_states[ torch.where(sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID)[0]] - if self.previous_hidden_states is None: + if self.previous_hidden_states is None and len( + seq_group_meta_with_hidden): self.previous_hidden_states = HiddenStates( - hidden_states, execute_model_req.seq_group_metadata_list) - else: - self.previous_hidden_states.update( - hidden_states, execute_model_req.seq_group_metadata_list) + hidden_states, seq_group_meta_with_hidden) + elif self.previous_hidden_states and len( + seq_group_meta_with_hidden): + self.previous_hidden_states.update(hidden_states, + seq_group_meta_with_hidden) if not skip_proposer: # We prepare the prefill hidden states here so that there no @@ -753,13 +767,13 @@ def _run_speculative_decoding_step( ] if len(non_spec_indices): all_hidden_states = proposal_scores.hidden_states - # TODO fix `return_hidden_states`, same as in `_run_no_spec` if all_hidden_states is not None: prefill_hidden_states = all_hidden_states[non_spec_indices] execute_model_req.previous_hidden_states = \ prepare_prefill_hidden_states(prefill_hidden_states) # Sync proposer KV cache for prefills. prefill_req = execute_model_req.clone(non_spec_seqs) + # TODO avoid sampling here? self.proposer_worker.execute_model(prefill_req) with Timer() as verification_timer: @@ -775,6 +789,8 @@ def _run_speculative_decoding_step( execute_model_req.seq_group_metadata_list, accepted_token_ids, target_logprobs=target_logprobs, + prompt_logprobs=proposal_scores.prompt_logprobs + if not self._disable_logprobs else None, k=execute_model_req.num_lookahead_slots, stage_times=stage_times) @@ -846,19 +862,32 @@ def _verify_tokens( # metadata. accepted_token_ids[original_indices] = accepted_token_ids.clone() + # B x K+1 x D hidden_states = proposal_scores.hidden_states if hidden_states is not None: + # Only get terminal hidden states for next step + terminal_metadata = [ + sg for sg in seq_group_metadata_list if sg.do_sample + ] + # Contract hidden states based on accepted tokens hs_size = hidden_states.shape[-1] - accepted_index = accepted_token_ids + 1 # Convert -1 to 0 - accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) - index = accepted_index[:, None, None].expand(-1, 1, hs_size) + accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) # b + # Drop non-terminal prefill chunks hidden states. + hidden_states = hidden_states[accepted_index != + VLLM_INVALID_TOKEN_ID] + accepted_index = accepted_index[accepted_index != + VLLM_INVALID_TOKEN_ID] + assert len(accepted_index) == hidden_states.shape[0] == len( + terminal_metadata) + index = accepted_index[:, None, None].expand(-1, 1, + hs_size) # b x 1 x d second_last_token_hidden_states = hidden_states[:, -2] # b x d hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d # Store hidden states from target model for subsequent decode step self.previous_hidden_states = HiddenStates( - hidden_states, seq_group_metadata_list, + hidden_states, terminal_metadata, second_last_token_hidden_states) return accepted_token_ids, logprobs @@ -867,6 +896,8 @@ def _create_output_sampler_list( seq_group_metadata_list: List[SequenceGroupMetadata], accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] target_logprobs: torch.Tensor, # shape: [batch_size, k+1, vocab_size] + prompt_logprobs: Optional[ + torch.Tensor], # shape: [nprompt_tokens, vocab_size] k: int, stage_times: Tuple[float, float, float], ) -> List[SamplerOutput]: @@ -910,15 +941,89 @@ def _create_output_sampler_list( # Construct the output on a per-step, per-sequence basis. # Non-terminal prefill chunks will end up here as rows with just -1s - # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] + # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while + # terminal chunks will only have one generated token at time 0. sampler_output_list: List[SamplerOutput] = [] + + # Prefills are not multi-step (return at most 1 token), in order to + # avoid padding or repetition to fit decodes, we separate them. + for i, sg in enumerate(seq_group_metadata_list): + if not sg.is_prompt: + # Requests are ordered as prefills|decodes=>no more prefills. + break + num_logprobs = num_logprobs_per_seq[i] + seq_kwargs = dict(token_id=-1, + token_id_logprob_rank=0, + token_id_logprob=-float('inf'), + topk_token_ids=[-1] * num_logprobs, + topk_logprobs=[-float('inf')] * num_logprobs, + seq_id=seq_ids[i]) + # Terminal chunk, has token. + if sg.do_sample: + seq_kwargs.update( + dict( + token_id=accepted_token_ids[i][0].item(), + token_id_logprob_rank=accepted_token_id_ranks_by_step[ + 0][i], + token_id_logprob=accepted_token_id_logprobs_by_step[0] + [i], + topk_token_ids=topk_indices_by_step[0][i] + [:num_logprobs], + # output only so step is 0 + topk_logprobs=topk_logprobs_by_step[0][i] + [:num_logprobs], + )) + needs_plogs = (sg.sampling_params.prompt_logprobs + and sg.sampling_params.prompt_logprobs > 0) + plogs = None + if prompt_logprobs is not None: + # Even non-terminal prompt chunks can have logprobs here. + plogs = prompt_logprobs[i] + elif needs_plogs: + # Prompt logprobs are requested but `_disable_logprobs` is set. + seq_data = next(iter(sg.seq_data.values())) + # Get only the tokens in this chunk! + prompt_token_ids = seq_data.get_prompt_token_ids() + prompt_token_ids = prompt_token_ids[ + seq_data. + _num_computed_tokens:seq_data._num_computed_tokens + + sg.token_chunk_size] + + is_first_chunk = seq_data._num_computed_tokens == 0 + # There's no prob generated for the first token in a sequence. + if is_first_chunk: + prompt_token_ids = prompt_token_ids[1:] + plogs = [ + create_logprobs_output( + token_id=p_token_id, + token_id_logprob_rank=-1, + token_id_logprob=0.0, + topk_token_ids=[], + topk_logprobs=[], + ) for p_token_id in prompt_token_ids + ] + seq_kwargs.update(dict(prompt_logprobs=plogs)) + + sampler_output_list.append( + SamplerOutput( + outputs=[create_sequence_group_output( + **seq_kwargs)])) # type: ignore + + # Decodes, create one SamplerOutput per-step (at most K+1). for step_index in range(num_steps): - if all(token_id == -1 - for token_id in accepted_token_ids_by_step[step_index]): + if all(token_id == -1 for sg, token_id in zip( + seq_group_metadata_list, + accepted_token_ids_by_step[step_index]) + if not sg.is_prompt): break step_output_token_ids: List[CompletionSequenceGroupOutput] = [] for sequence_index in range(batch_size): + seq_meta = seq_group_metadata_list[sequence_index] + # Prompts already processed above. + if seq_meta.is_prompt: + continue + # Each sequence may have a different num_logprobs; retrieve it. num_logprobs = num_logprobs_per_seq[sequence_index] step_output_token_ids.append( @@ -953,6 +1058,8 @@ def _create_output_sampler_list( # This is periodic because the rejection sampler emits metrics # periodically. self._maybe_log_stage_times(*stage_times) + # First `n_prefills` entries will contain prefills SamplerOutput when + # chunked prefill is enabled, the rest is decodes in multi-step format. return sampler_output_list def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float, diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py index 5a7999a258b2d..6bf7587cdda19 100644 --- a/vllm/spec_decode/top1_proposer.py +++ b/vllm/spec_decode/top1_proposer.py @@ -104,11 +104,11 @@ def get_spec_proposals( sampler_transposed=transposed, ) - proposals = SpeculativeProposals( - proposal_token_ids=proposal_tokens, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens, - no_proposals=maybe_sampler_output is None) + proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens, + proposal_probs=proposal_probs, + proposal_lens=proposal_lens, + no_proposals=maybe_sampler_output + is None) return proposals def _split_by_proposal_len( diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py index da8706658d09a..c88820ab27b69 100644 --- a/vllm/spec_decode/util.py +++ b/vllm/spec_decode/util.py @@ -40,13 +40,15 @@ def get_sampled_token_logprobs( """ num_steps, batch_size, vocab_size = logprob_tensor.shape - selected_logprobs = logprob_tensor[torch.arange(num_steps).unsqueeze(1), - torch.arange(batch_size), - sampled_token_ids, ] + selected_logprobs = logprob_tensor[ + torch.arange(num_steps).unsqueeze(1), + torch.arange(batch_size), + sampled_token_ids, + ] expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( -1, -1, vocab_size) - sampled_token_ids_ranks = (logprob_tensor > - expanded_selected_logprobs).sum(-1).add_(1) + sampled_token_ids_ranks = (logprob_tensor + > expanded_selected_logprobs).sum(-1).add_(1) return sampled_token_ids_ranks, selected_logprobs diff --git a/vllm/tracing.py b/vllm/tracing.py index 50068d8cf9c25..72a3f85118d36 100644 --- a/vllm/tracing.py +++ b/vllm/tracing.py @@ -16,7 +16,6 @@ OTEL_EXPORTER_OTLP_TRACES_PROTOCOL) from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor - from opentelemetry.semconv_ai import SpanAttributes as BaseSpanAttributes from opentelemetry.trace import SpanKind, Tracer, set_tracer_provider from opentelemetry.trace.propagation.tracecontext import ( TraceContextTextMapPropagator) @@ -92,21 +91,30 @@ def extract_trace_headers(headers: Mapping[str, str]) -> Mapping[str, str]: return {h: headers[h] for h in TRACE_HEADERS if h in headers} -class SpanAttributes(BaseSpanAttributes): - # The following span attribute names are added here because they are missing - # from the Semantic Conventions for LLM. - LLM_REQUEST_ID = "gen_ai.request.id" - LLM_REQUEST_N = "gen_ai.request.n" - LLM_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" - LLM_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" - LLM_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" - LLM_LATENCY_E2E = "gen_ai.latency.e2e" - LLM_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" +class SpanAttributes: + # Attribute names copied from here to avoid version conflicts: + # https://github.com/open-telemetry/semantic-conventions/blob/main/docs/gen-ai/gen-ai-spans.md + GEN_AI_USAGE_COMPLETION_TOKENS = "gen_ai.usage.completion_tokens" + GEN_AI_USAGE_PROMPT_TOKENS = "gen_ai.usage.prompt_tokens" + GEN_AI_REQUEST_MAX_TOKENS = "gen_ai.request.max_tokens" + GEN_AI_REQUEST_TOP_P = "gen_ai.request.top_p" + GEN_AI_REQUEST_TEMPERATURE = "gen_ai.request.temperature" + GEN_AI_RESPONSE_MODEL = "gen_ai.response.model" + # Attribute names added until they are added to the semantic conventions: + GEN_AI_REQUEST_ID = "gen_ai.request.id" + GEN_AI_REQUEST_N = "gen_ai.request.n" + GEN_AI_USAGE_NUM_SEQUENCES = "gen_ai.usage.num_sequences" + GEN_AI_LATENCY_TIME_IN_QUEUE = "gen_ai.latency.time_in_queue" + GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN = "gen_ai.latency.time_to_first_token" + GEN_AI_LATENCY_E2E = "gen_ai.latency.e2e" + GEN_AI_LATENCY_TIME_IN_SCHEDULER = "gen_ai.latency.time_in_scheduler" # Time taken in the forward pass for this across all workers - LLM_LATENCY_TIME_IN_MODEL_FORWARD = "gen_ai.latency.time_in_model_forward" + GEN_AI_LATENCY_TIME_IN_MODEL_FORWARD = ( + "gen_ai.latency.time_in_model_forward") # Time taken in the model execute function. This will include model # forward, block/sync across workers, cpu-gpu sync time and sampling time. - LLM_LATENCY_TIME_IN_MODEL_EXECUTE = "gen_ai.latency.time_in_model_execute" + GEN_AI_LATENCY_TIME_IN_MODEL_EXECUTE = ( + "gen_ai.latency.time_in_model_execute") def contains_trace_headers(headers: Mapping[str, str]) -> bool: diff --git a/vllm/transformers_utils/configs/aria.py b/vllm/transformers_utils/configs/aria.py deleted file mode 100644 index d253da0d96a34..0000000000000 --- a/vllm/transformers_utils/configs/aria.py +++ /dev/null @@ -1,47 +0,0 @@ -from transformers.models.idefics2.configuration_idefics2 import ( - Idefics2VisionConfig) -from transformers.models.llama.configuration_llama import LlamaConfig - - -class AriaVisionConfig(Idefics2VisionConfig): - model_type = "aria_vision_model" - - -class AriaMoELMConfig(LlamaConfig): - """ - Configuration class for AriaMoE language model. - - This class extends the LlamaConfig to include additional parameters specific - to the Mixture of Experts (MoE) architecture. - """ - - model_type = "aria_moe_lm" - - def __init__( - self, - moe_intermediate_size: int = 4096, - moe_num_experts: int = 8, - moe_topk: int = 2, - moe_num_shared_experts: int = 2, - **kwargs, - ): - """ - Initialize the AriaMoELMConfig. - - Args: - moe_intermediate_size (int): The intermediate size for MoE layers. - Default is 4096. - moe_num_experts (int): The number of experts in the MoE layer. - Default is 8. - moe_topk (int): The number of top experts to route to for each - token. Default is 2. - moe_num_shared_experts (int): The number of shared experts. Default - is 2. - **kwargs: Additional keyword arguments to be passed to the parent - LlamaConfig. - """ - super().__init__(**kwargs) - self.moe_intermediate_size = moe_intermediate_size - self.moe_num_experts = moe_num_experts - self.moe_topk = moe_topk - self.moe_num_shared_experts = moe_num_shared_experts diff --git a/vllm/transformers_utils/configs/nemotron.py b/vllm/transformers_utils/configs/nemotron.py index 93fec667d1cf3..1edf36329d83b 100644 --- a/vllm/transformers_utils/configs/nemotron.py +++ b/vllm/transformers_utils/configs/nemotron.py @@ -182,8 +182,8 @@ def _rope_scaling_validation(self): if self.rope_scaling is None: return - if not isinstance(self.rope_scaling, - dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_scaling, dict) or len( + self.rope_scaling) != 2: raise ValueError( "`rope_scaling` must be a dictionary with two fields, " f"`type` and `factor`, got {self.rope_scaling}") diff --git a/vllm/transformers_utils/processors/__init__.py b/vllm/transformers_utils/processors/__init__.py new file mode 100644 index 0000000000000..9c71b8cada32e --- /dev/null +++ b/vllm/transformers_utils/processors/__init__.py @@ -0,0 +1,4 @@ +from vllm.transformers_utils.processors.deepseek_vl2 import ( + DeepseekVLV2Processor) + +__all__ = ["DeepseekVLV2Processor"] diff --git a/vllm/transformers_utils/processors/deepseek_vl2.py b/vllm/transformers_utils/processors/deepseek_vl2.py new file mode 100644 index 0000000000000..27cdf6bc22d0e --- /dev/null +++ b/vllm/transformers_utils/processors/deepseek_vl2.py @@ -0,0 +1,361 @@ +# yapf: disable +# ruff: noqa: E501 +# coding=utf-8 +# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/ff23960c5cf9e6874b44be38af930cfb0ccbb620/deepseek_vl2/models/processing_deepseek_vl_v2.py +# Copyright (c) 2023-2024 DeepSeek. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +import math +from typing import List, Tuple + +import torch +import torchvision.transforms as T +from PIL import Image, ImageOps +from transformers import AutoProcessor, BatchFeature, LlamaTokenizerFast +from transformers.processing_utils import ProcessorMixin + + +class ImageTransform: + + def __init__(self, + mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True): + self.mean = mean + self.std = std + self.normalize = normalize + + transform_pipelines = [T.ToTensor()] + + if normalize: + transform_pipelines.append(T.Normalize(mean, std)) + + self.transform = T.Compose(transform_pipelines) + + def __call__(self, pil_img: Image.Image): + x = self.transform(pil_img) + return x + + +class DeepseekVLV2Processor(ProcessorMixin): + tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast") + attributes = ["tokenizer"] + + def __init__( + self, + tokenizer: LlamaTokenizerFast, + candidate_resolutions: Tuple[Tuple[int, int]], + patch_size: int, + downsample_ratio: int, + image_mean: Tuple[float, float, float] = (0.5, 0.5, 0.5), + image_std: Tuple[float, float, float] = (0.5, 0.5, 0.5), + normalize: bool = True, + image_token: str = "", + pad_token: str = "<|▁pad▁|>", + add_special_token: bool = False, + sft_format: str = "deepseek", + mask_prompt: bool = True, + ignore_id: int = -100, + **kwargs, + ): + + self.candidate_resolutions = candidate_resolutions + self.image_size = candidate_resolutions[0][0] + self.patch_size = patch_size + self.image_mean = image_mean + self.image_std = image_std + self.normalize = normalize + self.downsample_ratio = downsample_ratio + + self.image_transform = ImageTransform(mean=image_mean, std=image_std, normalize=normalize) + self.tokenizer = tokenizer + self.tokenizer.padding_side = 'left' # must set this,padding side with make a difference in batch inference + + # add the pad_token as special token to use 'tokenizer.pad_token' and 'tokenizer.pad_token_id' + if tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({'pad_token': pad_token}) + + # add image token + image_token_id = self.tokenizer.vocab.get(image_token) + if image_token_id is None: + special_tokens = [image_token] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + self.image_token_id = self.tokenizer.vocab.get(image_token) + + # add five special tokens for grounding-related tasks + # <|ref|>, <|/ref|>, <|det|>, <|/det|>, <|grounding|> + special_tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + # add special tokens for SFT data + special_tokens = ["<|User|>", "<|Assistant|>"] + special_tokens_dict = {"additional_special_tokens": special_tokens} + self.tokenizer.add_special_tokens(special_tokens_dict) + + self.image_token = image_token + self.pad_token = pad_token + self.add_special_token = add_special_token + self.sft_format = sft_format + self.mask_prompt = mask_prompt + self.ignore_id = ignore_id + + super().__init__( + tokenizer, + **kwargs, + ) + + def select_best_resolution(self, image_size): + # used for cropping + original_width, original_height = image_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for width, height in self.candidate_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int( + original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, + original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution + and wasted_resolution < min_wasted_resolution): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (width, height) + + return best_fit + + @property + def bos_id(self): + return self.tokenizer.bos_token_id + + @property + def eos_id(self): + return self.tokenizer.eos_token_id + + @property + def pad_id(self): + return self.tokenizer.pad_token_id + + def encode(self, text: str, bos: bool = True, eos: bool = False): + t = self.tokenizer.encode(text, add_special_tokens=False) + + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + + return t + + def decode(self, t: List[int], **kwargs) -> str: + return self.tokenizer.decode(t, **kwargs) + + def process_one( + self, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + conversations (List[Dict]): conversations with a list of messages; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + system_prompt (str): the system prompt; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - target_ids (torch.LongTensor): [N + image tokens] + - pixel_values (torch.FloatTensor): [n_patches, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + assert (prompt is not None and images is not None + ), "prompt and images must be used at the same time." + + sft_format = prompt + tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens = self.tokenize_with_images( + sft_format, images, bos=True, eos=True, cropping=len(images) <= 2) + masked_tokenized_str = [] + for token_index in tokenized_str: + if token_index != self.image_token_id: + masked_tokenized_str.append(token_index) + else: + masked_tokenized_str.append(self.ignore_id) + + assert len(tokenized_str) == len(images_seq_mask) == len(masked_tokenized_str), \ + (f"tokenized_str's length {len(tokenized_str)}, input_ids' length {len(masked_tokenized_str)}, " + f"imags_seq_mask's length {len(images_seq_mask)}, are not equal") + + input_ids = torch.LongTensor(tokenized_str) + target_ids = torch.LongTensor(masked_tokenized_str) + images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool) + + # set input_ids < 0 | input_ids == self.image_token_id as ignore_id + target_ids[(input_ids < 0) | + (input_ids == self.image_token_id)] = self.ignore_id + input_ids[input_ids < 0] = self.pad_id + + if inference_mode: + # 去掉结尾的eos token + assert input_ids[-1] == self.eos_id + input_ids = input_ids[:-1] + target_ids = target_ids[:-1] + images_seq_mask = images_seq_mask[:-1] + + if len(images_list) == 0: + pixel_values = torch.zeros((1, 3, self.image_size, self.image_size)) + images_spatial_crop = torch.zeros((1, 2), dtype=torch.long) + else: + pixel_values = torch.stack(images_list, dim=0) + images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long) + + input_ids = input_ids.unsqueeze(0) + + prepare = BatchFeature( + data=dict( + input_ids=input_ids, + pixel_values=pixel_values, + images_seq_mask=images_seq_mask, + images_spatial_crop=images_spatial_crop, + num_image_tokens=num_image_tokens, + ), + tensor_type="pt", + ) + return prepare + + def __call__( + self, + *, + prompt: str, + images: List[Image.Image], + inference_mode: bool = True, + **kwargs, + ): + """ + + Args: + prompt (str): the formatted prompt; + images (List[ImageType]): the list of images; + inference_mode (bool): if True, then remove the last eos token; + **kwargs: + + Returns: + outputs (BaseProcessorOutput): the output of the processor, + - input_ids (torch.LongTensor): [N + image tokens] + - images (torch.FloatTensor): [n_images, 3, H, W] + - image_id (int): the id of the image token + - num_image_tokens (List[int]): the number of image tokens + """ + + prepare = self.process_one( + prompt=prompt, + images=images, + inference_mode=inference_mode, + ) + + return prepare + + def tokenize_with_images( + self, + conversation: str, + images: List[Image.Image], + bos: bool = True, + eos: bool = True, + cropping: bool = True, + ): + """Tokenize text with tags.""" + assert conversation.count(self.image_token) == len(images) + text_splits = conversation.split(self.image_token) + images_list, images_seq_mask, images_spatial_crop = [], [], [] + num_image_tokens = [] + tokenized_str = [] + for text_sep, image in zip(text_splits, images): + """encode text_sep""" + tokenized_sep = self.encode(text_sep, bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """select best resolution for anyres""" + if cropping: + best_width, best_height = self.select_best_resolution(image.size) + else: + best_width, best_height = self.image_size, self.image_size + + """process the global view""" + global_view = ImageOps.pad(image, (self.image_size, self.image_size), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + images_list.append(self.image_transform(global_view)) + + """process the local views""" + local_view = ImageOps.pad(image, (best_width, best_height), + color=tuple(int(x * 255) for x in self.image_transform.mean)) + for i in range(0, best_height, self.image_size): + for j in range(0, best_width, self.image_size): + images_list.append( + self.image_transform(local_view.crop((j, i, j + self.image_size, i + self.image_size)))) + + """record height / width crop num""" + num_width_tiles, num_height_tiles = best_width // self.image_size, best_height // self.image_size + images_spatial_crop.append([num_width_tiles, num_height_tiles]) + + """add image tokens""" + h = w = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio) + # global views tokens h * (w + 1), 1 is for line separator + tokenized_image = [self.image_token_id] * h * (w + 1) + # add a separator between global and local views + tokenized_image += [self.image_token_id] + # local views tokens, (num_height_tiles * h) * (num_width_tiles * w + 1) + tokenized_image += [self.image_token_id] * (num_height_tiles * h) * (num_width_tiles * w + 1) + + tokenized_str += tokenized_image + images_seq_mask += [True] * len(tokenized_image) + num_image_tokens.append(len(tokenized_image)) + + """process the last text split""" + tokenized_sep = self.encode(text_splits[-1], bos=False, eos=False) + tokenized_str += tokenized_sep + images_seq_mask += [False] * len(tokenized_sep) + + """add the bos and eos tokens""" + if bos: + tokenized_str = [self.bos_id] + tokenized_str + images_seq_mask = [False] + images_seq_mask + if eos: + tokenized_str = tokenized_str + [self.eos_id] + images_seq_mask = images_seq_mask + [False] + + assert len(tokenized_str) == len( + images_seq_mask), f"tokenize_with_images func: tokenized_str's length {len(tokenized_str)} is not equal to imags_seq_mask's length {len(images_seq_mask)}" + + return tokenized_str, images_list, images_seq_mask, images_spatial_crop, num_image_tokens + + +AutoProcessor.register("DeepseekVLV2Processor", DeepseekVLV2Processor) diff --git a/vllm/transformers_utils/s3_utils.py b/vllm/transformers_utils/s3_utils.py index 6ae68161bbd97..74a56cbf57ec3 100644 --- a/vllm/transformers_utils/s3_utils.py +++ b/vllm/transformers_utils/s3_utils.py @@ -145,7 +145,8 @@ def pull_files(self, return for file in files: - destination_file = self.dir + file.removeprefix(base_dir) + destination_file = os.path.join(self.dir, + file.removeprefix(base_dir)) local_dir = Path(destination_file).parent os.makedirs(local_dir, exist_ok=True) self.s3.download_file(bucket_name, file, destination_file) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 294262484f2fb..1f1d67fabb243 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -67,9 +67,10 @@ def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: tokenizer_all_special_tokens_extended = ( tokenizer.all_special_tokens_extended) tokenizer_all_special_tokens = set(tokenizer.all_special_tokens) + tokenizer_vocab = tokenizer.get_vocab() tokenizer_len = len(tokenizer) - max_token_id = max(tokenizer.get_vocab().values()) + max_token_id = max(tokenizer_vocab.values()) # Some tokenizers (e.g., QwenTokenizer) have special tokens that # are added and included in the implementation of the vocab_size # property, but not in get_vocab(); if there is an implementation @@ -96,6 +97,9 @@ def all_special_tokens_extended(self): def max_token_id(self): return max_token_id + def get_vocab(self): + return tokenizer_vocab + def __len__(self): return tokenizer_len diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 17d722e3d88fe..d801cf4e4c7b1 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -18,6 +18,7 @@ Tekkenizer) from vllm.logger import init_logger +from vllm.utils import is_list_of if TYPE_CHECKING: from vllm.entrypoints.chat_utils import ChatCompletionMessageParam @@ -27,7 +28,7 @@ @dataclass class Encoding: - input_ids: List[int] + input_ids: Union[List[int], List[List[int]]] def maybe_serialize_tool_calls(request: ChatCompletionRequest): @@ -223,17 +224,25 @@ def __len__(self) -> int: def __call__( self, - prompt: str, + prompt: Union[str, List[str], List[int]], add_special_tokens: bool = False, truncation: bool = False, max_length: Optional[int] = None, ): - # Mistral Tokenizers should not add special tokens - input_ids = self.encode(prompt) - - if truncation: - input_ids = input_ids[:max_length] - + input_ids: Union[List[int], List[List[int]]] + # For List[str], original prompt text + if is_list_of(prompt, str): + input_ids_: List[List[int]] = [] + for p in prompt: + each_input_ids = self.encode_one(p, truncation, max_length) + input_ids_.append(each_input_ids) + input_ids = input_ids_ + # For List[int], apply chat template output, already tokens. + elif is_list_of(prompt, int): + input_ids = prompt + # For str, single prompt text + else: + input_ids = self.encode_one(prompt, truncation, max_length) return Encoding(input_ids=input_ids) def get_vocab(self) -> Dict[str, int]: @@ -245,6 +254,19 @@ def get_added_vocab(self) -> Dict[str, int]: # Mistral tokenizers have no added vocabulary return {} + def encode_one( + self, + prompt: str, + truncation: bool = False, + max_length: Optional[int] = None, + ) -> List[int]: + # Mistral Tokenizers should not add special tokens + input_ids = self.encode(prompt) + + if truncation: + input_ids = input_ids[:max_length] + return input_ids + def encode(self, prompt: str) -> List[int]: # `encode` should only be used for prompt completion # it should never be used for chat_completion. diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index a9deee881f41a..7f5cc906382af 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -27,6 +27,17 @@ _GLOBAL_RUNTIME_DATA: Dict[str, Union[str, int, bool]] = {} +_USAGE_ENV_VARS_TO_COLLECT = [ + "VLLM_USE_MODELSCOPE", + "VLLM_USE_TRITON_FLASH_ATTN", + "VLLM_ATTENTION_BACKEND", + "VLLM_USE_FLASHINFER_SAMPLER", + "VLLM_PP_LAYER_PARTITION", + "VLLM_USE_TRITON_AWQ", + "VLLM_USE_V1", + "VLLM_ENABLE_V1_MULTIPROCESSING", +] + def set_runtime_usage_data(key: str, value: Union[str, int, bool]) -> None: """Set global usage data that will be sent with every usage heartbeat.""" @@ -119,9 +130,11 @@ def __init__(self) -> None: self.total_memory: Optional[int] = None self.architecture: Optional[str] = None self.platform: Optional[str] = None + self.cuda_runtime: Optional[str] = None self.gpu_count: Optional[int] = None self.gpu_type: Optional[str] = None self.gpu_memory_per_device: Optional[int] = None + self.env_var_json: Optional[str] = None # vLLM Information self.model_architecture: Optional[str] = None @@ -157,6 +170,8 @@ def _report_usage_once(self, model_architecture: str, self.gpu_count = torch.cuda.device_count() self.gpu_type = device_property.name self.gpu_memory_per_device = device_property.total_memory + if current_platform.is_cuda(): + self.cuda_runtime = torch.version.cuda self.provider = _detect_cloud_provider() self.architecture = platform.machine() self.platform = platform.platform() @@ -176,6 +191,12 @@ def _report_usage_once(self, model_architecture: str, self.vllm_version = VLLM_VERSION self.model_architecture = model_architecture + # Environment variables + self.env_var_json = json.dumps({ + env_var: getattr(envs, env_var) + for env_var in _USAGE_ENV_VARS_TO_COLLECT + }) + # Metadata self.log_time = _get_current_timestamp_ns() self.source = envs.VLLM_USAGE_SOURCE diff --git a/vllm/utils.py b/vllm/utils.py index afdaa9e08171c..3aab6a17324bc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -30,7 +30,7 @@ from collections import OrderedDict, UserDict, defaultdict from collections.abc import Hashable, Iterable, Mapping from dataclasses import dataclass, field -from functools import lru_cache, partial, wraps +from functools import cache, lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Dict, Generator, Generic, Iterator, List, Literal, NamedTuple, Optional, Tuple, Type, TypeVar, Union, @@ -354,24 +354,24 @@ def reset(self): self._index = 0 -@lru_cache(maxsize=None) +@cache def is_fake_hpu() -> bool: return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0' -@lru_cache(maxsize=None) +@cache def hpu_device_string(): device_string = 'hpu' if not is_fake_hpu() else 'cpu' return device_string -@lru_cache(maxsize=None) +@cache def hpu_backend_string(): backend_string = 'hccl' if not is_fake_hpu() else 'gloo' return backend_string -@lru_cache(maxsize=None) +@cache def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" from vllm import _custom_ops as ops @@ -716,7 +716,7 @@ def create_kv_caches_with_random( return key_caches, value_caches -@lru_cache(maxsize=None) +@cache def is_pin_memory_available() -> bool: from vllm.platforms import current_platform return current_platform.is_pin_memory_available() @@ -957,7 +957,7 @@ def init_cached_hf_modules() -> None: init_hf_modules() -@lru_cache(maxsize=None) +@cache def find_library(lib_name: str) -> str: """ Find the library file in the system. @@ -1704,7 +1704,7 @@ def import_from_path(module_name: str, file_path: Union[str, os.PathLike]): return module -@lru_cache(maxsize=None) +@cache def get_vllm_optional_dependencies(): metadata = importlib.metadata.metadata("vllm") requirements = metadata.get_all("Requires-Dist", []) @@ -2020,36 +2020,57 @@ def kill_process_tree(pid: int): @dataclass class MemorySnapshot: """Memory snapshot.""" - torch_peak_in_bytes: int = 0 - torch_memory_in_bytes: int = 0 + torch_peak: int = 0 + cuda_memory: int = 0 + torch_memory: int = 0 + non_torch_memory: int = 0 timestamp: float = 0.0 + auto_measure: bool = True + + def __post_init__(self): + if self.auto_measure: + self.measure() def measure(self): - self.torch_peak_in_bytes = torch.cuda.max_memory_reserved() + # we measure the torch peak memory usage via allocated_bytes, + # rather than `torch.cuda.memory_reserved()` . + # After `torch.cuda.reset_peak_memory_stats()`, + # `torch.cuda.memory_reserved()` will keep growing, and only shrink + # when we call `torch.cuda.empty_cache()` or OOM happens. + self.torch_peak = torch.cuda.memory_stats().get( + "allocated_bytes.all.peak", 0) + + self.cuda_memory = torch.cuda.mem_get_info( + )[1] - torch.cuda.mem_get_info()[0] + # torch.cuda.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) - self.torch_memory_in_bytes = torch.cuda.memory_reserved() + # this is used to measure the non-torch memory usage + self.torch_memory = torch.cuda.memory_reserved() + + self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": - """support a - b""" return MemorySnapshot( - torch_peak_in_bytes=self.torch_peak_in_bytes - - other.torch_peak_in_bytes, - torch_memory_in_bytes=self.torch_memory_in_bytes - - other.torch_memory_in_bytes, - timestamp=self.timestamp - other.timestamp) + torch_peak=self.torch_peak - other.torch_peak, + cuda_memory=self.cuda_memory - other.cuda_memory, + torch_memory=self.torch_memory - other.torch_memory, + non_torch_memory=self.non_torch_memory - other.non_torch_memory, + timestamp=self.timestamp - other.timestamp, + auto_measure=False, + ) @dataclass class MemoryProfilingResult: - """Memory profiling result. - """ # noqa - baseline_memory_in_bytes: int = 0 - non_kv_cache_memory_in_bytes: int = 0 - torch_peak_increase_in_bytes: int = 0 - non_torch_increase_in_bytes: int = 0 - weights_memory_in_bytes: float = 0 + """Memory profiling result. All numbers are in bytes. + """ + non_kv_cache_memory: int = 0 + torch_peak_increase: int = 0 + non_torch_increase: int = 0 + weights_memory: float = 0 + before_create: MemorySnapshot = field(default_factory=MemorySnapshot) before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 @@ -2057,18 +2078,14 @@ class MemoryProfilingResult: @contextlib.contextmanager def memory_profiling( - baseline_memory_in_bytes: int, weights_memory_in_bytes: int -) -> Generator[MemoryProfilingResult, None, None]: + baseline_snapshot: MemorySnapshot, + weights_memory: int) -> Generator[MemoryProfilingResult, None, None]: """Memory profiling context manager. - baseline_memory_in_bytes: memory used by all the components other than - the current vLLM instance. It contains: memory used by other processes, memory - used by another vLLM instance in the same process, etc. It is usually measured - before the current vLLM instance initialize the device. And we assume it is - constant during the profiling of the current vLLM instance. - weights_memory_in_bytes: memory used by PyTorch when loading the model weights. + baseline_snapshot: the memory snapshot before the current vLLM instance. + weights_memory: memory used by PyTorch when loading the model weights. Note that, before loading the model weights, we also initialize the device and distributed environment, which may consume some memory. This part is not - included in the weights_memory_in_bytes because PyTorch does not control it. + included in the weights_memory because PyTorch does not control it. The memory in one GPU can be classified into 3 categories: 1. memory used by anything other than the current vLLM instance. @@ -2103,20 +2120,21 @@ def memory_profiling( b. 2 GiB reserved for the peak activation tensors (category 2) c. 1 GiB used by non-torch components (category 3) - The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`. + The memory used for loading weights (a.) is directly given from the argument `weights_memory`. - The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). - (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`), - subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_reserved()`. + The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). """ # noqa + gc.collect() + torch.cuda.empty_cache() torch.cuda.reset_peak_memory_stats() result = MemoryProfilingResult() - result.baseline_memory_in_bytes = baseline_memory_in_bytes + result.before_create = baseline_snapshot # the part of memory used for holding the model weights - result.weights_memory_in_bytes = weights_memory_in_bytes + result.weights_memory = weights_memory result.before_profile.measure() @@ -2127,13 +2145,12 @@ def memory_profiling( result.after_profile.measure() - diff = result.after_profile - result.before_profile - result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes - current_cuda_memory_bytes = torch.cuda.mem_get_info( - )[1] - torch.cuda.mem_get_info()[0] - result.non_torch_increase_in_bytes = current_cuda_memory_bytes - baseline_memory_in_bytes - weights_memory_in_bytes - diff.torch_memory_in_bytes # noqa - result.profile_time = diff.timestamp - result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa + diff_profile = result.after_profile - result.before_profile + diff_from_create = result.after_profile - result.before_create + result.torch_peak_increase = diff_profile.torch_peak + result.non_torch_increase = diff_from_create.non_torch_memory + result.profile_time = diff_profile.timestamp + result.non_kv_cache_memory = result.non_torch_increase + result.torch_peak_increase + result.weights_memory # noqa # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py old mode 100644 new mode 100755 index 7b0786261a6a6..ce83b1fac6c0b --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -9,8 +9,15 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) +from vllm.envs import VLLM_FLASH_ATTN_VERSION +from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils import cdiv -from vllm.vllm_flash_attn import flash_attn_varlen_func +from vllm.vllm_flash_attn import (fa_version_unsupported_reason, + flash_attn_varlen_func, + is_fa_version_supported) + +logger = init_logger(__name__) class FlashAttentionBackend(AttentionBackend): @@ -63,7 +70,7 @@ class FlashAttentionMetadata: max_query_len: int query_start_loc: torch.Tensor max_seq_len: int - seq_start_loc: torch.Tensor + seq_lens: torch.Tensor block_table: torch.Tensor slot_mapping: torch.Tensor @@ -71,8 +78,8 @@ class FlashAttentionMetadata: use_cascade: bool common_prefix_len: int cu_prefix_query_lens: Optional[torch.Tensor] - cu_prefix_kv_lens: Optional[torch.Tensor] - cu_suffix_kv_lens: Optional[torch.Tensor] + prefix_kv_lens: Optional[torch.Tensor] + suffix_kv_lens: Optional[torch.Tensor] # For logging. num_input_tokens: int = 0 # Number of tokens including padding. @@ -128,15 +135,33 @@ def __init__( "are not implemented for " "FlashAttentionImpl") + # if hopper default to FA3, otherwise stick to FA2 for now + # TODO(lucas): profile FA3 on ampere to see if it makes sense to + # use FA3 as default for both + if current_platform.get_device_capability()[0] >= 9: + self.fa_version = 3 if is_fa_version_supported(3) else 2 + else: + self.fa_version = 2 + + if VLLM_FLASH_ATTN_VERSION is not None: + assert VLLM_FLASH_ATTN_VERSION in [2, 3] + self.fa_version = VLLM_FLASH_ATTN_VERSION + + if not is_fa_version_supported(self.fa_version): + logger.error("Cannot use FA version %d is not supported due to %s", + self.fa_version, + fa_version_unsupported_reason(self.fa_version)) + + assert is_fa_version_supported(self.fa_version) + def forward( self, + layer: torch.nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: FlashAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -150,10 +175,6 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - # NOTE(woosuk): FlashAttention does not support FP8 KV cache. - assert k_scale == 1.0 and v_scale == 1.0, ( - "key/v_scale is not supported in FlashAttention.") - assert output is not None, "Output tensor must be provided." if attn_metadata is None: @@ -183,8 +204,8 @@ def forward( value_cache, attn_metadata.slot_mapping, self.kv_cache_dtype, - k_scale, - v_scale, + layer._k_scale, + layer._v_scale, ) # Compute attention and update output up to `num_actual_tokens`. @@ -197,7 +218,7 @@ def forward( out=output[:num_actual_tokens], cu_seqlens_q=attn_metadata.query_start_loc, max_seqlen_q=attn_metadata.max_query_len, - cu_seqlens_k=attn_metadata.seq_start_loc, + seqused_k=attn_metadata.seq_lens, max_seqlen_k=attn_metadata.max_seq_len, softmax_scale=self.scale, causal=True, @@ -205,6 +226,7 @@ def forward( window_size=self.sliding_window, block_table=attn_metadata.block_table, softcap=self.logits_soft_cap, + fa_version=self.fa_version, ) return output @@ -217,8 +239,8 @@ def forward( cu_query_lens=attn_metadata.query_start_loc, max_query_len=attn_metadata.max_query_len, cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens, - cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens, - cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens, + prefix_kv_lens=attn_metadata.prefix_kv_lens, + suffix_kv_lens=attn_metadata.suffix_kv_lens, max_kv_len=attn_metadata.max_seq_len, softmax_scale=self.scale, alibi_slopes=self.alibi_slopes, @@ -226,6 +248,7 @@ def forward( logits_soft_cap=self.logits_soft_cap, block_table=attn_metadata.block_table, common_prefix_len=attn_metadata.common_prefix_len, + fa_version=self.fa_version, ) return output @@ -306,8 +329,8 @@ def cascade_attention( cu_query_lens: torch.Tensor, max_query_len: int, cu_prefix_query_lens: torch.Tensor, - cu_prefix_kv_lens: torch.Tensor, - cu_suffix_kv_lens: torch.Tensor, + prefix_kv_lens: torch.Tensor, + suffix_kv_lens: torch.Tensor, max_kv_len: int, softmax_scale: float, alibi_slopes: Optional[torch.Tensor], @@ -315,6 +338,7 @@ def cascade_attention( logits_soft_cap: float, block_table: torch.Tensor, common_prefix_len: int, + fa_version: int, ) -> torch.Tensor: assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") # TODO: Support sliding window. @@ -333,7 +357,7 @@ def cascade_attention( k=key_cache, v=value_cache, cu_seqlens_q=cu_prefix_query_lens, - cu_seqlens_k=cu_prefix_kv_lens, + seqused_k=prefix_kv_lens, max_seqlen_q=num_tokens, max_seqlen_k=common_prefix_len, softmax_scale=softmax_scale, @@ -342,6 +366,7 @@ def cascade_attention( block_table=block_table[:1], softcap=logits_soft_cap, return_softmax_lse=True, + fa_version=fa_version, ) # Process suffix per query. @@ -350,7 +375,7 @@ def cascade_attention( k=key_cache, v=value_cache, cu_seqlens_q=cu_query_lens, - cu_seqlens_k=cu_suffix_kv_lens, + seqused_k=suffix_kv_lens, max_seqlen_q=max_query_len, max_seqlen_k=max_kv_len - common_prefix_len, softmax_scale=softmax_scale, @@ -359,6 +384,7 @@ def cascade_attention( block_table=block_table[:, num_common_kv_blocks:], softcap=logits_soft_cap, return_softmax_lse=True, + fa_version=fa_version, ) # Merge prefix and suffix outputs, and store the result in output. diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index bac77443c8560..18fdfdfe4a010 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -285,6 +285,56 @@ def free(self, request: Request) -> None: if block.ref_cnt == 0: self.free_block_queue.append(block) + def uncache_blocks(self, request: Request) -> int: + """Uncache the blocks that are no longer full based on the + num_computed_tokens in the given request. This happens when + the blocks were full and cached due to speculative tokens, but the + speculative tokens are not accepted. + + Args: + request: The request. + + Returns: + The number of uncached blocks. + """ + blocks = self.req_to_blocks[request.request_id] + num_computed_tokens = request.num_computed_tokens + num_full_blocks = num_computed_tokens // self.block_size + num_uncached_blocks = 0 + for block in blocks[num_full_blocks:]: + # If the block is not cached, the following blocks are not cached. + if not self._maybe_evict_cached_block(block): + break + num_uncached_blocks += 1 + return num_uncached_blocks + + def reset_prefix_cache(self) -> bool: + """Reset prefix cache. This function may be used in RLHF + flows to invalid prefix caching after the weights are updated, + or used for resetting prefix caching status for benchmarking. + + Returns: + bool: True if the prefix cache is successfully reset, + False otherwise. + """ + num_used_blocks = (self.num_gpu_blocks - + self.free_block_queue.num_free_blocks) + if num_used_blocks > 0: + logger.warning( + "Failed to reset prefix cache because some " + "blocks (%d) are not freed yet", num_used_blocks) + return False + + # Remove all hashes so that no new blocks will hit. + self.cached_block_hash_to_block = defaultdict(dict) + + # Remove all hashes from all blocks. + for block in self.block_pool: + block.reset_hash() + + logger.info("Successfully reset prefix cache") + return True + def get_num_common_prefix_blocks( self, request: Request, @@ -359,7 +409,7 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: # If the block is cached, evict it. if self.enable_caching: - self._evict_cached_block(curr_block) + self._maybe_evict_cached_block(curr_block) curr_block.incr_ref() ret.append(curr_block) @@ -367,13 +417,16 @@ def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: return ret - def _evict_cached_block(self, block: KVCacheBlock) -> None: + def _maybe_evict_cached_block(self, block: KVCacheBlock) -> bool: """ If a block is cached in `cached_block_hash_to_block`, we reset its hash metadata and evict it from the cache. Args: block: The block to evict. + + Returns: + True if the block is evicted, False otherwise. """ block_hash = block.block_hash if block_hash and block_hash in self.cached_block_hash_to_block: @@ -383,6 +436,9 @@ def _evict_cached_block(self, block: KVCacheBlock) -> None: if len(self.cached_block_hash_to_block[block_hash]) == 0: del self.cached_block_hash_to_block[block_hash] + return True + return False + def _get_cached_block(self, block_hash: BlockHashType) -> Optional[KVCacheBlock]: """Get a cached block by the block hash, or None if cache miss. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 64df21d59fef4..7a88cc9433b32 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -247,8 +247,8 @@ def schedule(self) -> "SchedulerOutput": token_budget -= num_new_tokens request.status = RequestStatus.RUNNING request.num_computed_tokens = num_computed_tokens - has_partial_request = (num_computed_tokens + num_new_tokens < - request.num_tokens) + has_partial_request = (num_computed_tokens + num_new_tokens + < request.num_tokens) # Encoder-related. if encoder_inputs_to_schedule: @@ -411,6 +411,10 @@ def update_from_output( num_scheduled_tokens = scheduler_output.num_scheduled_tokens new_running: List[Request] = [] outputs: List[EngineCoreOutput] = [] + + # NOTE(woosuk): As len(self.running) can be up to 1K or more, the below + # loop can be a performance bottleneck. We should do our best to avoid + # expensive operations inside the loop. for request in self.running: req_id = request.request_id request.num_computed_tokens += num_scheduled_tokens[req_id] @@ -421,13 +425,15 @@ def update_from_output( cached_encoder_input_ids = ( self.encoder_cache_manager.get_cached_input_ids(request)) - for input_id in list(cached_encoder_input_ids): - start_pos = request.mm_positions[input_id]["offset"] - num_tokens = request.mm_positions[input_id]["length"] - if start_pos + num_tokens <= request.num_computed_tokens: - # The encoder output is already processed and stored - # in the decoder's KV cache. - self.encoder_cache_manager.free(request, input_id) + # OPTIMIZATION: Avoid list(set) if the set is empty. + if cached_encoder_input_ids: + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] @@ -529,6 +535,9 @@ def get_num_unfinished_requests(self) -> int: def has_unfinished_requests(self) -> bool: return self.get_num_unfinished_requests() > 0 + def reset_prefix_cache(self) -> bool: + return self.kv_cache_manager.reset_prefix_cache() + def make_stats(self) -> SchedulerStats: return SchedulerStats( num_running_reqs=len(self.running), diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 6d90c38c72cf5..abe4952c4baff 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -66,6 +66,11 @@ class EngineCoreProfile: is_start: bool +@dataclass +class EngineCoreResetPrefixCache: + pass + + class EngineCoreRequestType(enum.Enum): """ Request types defined as hex byte strings, so it can be sent over sockets @@ -74,6 +79,8 @@ class EngineCoreRequestType(enum.Enum): ADD = b'\x00' ABORT = b'\x01' PROFILE = b'\x02' + RESET_PREFIX_CACHE = b'\x03' -EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, List[str]] +EngineCoreRequestUnion = Union[EngineCoreRequest, EngineCoreProfile, + EngineCoreResetPrefixCache, List[str]] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index a74699f7513e6..b9dc3561d1750 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -2,9 +2,12 @@ import os from typing import AsyncGenerator, List, Mapping, Optional, Type, Union +import numpy as np + from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient +from vllm.envs import VLLM_V1_OUTPUT_PROC_CHUNK_SIZE from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger @@ -12,16 +15,17 @@ from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext -from vllm.utils import kill_process_tree +from vllm.utils import cdiv, kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.output_processor import OutputProcessor from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import LoggingStatLogger, StatLoggerBase +from vllm.v1.metrics.loggers import (LoggingStatLogger, PrometheusStatLogger, + StatLoggerBase) from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -43,13 +47,14 @@ def __init__( assert start_engine_loop + self.model_config = vllm_config.model_config + self.log_requests = log_requests self.log_stats = log_stats self.stat_loggers: List[StatLoggerBase] = [ LoggingStatLogger(), - # TODO(rob): PrometheusStatLogger(), + PrometheusStatLogger(vllm_config.model_config), ] - self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( @@ -205,17 +210,23 @@ async def generate( # The output_handler task pushes items into the queue. # This task pulls from the queue and yields to caller. - while True: + finished = False + while not finished: # Note: drain queue without await if possible (avoids # task switching under load which helps performance). - out = q.get_nowait() if q.qsize() > 0 else await q.get() + out = q.get_nowait() if not q.empty() else await q.get() + + # Coalesce any additional queued outputs + while not q.empty(): + next_out = q.get_nowait() + if sampling_params.output_kind == RequestOutputKind.DELTA: + out.add(next_out) + else: + out = next_out # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. - if out.finished: - yield out - break - + finished = out.finished yield out # If the request is disconnected by the client, the @@ -233,22 +244,41 @@ async def _run_output_handler(self): # 1) Pull EngineCoreOutputs from the EngineCore. outputs = await self.engine_core.get_output_async() - # 2) Process EngineCoreOutputs. - processed_outputs = self.output_processor.process_outputs( - outputs.outputs) - # NOTE: RequestOutputs are pushed to their queues. - assert len(processed_outputs.request_outputs) == 0 - - # 3) Abort any reqs that finished due to stop strings. - await self.engine_core.abort_requests_async( - processed_outputs.reqs_to_abort) + # Split outputs into chunks of at most + # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the + # event loop for too long. + num_outputs = len(outputs.outputs) + if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: + slices = (outputs.outputs, ) + else: + slices = np.array_split( + outputs.outputs, + cdiv(num_outputs, VLLM_V1_OUTPUT_PROC_CHUNK_SIZE)) + + iteration_stats = None + for i, outputs_slice in enumerate(slices): + # 2) Process EngineCoreOutputs. + processed_outputs = self.output_processor.process_outputs( + outputs_slice, iteration_stats) + # NOTE: RequestOutputs are pushed to their queues. + assert not processed_outputs.request_outputs + iteration_stats = processed_outputs.iteration_stats + + # Allow other asyncio tasks to run between chunks + if i + 1 < len(slices): + await asyncio.sleep(0) + + # 3) Abort any reqs that finished due to stop strings. + await self.engine_core.abort_requests_async( + processed_outputs.reqs_to_abort) # 4) Logging. # TODO(rob): make into a coroutine and launch it in - # background thread once we add Prometheus. + # background thread once Prometheus overhead is non-trivial. + assert iteration_stats is not None self._log_stats( scheduler_stats=outputs.scheduler_stats, - iteration_stats=processed_outputs.iteration_stats, + iteration_stats=iteration_stats, ) except Exception as e: @@ -274,7 +304,8 @@ def _log_stats( return for logger in self.stat_loggers: - logger.log(scheduler_stats=scheduler_stats) + logger.log(scheduler_stats=scheduler_stats, + iteration_stats=iteration_stats) def encode( self, @@ -321,6 +352,9 @@ async def start_profile(self) -> None: async def stop_profile(self) -> None: await self.engine_core.profile_async(False) + async def reset_prefix_cache(self) -> None: + await self.engine_core.reset_prefix_cache_async() + @property def is_running(self) -> bool: return True diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 26ebc7edcf03e..f50303bda58fd 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -20,7 +20,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, - EngineCoreRequestUnion) + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.mm_input_mapper import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus @@ -42,7 +42,7 @@ def __init__( ): assert vllm_config.model_config.runner_type != "pooling" - logger.info("Initializing an LLM engine (v%s) with config: %s", + logger.info("Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) # Setup Model. @@ -135,6 +135,9 @@ def shutdown(self): def profile(self, is_start: bool = True): self.model_executor.profile(is_start) + def reset_prefix_cache(self): + self.scheduler.reset_prefix_cache() + class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" @@ -247,6 +250,8 @@ def _handle_client_request(self, request: EngineCoreRequestUnion) -> None: self.add_request(request) elif isinstance(request, EngineCoreProfile): self.model_executor.profile(request.is_start) + elif isinstance(request, EngineCoreResetPrefixCache): + self.reset_prefix_cache() else: # TODO: make an EngineCoreAbort wrapper assert isinstance(request, list) @@ -271,7 +276,9 @@ def process_input_socket(self, input_path: str): request = decoder_add_req.decode(request_data) elif request_type == EngineCoreRequestType.ABORT.value: request = decoder_abort_req.decode(request_data) - elif request_type == EngineCoreRequestType.PROFILE.value: + elif request_type in ( + EngineCoreRequestType.PROFILE.value, + EngineCoreRequestType.RESET_PREFIX_CACHE.value): request = pickle.loads(request_data) else: raise ValueError(f"Unknown RequestType: {request_type}") diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index ac0f0f14bf1ab..f3b992d6873e7 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,8 +1,9 @@ +import asyncio import os import signal import weakref from abc import ABC, abstractmethod -from typing import List, Type +from typing import List, Optional, Type import msgspec import zmq @@ -14,7 +15,7 @@ make_zmq_socket) from vllm.v1.engine import (EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, - EngineCoreRequestUnion) + EngineCoreRequestUnion, EngineCoreResetPrefixCache) from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder @@ -69,6 +70,9 @@ def add_request(self, request: EngineCoreRequest) -> None: def profile(self, is_start: bool = True) -> None: raise NotImplementedError + def reset_prefix_cache(self) -> None: + raise NotImplementedError + def abort_requests(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -81,6 +85,9 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: async def profile_async(self, is_start: bool = True) -> None: raise NotImplementedError + async def reset_prefix_cache_async(self) -> None: + raise NotImplementedError + async def abort_requests_async(self, request_ids: List[str]) -> None: raise NotImplementedError @@ -108,12 +115,15 @@ def abort_requests(self, request_ids: List[str]) -> None: if len(request_ids) > 0: self.engine_core.abort_requests(request_ids) - def shutdown(self): + def shutdown(self) -> None: self.engine_core.shutdown() def profile(self, is_start: bool = True) -> None: self.engine_core.profile(is_start) + def reset_prefix_cache(self) -> None: + self.engine_core.reset_prefix_cache() + class MPClient(EngineCoreClient): """ @@ -229,6 +239,10 @@ def profile(self, is_start: bool = True) -> None: self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) + def reset_prefix_cache(self) -> None: + self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) + class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" @@ -242,10 +256,24 @@ def __init__(self, vllm_config: VllmConfig, log_stats=True, ) + self.outputs_queue: Optional[asyncio.Queue[bytes]] = None + self.queue_task: Optional[asyncio.Task] = None + async def get_output_async(self) -> EngineCoreOutputs: + if self.outputs_queue is None: + # Perform IO in separate task to parallelize as much as possible + self.outputs_queue = asyncio.Queue() + + async def process_outputs_socket(): + assert self.outputs_queue is not None + while True: + (frame, ) = await self.output_socket.recv_multipart( + copy=False) + self.outputs_queue.put_nowait(frame.buffer) - frames = await self.output_socket.recv_multipart(copy=False) - return self.decoder.decode(frames[0].buffer) + self.queue_task = asyncio.create_task(process_outputs_socket()) + + return self.decoder.decode(await self.outputs_queue.get()) async def _send_input(self, request_type: EngineCoreRequestType, request: EngineCoreRequestUnion) -> None: @@ -266,3 +294,7 @@ async def abort_requests_async(self, request_ids: List[str]) -> None: async def profile_async(self, is_start: bool = True) -> None: await self._send_input(EngineCoreRequestType.PROFILE, EngineCoreProfile(is_start)) + + async def reset_prefix_cache_async(self) -> None: + await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, + EngineCoreResetPrefixCache()) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index f5999ccda6447..55d314ebeb955 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -162,6 +162,9 @@ def start_profile(self): def stop_profile(self): self.engine_core.profile(False) + def reset_prefix_cache(self): + self.engine_core.reset_prefix_cache() + def get_tokenizer_group( self, group_type: Type[_G] = BaseTokenizerGroup, diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 749f4f5043c97..234ef8194ca93 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -8,7 +8,7 @@ from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest from vllm.v1.engine.detokenizer import (DetokenizerOutput, IncrementalDetokenizer) -from vllm.v1.metrics.stats import IterationStats +from vllm.v1.metrics.stats import IterationStats, RequestStateStats @dataclass @@ -27,6 +27,7 @@ def __init__( prompt: Optional[str], prompt_token_ids: List[int], detokenizer: IncrementalDetokenizer, + arrival_time: float, queue: Optional[asyncio.Queue[RequestOutput]], ): self.request_id = request_id @@ -37,6 +38,8 @@ def __init__( self.is_prefilling = True self.queue = queue + self.stats = RequestStateStats(last_token_time=arrival_time) + @classmethod def from_new_request( cls, @@ -52,6 +55,7 @@ def from_new_request( tokenizer=tokenizer, request=request, ), + arrival_time=request.arrival_time, queue=queue, ) @@ -101,6 +105,7 @@ def add_request( def process_outputs( self, engine_core_outputs: List[EngineCoreOutput], + iteration_stats: Optional[IterationStats] = None, ) -> OutputProcessorOutput: """ Process the EngineCoreOutputs: @@ -133,7 +138,8 @@ def process_outputs( request_outputs: List[RequestOutput] = [] reqs_to_abort: List[str] = [] - iteration_stats = IterationStats(self.log_stats) + if not iteration_stats: + iteration_stats = IterationStats(self.log_stats) for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id req_state = self.request_states.get(req_id) @@ -144,7 +150,8 @@ def process_outputs( # 1) Compute stats for this iteration. iteration_stats.update_from_output(engine_core_output, req_state.is_prefilling, - req_state.prompt_len) + req_state.prompt_len, + req_state.stats) req_state.is_prefilling = False # 2) Detokenize the token ids into text. @@ -169,14 +176,18 @@ def process_outputs( # detected stop string, abort needed in EngineCore. reqs_to_abort.append(req_id) + # Track per-request stats + iteration_stats.update_from_finished_request( + request_output, req_state.stats) + return OutputProcessorOutput( request_outputs=request_outputs, reqs_to_abort=reqs_to_abort, iteration_stats=iteration_stats, ) + @staticmethod def _make_request_output( - self, request_state: RequestState, detokenizer_output: Optional[DetokenizerOutput], ) -> Optional[RequestOutput]: diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 5240778ebf330..131be759842c7 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,63 +1,92 @@ -from abc import ABC, abstractmethod from typing import Type from vllm.config import VllmConfig +from vllm.executor.executor_base import ExecutorBase +from vllm.executor.ray_distributed_executor import ( # noqa + RayDistributedExecutor as RayDistributedExecutorV0) +from vllm.executor.uniproc_executor import ( # noqa + ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0) +from vllm.executor.uniproc_executor import ( # noqa + UniProcExecutor as UniProcExecutorV0) from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput -class Executor(ABC): - """Abstract class for executors.""" +class Executor(ExecutorBase): + """ + Abstract class for v1 executors, mainly define some methods for v1. + For methods shared by v0 and v1, define them in ExecutorBase""" @staticmethod def get_class(vllm_config: VllmConfig) -> Type["Executor"]: executor_class: Type[Executor] + parallel_config = vllm_config.parallel_config distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) + parallel_config.distributed_executor_backend) + if distributed_executor_backend is None: + # If the user does not specify the distributed executor backend, + # we will choose the backend based on the world size. + if parallel_config.world_size > 1: + distributed_executor_backend = "mp" + else: + distributed_executor_backend = "uni" + if distributed_executor_backend == "ray": - from vllm.executor.ray_distributed_executor import ( # noqa - RayDistributedExecutor) executor_class = RayDistributedExecutor elif distributed_executor_backend == "mp": from vllm.v1.executor.multiproc_executor import MultiprocExecutor executor_class = MultiprocExecutor + elif distributed_executor_backend == "uni": + executor_class = UniProcExecutor + elif distributed_executor_backend == "external_launcher": + # TODO: make v1 scheduling deterministic + # to support external launcher + executor_class = ExecutorWithExternalLauncher else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor + raise ValueError("Unknown distributed executor backend: " + f"{distributed_executor_backend}") return executor_class - @abstractmethod - def __init__(self, vllm_config: VllmConfig) -> None: - raise NotImplementedError - - @abstractmethod def initialize(self, kv_cache_config: KVCacheConfig) -> None: - raise NotImplementedError + """ + Initialize the KV caches and begin the model execution loop of the + underlying workers. + """ + self.collective_rpc("initialize_cache", args=(kv_cache_config, )) + self.collective_rpc("compile_or_warm_up_model") - @abstractmethod def determine_available_memory(self) -> int: # in bytes - raise NotImplementedError + output = self.collective_rpc("determine_available_memory") + # Since we use a shared centralized controller, we take the minimum + # memory size across all workers to make sure all the memory + # operators can be applied to all workers. + return min(output) - @abstractmethod def get_kv_cache_spec(self) -> KVCacheSpec: - raise NotImplementedError + output = self.collective_rpc("get_kv_cache_spec") + for x in output: + assert x == output[0] + return output[0] - @abstractmethod def execute_model( self, scheduler_output, ) -> ModelRunnerOutput: - raise NotImplementedError + output = self.collective_rpc("execute_model", + args=(scheduler_output, )) + return output[0] - @abstractmethod def profile(self, is_start: bool = True): - raise NotImplementedError + self.collective_rpc("profile", args=(is_start, )) + + +class UniProcExecutor(UniProcExecutorV0, Executor): + pass + + +class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor): + pass - @abstractmethod - def shutdown(self): - pass - @abstractmethod - def check_health(self) -> None: - raise NotImplementedError +class RayDistributedExecutor(RayDistributedExecutorV0, Executor): + pass diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index fd977d07e8d81..f6cf35da0106b 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -25,8 +25,6 @@ from vllm.utils import (get_distributed_init_method, get_mp_context, get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) from vllm.v1.executor.abstract import Executor -from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec -from vllm.v1.outputs import ModelRunnerOutput from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -37,7 +35,7 @@ class MultiprocExecutor(Executor): - def __init__(self, vllm_config: VllmConfig) -> None: + def _init_executor(self) -> None: # Call self.shutdown at exit to clean up # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) @@ -55,9 +53,6 @@ def sigusr1_handler(signum, frame): signal.signal(signal.SIGUSR1, sigusr1_handler) - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.world_size = self.parallel_config.world_size tensor_parallel_size = self.parallel_config.tensor_parallel_size assert self.world_size == tensor_parallel_size, ( @@ -82,7 +77,8 @@ def sigusr1_handler(signum, frame): # Create workers self.workers: List[WorkerProcHandle] = [] for rank in range(self.world_size): - worker = WorkerProc.make_worker_process(vllm_config, rank, rank, + worker = WorkerProc.make_worker_process(self.vllm_config, rank, + rank, distributed_init_method, scheduler_output_handle) self.workers.append(worker) @@ -93,55 +89,17 @@ def sigusr1_handler(signum, frame): for w in self.workers: w.worker_response_mq.wait_until_ready() - def initialize(self, kv_cache_config: KVCacheConfig) -> None: - """ - Initialize the KV caches and begin the model execution loop of the - underlying workers. - """ - self.collective_rpc("initialize_cache", args=(kv_cache_config, )) - self.collective_rpc("compile_or_warm_up_model") - - def determine_available_memory(self) -> int: - """ - Determine the available memory (in bytes) for KV cache by invoking the - underlying worker. - """ - memory_sizes = self.collective_rpc("determine_available_memory") - - # Since we use a shared centralized controller, we take the minimum - # memory size across all workers to make sure all the memory - # operators can be applied to all workers. - return min(memory_sizes) - - def get_kv_cache_spec(self) -> KVCacheSpec: - """ - Get all kv cache needed by the model by invoking the underlying worker. - """ - kv_cache_specs = self.collective_rpc("get_kv_cache_spec") - assert all(s == kv_cache_specs[0] for s in kv_cache_specs) - return kv_cache_specs[0] - def collective_rpc(self, method: Union[str, Callable], timeout: Optional[float] = None, args: Tuple = (), kwargs: Optional[Dict] = None) -> List[Any]: - """ - Execute an RPC call on workers. - - Args: - method: Name of the worker method to execute - timeout: Maximum time in seconds to wait for execution. Rases a - TimeoutError on timeout. None means wait indefinitely. - args: Positional arguments to pass to the worker method - kwargs: Keyword arguments to pass to the worker method - - Returns: - List of results from each worker - """ start_time = time.monotonic() kwargs = kwargs or {} + # NOTE: If the args are heterogeneous, then we pack them into a list, + # and unpack them in the method of every worker, because every worker + # knows their own rank. try: if isinstance(method, str): send_method = method @@ -172,18 +130,6 @@ def collective_rpc(self, # Re-raise any other exceptions raise e - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - model_output = self.collective_rpc("execute_model", - args=(scheduler_output, ))[0] - return model_output - - def profile(self, is_start: bool = True): - self.collective_rpc("profile", args=(is_start, )) - return - def _ensure_worker_termination(self): """Ensure that all worker processes are terminated. Assumes workers have received termination requests. Waits for processing, then sends diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py deleted file mode 100644 index fd67fa2235770..0000000000000 --- a/vllm/v1/executor/ray_executor.py +++ /dev/null @@ -1,344 +0,0 @@ -import os -from collections import defaultdict -from itertools import islice, repeat -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple - -import vllm.envs as envs -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.executor.ray_utils import (RayWorkerWrapper, - initialize_ray_cluster, ray) -from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec -from vllm.v1.outputs import ModelRunnerOutput - -if ray is not None: - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - -if TYPE_CHECKING: - from ray.util.placement_group import PlacementGroup - -logger = init_logger(__name__) - - -class RayExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.parallel_config = vllm_config.parallel_config - self.model_config = vllm_config.model_config - self.forward_dag: Optional[ray.dag.CompiledDAG] = None - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - initialize_ray_cluster(self.parallel_config) - placement_group = self.parallel_config.placement_group - - # Create the parallel GPU workers. - self._init_workers_ray(placement_group) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - # A list of workers to run a model. - self.workers: List[RayWorkerWrapper] = [] - if self.parallel_config.ray_workers_use_nsight: - ray_remote_kwargs = self._configure_ray_workers_use_nsight( - ray_remote_kwargs) - - # Create the workers. - driver_ip = get_ip() - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("GPU", 0): - # Skip bundles that don't have GPUs, - # as each worker needs one GPU. - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - - worker = ray.remote( - num_cpus=0, - num_gpus=1, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config) - self.workers.append(worker) - - logger.debug("workers: %s", self.workers) - worker_ips = [ - ray.get(worker.get_node_ip.remote()) # type: ignore[attr-defined] - for worker in self.workers - ] - ip_counts: Dict[str, int] = {} - for ip in worker_ips: - ip_counts[ip] = ip_counts.get(ip, 0) + 1 - - worker_to_ip = dict(zip(self.workers, worker_ips)) - - def sort_by_driver_then_worker_ip(worker): - """ - Sort the workers based on 3 properties: - 1. If the worker is on the same node as the driver (vllm engine), - it should be placed first. - 2. Then, if the worker is on a node with fewer workers, it should - be placed first. - 3. Finally, if the work is on a node with smaller IP address, it - should be placed first. This is simply a tiebreaker to make - sure the workers are sorted in a deterministic way. - """ - ip = worker_to_ip[worker] - return (ip != driver_ip, ip_counts[ip], ip) - - # After sorting, the workers on the same node will be - # close to each other, and the workers on the driver - # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids") - - node_workers = defaultdict(list) # node id -> list of worker ranks - node_gpus = defaultdict(list) # node id -> list of gpu ids - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - # `gpu_ids` can be a list of strings or integers. - # convert them to integers for consistency. - # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), - # string sorting is not sufficient. - # see https://github.com/vllm-project/vllm/issues/5590 - gpu_ids = [int(x) for x in gpu_ids] - node_gpus[node_id].extend(gpu_ids) - - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - all_ips = set(worker_ips) - n_ips = len(all_ips) - n_nodes = len(node_workers) - - if n_nodes != n_ips: - raise RuntimeError( - f"Every node should have a unique IP address. Got {n_nodes}" - f" nodes with node ids {list(node_workers.keys())} and " - f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP` or " - "`HOST_IP` environment variable, make sure it is unique for" - " each node.") - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "CUDA_VISIBLE_DEVICES": - ",".join(map(str, node_gpus[node_id])), - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - "VLLM_USE_V1": - str(int(envs.VLLM_USE_V1)), - **({ - "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND - } if envs.VLLM_ATTENTION_BACKEND is not None else {}) - }, ) for (node_id, _) in worker_node_and_gpu_ids] - - self._env_vars_for_all_workers = ( - all_args_to_update_environment_variables) - - self._run_workers("update_environment_variables", - all_args=self._get_env_vars_to_be_updated()) - - if len(node_gpus) == 1: - # in single node case, we don't need to get the IP address. - # the loopback address is sufficient - # NOTE: a node may have several IP addresses, one for each - # network interface. `get_ip()` might return any of them, - # while they might not work for communication inside the node - # if the network setup is complicated. Using the loopback address - # solves this issue, as it always works for communication inside - # the node. - driver_ip = "127.0.0.1" - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), - rank=rank, - distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - self._run_workers("initialize") - self._run_workers("load_model") - - def _configure_ray_workers_use_nsight(self, - ray_remote_kwargs) -> Dict[str, Any]: - # If nsight profiling is enabled, we need to set the profiling - # configuration for the ray workers as runtime env. - runtime_env = ray_remote_kwargs.setdefault("runtime_env", {}) - runtime_env.update({ - "nsight": { - "t": "cuda,cudnn,cublas", - "o": "'worker_process_%p'", - "cuda-graph-trace": "node", - } - }) - - return ray_remote_kwargs - - def _get_env_vars_to_be_updated(self): - return self._env_vars_for_all_workers - - def _get_worker_kwargs( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Dict[str, Any]: - """ - Return worker init args for a given rank. - """ - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_available_memory(self) -> int: - """ - Determine the available GPU memory in bytes. - - This invokes `determine_available_memory` on each worker and takes - the min of the results, guaranteeing that the selected cache sizes are - compatible with all workers. - """ - - memory_sizes = self._run_workers("determine_available_memory") - - # Since we use a shared centralized controller, we take the minimum - # memory size across all workers to make sure all the memory - # operators can be applied to all workers. - return min(memory_sizes) - - def initialize(self, kv_cache_config: KVCacheConfig) -> None: - """ - Initialize the KV cache in all workers. - """ - self._run_workers("initialize_cache", kv_cache_config) - self._run_workers("compile_or_warm_up_model") - - def get_kv_cache_spec(self) -> KVCacheSpec: - """ - Get all kv cache needed by the model - - This invokes `get_kv_cache_spec` on each worker and asserts that - they are identical. The KVCacheSpec is then returned. - """ - kv_cache_specs = self._run_workers("get_kv_cache_spec") - assert all(s == kv_cache_specs[0] for s in kv_cache_specs) - return kv_cache_specs[0] - - def _run_workers( - self, - method: str, - *args, - all_args: Optional[List[Tuple[Any, ...]]] = None, - all_kwargs: Optional[List[Dict[str, Any]]] = None, - **kwargs, - ) -> Any: - """ - Runs the given method on all workers. Can be used in the following - ways: - - Args: - - args/kwargs: All workers share the same args/kwargs - - all_args/all_kwargs: args/kwargs for each worker are specified - individually - """ - count = len(self.workers) - all_worker_args = repeat(args, count) if all_args is None \ - else islice(all_args, 0, None) - all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ - else islice(all_kwargs, 0, None) - - ray_worker_refs = [ - worker.execute_method.remote( # type: ignore[attr-defined] - method, *worker_args, **worker_kwargs) - for (worker, worker_args, worker_kwargs - ) in zip(self.workers, all_worker_args, all_worker_kwargs) - ] - return ray.get(ray_worker_refs) - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - if self.forward_dag is None: - self.forward_dag = self._compiled_ray_dag() - # Only the first worker (with rank 0) returns the execution result. - # Others return None. - output = ray.get(self.forward_dag.execute(scheduler_output))[0] - return output - - def profile(self, is_start=True): - raise NotImplementedError - - def shutdown(self): - if hasattr(self, "forward_dag") and self.forward_dag is not None: - self.forward_dag.teardown() - import ray - for worker in self.workers: - ray.kill(worker) - self.forward_dag = None - - def check_health(self) -> None: - logger.debug("Called check_health.") - - def _check_ray_compiled_graph_installation(self): - import pkg_resources - from packaging import version - - required_version = version.parse("2.39") - current_version = version.parse( - pkg_resources.get_distribution("ray").version) - if current_version < required_version: - raise ValueError(f"Ray version {required_version} is " - f"required, but found {current_version}") - - import importlib.util - raycg = importlib.util.find_spec("ray.experimental.compiled_dag_ref") - if raycg is None: - raise ValueError("Ray Compiled Graph is not installed. " - "Run `pip install ray[adag]` to install it.") - - cupy_spec = importlib.util.find_spec("cupy") - if cupy_spec is None and envs.VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL: - raise ValueError( - "cupy is not installed but required since " - "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL is set." - "Run `pip install ray[adag]` and check cupy installation.") - - def _compiled_ray_dag(self): - assert self.parallel_config.use_ray - self._check_ray_compiled_graph_installation() - from ray.dag import InputNode, MultiOutputNode - - with InputNode() as input_batches: - outputs = [ - worker.execute_model.bind( # type: ignore[attr-defined] - input_batches) for worker in self.workers - ] - forward_dag = MultiOutputNode(outputs) - - return forward_dag.experimental_compile() - - def __del__(self): - self.shutdown() diff --git a/vllm/v1/executor/ray_utils.py b/vllm/v1/executor/ray_utils.py deleted file mode 100644 index fc9715b7a5909..0000000000000 --- a/vllm/v1/executor/ray_utils.py +++ /dev/null @@ -1,280 +0,0 @@ -import time -from collections import defaultdict -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple - -from vllm.config import ParallelConfig -from vllm.logger import init_logger -from vllm.platforms import current_platform -from vllm.utils import get_ip -from vllm.v1.outputs import ModelRunnerOutput -from vllm.worker.worker_base import WorkerWrapperBase - -if TYPE_CHECKING: - from vllm.v1.core.scheduler import SchedulerOutput - -logger = init_logger(__name__) -PG_WAIT_TIMEOUT = 60 - -try: - import ray - from ray.util import placement_group_table - from ray.util.placement_group import PlacementGroup - try: - from ray._private.state import available_resources_per_node - except ImportError: - # Ray 2.9.x doesn't expose `available_resources_per_node` - from ray._private.state import state as _state - available_resources_per_node = _state._available_resources_per_node - - class RayWorkerWrapper(WorkerWrapperBase): - - def __init__(self, *args, **kwargs) -> None: - super().__init__(*args, **kwargs) - # Since the compiled DAG runs a main execution - # in a different thread that calls cuda.set_device. - # The flag indicates is set_device is called on - # that thread. It will be removed soon. - self.compiled_dag_cuda_device_set = False - - def get_node_ip(self) -> str: - return get_ip() - - def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]: - node_id = ray.get_runtime_context().get_node_id() - device_key = current_platform.ray_device_key - if not device_key: - raise RuntimeError("current platform %s does not support ray.", - current_platform.device_name) - gpu_ids = ray.get_runtime_context().get_accelerator_ids( - )[device_key] - return node_id, gpu_ids - - def setup_device_if_necessary(self): - # TODO(swang): This is needed right now because Ray CG executes - # on a background thread, so we need to reset torch's current - # device. - # We can remove this API after it is fixed in compiled graph. - import torch - assert self.worker is not None, "Worker is not initialized" - if not self.compiled_dag_cuda_device_set: - torch.cuda.set_device(self.worker.device) - self.compiled_dag_cuda_device_set = True - - def execute_model( - self, - scheduler_output: "SchedulerOutput", - ) -> ModelRunnerOutput: - self.setup_device_if_necessary() - assert self.worker is not None, "Worker is not initialized" - output = self.worker.model_runner.execute_model(scheduler_output) - return output - - ray_import_err = None - -except ImportError as e: - ray = None # type: ignore - ray_import_err = e - RayWorkerWrapper = None # type: ignore - - -def ray_is_available() -> bool: - """Returns True if Ray is available.""" - return ray is not None - - -def assert_ray_available(): - """ - Raise an exception if Ray is not available. - """ - if ray is None: - raise ValueError("Failed to import Ray, please install Ray with " - "`pip install ray`.") from ray_import_err - - -def _verify_bundles(placement_group: "PlacementGroup", - parallel_config: ParallelConfig, device_str: str): - """ - Verify a given placement group has bundles located in the right place. - - There are 2 rules. - - Warn if all tensor parallel workers cannot fit in a single node. - - Fail if driver node is not included in a placement group. - - Args: - placement_group: The placement group to verify. - parallel_config: The parallel configuration. - device_str: The required device. - """ - assert ray.is_initialized(), ( - "Ray is not initialized although distributed-executor-backend is ray.") - pg_data = placement_group_table(placement_group) - # bundle_idx -> node_id - bundle_to_node_ids = pg_data["bundles_to_node_id"] - # bundle_idx -> bundle (e.g., {"GPU": 1}) - bundles = pg_data["bundles"] - # node_id -> List of bundle (e.g., {"GPU": 1}) - node_id_to_bundle: Dict[str, List[Dict[str, float]]] = defaultdict(list) - - for bundle_idx, node_id in bundle_to_node_ids.items(): - node_id_to_bundle[node_id].append(bundles[bundle_idx]) - driver_node_id = ray.get_runtime_context().get_node_id() - - if driver_node_id not in node_id_to_bundle: - raise RuntimeError( - f"driver node id {driver_node_id} is not included in a placement " - f"group {placement_group.id}. Node id -> bundles " - f"{node_id_to_bundle}. " - "You don't have enough GPUs available in a current node. Check " - "`ray status` to see if you have available GPUs in a node " - f"{driver_node_id} before starting an vLLM engine.") - - for node_id, bundles in node_id_to_bundle.items(): - if len(bundles) < parallel_config.tensor_parallel_size: - logger.warning( - "tensor_parallel_size=%d " - "is bigger than a reserved number of %ss (%d " - "%ss) in a node %s. Tensor parallel workers can be " - "spread out to 2+ nodes which can degrade the performance " - "unless you have fast interconnect across nodes, like " - "Infiniband. To resolve this issue, make sure you have more " - "than %d GPUs available at each node.", - parallel_config.tensor_parallel_size, device_str, len(bundles), - device_str, node_id, parallel_config.tensor_parallel_size) - - -def _wait_until_pg_ready(current_placement_group: "PlacementGroup"): - """Wait until a placement group is ready. - - It prints the informative log messages if the placement group is - not created within time. - - """ - # Wait until PG is ready - this will block until all - # requested resources are available, and will timeout - # if they cannot be provisioned. - placement_group_specs = current_placement_group.bundle_specs - - s = time.time() - pg_ready_ref = current_placement_group.ready() - wait_interval = 10 - while time.time() - s < PG_WAIT_TIMEOUT: - ready, _ = ray.wait([pg_ready_ref], timeout=wait_interval) - if len(ready) > 0: - break - - # Exponential backoff for warning print. - wait_interval *= 2 - logger.info( - "Waiting for creating a placement group of specs for " - "%d seconds. specs=%s. Check " - "`ray status` to see if you have enough resources.", - int(time.time() - s), placement_group_specs) - - try: - ray.get(pg_ready_ref, timeout=0) - except ray.exceptions.GetTimeoutError: - raise ValueError( - "Cannot provide a placement group of " - f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See " - "`ray status` to make sure the cluster has enough resources." - ) from None - - -def initialize_ray_cluster( - parallel_config: ParallelConfig, - ray_address: Optional[str] = None, -): - """Initialize the distributed cluster with Ray. - - it will connect to the Ray cluster and create a placement group - for the workers, which includes the specification of the resources - for each distributed worker. - - Args: - parallel_config: The configurations for parallel execution. - ray_address: The address of the Ray cluster. If None, uses - the default Ray cluster address. - """ - assert_ray_available() - - # Connect to a ray cluster. - if current_platform.is_rocm() or current_platform.is_xpu(): - # Try to connect existing ray instance and create a new one if not found - try: - ray.init("auto") - except ConnectionError: - logger.warning( - "No existing RAY instance detected. " - "A new instance will be launched with current node resources.") - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) - else: - ray.init(address=ray_address, ignore_reinit_error=True) - - if parallel_config.placement_group: - # Placement group is already set. - return - - device_str = current_platform.ray_device_key - if not device_str: - raise ValueError( - f"current platform {current_platform.device_name} does not " - "support ray.") - # Create placement group for worker processes - current_placement_group = ray.util.get_current_placement_group() - if current_placement_group: - # We are in a placement group - bundles = current_placement_group.bundle_specs - # Verify that we can use the placement group. - device_bundles = 0 - for bundle in bundles: - bundle_devices = bundle.get(device_str, 0) - if bundle_devices > 1: - raise ValueError( - "Placement group bundle cannot have more than 1 " - f"{device_str}.") - if bundle_devices: - device_bundles += 1 - if parallel_config.world_size > device_bundles: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group." - f"Required number of devices: {parallel_config.world_size}. " - f"Total number of devices: {device_bundles}.") - else: - num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) - if parallel_config.world_size > num_devices_in_cluster: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group.") - # Create a new placement group - placement_group_specs: List[Dict[str, float]] = ([{ - device_str: 1.0 - } for _ in range(parallel_config.world_size)]) - - # vLLM engine is also a worker to execute model with an accelerator, - # so it requires to have the device in a current node. Check if - # the current node has at least one device. - current_ip = get_ip() - current_node_id = ray.get_runtime_context().get_node_id() - current_node_resource = available_resources_per_node()[current_node_id] - if current_node_resource.get(device_str, 0) < 1: - raise ValueError( - f"Current node has no {device_str} available. " - f"{current_node_resource=}. vLLM engine cannot start without " - f"{device_str}. Make sure you have at least 1 {device_str} " - f"available in a node {current_node_id=} {current_ip=}.") - # This way, at least bundle is required to be created in a current - # node. - placement_group_specs[0][f"node:{current_ip}"] = 0.001 - - # By default, Ray packs resources as much as possible. - current_placement_group = ray.util.placement_group( - placement_group_specs, strategy="PACK") - _wait_until_pg_ready(current_placement_group) - - assert current_placement_group is not None - _verify_bundles(current_placement_group, parallel_config, device_str) - # Set the placement group in the parallel config - parallel_config.placement_group = current_placement_group diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py deleted file mode 100644 index b3997caac726b..0000000000000 --- a/vllm/v1/executor/uniproc_executor.py +++ /dev/null @@ -1,88 +0,0 @@ -import os -from typing import Optional - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.v1.executor.abstract import Executor -from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec -from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.worker.gpu_worker import Worker - -logger = init_logger(__name__) - - -class UniprocExecutor(Executor): - - def __init__(self, vllm_config: VllmConfig) -> None: - self.vllm_config = vllm_config - self.model_config = vllm_config.model_config - self.cache_config = vllm_config.cache_config - self.lora_config = vllm_config.lora_config - self.load_config = vllm_config.load_config - self.parallel_config = vllm_config.parallel_config - self.scheduler_config = vllm_config.scheduler_config - self.device_config = vllm_config.device_config - self.speculative_config = vllm_config.speculative_config - self.prompt_adapter_config = vllm_config.prompt_adapter_config - self.observability_config = vllm_config.observability_config - - self.worker: Worker = self._create_worker() - self.worker.init_device() - self.worker.load_model() - - def _create_worker( - self, - local_rank: int = 0, - rank: int = 0, - distributed_init_method: Optional[str] = None) -> Worker: - """Return worker init args for a given rank.""" - # see https://github.com/NVIDIA/nccl/issues/1234 - os.environ['NCCL_CUMEM_ENABLE'] = '0' - - if distributed_init_method is None: - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - return Worker( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - ) - - def determine_available_memory(self) -> int: - """Determine the available memory (in bytes) for KV cache by invoking - the underlying worker. - """ - return self.worker.determine_available_memory() - - def get_kv_cache_spec(self) -> KVCacheSpec: - """Get all kv cache needed by the model by invoking the underlying - worker. - """ - return self.worker.get_kv_cache_spec() - - def initialize(self, kv_cache_config: KVCacheConfig) -> None: - """Initialize the KV cache by invoking the underlying worker. - """ - self.worker.initialize_cache(kv_cache_config) - self.worker.compile_or_warm_up_model() - - def execute_model( - self, - scheduler_output, - ) -> ModelRunnerOutput: - output = self.worker.execute_model(scheduler_output) - assert output is not None - return output - - def profile(self, is_start: bool = True): - self.worker.profile(is_start) - - def shutdown(self): - pass - - def check_health(self) -> None: - # UniprocExecutor will always be healthy as long as - # it's running. - return diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 8feeef17542e6..9bb24d1948651 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -1,8 +1,13 @@ import time from abc import ABC, abstractmethod +from typing import List +import numpy as np +import prometheus_client + +from vllm.config import ModelConfig from vllm.logger import init_logger -from vllm.v1.metrics.stats import SchedulerStats +from vllm.v1.metrics.stats import IterationStats, SchedulerStats logger = init_logger(__name__) @@ -12,27 +17,181 @@ class StatLoggerBase(ABC): @abstractmethod - def log(self, scheduler_stats: SchedulerStats): + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): ... class LoggingStatLogger(StatLoggerBase): def __init__(self): - self.last_log_time = time.monotonic() + self._reset(time.monotonic()) - def log(self, scheduler_stats: SchedulerStats): - """Log Stats to standard output.""" + def _reset(self, now): + self.last_log_time = now + # Tracked stats over current local logging interval. + self.num_prompt_tokens: List[int] = [] + self.num_generation_tokens: List[int] = [] + + def _local_interval_elapsed(self, now: float) -> bool: # Log every _LOCAL_LOGGING_INTERVAL_SEC. + elapsed_time = now - self.last_log_time + return elapsed_time > _LOCAL_LOGGING_INTERVAL_SEC + + def _track_iteration_stats(self, iteration_stats: IterationStats): + # Save tracked stats for token counters. + self.num_prompt_tokens.append(iteration_stats.num_prompt_tokens) + self.num_generation_tokens.append( + iteration_stats.num_generation_tokens) + + def _get_throughput(self, tracked_stats: List[int], now: float) -> float: + # Compute summary metrics for tracked stats + return float(np.sum(tracked_stats) / (now - self.last_log_time)) + + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): + """Log Stats to standard output.""" + + self._track_iteration_stats(iteration_stats) + now = time.monotonic() - if now - self.last_log_time < _LOCAL_LOGGING_INTERVAL_SEC: + if not self._local_interval_elapsed(now): return - self.last_log_time = now + + prompt_throughput = self._get_throughput(self.num_prompt_tokens, now) + generation_throughput = self._get_throughput( + self.num_generation_tokens, now) + + self._reset(now) # Format and print output. logger.info( + "Avg prompt throughput: %.1f tokens/s, " + "Avg generation throughput: %.1f tokens/s, " "Running: %d reqs, Waiting: %d reqs ", + prompt_throughput, + generation_throughput, scheduler_stats.num_running_reqs, scheduler_stats.num_waiting_reqs, ) + + +class PrometheusStatLogger(StatLoggerBase): + + def __init__(self, model_config: ModelConfig): + self._unregister_vllm_metrics() + + labelnames = ["model_name"] + labelvalues = [model_config.served_model_name] + + max_model_len = model_config.max_model_len + + self.gauge_scheduler_running = prometheus_client.Gauge( + name="vllm:num_requests_running", + documentation="Number of requests in model execution batches.", + labelnames=labelnames).labels(*labelvalues) + + self.gauge_scheduler_waiting = prometheus_client.Gauge( + name="vllm:num_requests_waiting", + documentation="Number of requests waiting to be processed.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_prompt_tokens = prometheus_client.Counter( + name="vllm:prompt_tokens_total", + documentation="Number of prefill tokens processed.", + labelnames=labelnames).labels(*labelvalues) + + self.counter_generation_tokens = prometheus_client.Counter( + name="vllm:generation_tokens_total", + documentation="Number of generation tokens processed.", + labelnames=labelnames).labels(*labelvalues) + + self.histogram_num_prompt_tokens_request = \ + prometheus_client.Histogram( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + + self.histogram_num_generation_tokens_request = \ + prometheus_client.Histogram( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames).labels(*labelvalues) + + self.histogram_time_to_first_token = \ + prometheus_client.Histogram( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0 + ], + labelnames=labelnames).labels(*labelvalues) + + self.histogram_time_per_output_token = \ + prometheus_client.Histogram( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, + 0.75, 1.0, 2.5 + ], + labelnames=labelnames).labels(*labelvalues) + + def log(self, scheduler_stats: SchedulerStats, + iteration_stats: IterationStats): + """Log to prometheus.""" + self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + + self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) + self.counter_generation_tokens.inc( + iteration_stats.num_generation_tokens) + + for finished_request in iteration_stats.finished_requests: + self.histogram_num_prompt_tokens_request.observe( + finished_request.num_prompt_tokens) + self.histogram_num_generation_tokens_request.observe( + finished_request.num_generation_tokens) + + for ttft in iteration_stats.time_to_first_tokens_iter: + self.histogram_time_to_first_token.observe(ttft) + for tpot in iteration_stats.time_per_output_tokens_iter: + self.histogram_time_per_output_token.observe(tpot) + + @staticmethod + def _unregister_vllm_metrics(): + # Unregister any existing vLLM collectors (for CI/CD + for collector in list(prometheus_client.REGISTRY._collector_to_names): + if hasattr(collector, "_name") and "vllm" in collector._name: + prometheus_client.REGISTRY.unregister(collector) + + +def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: + """ + Builds a list of buckets with increasing powers of 10 multiplied by + mantissa values until the value exceeds the specified maximum. + + """ + exponent = 0 + buckets: List[int] = [] + while True: + for m in mantissa_lst: + value = m * 10**exponent + if value <= max_value: + buckets.append(value) + else: + return buckets + exponent += 1 + + +def build_1_2_5_buckets(max_value: int) -> List[int]: + """ + Example: + >>> build_1_2_5_buckets(100) + [1, 2, 5, 10, 20, 50, 100] + """ + return build_buckets([1, 2, 5], max_value) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 60cb986f8bbce..f4c276f0b6902 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -1,7 +1,9 @@ +import time from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, List if TYPE_CHECKING: + from vllm.outputs import RequestOutput from vllm.v1.engine import EngineCoreOutput @@ -16,6 +18,22 @@ class SchedulerStats: # gpu_prefix_cache_hit_rate: float = 0.0 +@dataclass +class RequestStateStats: + """Stats that need to be tracked across delta updates.""" + + num_generation_tokens: int = 0 + last_token_time: float = 0.0 + + +@dataclass +class FinishedRequestStats: + """Stats associated with a finished request.""" + + num_prompt_tokens: int = 0 + num_generation_tokens: int = 0 + + class IterationStats: """Stats associated with a single set of EngineCoreOutputs.""" @@ -23,17 +41,38 @@ def __init__(self, log_stats: bool): self.log_stats = log_stats self.num_generation_tokens = 0 self.num_prompt_tokens = 0 + self.finished_requests: List[FinishedRequestStats] = [] + self.time_to_first_tokens_iter: List[float] = [] + self.time_per_output_tokens_iter: List[float] = [] def update_from_output(self, output: "EngineCoreOutput", - is_prefilling: bool, prompt_len: int): + is_prefilling: bool, prompt_len: int, + request_state_stats: RequestStateStats): if not self.log_stats: return - self.num_generation_tokens += len(output.new_token_ids) + num_new_generation_tokens = len(output.new_token_ids) + now = time.time() + last_token_latency = now - request_state_stats.last_token_time + + self.num_generation_tokens += num_new_generation_tokens if is_prefilling: # This relies on the invariant that EngineCore does # not stream outputs for partially completed prefills # (scheduler.update_from_output makes EngineCoreOutput # iff num_computed_tokens == num_tokens). - assert (len(output.new_token_ids) > 0) + assert (num_new_generation_tokens > 0) self.num_prompt_tokens += prompt_len + + self.time_to_first_tokens_iter.append(last_token_latency) + else: + self.time_per_output_tokens_iter.append(last_token_latency) + + request_state_stats.num_generation_tokens += num_new_generation_tokens + request_state_stats.last_token_time = now + + def update_from_finished_request(self, request_output: "RequestOutput", + request_state_stats: RequestStateStats): + self.finished_requests.append( + FinishedRequestStats(len(request_output.prompt_token_ids), + request_state_stats.num_generation_tokens)) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index acc3a944e21b9..32aee44e3f374 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -8,7 +8,7 @@ class SamplerOutput: # [num_reqs] - sampled_token_ids: List[int] + sampled_token_ids: torch.Tensor # [num_reqs, max_num_logprobs + 1] logprob_token_ids: Optional[torch.Tensor] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 45450165eaefe..2cfcd8b63ccb2 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -58,12 +58,19 @@ def __init__( # Sanity check assert len(self.mm_inputs) == len(self.mm_positions) - assert len(self.mm_inputs) == len(self.mm_hashes) + if self.mm_hashes: + assert len(self.mm_inputs) == len(self.mm_hashes) # Cache the computed kv block hashes of the request to avoid # recomputing. self._kv_block_hashes: List[BlockHashType] = [] + # Read-only views + # Prevent directly appending to the these lists since + # they should also be updated simultaneously. + self.output_token_ids = ConstantList(self._output_token_ids) + self.all_token_ids = ConstantList(self._all_token_ids) + @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": return cls( @@ -79,18 +86,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": lora_request=request.lora_request, ) - @property - def output_token_ids(self) -> ConstantList[int]: - # Prevent directly appending to the output_token_ids since - # all_token_ids should also be updated simultaneously. - return ConstantList(self._output_token_ids) - - @property - def all_token_ids(self) -> ConstantList[int]: - # Prevent directly appending to the all_token_ids since - # output_token_ids should also be updated simultaneously - return ConstantList(self._all_token_ids) - def append_output_token_ids( self, token_ids: Union[int, List[int]], diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 7cd42ca211a22..9ad665a64894c 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -50,9 +50,8 @@ def forward( # Use int32 to reduce the tensor size. sampled = sampled.to(torch.int32) - # NOTE: CPU-GPU synchronization happens here. sampler_output = SamplerOutput( - sampled_token_ids=sampled.tolist(), + sampled_token_ids=sampled, logprob_token_ids=topk_indices, logprobs=topk_logprobs, prompt_logprob_token_ids=None, diff --git a/vllm/v1/stats/__init__.py b/vllm/v1/stats/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py new file mode 100644 index 0000000000000..902800e0573bf --- /dev/null +++ b/vllm/v1/stats/common.py @@ -0,0 +1,451 @@ +import time +from dataclasses import dataclass +from dataclasses import field as dataclass_field +from enum import IntEnum +from typing import ClassVar, Dict, List, Optional, Set + +import msgspec +from msgspec import field as msgspec_field + +from vllm.sampling_params import SamplingParams + + +class RequestStatsUpdate( + msgspec.Struct, # type: ignore + array_like=True, + omit_defaults=True, + gc=False): + """ + An update to the request stats. + + This represents a stats update at a specific timestamp with metadata + associated with the update. + + NOTE: since there might be multiple processes generating updates at + different parts of the engine (e.g. input processor, scheduler, engine core, + etc.), we use the monotonic timestamp to record the update to compute any + intervals, and explicit wall-clock timestamp should be used for timestamps. + + WARNING: This assumes stats are generated in a single machine. If there are + potentially multiple machines, one should always generate the stats updates + on one single machine or use something else. + """ + + class Type(IntEnum): + """See `RequestStats` for the lifecycle of a request.""" + + # Request arrived at the engine frontend. + ARRIVED = 0 + # Input processed by the input processor. + INPUT_PROCESSED = 1 + # Queued on the engine core. + QUEUED = 2 + # Scheduled running prefill by the scheduler. + # A request could be running a new prefill on the prompt tokens or + # a resumed prefill on the original prefill tokens + generated output + # tokens before preemption. + PREFILLING = 3 + # Preempted by the scheduler. + PREEMPTED = 4 + # Output token is generated by the engine core. + DECODING = 5 + # Token detokenized by the detokenizer. + # We will record the timestamp for each output token, as well as the + # finish reason. + DETOKENIZED = 6 + # Request finishes (or aborts). + FINISHED = 7 + + """ + Valid state updates: + ARRIVED + │ + ├──────► INPUT_PROCESSED ──────► QUEUED ──────► PREFILLING ◄────┐ + │ │ │ │ │ + │ │ │ ▼ │ + │ │ │ -──► DECODING │ + │ │ │ | │ │ + │ │ │ | ▼ │ + │ │ │ └─ DETOKENIZED │ + │ │ │ │ │ + │ │ │ ▼ │ + │ ▼ ▼ PREEMPTED ◄──────┘ + │ │ │ │ + └──────────────┴───────────────────┴──────────────┴ + │ + ▼ + FINISHED (All could go to FINISHED) + """ + _VALID_TRANSITIONS: ClassVar[Dict[Type, Set[Type]]] = { + Type.ARRIVED: { + Type.INPUT_PROCESSED, + Type.FINISHED, + }, + Type.INPUT_PROCESSED: { + Type.QUEUED, + Type.FINISHED, + }, + Type.QUEUED: { + Type.PREFILLING, + Type.FINISHED, + }, + Type.PREFILLING: { + Type.DECODING, + Type.PREEMPTED, + Type.FINISHED, + }, + Type.DECODING: { + Type.DETOKENIZED, + Type.FINISHED, + }, + Type.DETOKENIZED: { + Type.DECODING, + Type.PREEMPTED, + Type.FINISHED, + }, + Type.PREEMPTED: {Type.PREFILLING, Type.FINISHED}, + Type.FINISHED: set(), + } + + request_id: str + + type: Type + + # Timestamp when the update is recorded. This is used to record time + # intervals between events rather than wall clock time. + monotonic_ts_s: float = msgspec_field( + default_factory=lambda: time.monotonic()) + + ############################################################ + # Metadata associated with the update. + ############################################################ + # For input_processed. Metadata needed for stats logging. + num_prompt_tokens: Optional[int] = None + sampling_params: Optional[SamplingParams] = None + + # For running. + # Number of tokens computed when scheduled to run. + num_computed_tokens: Optional[int] = None + # Number of cached tokens when scheduled to run. + num_cached_tokens: Optional[int] = None + + # For decoded. + # The number of new output tokens generated. + num_new_tokens: Optional[int] = None + + # For both detokenized and decoded. + # Finished reason. + finish_reason: Optional[str] = None + + # Non-optional fields for each update type. + _REQUIRED_FIELDS: ClassVar[Dict[Type, List[str]]] = { + Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"], + Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"], + Type.DETOKENIZED: ["num_new_tokens"], + Type.FINISHED: ["finish_reason"], + } + + def __post_init__(self): + required_fields = self._REQUIRED_FIELDS.get(self.type, []) + for field in required_fields: + if getattr(self, field) is None: + raise ValueError( + f"Field {field} is required for update type {self.type}.") + + @staticmethod + def check_valid_update( + update: "RequestStatsUpdate", + last_update_type: Optional[Type], + last_updated_ts_s: Optional[float], + ): + if last_update_type is None: + assert update.type == RequestStatsUpdate.Type.ARRIVED + else: + valid_cur_update_types = RequestStatsUpdate._VALID_TRANSITIONS[ + last_update_type] + assert update.type in valid_cur_update_types, ( + f"Invalid update type: {update.type} for last_update_type: " + f"{last_update_type}.") + + if last_updated_ts_s is not None: + assert update.monotonic_ts_s >= last_updated_ts_s, ( + "Update timestamp must be monotonically increasing, but " + f"last_updated_ts_s={last_updated_ts_s} and " + f"update.monotonic_ts_s={update.monotonic_ts_s}.") + + +@dataclass +class RequestStats: + """Stats associated with a request (`Request`).""" + + ############################################################ + # Metadata + ############################################################ + request_id: str + sampling_params: Optional[SamplingParams] = None + num_prompt_tokens: Optional[int] = None + + ############################################################ + # Metrics and Stats + ############################################################ + # Timestamp when the request was last updated. + last_updated_ts_s: Optional[float] = None + + # Last update stats type. + last_update_type: Optional[RequestStatsUpdate.Type] = None + + # Timestamp when the request arrived at the llm engine. + arrival_ts_s: Optional[float] = None + + # Number of tokens cached. When part of the request prefix is cached, + # this will be set. + num_cached_tokens: int = 0 + + # Number of tokens computed. + num_computed_tokens: int = 0 + + # The timestamp when the request become waiting in the queue. + queued_ts_s: Optional[float] = None + + # When the input processor is completed. + input_processor_end_ts_s: Optional[float] = None + + # A sorted list of timestamps when the request was scheduled to prefill. + # This could be when: + # 1. the request is newly scheduled, so it's a new prefill. + # 2. the request was preempted and resumed. It is equivalent to running + # a prefill of the original prefill tokens + generated output tokens + # before preemption. + prefill_start_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # A list of timestamps when a token is decoded by the engine core. + decoding_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # A sorted list of timestamps for each output token. + output_token_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # First token's timestamp. + first_token_ts_s: Optional[float] = None + + # TODO(rickyx): we need model runner to surface these. + model_forward_duration_s: float = 0.0 + # Includes model forward, block/sync across workers, cpu-gpu sync time + # and sampling time. + model_execute_duration_s: float = 0.0 + + # A sorted list of timestamps when the request was preempted at the + # scheduler. + # TODO(rickyx): right now, we don't actually have a good high-level + # metric to measure the impact of preemption other than observation of + # large P99 TPOT. Ideally we could quantify the impact of preemption by + # measuring the number of tokens re-computed due to preemption. + preempted_ts_s_lst: List[float] = dataclass_field(default_factory=list) + + # Timestamp when the request was finished at the engine core. + finished_ts_s: Optional[float] = None + + # Finish reason. + finish_reason: Optional[str] = None + + ############################################################ + # Derived properties. + ############################################################ + @property + def prefill_ts_s(self) -> Optional[float]: + """The timestamp when the request started prefilling. + Since a request could be preempted in decoding and later resumed + to prefill the decoded tokens, we use the first prefill start timestamp. + """ + return (self.prefill_start_ts_s_lst[0] + if self.prefill_start_ts_s_lst else None) + + @property + def e2e_latency_s(self) -> Optional[float]: + if self.finished_ts_s is None or self.arrival_ts_s is None: + return None + assert self.finished_ts_s >= self.arrival_ts_s + return self.finished_ts_s - self.arrival_ts_s + + @property + def queue_duration_s(self) -> Optional[float]: + """How long the request was waiting to run.""" + if self.queued_ts_s is None or self.prefill_ts_s is None: + # Either not queued or not running yet. + return None + assert self.queued_ts_s <= self.prefill_ts_s + return self.prefill_ts_s - self.queued_ts_s + + @property + def inference_latency_s(self) -> Optional[float]: + """How long the request was running inference + (prefill and decode).""" + if self.finished_ts_s is None or self.prefill_ts_s is None: + return None + assert self.finished_ts_s >= self.prefill_ts_s + return self.finished_ts_s - self.prefill_ts_s + + @property + def first_token_latency_s(self) -> Optional[float]: + if self.first_token_ts_s is None or self.arrival_ts_s is None: + return None + assert self.first_token_ts_s >= self.arrival_ts_s + return self.first_token_ts_s - self.arrival_ts_s + + @property + def prefill_latency_s(self) -> Optional[float]: + if self.first_token_ts_s is None or self.prefill_ts_s is None: + return None + assert self.first_token_ts_s >= self.prefill_ts_s + return self.first_token_ts_s - self.prefill_ts_s + + @property + def decode_latency_s(self) -> Optional[float]: + if self.e2e_latency_s is None or self.first_token_latency_s is None: + return None + assert self.e2e_latency_s >= self.first_token_latency_s + return self.e2e_latency_s - self.first_token_latency_s + + @property + def output_token_latency_s_lst(self) -> List[float]: + if len(self.output_token_ts_s_lst) == 0: + return [] + latency_s_lst = [] + for i in range(1, len(self.output_token_ts_s_lst)): + assert (self.output_token_ts_s_lst[i] + >= self.output_token_ts_s_lst[i - 1]) + latency_s = (self.output_token_ts_s_lst[i] - + self.output_token_ts_s_lst[i - 1]) + latency_s_lst.append(latency_s) + return latency_s_lst + + @property + def num_output_tokens(self) -> int: + return len(self.output_token_ts_s_lst) + + @property + def is_finished(self) -> bool: + return self.finished_ts_s is not None + + def update_from(self, update: "RequestStatsUpdate"): + RequestStatsUpdate.check_valid_update(update, self.last_update_type, + self.last_updated_ts_s) + ts = update.monotonic_ts_s + self.last_updated_ts_s = ts + self.last_update_type = update.type + if update.type == RequestStatsUpdate.Type.ARRIVED: + self.arrival_ts_s = ts + elif update.type == RequestStatsUpdate.Type.INPUT_PROCESSED: + self.input_processor_end_ts_s = ts + self.sampling_params = update.sampling_params + self.num_prompt_tokens = update.num_prompt_tokens + elif update.type == RequestStatsUpdate.Type.QUEUED: + self.queued_ts_s = ts + elif update.type == RequestStatsUpdate.Type.PREFILLING: + self.prefill_start_ts_s_lst.append(ts) + self.num_cached_tokens = update.num_cached_tokens or 0 + self.num_computed_tokens = update.num_computed_tokens or 0 + elif update.type == RequestStatsUpdate.Type.PREEMPTED: + self._reset_for_preemption(ts) + elif update.type == RequestStatsUpdate.Type.DECODING: + self.decoding_ts_s_lst.append(ts) + elif update.type == RequestStatsUpdate.Type.DETOKENIZED: + self._record_detokenized_output( + ts, + update.num_new_tokens or 0, + ) + elif update.type == RequestStatsUpdate.Type.FINISHED: + self.finished_ts_s = ts + self.finish_reason = update.finish_reason + else: + raise ValueError(f"Unknown update type: {update.type}") + + def _record_detokenized_output( + self, + ts_s: float, + num_new_tokens: int, + ): + # Update if first output token is generated. + if len(self.output_token_ts_s_lst) == 0: + self.first_token_ts_s = ts_s + assert ( + self.prefill_ts_s is not None + ), "Request must be running before generating output tokens." + + # Some X new tokens were generated at the ts. + self.output_token_ts_s_lst.extend([ts_s] * num_new_tokens) + + def _reset_for_preemption(self, ts_s: float): + self.preempted_ts_s_lst.append(ts_s) + # Reset the computed tokens since it might restart the prefill. + self.num_computed_tokens = 0 + # Cached token count might also change when resumed. + self.num_cached_tokens = 0 + # These stats don't change since they happen before request running. + # - arrival_ts_s + # - input_processor_end_ts_s + # - sampling_params + # - num_prompt_tokens + # - first_token_ts_s + # + # These stats are accumulated over preemptions: + # - output_token_ts_s_lst + # - prefill_start_ts_s_lst (after preemption, it will prefill the + # original prefill tokens and any output tokens generated before + # preemption.) + + +@dataclass +class KVCacheStats: + # KV Cache Usage in % + gpu_cache_usage_sys: float = 0.0 + gpu_prefix_cache_hit_rate: float = 0.0 + + +@dataclass +class SchedulerStats: + """Stats associated with the scheduler.""" + + # Number of requests currently running. + num_running_reqs: int = 0 + # Number of requests currently waiting. + num_waiting_reqs: int = 0 + + kv_cache_stats: KVCacheStats = dataclass_field( + default_factory=KVCacheStats) + + +@dataclass +class EngineCoreProcessStats: + """Stats associated with the engine core process.""" + + # Number of requests currently in the input queue. None if the engine core + # is not running in multiprocess mode. + input_queue_size: Optional[int] = None + # Number of outputs currently in the output queue. None if the engine core + # is not running in multiprocess mode. + output_queue_size: Optional[int] = None + + +class EngineCoreStatsSnapshot( + msgspec.Struct, # type: ignore + array_like=True, + omit_defaults=True, + gc=False): + """ + A snapshot of the EngineCore's current stats over a period of time. + """ + + # Snapshot of the scheduler stats. + scheduler_stats: SchedulerStats = msgspec_field( + default_factory=SchedulerStats) + + # Per request stats updates. + requests_stats_updates: List[RequestStatsUpdate] = msgspec_field( + default_factory=list) + + # Engine core's queue stats. + engine_core_process_stats: EngineCoreProcessStats = msgspec_field( + default_factory=EngineCoreProcessStats) + + # TODO(rickyx): Add other components' stats, + # e.g. model runner/worker and etc. diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 40494e64b22f0..28d8e39053874 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -30,6 +30,9 @@ class CachedRequestState: num_computed_tokens: int output_token_ids: List[int] + mrope_positions: Optional[torch.Tensor] = None + mrope_position_delta: Optional[int] = None + @property def num_tokens(self) -> int: return len(self.prompt_token_ids) + len(self.output_token_ids) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index aa63d9414c296..a00c00c307335 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,6 @@ import gc import time -from typing import TYPE_CHECKING, Dict, List, Tuple, cast +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, cast import numpy as np import torch @@ -14,8 +14,10 @@ from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY from vllm.logger import init_logger +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.model_executor.model_loader import get_model from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.sampling_params import SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, LayerBlockType, cdiv, is_pin_memory_available) @@ -126,7 +128,8 @@ def __init__( # self.cudagraph_batch_sizes sorts in ascending order. # The batch sizes in the config are in descending order. self.cudagraph_batch_sizes = list( - reversed(self.vllm_config.compilation_config.capture_sizes)) + reversed( + self.vllm_config.compilation_config.cudagraph_capture_sizes)) # Cache the device properties. self.device_properties = torch.cuda.get_device_properties(self.device) @@ -139,6 +142,28 @@ def __init__( self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + # NOTE: `mrope_positions` is implemented with one additional dummy + # position on purpose to make it non-contiguous so that it can work + # with torch compile. + # See detailed explanation in https://github.com/vllm-project/vllm/pull/12128#discussion_r1926431923 + + # NOTE: When M-RoPE is enabled, position ids are 3D regardless of + # the modality of inputs. For text-only inputs, each dimension has + # identical position IDs, making M-RoPE functionally equivalent to + # 1D-RoPE. + # See page 5 of https://arxiv.org/abs/2409.12191 + self.mrope_positions = torch.zeros((3, self.max_num_tokens + 1), + dtype=torch.int64, + device=self.device) + self.mrope_positions_cpu = torch.zeros( + (3, self.max_num_tokens + 1), + dtype=torch.int64, + device="cpu", + pin_memory=self.pin_memory) + self.inputs_embeds = torch.zeros( (self.max_num_tokens, self.hidden_size), dtype=self.dtype, @@ -146,7 +171,8 @@ def __init__( # OPTIMIZATION: Cache the tensors rather than creating them every step. self.arange_np = np.arange(max(self.max_num_reqs + 1, - self.max_model_len), + self.max_model_len, + self.max_num_tokens), dtype=np.int32) # NOTE(woosuk): These tensors are "stateless", i.e., they are literally # a faster version of creating a new tensor every time. Thus, we should @@ -171,15 +197,15 @@ def __init__( device="cpu", pin_memory=self.pin_memory) self.query_start_loc_np = self.query_start_loc_cpu.numpy() - self.seq_start_loc_cpu = torch.zeros(self.max_num_reqs + 1, - dtype=torch.int32, - device="cpu", - pin_memory=self.pin_memory) - self.seq_start_loc_np = self.seq_start_loc_cpu.numpy() + self.seq_lens_cpu = torch.zeros(self.max_num_reqs, + dtype=torch.int32, + device="cpu", + pin_memory=self.pin_memory) + self.seq_lens_np = self.seq_lens_cpu.numpy() def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. - # Keep the states of the pre-empted requests. + # Keep the states of the preempted requests. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) self.encoder_cache.pop(req_id, None) @@ -246,6 +272,35 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: num_computed_tokens=new_req_data.num_computed_tokens, output_token_ids=[], ) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + for mm_input in self.requests[req_id].mm_inputs: + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.extend( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.extend( + mm_input["video_grid_thw"].tolist()) + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config. + spatial_merge_size, + ) + req_ids_to_add.append(req_id) # Update the cached states of the resumed requests. @@ -304,8 +359,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Get batched arange. # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - arange = np.concatenate( - [self.arange_np[:n] for n in num_scheduled_tokens]) + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_scheduled_tokens) + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets # Get positions. positions_np = self.positions_np[:total_num_scheduled_tokens] @@ -313,6 +375,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): arange, out=positions_np) + # Calculate M-RoPE positions. + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.model_config.uses_mrope: + self._calc_mrope_positions(scheduler_output) + # Get token indices. # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] @@ -347,24 +414,30 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # Prepare the attention metadata. self.query_start_loc_np[0] = 0 - np.cumsum(num_scheduled_tokens, - out=self.query_start_loc_np[1:num_reqs + 1]) + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens - seq_lens = (self.input_batch.num_computed_tokens_cpu[:num_reqs] + - num_scheduled_tokens) - max_seq_len = seq_lens.max() - self.seq_start_loc_np[0] = 0 - np.cumsum(seq_lens, out=self.seq_start_loc_np[1:num_reqs + 1]) + self.seq_lens_np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + max_seq_len = self.seq_lens_np[:num_reqs].max() # Copy the tensors to the GPU. self.input_ids[:total_num_scheduled_tokens].copy_( self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True) - self.positions[:total_num_scheduled_tokens].copy_( - self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True) + if self.model_config.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + # Common case (1D positions) + self.positions[:total_num_scheduled_tokens].copy_( + self.positions_cpu[:total_num_scheduled_tokens], + non_blocking=True) query_start_loc = self.query_start_loc_cpu[:num_reqs + 1].to( self.device, non_blocking=True) - seq_start_loc = self.seq_start_loc_cpu[:num_reqs + 1].to( - self.device, non_blocking=True) + seq_lens = self.seq_lens_cpu[:num_reqs].to(self.device, + non_blocking=True) slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( self.device, non_blocking=True).long() @@ -436,33 +509,30 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): [0, total_num_scheduled_tokens], dtype=torch.int32, device=self.device) - cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], - dtype=torch.int32, - device=self.device) - cu_suffix_kv_lens = ( - self.seq_start_loc_np[:num_reqs + 1] - - self.arange_np[:num_reqs + 1] * common_prefix_len) - cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to( - self.device) + prefix_kv_lens = torch.tensor([common_prefix_len], + dtype=torch.int32, + device=self.device) + suffix_kv_lens = (self.seq_lens_np[:num_reqs] - common_prefix_len) + suffix_kv_lens = torch.from_numpy(suffix_kv_lens).to(self.device) else: cu_prefix_query_lens = None - cu_prefix_kv_lens = None - cu_suffix_kv_lens = None + prefix_kv_lens = None + suffix_kv_lens = None attn_metadata = FlashAttentionMetadata( num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, query_start_loc=query_start_loc, max_seq_len=max_seq_len, - seq_start_loc=seq_start_loc, + seq_lens=seq_lens, block_table=( self.input_batch.block_table.get_device_tensor()[:num_reqs]), slot_mapping=slot_mapping, use_cascade=use_cascade, common_prefix_len=common_prefix_len, cu_prefix_query_lens=cu_prefix_query_lens, - cu_prefix_kv_lens=cu_prefix_kv_lens, - cu_suffix_kv_lens=cu_suffix_kv_lens, + prefix_kv_lens=prefix_kv_lens, + suffix_kv_lens=suffix_kv_lens, ) # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial # request in the batch. While we should not sample any token from this @@ -472,6 +542,61 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): logits_indices = query_start_loc[1:] - 1 return attn_metadata, logits_indices + def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"): + mrope_pos_ptr = 0 + num_reqs = self.input_batch.num_reqs + for index, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): + assert req_id is not None + + req = self.requests[req_id] + assert req.mrope_positions is not None + + num_computed_tokens = \ + self.input_batch.num_computed_tokens_cpu[index] + num_scheduled_tokens = \ + scheduler_output.num_scheduled_tokens[req_id] + num_prompt_tokens = len(req.prompt_token_ids) + + if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens: + prompt_part_len = max(0, + num_prompt_tokens - num_computed_tokens) + completion_part_len = max( + 0, num_scheduled_tokens - prompt_part_len) + else: + prompt_part_len = num_scheduled_tokens + completion_part_len = 0 + + assert num_scheduled_tokens == prompt_part_len + completion_part_len + + if prompt_part_len > 0: + # prompt's mrope_positions are pre-computed + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + prompt_part_len + src_start = num_computed_tokens + src_end = num_computed_tokens + prompt_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + req.mrope_positions[:,src_start:src_end] + + mrope_pos_ptr += prompt_part_len + + if completion_part_len > 0: + # compute completion's mrope_positions on-the-fly + dst_start = mrope_pos_ptr + dst_end = mrope_pos_ptr + completion_part_len + + self.mrope_positions_cpu[:, dst_start:dst_end] = \ + MRotaryEmbedding.get_next_input_positions_tensor( + req.mrope_position_delta, + context_len=num_computed_tokens + + prompt_part_len, + seq_len=num_computed_tokens + + prompt_part_len + + completion_part_len, + ) + + mrope_pos_ptr += completion_part_len + def _prepare_sampling( self, scheduler_output: "SchedulerOutput", @@ -505,19 +630,34 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): for input_id in encoder_input_ids: mm_inputs.append(req_state.mm_inputs[input_id]) req_input_ids.append((req_id, input_id)) - batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) - batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, - device=self.device) - - # Run the encoder. - # `encoder_outputs` is either of the following: - # 1. A tensor of shape [num_images, feature_size, hidden_size] - # in case when feature_size is fixed across all images. - # 2. A list (length: num_images) of tensors, each of shape - # [feature_size, hidden_size] in case when the feature size is - # dynamic depending on input images. - encoder_outputs = self.model.get_multimodal_embeddings( - **batched_mm_inputs) + + # Batch mm inputs as much as we can: if a request in the batch has + # multiple modalities or a different modality than the previous one, + # we process it separately to preserve item order. + # FIXME(ywang96): This is a hacky way to deal with multiple modalities + # in the same batch while still being able to benefit from batching + # multimodal inputs. The proper solution should be reordering the + # encoder outputs. + grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs) + + encoder_outputs = [] + for grouped_mm_inputs in grouped_mm_inputs_list: + batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `curr_group_outputs` is either of the following: + # 1. A tensor of shape (num_items, feature_size, hidden_size) + # in case feature_size is fixed across all multimodal items. + # 2. A list or tuple (length: num_items) of tensors, each of shape + # (feature_size, hidden_size) in case the feature size is dynamic + # depending on the input multimodal items. + curr_group_outputs = self.model.get_multimodal_embeddings( + **batched_mm_inputs) + + for output in curr_group_outputs: + encoder_outputs.append(output) # Cache the encoder outputs. for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): @@ -565,6 +705,9 @@ def _gather_encoder_outputs( encoder_outputs.append(encoder_output[start_idx:end_idx]) return encoder_outputs + def get_model(self) -> nn.Module: + return self.model + @torch.inference_mode() def execute_model( self, @@ -618,9 +761,12 @@ def execute_model( # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): + positions = self.mrope_positions[:, :num_input_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_input_tokens] hidden_states = self.model( input_ids=input_ids, - positions=self.positions[:num_input_tokens], + positions=positions, kv_caches=self.kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -636,10 +782,10 @@ def execute_model( sampling_metadata=sampling_metadata, ) - sampled_token_ids = sampler_output.sampled_token_ids # TODO(woosuk): The following loop can be slow since it iterates over # the requests one by one. Optimize. num_reqs = self.input_batch.num_reqs + request_seq_lens: List[Tuple[int, CachedRequestState, int]] = [] for i, req_id in enumerate(self.input_batch.req_ids[:num_reqs]): assert req_id is not None req_state = self.requests[req_id] @@ -648,10 +794,10 @@ def execute_model( assert seq_len <= req_state.num_tokens if seq_len == req_state.num_tokens: # Append the sampled token to the output token ids. - token_id = sampled_token_ids[i] - self.input_batch.token_ids_cpu[i, seq_len] = token_id self.input_batch.num_tokens[i] += 1 - req_state.output_token_ids.append(token_id) + # OPTIMIZATION: Priming the state updates for later updates. + req_state.output_token_ids.append(0) + request_seq_lens.append((i, req_state, seq_len)) else: # Ignore the sampled token from the partial request. # Rewind the generator state as if the token was not sampled. @@ -660,6 +806,21 @@ def execute_model( # This relies on cuda-specific torch-internal impl details generator.set_offset(generator.get_offset() - 4) + # num_reqs entries should be non-None + assert all( + req_id is not None for req_id in + self.input_batch.req_ids[:num_reqs]), "req_ids contains None" + req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) + + # NOTE: GPU -> CPU Sync happens here. + # Move as many CPU operations as possible before this sync point. + sampled_token_ids = sampler_output.sampled_token_ids.tolist() + # Update with the actual token ids + for i, req_state, seq_len in request_seq_lens: + token_id = sampled_token_ids[i] + self.input_batch.token_ids_cpu[i, seq_len] = token_id + req_state.output_token_ids[-1] = token_id + if sampler_output.logprob_token_ids is None: logprob_token_ids = None else: @@ -669,12 +830,6 @@ def execute_model( else: logprobs = sampler_output.logprobs.cpu() - # num_reqs entries should be non-None - assert all( - req_id is not None for req_id in - self.input_batch.req_ids[:num_reqs]), "req_ids contains None" - req_ids = cast(List[str], self.input_batch.req_ids[:num_reqs]) - model_runner_output = ModelRunnerOutput( req_ids=req_ids, req_id_to_index=self.input_batch.req_id_to_index, @@ -696,10 +851,12 @@ def load_model(self) -> None: @torch.inference_mode() def _dummy_run( self, - model: nn.Module, num_tokens: int, - kv_caches: List[torch.Tensor], + kv_caches: Optional[List[torch.Tensor]] = None, ) -> torch.Tensor: + model = self.model + if kv_caches is None: + kv_caches = self.kv_caches if self.is_multimodal_model: input_ids = None inputs_embeds = self.inputs_embeds[:num_tokens] @@ -707,9 +864,12 @@ def _dummy_run( input_ids = self.input_ids[:num_tokens] inputs_embeds = None with set_forward_context(None, self.vllm_config): + positions = self.mrope_positions[:, :num_tokens] \ + if self.model_config.uses_mrope \ + else self.positions[:num_tokens] hidden_states = model( input_ids=input_ids, - positions=self.positions[:num_tokens], + positions=positions, kv_caches=kv_caches, attn_metadata=None, inputs_embeds=inputs_embeds, @@ -822,8 +982,7 @@ def profile_run(self) -> None: self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) # Trigger compilation for general shape. - hidden_states = self._dummy_run(self.model, self.max_num_tokens, - dummy_kv_caches) + hidden_states = self._dummy_run(self.max_num_tokens, dummy_kv_caches) logits = self.model.compute_logits(hidden_states, None) logits = logits[:self.max_num_tokens] # TODO(woosuk): Consider the memory usage of the sampler. @@ -849,8 +1008,8 @@ def capture_model(self) -> None: for num_tokens in reversed(self.cudagraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): - self._dummy_run(self.model, num_tokens, self.kv_caches) - self._dummy_run(self.model, num_tokens, self.kv_caches) + self._dummy_run(num_tokens) + self._dummy_run(num_tokens) end_time = time.perf_counter() end_free_gpu_memory = torch.cuda.mem_get_info()[0] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 4fb4197f1822f..a8cf0aec3f17b 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -5,16 +5,18 @@ import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs -from vllm.config import CacheConfig, ModelConfig, ParallelConfig, VllmConfig +from vllm.config import ParallelConfig, VllmConfig +from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, get_dtype_size +from vllm.utils import GiB_bytes from vllm.v1.core.scheduler import SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput @@ -77,6 +79,23 @@ def __init__( else: self.profiler = None + def sleep(self, level: int = 1) -> None: + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self) -> None: + allocator = CuMemAllocator.get_instance() + allocator.wake_up() + def init_device(self): if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -110,7 +129,17 @@ def init_device(self): self.model_runner = GPUModelRunner(self.vllm_config, self.device) def load_model(self) -> None: - self.model_runner.load_model() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.load_model() @torch.inference_mode() def determine_available_memory(self) -> int: @@ -167,15 +196,37 @@ def get_kv_cache_spec(self) -> KVCacheSpec: def initialize_cache(self, kv_cache_config: KVCacheConfig) -> None: """Allocate GPU KV cache with the specified kv_cache_config.""" - self.model_runner.initialize_kv_cache(kv_cache_config) + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.initialize_kv_cache(kv_cache_config) def compile_or_warm_up_model(self) -> None: + # warm up sizes that are not in cudagraph capture sizes, + # but users still want to compile for better performance, + # e.g. for the max-num-batched token size in chunked prefill. + warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() + if not self.model_config.enforce_eager: + warmup_sizes = [ + x for x in warmup_sizes if x not in + self.vllm_config.compilation_config.cudagraph_capture_sizes + ] + for size in sorted(warmup_sizes, reverse=True): + logger.info("Compile and warming up model for size %d", size) + self.model_runner._dummy_run(size) if not self.model_config.enforce_eager: self.model_runner.capture_model() # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed) + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, @@ -231,24 +282,3 @@ def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype): f"of at least 8.0. Your {gpu_name} GPU {compute_str}. " "You can use float16 instead by explicitly setting the" "`dtype` flag in CLI, for example: --dtype=half.") - - -def _get_cache_block_size( - cache_config: CacheConfig, - model_config: ModelConfig, - parallel_config: ParallelConfig, -) -> int: - head_size = model_config.get_head_size() - num_heads = model_config.get_num_kv_heads(parallel_config) - num_attention_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) - - key_cache_block = cache_config.block_size * num_heads * head_size - value_cache_block = key_cache_block - total = num_attention_layers * (key_cache_block + value_cache_block) - if cache_config.cache_dtype == "auto": - dtype = model_config.dtype - else: - dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - dtype_size = get_dtype_size(dtype) - return dtype_size * total diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 303d9a15e9c3c..4b429b67b36f8 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -144,9 +144,7 @@ def __init__(self, runner: "CPUModelRunner", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.runner = runner - self.chunked_prefill = (runner.scheduler_config.chunked_prefill_enabled or runner.cache_config.enable_prefix_caching) self.model_input_cls = self.runner._model_input_cls @@ -156,10 +154,17 @@ def __init__(self, self.device = self.runner.device self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper self.enable_lora = self.runner.lora_config is not None + if self.runner.attn_backend is not None: + # spec decode (e.g. Medusa) does not have atten backend + attn_backend = self.runner.attn_backend + self.att_metadata_builder = attn_backend.get_builder_cls()(self) + + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.input_data = ModelInputForCPUBuilder.ModelInputData( self.runner.model_config.uses_mrope) - self.att_metadata_builder = self.runner.attn_backend.get_builder_cls()( - self) + self.att_metadata_builder.prepare() def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -431,6 +436,7 @@ class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): """ _model_input_cls: Type[TModelInputForCPU] _builder_cls: Type[ModelInputForCPUBuilder] + builder: ModelInputForCPUBuilder def __init__( self, @@ -477,6 +483,10 @@ def __init__( # Set after load_model. self.lora_manager: Optional[LRUCacheWorkerLoRAManager] = None + if hasattr(self, "_builder_cls"): + # multi-step model runner does not have `_builder_cls` + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) @@ -509,6 +519,9 @@ def load_model(self) -> None: ) self.model = self.lora_manager.create_lora_manager(self.model) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -519,10 +532,10 @@ def _prepare_model_input_tensors( metadata for possible additional steps, e.g., sampling. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) - builder.set_seq_group_list(seq_group_metadata_list) + self.builder.prepare(finished_requests_ids) + self.builder.set_seq_group_list(seq_group_metadata_list) - return builder.build() # type: ignore + return self.builder.build() # type: ignore # sampler property will be used by spec_decode_worker @property diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py index ca65701191c27..13d6758a34976 100644 --- a/vllm/worker/hpu_enc_dec_model_runner.py +++ b/vllm/worker/hpu_enc_dec_model_runner.py @@ -4,19 +4,22 @@ import math from array import array from functools import partial -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, cast +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union, cast import habana_frameworks.torch as htorch import torch from vllm_hpu_extension.ops import batch2block, block2batch from vllm.attention import AttentionMetadata +from vllm.distributed import broadcast_tensor_dict from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.sampling_metadata import SequenceGroupToSample from vllm.sampling_params import SamplingParams -from vllm.sequence import (IntermediateTensors, SequenceData, - SequenceGroupMetadata) +from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, + Logprob, SequenceData, SequenceGroupMetadata, + SequenceOutput) from vllm.utils import is_fake_hpu from vllm.worker.hpu_model_runner import (HpuModelAdapter, HPUModelRunnerBase, ModelInputForHPUWithSamplingMetadata, @@ -404,7 +407,27 @@ def warmup_scenario(self, profiler.start() for _ in range(times): inputs = self.prepare_model_input(seqs) - self.execute_model(inputs, kv_caches, warmup_mode=True) + is_single_step = \ + self.vllm_config.scheduler_config.num_scheduler_steps == 1 + if is_prompt or is_single_step: + self.execute_model(inputs, kv_caches, warmup_mode=True) + else: # decode with multi-step + inputs = dataclasses.replace(inputs, + is_first_multi_step=True, + is_last_step=False) + self.execute_model(inputs, + kv_caches, + warmup_mode=True, + num_steps=2, + seqs=seqs) + inputs = dataclasses.replace(inputs, + is_first_multi_step=False, + is_last_step=True) + self.execute_model(inputs, + kv_caches, + warmup_mode=True, + num_steps=2, + seqs=seqs) torch.hpu.synchronize() if profiler: profiler.step() @@ -419,7 +442,7 @@ def create_dummy_seq_group_metadata(self, is_prompt, lora_request=None, temperature=0): - sampling_params = SamplingParams(temperature=0) + sampling_params = SamplingParams(temperature=temperature) num_blocks = math.ceil(seq_len / self.block_size) cross_block_table: Optional[List[int]] = None encoder_dummy_data \ @@ -523,6 +546,19 @@ def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): logger.warning("Configuration: (%s, %s, %s) was not warmed-up!", phase, batch_size, seq_len) + def add_dummy_seq(self, seq_group_metadata_list, is_prompt): + real_batch_size = len(seq_group_metadata_list) + batch_size_padded = self.bucketing_ctx.get_padded_batch_size( + real_batch_size, is_prompt) + batch_size_padding = batch_size_padded - real_batch_size + seq_group_metadata_list = seq_group_metadata_list.copy() + if batch_size_padding > 0: + dummy_seq_group_metadata = self.create_dummy_seq_group_metadata( + 0, 0, is_prompt) + seq_group_metadata_list.extend(dummy_seq_group_metadata + for _ in range(batch_size_padding)) + return seq_group_metadata_list + @torch.inference_mode() def execute_model( self, @@ -531,95 +567,264 @@ def execute_model( intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, warmup_mode=False, - ) -> Optional[List[SamplerOutput]]: - if num_steps > 1: - raise ValueError( - "num_steps > 1 is not supported in HPUEncoderDecoderModelRunner" - ) - - input_tokens = model_input.input_tokens - input_positions = model_input.input_positions - attn_metadata = model_input.attn_metadata - sampling_metadata = model_input.sampling_metadata - real_batch_size = model_input.real_batch_size - batch_size_padded = model_input.batch_size_padded - assert input_tokens is not None - assert input_positions is not None - assert sampling_metadata is not None - assert attn_metadata is not None - is_prompt = attn_metadata.is_prompt - assert is_prompt is not None - batch_size = input_tokens.size(0) - seq_len = self._seq_len(attn_metadata) - use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) - self._check_config(batch_size, seq_len, is_prompt, warmup_mode) - - execute_model_kwargs = { - "input_ids": input_tokens, - "positions": input_positions, - "kv_caches": kv_caches, - "attn_metadata": self.trim_attn_metadata(attn_metadata), - "intermediate_tensors": intermediate_tensors, - **(model_input.multi_modal_kwargs or {}), - } - if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs}) - - htorch.core.mark_step() - if self.is_driver_worker: - model_event_name = ("model_" - f"{'prompt' if is_prompt else 'decode'}_" - f"bs{batch_size}_" - f"seq{seq_len}_" - f"graphs{'T' if use_graphs else 'F'}") + previous_hidden_states: Optional[torch.Tensor] = None, + seqs=None, + ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]: + if not model_input.is_first_multi_step: + if not model_input.is_last_step: + # not first or last multi-step + return [] + # last multi-step + output = self._decode_sampler_outputs( + model_input) if self.is_driver_worker else [] + torch.hpu.synchronize() + if model_input.is_first_multi_step: + # first multi-step + input_tokens = model_input.input_tokens + input_positions = model_input.input_positions + attn_metadata = model_input.attn_metadata + sampling_metadata = model_input.sampling_metadata + real_batch_size = model_input.real_batch_size + batch_size_padded = model_input.batch_size_padded + assert input_tokens is not None + assert input_positions is not None + assert sampling_metadata is not None + assert attn_metadata is not None + is_prompt = attn_metadata.is_prompt + assert is_prompt is not None + batch_size = input_tokens.size(0) + seq_len = self._seq_len(attn_metadata) + use_graphs = self._use_graphs(batch_size, seq_len, is_prompt) + self._check_config(batch_size, seq_len, is_prompt, warmup_mode) + + execute_model_kwargs = { + "input_ids": input_tokens, + "positions": input_positions, + "kv_caches": kv_caches, + "attn_metadata": self.trim_attn_metadata(attn_metadata), + "intermediate_tensors": intermediate_tensors, + **(model_input.multi_modal_kwargs or {}), + } + if previous_hidden_states is not None: + execute_model_kwargs.update( + {"previous_hidden_states": previous_hidden_states}) + if htorch.utils.internal.is_lazy(): + execute_model_kwargs.update( + {"bypass_hpu_graphs": not use_graphs}) + + htorch.core.mark_step() + if self.is_driver_worker: + model_event_name = ("model_" + f"{'prompt' if is_prompt else 'decode'}_" + f"bs{batch_size}_" + f"seq{seq_len}_" + f"graphs{'T' if use_graphs else 'F'}") + else: + model_event_name = 'model_executable' + if num_steps > 1: + # in case of multi-step scheduling + # we only want to pythonize in the last step + sampling_metadata.skip_sampler_cpu_output = True + self.model.model.sampler.include_gpu_probs_tensor = True + cache_orig_output_tokens_len: List[Dict] = [] + + def try_revert_dummy_output_tokens(): + if len(cache_orig_output_tokens_len) > 0: + # Reuse the original output token ids length + for i in range(len(cache_orig_output_tokens_len)): + seq_group_metadata = seq_group_metadata_list[i] + for j, data in seq_group_metadata.seq_data.items(): + orig_output_tokens_len = \ + cache_orig_output_tokens_len[i][j] + data.output_token_ids = \ + data.output_token_ids[:orig_output_tokens_len] + + for i in range(num_steps): + if i != 0 and not self.is_driver_worker: + broadcast_data = broadcast_tensor_dict(src=0) + if 'early_exit' in broadcast_data and broadcast_data[ + 'early_exit']: + return [output] if num_steps == 1 else [] + execute_model_kwargs.update({ + "input_ids": + broadcast_data["input_ids"], + "positions": + broadcast_data["positions"], + "attn_metadata": + self.trim_attn_metadata( + broadcast_data["attn_metadata"]) + }) + with self.profiler.record_event('internal', model_event_name): + hidden_states = self.model.forward( + **execute_model_kwargs, + selected_token_indices=sampling_metadata. + selected_token_indices) + + # Compute the logits. + with self.profiler.record_event( + 'internal', + ('compute_logits_' + f'{"prompt" if is_prompt else "decode"}_bs' + f'{batch_size}_' + f'seq{seq_len}')): + if num_steps == 1: + sampling_metadata.selected_token_indices = None + logits = self.model.compute_logits(hidden_states, + sampling_metadata) + htorch.core.mark_step() + # Only perform sampling in the driver worker. + if not self.is_driver_worker: + continue + + if model_input.async_callback is not None: + model_input.async_callback() + # Sample the next token. + with self.profiler.record_event( + 'internal', ('sample_' + f'{"prompt" if is_prompt else "decode"}_' + f'bs{batch_size}_' + f'seq{seq_len}')): + output = self.model.sample( + logits=logits, + sampling_metadata=sampling_metadata, + ) + if num_steps > 1: + output = output.sampled_token_ids + self.cached_step_outputs.append( + output.detach().clone()) + htorch.core.mark_step() + if i < num_steps - 1: + if i == 0: + if model_input.async_callback is not None: + ctx = model_input.async_callback.keywords[ # type: ignore + "ctx"] + seq_group_metadata_list = \ + ctx.seq_group_metadata_list + elif seqs is not None: + seq_group_metadata_list = seqs + else: + raise RuntimeError( + "seq_group_metadata_list is uninitialized") + for seq_idx, seq_group_metadata in enumerate( + seq_group_metadata_list): + # Skip empty steps + seq_group_metadata.state.current_step += ( + num_steps - 2) + # Cache the original output token ids + cache_orig_output_tokens_len.append({}) + for j, data in seq_group_metadata.seq_data.items(): + cache_orig_output_tokens_len[seq_idx][j] = \ + len(data.output_token_ids) + seq_group_metadata_list = self.add_dummy_seq( + seq_group_metadata_list, is_prompt=False) + for seq_group_metadata in seq_group_metadata_list: + for data in seq_group_metadata.seq_data.values(): + max_output_len = sampling_metadata.seq_groups[ + 0].sampling_params.max_tokens + if len(data.output_token_ids) < max_output_len - 1: + # add a place holder for prepare_decode + # arbitrary value, this could be any token + dummy_token = (540, ) + data.output_token_ids += (dummy_token) + else: + broadcast_tensor_dict({'early_exit': True}, + src=0) + if num_steps == 1: + return [output] + else: + try_revert_dummy_output_tokens() + return [] + + result = self._prepare_decode(seq_group_metadata_list, + output=output) + execute_model_kwargs.update({ + "input_ids": + result.input_tokens, + "positions": + result.input_positions, + "attn_metadata": + self.trim_attn_metadata(result.attn_metadata) + }) + model_kwargs_broadcast_data = { + "input_ids": result.input_tokens, + "positions": result.input_positions, + "attn_metadata": vars(result.attn_metadata) + } + broadcast_tensor_dict(model_kwargs_broadcast_data, src=0) + else: + try_revert_dummy_output_tokens() + + if self.is_driver_worker and self.profiler.enabled: + # Stop recording 'execute_model' event + self.profiler.end() + event_end = self.profiler.get_timestamp_us() + counters = self.profiler_counter_helper.get_counter_dict( + cache_config=self.cache_config, + duration=event_end - self.event_start, + seq_len=seq_len, + batch_size_padded=batch_size_padded, + real_batch_size=real_batch_size, + is_prompt=is_prompt) + self.profiler.record_counter(self.event_start, counters) + if num_steps == 1: + if self.return_hidden_states: + # we only need to pass hidden states of most recent token + assert model_input.sampling_metadata is not None + if model_input.is_prompt: + output.prefill_hidden_states = hidden_states + output.hidden_states = hidden_states + return [output] if self.is_driver_worker else [] + else: + return [] + + return output if type(output) is list else [output] + + def _decode_sampler_outputs(self, model_input): + use_async_out_proc = model_input.async_callback is not None + sampler_outputs = [] + num_outputs = len(self.cached_step_outputs) + for i in range(num_outputs): + next_token_ids = self.cached_step_outputs.pop(0) + next_token_ids = next_token_ids.cpu().tolist() + sampler_output = self._make_decode_output( + next_token_ids, model_input.sampling_metadata.seq_groups) + sampler_outputs.append(sampler_output) + + if i < num_outputs - 1 and use_async_out_proc: + assert model_input.async_callback is not None + ctx = model_input.async_callback.keywords[ # type: ignore + "ctx"] + ctx.append_output( + outputs=[sampler_output], + seq_group_metadata_list=ctx.seq_group_metadata_list, + scheduler_outputs=ctx.scheduler_outputs, + is_async=False, + is_last_step=False, + is_first_step_output=False) + model_input.async_callback() + + if use_async_out_proc: + return [sampler_outputs[-1]] else: - model_event_name = 'model_executable' - with self.profiler.record_event('internal', model_event_name): - hidden_states = self.model.forward( - **execute_model_kwargs, - selected_token_indices=sampling_metadata.selected_token_indices - ) + return sampler_outputs - # Compute the logits. - with self.profiler.record_event( - 'internal', ('compute_logits_' - f'{"prompt" if is_prompt else "decode"}_bs' - f'{batch_size}_' - f'seq{seq_len}')): - sampling_metadata.selected_token_indices = None - logits = self.model.compute_logits(hidden_states, - sampling_metadata) - htorch.core.mark_step() - # Only perform sampling in the driver worker. - if not self.is_driver_worker: - return [] - - if model_input.async_callback is not None: - model_input.async_callback() - - # Sample the next token. - with self.profiler.record_event( - 'internal', ('sample_' - f'{"prompt" if is_prompt else "decode"}_' - f'bs{batch_size}_' - f'seq{seq_len}')): - output = self.model.sample( - logits=logits, - sampling_metadata=sampling_metadata, - ) - output.outputs = output.outputs[:real_batch_size] - htorch.core.mark_step() - - if self.is_driver_worker and self.profiler.enabled: - # Stop recording 'execute_model' event - self.profiler.end() - event_end = self.profiler.get_timestamp_us() - counters = self.profiler_counter_helper.get_counter_dict( - cache_config=self.cache_config, - duration=event_end - self.event_start, - seq_len=seq_len, - batch_size_padded=batch_size_padded, - real_batch_size=real_batch_size, - is_prompt=is_prompt) - self.profiler.record_counter(self.event_start, counters) - return [output] + def _make_decode_output( + self, + next_token_ids: List[List[int]], + seq_groups: List[SequenceGroupToSample], + ) -> SamplerOutput: + zero_logprob = Logprob(0.0) + sampler_outputs = [] + batch_idx = 0 + for seq_group in seq_groups: + seq_ids = seq_group.seq_ids + seq_outputs = [] + for seq_id in seq_ids: + next_token_id = next_token_ids[batch_idx][0] + seq_outputs.append( + SequenceOutput(seq_id, next_token_id, + {next_token_id: zero_logprob})) + batch_idx += 1 + sampler_outputs.append( + CompletionSequenceGroupOutput(seq_outputs, None)) + return SamplerOutput(sampler_outputs) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 404c5b70b6274..5c074a0ec7cad 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -19,6 +19,7 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +import torch.nn as nn import vllm_hpu_extension.environment as environment from vllm_hpu_extension.bucketing import HPUBucketingContext from vllm_hpu_extension.flags import enabled_flags @@ -822,6 +823,11 @@ def _maybe_wrap_in_hpu_graph(self, *args, **kwargs): ) if htorch.utils.internal.is_lazy() else HpuModelAdapter( *args, **kwargs) + def get_model(self) -> nn.Module: + if isinstance(self.model, HpuModelAdapter): + return self.model.model + return self.model + def _use_graphs(self, batch_size, seq_len, is_prompt): if self.enforce_eager: return False @@ -1069,7 +1075,9 @@ def _prepare_prompt( num_prefill_tokens=num_prefill_tokens, num_decode_tokens=0, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=placeholder_index_maps) + multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=False, + ) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) for t in multi_modal_kwargs: if torch.is_tensor(multi_modal_kwargs[t]): @@ -1097,11 +1105,14 @@ def _prepare_decode( input_positions: List[List[int]] = [] slot_mapping: List[List[int]] = [] seq_lens: List[int] = [] + encoder_seq_lens: List[int] = [] + cross_block_tables: List[List[int]] = [] block_tables: List[List[int]] = [] lora_index_mapping: List[List[int]] = [] lora_prompt_mapping: List[List[int]] = [] lora_requests: Set[LoRARequest] = set() + is_enc_dec_model = self.model_config.is_encoder_decoder if len(seq_group_metadata_list) == 0: return PrepareDecodeMetadata.empty() lora_ids: List[int] = [] @@ -1116,6 +1127,15 @@ def _prepare_decode( seq_ids = list(seq_group_metadata.seq_data.keys()) lora_id = seq_group_metadata.lora_int_id lora_ids.append(lora_id) + if is_enc_dec_model: + for _ in range(len(seq_group_metadata.seq_data)): + encoder_seq_len = ( + seq_group_metadata.encoder_seq_data.get_len() + if seq_group_metadata.encoder_seq_data else 0) + encoder_seq_lens.append(encoder_seq_len) + cross_block_table = seq_group_metadata.cross_block_table + cross_block_tables.append([] if ( + cross_block_table is None) else cross_block_table) if lora_id > 0: lora_requests.add(seq_group_metadata.lora_request) @@ -1186,6 +1206,30 @@ def _prepare_decode( assert len(block_list) == len(block_groups) assert len(block_list) == len(block_usage) + if is_enc_dec_model: + last_cross_block_usage = [ + (encoder_seq_len - 1) % self.block_size + 1 + for encoder_seq_len in encoder_seq_lens + ] + cross_block_groups = [[i] * len(bt) + for i, bt in enumerate(cross_block_tables)] + cross_block_usage = [ + [self.block_size] * (len(bt) - 1) + [lbu] + for bt, lbu in zip(cross_block_tables, last_cross_block_usage) + if bt + ] + cross_block_list = flatten(cross_block_tables) + cross_block_groups = flatten(cross_block_groups) + cross_block_usage = flatten(cross_block_usage) + assert len(cross_block_list) == len(cross_block_groups) + assert len(cross_block_list) == len(cross_block_usage) + + else: + cross_block_list = None + cross_block_groups = None + cross_block_usage = None + encoder_seq_lens_tensor = None + padding_fn = None if self.use_contiguous_pa: block_bucket_size = max(max(block_list) + 1, len(block_list)) @@ -1207,6 +1251,50 @@ def _prepare_decode( block_groups = padding_fn(block_groups, -1) block_usage = padding_fn(block_usage, 1) + if is_enc_dec_model: + if self.use_contiguous_pa: + cross_block_bucket_size = max( + max(cross_block_list) + + 1, len(cross_block_list)) if cross_block_list else 0 + cross_block_bucket_size = \ + self.bucketing_ctx.get_padded_decode_num_blocks( + cross_block_bucket_size) + indices = [None] * cross_block_bucket_size + for i, bid in enumerate(cross_block_list): + indices[bid] = i + padding_fn = lambda tensor, pad_value: gather_list( + tensor, indices, pad_value) + else: + cross_block_bucket_size = \ + self.bucketing_ctx.get_padded_decode_num_blocks( + len(cross_block_list)) + padding_fn = lambda tensor, pad_value: pad_list( + tensor, cross_block_bucket_size, pad_value) + + real_batch_size = len(seq_group_metadata_list) + batch_size_padded = self.bucketing_ctx.get_padded_batch_size( + real_batch_size, False) + batch_size_padding = batch_size_padded - real_batch_size + if batch_size_padding > 0: + encoder_seq_lens.extend(encoder_seq_lens[0] + for _ in range(batch_size_padding)) + cross_block_list = padding_fn(cross_block_list, _PAD_BLOCK_ID) + cross_block_groups = padding_fn(cross_block_groups, -1) + cross_block_usage = padding_fn(cross_block_usage, 1) + + cross_block_list = torch.tensor(cross_block_list, + dtype=torch.int, + device='cpu') + cross_block_groups = torch.tensor(cross_block_groups, + dtype=torch.int, + device='cpu') + cross_block_usage = torch.tensor(cross_block_usage, + dtype=self.model_config.dtype, + device='cpu') + encoder_seq_lens_tensor = torch.tensor(encoder_seq_lens, + dtype=torch.long, + device='cpu') + block_list = torch.tensor(block_list, dtype=torch.int, device='cpu') block_groups = torch.tensor(block_groups, dtype=torch.int, @@ -1230,6 +1318,15 @@ def _prepare_decode( self.device, non_blocking=True) slot_mapping = slot_mapping.to( # type: ignore self.device, non_blocking=True) + if is_enc_dec_model: + cross_block_list = cross_block_list.to( # type: ignore + self.device, non_blocking=True) + cross_block_groups = cross_block_groups.to( # type: ignore + self.device, non_blocking=True) + cross_block_usage = cross_block_usage.to( # type: ignore + self.device, non_blocking=True) + encoder_seq_lens_tensor = encoder_seq_lens_tensor.to( # type: ignore + self.device, non_blocking=True) attn_metadata = self.attn_backend.make_metadata( is_prompt=False, @@ -1242,12 +1339,19 @@ def _prepare_decode( block_groups=block_groups, attn_bias=None, seq_lens_tensor=None, + encoder_seq_lens=encoder_seq_lens, + encoder_seq_lens_tensor=encoder_seq_lens_tensor, + cross_block_list=cross_block_list, + cross_block_groups=cross_block_groups, + cross_block_usage=cross_block_usage, context_lens_tensor=None, num_prefills=0, num_prefill_tokens=0, num_decode_tokens=num_decode_tokens, slot_mapping=slot_mapping, - multi_modal_placeholder_index_maps=None) + multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, + ) return PrepareDecodeMetadata(input_tokens=input_tokens, input_positions=input_positions, attn_metadata=attn_metadata, @@ -2277,8 +2381,14 @@ def try_revert_dummy_output_tokens(): broadcast_data["attn_metadata"]) }) - # Model forward - with self.profiler.record_event('internal', model_event_name): + profiler_args = { + 'real_seq_len': model_input.seq_lens, + 'real_batch_size': real_batch_size + } + + with self.profiler.record_event('internal', + model_event_name, + args=profiler_args): hidden_states = self.model.forward( **execute_model_kwargs, selected_token_indices=sampling_metadata. @@ -2332,7 +2442,8 @@ def try_revert_dummy_output_tokens(): ('compute_logits_' f'{"prompt" if is_prompt else "decode"}_bs' f'{batch_size}_' - f'seq{seq_len}')): + f'seq{seq_len}'), + args=profiler_args): if num_steps == 1: sampling_metadata.selected_token_indices = None logits = self.model.compute_logits(hidden_states, @@ -2375,7 +2486,8 @@ def try_revert_dummy_output_tokens(): 'internal', ('sample_' f'{"prompt" if is_prompt else "decode"}_' f'bs{batch_size}_' - f'seq{seq_len}')): + f'seq{seq_len}'), + args=profiler_args): output = self.model.sample( logits=logits, sampling_metadata=sampling_metadata, diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 969971f2e25cd..f851a5d3515c3 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -235,8 +235,8 @@ def execute_model( 'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0' log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS', '0') != '0' or log_cpu_fallbacks_all - if (log_graph_compilation or log_cpu_fallbacks) \ - and execute_model_req is not None: + if (log_graph_compilation or log_cpu_fallbacks) and \ + execute_model_req is not None: from habana_frameworks.torch.hpu.metrics import metric_localcontext seq_group_metadata_list = execute_model_req.seq_group_metadata_list is_prompt = any([ @@ -265,13 +265,13 @@ def execute_model( cpu_fallback_ctx as cpu_fallback_local_metric: output = LocalOrDistributedWorkerBase.execute_model( self, execute_model_req) - if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0 - ) or log_graph_compilation_all: + if (log_graph_compilation and gc_local_metric.stats()[0][1] + > 0) or log_graph_compilation_all: msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: " f"{gc_local_metric.stats()}, {input_stats}") logger.warning(msg) - if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] > - 0) or log_cpu_fallbacks_all: + if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] + > 0) or log_cpu_fallbacks_all: msg = ("VLLM_HPU_STEP_CPU_FALLBACK: " f"{cpu_fallback_local_metric.stats()}, {input_stats}") logger.warning(msg) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ae8b7f97c827d..bf1a40d48a789 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -3,7 +3,6 @@ import inspect import itertools import time -import warnings import weakref from contextlib import contextmanager from dataclasses import dataclass @@ -41,7 +40,6 @@ from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalKwargs, MultiModalPlaceholderMap, MultiModalRegistry) -from vllm.platforms import current_platform from vllm.prompt_adapter.layers import PromptAdapterMapping from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.prompt_adapter.worker_manager import ( @@ -457,17 +455,12 @@ def __init__(self, self.enable_prompt_adapter = (self.runner.prompt_adapter_config is not None) self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper - self.finished_requests_ids = finished_requests_ids - self.decode_only = True - - # Intermediate data (data in CPU before going to GPU) for - # the current sequence group. - self.inter_data_list: List[ - ModelInputForGPUBuilder.InterDataForSeqGroup] = [] # Attention metadata inputs. - self.attn_metadata_builder = self.attn_backend.make_metadata_builder( - weakref.proxy(self)) + if self.attn_backend is not None: + # spec decode (e.g. Medusa) does not have atten backend + self.attn_metadata_builder = self.attn_backend.get_builder_cls()( + weakref.proxy(self)) # Engine/Model configurations. self.chunked_prefill_enabled = ( @@ -479,6 +472,21 @@ def __init__(self, self.block_aligned_sliding_window = \ self.sliding_window_blocks * self.block_size + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.finished_requests_ids = finished_requests_ids + + # if the current batch is decode-only. + # will be set to False if there is any non-decode request. + self.decode_only = True + + # Intermediate data (data in CPU before going to GPU) for + # the current sequence group. + self.inter_data_list: List[ + ModelInputForGPUBuilder.InterDataForSeqGroup] = [] + + self.attn_metadata_builder.prepare() + def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int, seq_group_metadata: SequenceGroupMetadata): """Compute context length, sequence length and tokens @@ -993,6 +1001,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): """ _model_input_cls: Type[TModelInputForGPU] _builder_cls: Type[ModelInputForGPUBuilder] + builder: ModelInputForGPUBuilder def __init__( self, @@ -1093,6 +1102,10 @@ def __init__( SamplingMetadataCache() \ if self.parallel_config.pipeline_parallel_size == 1 else None + if hasattr(self, "_builder_cls"): + # multi-step model runner does not have `_builder_cls` + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) with DeviceMemoryProfiler() as m: @@ -1139,34 +1152,6 @@ def load_model(self) -> None: self.prompt_adapter_manager.create_prompt_adapter_manager( self.model)) - if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm() - or current_platform.is_cuda()): - # Currently only ROCm accepts kv-cache scaling factors - # via quantization_param_path and this will be deprecated - # in the future. - if self.model_config.quantization_param_path is not None: - if callable(getattr(self.model, "load_kv_cache_scales", None)): - warnings.warn( - "Loading kv cache scaling factor from JSON is " - "deprecated and will be removed. Please include " - "kv cache scaling factors in the model checkpoint.", - FutureWarning, - stacklevel=2) - self.model.load_kv_cache_scales( - self.model_config.quantization_param_path) - logger.info("Loaded KV cache scaling factors from %s", - self.model_config.quantization_param_path) - else: - raise RuntimeError( - "Using FP8 KV cache and scaling factors provided but " - "model %s does not support loading scaling factors.", - self.model.__class__) - else: - logger.warning( - "Using FP8 KV cache but no scaling factors " - "provided. Defaulting to scaling factors of 1.0. " - "This may lead to less accurate results!") - if self.vllm_config.compilation_config.level ==\ CompilationLevel.DYNAMO_AS_IS and supports_dynamo(): backend = self.vllm_config.compilation_config.init_backend( @@ -1176,6 +1161,9 @@ def load_model(self) -> None: fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE, backend=backend) + def get_model(self) -> nn.Module: + return self.model + def save_sharded_state( self, path: str, @@ -1223,13 +1211,13 @@ def _prepare_model_input_tensors( If cuda graph is required, this API automatically pads inputs. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) + self.builder.prepare(finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: - builder.add_seq_group(seq_group_metadata) + self.builder.add_seq_group(seq_group_metadata) - builder.reset_cached_inter_data() + self.builder.reset_cached_inter_data() - return builder.build() # type: ignore + return self.builder.build() # type: ignore @contextmanager def set_in_profile_run(self): @@ -1241,13 +1229,19 @@ def set_in_profile_run(self): @torch.inference_mode() def profile_run(self) -> None: + max_num_batched_tokens = \ + self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + self._dummy_run(max_num_batched_tokens, max_num_seqs) + + def _dummy_run(self, + max_num_batched_tokens: int, + max_num_seqs: int = 1) -> None: with self.set_in_profile_run(): # Enable top-k sampling to reflect the accurate memory usage. sampling_params = \ SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = \ - self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests # that will have unique loras, an therefore the max amount of memory # consumption create dummy lora request copies from the lora request @@ -1345,6 +1339,10 @@ def profile_run(self) -> None: dtype=self.model_config.dtype, device=self.device) + # Disable KV Scale Calculation for dummy data during profile run + if model_input.attn_metadata is not None: + model_input.attn_metadata.enable_kv_scales_calculation = False + self.execute_model(model_input, kv_caches, intermediate_tensors) torch.cuda.synchronize() return @@ -1476,19 +1474,21 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: for virtual_engine in range( self.parallel_config.pipeline_parallel_size): # Only rank 0 should print progress bar during capture - capture_sizes = ( - tqdm( - self.vllm_config.compilation_config.capture_sizes, - desc="Capturing CUDA graph shapes", - ) if get_tensor_model_parallel_rank() == 0 else - self.vllm_config.compilation_config.capture_sizes) - for batch_size in capture_sizes: + cudagraph_capture_sizes = (tqdm( + self.vllm_config.compilation_config. + cudagraph_capture_sizes, + desc="Capturing CUDA graph shapes", + ) if get_tensor_model_parallel_rank() == 0 else + self.vllm_config.compilation_config. + cudagraph_capture_sizes) + for batch_size in cudagraph_capture_sizes: attn_metadata = ( self.attn_state.graph_capture_get_metadata_for_batch( batch_size, is_encoder_decoder_model=self.model_config. is_encoder_decoder)) - + # Disable KV Scale Calculation for graph capture + attn_metadata.enable_kv_scales_calculation = False if self.lora_config: lora_mapping = LoRAMapping( **dict(index_mapping=[0] * batch_size, diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index c7abad7e0258d..aef4bdcdd4bf9 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -7,6 +7,7 @@ Optional, Type, TypeVar) import torch +import torch.nn as nn from torch import is_tensor from vllm.config import VllmConfig @@ -199,6 +200,11 @@ class ModelRunnerInputBuilderBase(ABC, Generic[T]): """A builder to create ModelRunnerInputBase objects. """ + @abstractmethod + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + raise NotImplementedError + @abstractmethod def add_seq_group(self, seq_group_metadata): """TBA""" @@ -264,6 +270,10 @@ def prepare_model_input( """ raise NotImplementedError + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + def execute_model( self, model_input: T, @@ -297,9 +307,9 @@ class ModelRunnerWrapperBase: def __init__( self, - moderl_runner: ModelRunnerBase, + model_runner: ModelRunnerBase, ) -> None: - self.model_runner: ModelRunnerBase = moderl_runner + self.model_runner: ModelRunnerBase = model_runner def __getattr__(self, attr): return getattr(self.model_runner, attr) diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index a35f5467e1a1f..596c26eac28bd 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -113,6 +113,9 @@ def load_model(self) -> None: raise NotImplementedError( "Supports only Transformer-NeuronX based models.") + def get_model(self) -> nn.Module: + return self.model + def _prepare_prompt( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index a38b5a4e6e8d5..42fe2cf668ad8 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -84,6 +84,9 @@ def load_model(self) -> None: kv_cache_dtype=self.kv_cache_dtype, ov_core=self.ov_core) + def get_model(self) -> nn.Module: + return self.model + def _prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], @@ -279,6 +282,7 @@ def _prepare_model_input( block_indices_begins=block_indices_begins_tensor, max_context_len=max_context_len_tensor, multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=False, ) multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index 50a155d22c666..f5b46cde3969c 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -4,6 +4,7 @@ import openvino as ov import torch import torch.distributed +import torch.nn as nn import vllm.envs as envs from vllm.attention import get_attn_backend @@ -362,6 +363,9 @@ def cache_copy( ) -> None: self.cache_engine.copy(blocks_to_copy) # type: ignore + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + @torch.inference_mode() def execute_model( self, diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 52c577bccab9c..8749518284288 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -158,6 +158,9 @@ def load_model(self) -> None: fullgraph=True, dynamic=False) + def get_model(self) -> nn.Module: + return self.model.model + def _dummy_run( self, batch_size: int, @@ -187,6 +190,7 @@ def _dummy_run( num_decode_tokens=0, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, block_tables=None, context_lens=None, effective_query_lens=None, @@ -205,6 +209,7 @@ def _dummy_run( num_decode_tokens=0, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, block_tables=block_tables, context_lens=context_lens, effective_query_lens=effective_query_lens, @@ -236,6 +241,7 @@ def _dummy_run( num_decode_tokens=batch_size * seq_len, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, block_tables=block_tables, context_lens=context_lens, ) @@ -310,8 +316,8 @@ def warmup_model( logger.info("batch_size: %d, seq_len: %d", batch_size, seq_len) num_tokens = batch_size * seq_len - if (num_tokens >= - self.scheduler_config.max_num_batched_tokens): + if (num_tokens + >= self.scheduler_config.max_num_batched_tokens): break seq_len = seq_len * 2 end = time.time() @@ -422,6 +428,7 @@ def _prepare_prompt( num_decode_tokens=0, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, block_tables=block_tables, context_lens=context_lens, effective_query_lens=prompt_lens, @@ -493,6 +500,7 @@ def _prepare_decode( num_decode_tokens=batch_size, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, block_tables=block_tables, context_lens=context_lens, ) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 43eeb287d64eb..24bba79fedd75 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -8,6 +8,7 @@ import vllm.envs as envs from vllm.config import VllmConfig +from vllm.device_allocator.cumem import CuMemAllocator from vllm.distributed import (ensure_kv_transfer_initialized, ensure_model_parallel_initialized, init_distributed_environment, @@ -21,7 +22,8 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, SequenceGroupMetadata, SequenceGroupMetadataDelta) -from vllm.utils import GiB_bytes, bind_kv_cache, memory_profiling +from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache, + memory_profiling) from vllm.worker.cache_engine import CacheEngine from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner @@ -119,6 +121,23 @@ def stop_profile(self): raise RuntimeError("Profiler is not enabled.") self.profiler.stop() + def sleep(self, level: int = 1) -> None: + free_bytes_before_sleep = torch.cuda.mem_get_info()[0] + allocator = CuMemAllocator.get_instance() + allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple()) + free_bytes_after_sleep, total = torch.cuda.mem_get_info() + freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep + used_bytes = total - free_bytes_after_sleep + assert freed_bytes >= 0, "Memory usage increased after sleeping." + logger.info( + "Sleep mode freed %.2f GiB memory, " + "%.2f GiB memory is still in use.", freed_bytes / GiB_bytes, + used_bytes / GiB_bytes) + + def wake_up(self) -> None: + allocator = CuMemAllocator.get_instance() + allocator.wake_up() + def init_device(self) -> None: if self.device_config.device.type == "cuda": # torch.distributed.all_reduce does not free the input tensor until @@ -137,7 +156,8 @@ def init_device(self) -> None: _check_if_gpu_supports_dtype(self.model_config.dtype) gc.collect() torch.cuda.empty_cache() - self.init_gpu_memory = torch.cuda.mem_get_info()[0] + torch.cuda.reset_peak_memory_stats() + self.baseline_snapshot = MemorySnapshot() else: raise RuntimeError( f"Not support device type: {self.device_config.device}") @@ -149,7 +169,17 @@ def init_device(self) -> None: set_random_seed(self.model_config.seed) def load_model(self): - self.model_runner.load_model() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + assert allocator.get_current_usage() == 0, ( + "Sleep mode can only be " + "used for one instance per process.") + context = allocator.use_memory_pool(tag="weights") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self.model_runner.load_model() def save_sharded_state( self, @@ -192,10 +222,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. - with memory_profiling(baseline_memory_in_bytes=total_gpu_memory - - self.init_gpu_memory, - weights_memory_in_bytes=self.model_runner. - model_memory_usage) as result: + with memory_profiling( + self.baseline_snapshot, + weights_memory=self.model_runner.model_memory_usage) as result: self.model_runner.profile_run() self._assert_memory_footprint_increased_during_profiling() @@ -203,7 +232,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: memory_for_current_instance = total_gpu_memory * \ self.cache_config.gpu_memory_utilization available_kv_cache_memory = (memory_for_current_instance - - result.non_kv_cache_memory_in_bytes) + result.non_kv_cache_memory) # Calculate the number of blocks that can be allocated with the # profiled peak memory. @@ -226,11 +255,11 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: f"({self.cache_config.gpu_memory_utilization:.2f})" f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" "model weights take " - f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.weights_memory / GiB_bytes):.2f}GiB;" " non_torch_memory takes " - f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.non_torch_increase / GiB_bytes):.2f}GiB;" " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;" + f"{(result.torch_peak_increase / GiB_bytes):.2f}GiB;" " the rest of the memory reserved for KV Cache is " f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB.") @@ -246,11 +275,13 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: def _assert_memory_footprint_increased_during_profiling(self): # NOTE(woosuk): Here we assume that the other processes using the same # GPU did not change their memory usage during the profiling. - free_gpu_memory, _ = torch.cuda.mem_get_info() - assert self.init_gpu_memory - free_gpu_memory > 0, ( + free_gpu_memory, total = torch.cuda.mem_get_info() + cuda_memory = total - free_gpu_memory + assert self.baseline_snapshot.cuda_memory < cuda_memory, ( "Error in memory profiling. " - f"Initial free memory {self.init_gpu_memory}, current free memory" - f" {free_gpu_memory}. This happens when the GPU memory was " + f"Initial used memory {self.baseline_snapshot.cuda_memory}, " + f"currently used memory {cuda_memory}. " + f"This happens when the GPU memory was " "not properly cleaned up before initializing the vLLM instance.") def initialize_cache(self, num_gpu_blocks: int, @@ -267,7 +298,14 @@ def initialize_cache(self, num_gpu_blocks: int, self.cache_config.num_gpu_blocks = num_gpu_blocks self.cache_config.num_cpu_blocks = num_cpu_blocks - self._init_cache_engine() + if self.vllm_config.model_config.enable_sleep_mode: + allocator = CuMemAllocator.get_instance() + context = allocator.use_memory_pool(tag="kv_cache") + else: + from contextlib import nullcontext + context = nullcontext() + with context: + self._init_cache_engine() self._warm_up_model() def _init_cache_engine(self): @@ -285,6 +323,18 @@ def _init_cache_engine(self): self.gpu_cache) def _warm_up_model(self) -> None: + # warm up sizes that are not in cudagraph capture sizes, + # but users still want to compile for better performance, + # e.g. for the max-num-batched token size in chunked prefill. + warmup_sizes = self.vllm_config.compilation_config.compile_sizes.copy() + if not self.model_config.enforce_eager: + warmup_sizes = [ + x for x in warmup_sizes if x not in + self.vllm_config.compilation_config.cudagraph_capture_sizes + ] + for size in sorted(warmup_sizes, reverse=True): + logger.info("Compile and warming up model for size %d", size) + self.model_runner._dummy_run(size) if not self.model_config.enforce_eager: self.model_runner.capture_model(self.gpu_cache) # Reset the seed to ensure that the random state is not affected by diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index f434c7082bd2b..783b61e85238b 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -6,8 +6,10 @@ import cloudpickle import torch +import torch.nn as nn -from vllm.config import ObservabilityConfig, VllmConfig +from vllm.config import (ObservabilityConfig, VllmConfig, + set_current_vllm_config) from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -90,6 +92,11 @@ def start_worker_execution_loop(self) -> None: if output is None: return None + @abstractmethod + def get_model(self) -> nn.Module: + raise NotImplementedError + + @abstractmethod def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -151,6 +158,9 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) + def get_model(self) -> nn.Module: + return self.worker.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None @@ -367,6 +377,9 @@ def prepare_input( else: return self._get_worker_input_from_broadcast() + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + def execute_model( self, execute_model_req: Optional[ExecuteModelRequest] = None, @@ -490,8 +503,11 @@ def __init__( group. """ self.rpc_rank = rpc_rank - self.vllm_config = vllm_config self.worker: Optional[WorkerBase] = None + # do not store this `vllm_config`, `init_worker` will set the final + # one. TODO: investigate if we can remove this field in + # `WorkerWrapperBase`, `init_cached_hf_modules` should be + # unnecessary now. if vllm_config.model_config is not None: # it can be None in tests trust_remote_code = vllm_config.model_config.trust_remote_code @@ -525,11 +541,11 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: Arguments are passed to the worker class constructor. """ kwargs = all_kwargs[self.rpc_rank] + self.vllm_config = kwargs.get("vllm_config", None) + assert self.vllm_config is not None, ( + "vllm_config is required to initialize the worker") enable_trace_function_call_for_thread(self.vllm_config) - from vllm import configure_as_vllm_process - configure_as_vllm_process() - from vllm.plugins import load_general_plugins load_general_plugins() @@ -541,8 +557,10 @@ def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None: bytes) worker_class = cloudpickle.loads( self.vllm_config.parallel_config.worker_cls) - self.worker = worker_class(**kwargs) - assert self.worker is not None + with set_current_vllm_config(self.vllm_config): + # To make vLLM config available during worker initialization + self.worker = worker_class(**kwargs) + assert self.worker is not None def execute_method(self, method: Union[str, bytes], *args, **kwargs): try: diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 82b8f22a5af33..b7b7b7227b22c 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -113,7 +113,6 @@ def __init__(self, runner: "XPUModelRunner", finished_requests_ids: Optional[List[str]] = None) -> None: super().__init__() - self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] self.runner = runner self.model_input_cls = self.runner._model_input_cls self.attn_backend = self.runner.attn_backend @@ -121,6 +120,10 @@ def __init__(self, self.block_size = self.runner.block_size self.device = self.runner.device + def prepare(self, + finished_requests_ids: Optional[List[str]] = None) -> None: + self.seq_group_metadata_list: List[SequenceGroupMetadata] = [] + def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -258,6 +261,7 @@ def _prepare_prompt( is_prompt=True, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=placeholder_index_maps, + enable_kv_scales_calculation=False, seq_lens=seq_lens, seqlen_q=seqlen_q, max_seqlen=max_seqlen, @@ -342,6 +346,7 @@ def _prepare_decode( is_prompt=False, slot_mapping=slot_mapping, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, seq_lens=seq_lens, seqlen_q=torch.tensor([]), max_seqlen=0, @@ -408,6 +413,8 @@ def __init__( SamplingMetadataCache() \ if self.parallel_config.pipeline_parallel_size == 1 else None + self.builder = self._builder_cls(weakref.proxy(self)) + def load_model(self) -> None: with DeviceMemoryProfiler() as m: self.model = get_model(vllm_config=self.vllm_config) @@ -416,6 +423,9 @@ def load_model(self) -> None: logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30)) + def get_model(self) -> nn.Module: + return self.model + @property def vocab_size(self) -> int: return self.model_config.get_vocab_size() @@ -514,7 +524,8 @@ def _prepare_model_input_tensors( metadata for possible additional steps, e.g., sampling. """ - builder = self._builder_cls(weakref.proxy(self), finished_requests_ids) + builder = self.builder + builder.prepare(finished_requests_ids) for seq_group_metadata in seq_group_metadata_list: builder.add_seq_group(seq_group_metadata)