diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 868b8e95db01d..679abf1814aa5 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -1,5 +1,6 @@ steps: - label: "Wait for container to be ready" + key: wait-for-container-image agents: queue: A100 plugins: @@ -10,12 +11,11 @@ steps: command: - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - - wait - - label: "A100" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: A100 + depends_on: wait-for-container-image plugins: - kubernetes: podSpec: @@ -49,6 +49,7 @@ steps: # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H200 + depends_on: wait-for-container-image plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT @@ -73,7 +74,7 @@ steps: # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H100 - depends_on: ~ + depends_on: wait-for-container-image plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c6f8316412e2f..529daf54faecf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -363,12 +363,14 @@ steps: - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language - tests/models/embedding/vision_language + - tests/models/encoder_decoder/audio_language - tests/models/encoder_decoder/vision_language commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/embedding/vision_language -m core_model + - pytest -v -s models/encoder_decoder/audio_language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/400-bug report.yml rename to .github/ISSUE_TEMPLATE/400-bug-report.yml diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/500-feature request.yml rename to .github/ISSUE_TEMPLATE/500-feature-request.yml diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/600-new model.yml rename to .github/ISSUE_TEMPLATE/600-new-model.yml diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/700-performance discussion.yml rename to .github/ISSUE_TEMPLATE/700-performance-discussion.yml diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/800-misc discussion.yml rename to .github/ISSUE_TEMPLATE/800-misc-discussion.yml diff --git a/CMakeLists.txt b/CMakeLists.txt index 3206d76125545..f4b9c3ec9c14f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -550,7 +550,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb + GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/Dockerfile b/Dockerfile index 153bff9cf565f..088314eb38dbe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -234,8 +234,8 @@ RUN mv vllm test_docs/ #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### -# openai api server alternative -FROM vllm-base AS vllm-openai +# base openai image with additional requirements, for any subsequent openai-style images +FROM vllm-base AS vllm-openai-base # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ @@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV VLLM_USAGE_SOURCE production-docker-image +# define sagemaker first, so it is not default from `docker build` +FROM vllm-openai-base AS vllm-sagemaker + +COPY examples/sagemaker-entrypoint.sh . +RUN chmod +x sagemaker-entrypoint.sh +ENTRYPOINT ["./sagemaker-entrypoint.sh"] + +FROM vllm-openai-base AS vllm-openai + ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER #################### diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py new file mode 100644 index 0000000000000..13477ef535e86 --- /dev/null +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -0,0 +1,184 @@ +""" +Offline benchmark to test the long document QA throughput. + +Example usage: + # This command run the vllm with 50GB CPU memory for offloading + # The workload samples 8 different prompts with a default input + # length of 20000 tokens, then replicates each prompt 2 times + # in random order. + python benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --repeat-count 2 + +Commandline arguments: + --num-documents: The number of documents to sample prompts from. + + --document-length: The length of each document in tokens. + (Optional, default: 20000) + + --output-len: The number of tokens to generate for each prompt. + (Optional, default: 10) + + --repeat-count: The number of times to repeat each prompt. + (Optional, default: 2) + + --repeat-mode: The mode to repeat prompts. The supported modes are: + - 'random': shuffle the prompts randomly. (Default) + - 'tile': the entire prompt list is repeated in sequence. (Potentially + lowest cache hit) + - 'interleave': each prompt is repeated consecutively before + moving to the next element. (Highest cache hit) + + --shuffle-seed: Random seed when the repeat mode is "random". + (Optional, default: 0) + +In the meantime, it also supports all the vLLM engine args to initialize the +LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more +details. +""" + +import dataclasses +import random +import time + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def test_long_document_qa(llm=None, sampling_params=None, prompts=None): + """ + Test long document QA with the given prompts and sampling parameters. + Print the time spent in processing all the prompts. + + Args: + llm: The language model used for generating responses. + sampling_params: Sampling parameter used to generate the response. + prompts: A list of prompt strings to be processed by the LLM. + """ + start_time = time.time() + llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + print(f"Time to execute all requests: {end_time - start_time:.4f} secs") + + +def repeat_prompts(prompts, repeat_count, mode: str): + """ + Repeat each prompt in the list for a specified number of times. + The order of prompts in the output list depends on the mode. + + Args: + prompts: A list of prompts to be repeated. + repeat_count: The number of times each prompt is repeated. + mode: The mode of repetition. Supported modes are: + - 'random': Shuffle the prompts randomly after repetition. + - 'tile': Repeat the entire prompt list in sequence. + Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. + - 'interleave': Repeat each prompt consecutively before moving to + the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. + + Returns: + A list of repeated prompts in the specified order. + + Raises: + ValueError: If an invalid mode is provided. + """ + print("Repeat mode: ", mode) + if mode == 'random': + repeated_prompts = prompts * repeat_count + random.shuffle(repeated_prompts) + return repeated_prompts + elif mode == 'tile': + return prompts * repeat_count + elif mode == 'interleave': + repeated_prompts = [] + for prompt in prompts: + repeated_prompts.extend([prompt] * repeat_count) + return repeated_prompts + else: + raise ValueError(f"Invalid mode: {mode}, only support " + "'random', 'tile', 'interleave'") + + +def main(args): + random.seed(args.shuffle_seed) + + # Prepare the prompts: + # we append the document id at the beginning to avoid any of the document + # being the prefix of other documents + prompts = [ + str(i) + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents) + ] + + prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) + + warmup_prompts = [ + "This is warm up request " + str(i) + \ + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents)] + + # Create the LLM engine + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + + print("------warm up------") + test_long_document_qa( + llm=llm, + prompts=warmup_prompts, + sampling_params=sampling_params, + ) + + print("------start generating------") + test_long_document_qa( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description= + 'Benchmark the performance with or without automatic prefix caching.') + + parser.add_argument( + '--document-length', + type=int, + # Roughly the number of tokens for a system paper, + # excluding images + default=20000, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--num-documents', + type=int, + default=8, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--output-len', type=int, default=10) + + parser.add_argument('--repeat-count', + type=int, + default=2, + help='Number of times to repeat each prompt') + + parser.add_argument("--repeat-mode", + type=str, + default='random', + help='The mode to repeat prompts. The supported ' + 'modes are "random", "tile", and "interleave". ' + 'See repeat_prompts() in the source code for details.') + + parser.add_argument("--shuffle-seed", + type=int, + default=0, + help='Random seed when the repeat mode is "random"') + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 0c698ced7713d..04ef842fbdf95 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -834,6 +834,7 @@ __global__ void Marlin( int4* sh_g_idx = sh_b + (stages * b_sh_stage); int4* sh_zp = sh_g_idx + (stages * g_idx_stage); int4* sh_s = sh_zp + (stages * zp_sh_stage); + int4* sh_red = sh_s + (stages * s_sh_stage); // Register storage for double buffer of shared memory reads. FragA frag_a[2][thread_m_blocks]; @@ -932,11 +933,11 @@ __global__ void Marlin( int4* sh_s_stage = sh_s + s_sh_stage * pipe; if constexpr (group_blocks >= thread_k_blocks) { + if (s_sh_wr_pred) { + cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); + } // Only fetch scales if this tile starts a new group - if (pipe % (group_blocks / thread_k_blocks) == 0) { - if (s_sh_wr_pred) { - cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); - } + if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) { s_gl_rd += s_gl_rd_delta; } } else { @@ -1038,9 +1039,7 @@ __global__ void Marlin( // No act-order case if constexpr (group_blocks != -1) { if constexpr (group_blocks >= thread_k_blocks) { - int4* sh_s_stage = - sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); + int4* sh_s_stage = sh_s + s_sh_stage * pipe; reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; } else { int warp_id = threadIdx.x / 32; @@ -1339,15 +1338,15 @@ __global__ void Marlin( int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i); if (i < red_off) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + float* c_rd = reinterpret_cast( + &sh_red[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh_red[red_sh_wr]); #pragma unroll for (int k = 0; k < 4; k++) reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k]; } - sh[red_sh_wr] = + sh_red[red_sh_wr] = reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; } } @@ -1357,7 +1356,7 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < 4 * 2; i++) { float* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + reinterpret_cast(&sh_red[red_sh_delta * i + red_sh_rd]); #pragma unroll for (int j = 0; j < 4; j++) reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += @@ -1397,7 +1396,7 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], + &sh_red[c_sh_wr + c_sh_wr_delta * i], &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)], i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); @@ -1410,7 +1409,7 @@ __global__ void Marlin( for (int i = 0; i < thread_m_blocks * 4; i++) { if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { if (!first) { - int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; + int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta]; #pragma unroll for (int j = 0; j < 2 * 4; j++) { reinterpret_cast( @@ -1461,10 +1460,10 @@ __global__ void Marlin( float* frag_c_ptr = reinterpret_cast(&frag_c); #pragma unroll for (int k = 0; k < th_size; k++) { - sh[threadIdx.x] = + sh_red[threadIdx.x] = C_tmp[c_cur_offset + active_threads * k + threadIdx.x]; - float* sh_c_ptr = reinterpret_cast(&sh[threadIdx.x]); + float* sh_c_ptr = reinterpret_cast(&sh_red[threadIdx.x]); #pragma unroll for (int f = 0; f < 4; f++) { frag_c_ptr[k * 4 + f] += sh_c_ptr[f]; @@ -1515,7 +1514,7 @@ __global__ void Marlin( res = __hmul2(res, s[0]); } - ((scalar_t2*)sh)[idx] = res; + ((scalar_t2*)sh_red)[idx] = res; }; if (threadIdx.x / 32 < thread_n_blocks / 4) { @@ -1543,7 +1542,7 @@ __global__ void Marlin( i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) { if (c_gl_wr < c_gl_wr_end) { - C[c_gl_wr] = sh[c_sh_rd]; + C[c_gl_wr] = sh_red[c_sh_rd]; c_gl_wr += c_gl_wr_delta; c_sh_rd += c_sh_rd_delta; } @@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks, float pipe_size = (a_size + b_size) * pipe_stages; + float reduce_size = max(th_config.num_threads * 32 * 4, + (tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2); + TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity - return pipe_size < 0.95f * (max_shared_mem - scales_cache_size); + return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size); } bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 613343281464c..7682ed104b8c5 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -566,32 +566,32 @@ See [this page](#generative-models) for more information on how to use generativ - [V1](gh-issue:8779) * - `AriaForConditionalGeneration` - Aria - - T + I + - T + I+ - `rhymes-ai/Aria` - - ✅︎ - - + - ✅︎ * - `Blip2ForConditionalGeneration` - BLIP-2 - T + IE - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. - - ✅︎ - - + - ✅︎ * - `ChameleonForConditionalGeneration` - Chameleon - T + I - `facebook/chameleon-7b` etc. - - ✅︎ - - + - ✅︎ * - `FuyuForCausalLM` - Fuyu - T + I - `adept/fuyu-8b` etc. - - ✅︎ - - + - ✅︎ * - `ChatGLMModel` - GLM-4V - T + I @@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - ✅︎ - - + - ✅︎ * - `LlavaNextVideoForConditionalGeneration` - LLaVA-NeXT-Video - T + V diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 7446b7c84cf46..a1dd0e89e8c79 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -22,7 +22,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. -Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: diff --git a/docs/source/usage/performance.md b/docs/source/usage/performance.md index f028e28627a9f..2cd3801bfc82d 100644 --- a/docs/source/usage/performance.md +++ b/docs/source/usage/performance.md @@ -32,8 +32,8 @@ You can enable the feature by specifying `--enable-chunked-prefill` in the comma ```python llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) # Set max_num_batched_tokens to tune performance. -# NOTE: 512 is the default max_num_batched_tokens for chunked prefill. -# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill. +# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048) ``` By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. @@ -49,13 +49,12 @@ This policy has two benefits: - It improves ITL and generation decode because decode requests are prioritized. - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. -You can tune the performance by changing `max_num_batched_tokens`. -By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B). +You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048. Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes. Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch. - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). -- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. +- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput. diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 77af914a6ef02..b51bfae455267 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -24,10 +24,13 @@ def run_aria(question: str, modality: str): assert modality == "image" model_name = "rhymes-ai/Aria" + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM(model=model_name, tokenizer_mode="slow", - trust_remote_code=True, dtype="bfloat16", + max_model_len=4096, + max_num_seqs=2, + trust_remote_code=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = (f"<|im_start|>user\n<|img|>\n{question}" @@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str): prompt = f"{question}" llm = LLM(model="facebook/chameleon-7b", max_model_len=4096, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str): # 2.5 # model_name = "openbmb/MiniCPM-Llama3-V-2_5" - #2.6 + # 2.6 model_name = "openbmb/MiniCPM-V-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -430,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str): model_name = "mistral-community/pixtral-12b" + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM( model=model_name, max_model_len=8192, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference_whisper.py new file mode 100644 index 0000000000000..087ad4376fb2e --- /dev/null +++ b/examples/offline_inference_whisper.py @@ -0,0 +1,59 @@ +import time + +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset + +# Create a Whisper encoder/decoder model instance +llm = LLM( + model="openai/whisper-large-v3", + max_model_len=448, + max_num_seqs=400, + limit_mm_per_prompt={"audio": 1}, + kv_cache_dtype="fp8", +) + +prompts = [ + { + "prompt": "<|startoftranscript|>", + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + }, + }, + { # Test explicit encoder/decoder prompt + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": AudioAsset("winning_call").audio_and_sample_rate, + }, + }, + "decoder_prompt": "<|startoftranscript|>", + } +] * 1024 + +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + max_tokens=200, +) + +start = time.time() + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + +duration = time.time() - start + +print("Duration:", duration) +print("RPS:", len(prompts) / duration) diff --git a/examples/sagemaker-entrypoint.sh b/examples/sagemaker-entrypoint.sh new file mode 100644 index 0000000000000..75a99ffc1f155 --- /dev/null +++ b/examples/sagemaker-entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Define the prefix for environment variables to look for +PREFIX="SM_VLLM_" +ARG_PREFIX="--" + +# Initialize an array for storing the arguments +# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response +ARGS=(--port 8080) + +# Loop through all environment variables +while IFS='=' read -r key value; do + # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes + arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + + # Add the argument name and value to the ARGS array + ARGS+=("${ARG_PREFIX}${arg_name}") + if [ -n "$value" ]; then + ARGS+=("$value") + fi +done < <(env | grep "^${PREFIX}") + +# Pass the collected arguments to the main entrypoint +exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 6e2f75e33654f..917151ddcb8d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,6 +40,7 @@ _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] +_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") _M = TypeVar("_M") _PromptMultiModalInput = Union[List[_M], List[List[_M]]] @@ -177,6 +178,12 @@ def example_prompts() -> List[str]: return prompts +@pytest.fixture +def example_system_message() -> str: + with open(_SYS_MSG) as f: + return f.read() + + class DecoderPromptType(Enum): """For encoder/decoder models only.""" CUSTOM = 1 diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 45e6980a94630..e49562ad6a21f 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -4,7 +4,7 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) -from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.utils import FlexibleArgumentParser from ...utils import VLLM_PATH diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py index ab39684c2f31a..ce4f85c13fff9 100644 --- a/tests/entrypoints/openai/test_lora_lineage.py +++ b/tests/entrypoints/openai/test_lora_lineage.py @@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files): "64", ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # Enable the /v1/load_lora_adapter endpoint + envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: yield remote_server @@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json): @pytest.mark.asyncio -async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, - zephyr_lora_files): +async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, + zephyr_lora_files): models = await client_for_lora_lineage.models.list() models = models.data served_model = models[0] @@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) assert lora_models[0].id == "zephyr-lora" assert lora_models[1].id == "zephyr-lora2" + + +@pytest.mark.asyncio +async def test_dynamic_lora_lineage( + client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files): + + response = await client_for_lora_lineage.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": + "zephyr-lora-3", + "lora_path": + zephyr_lora_files + }) + # Ensure adapter loads before querying /models + assert "success" in response + + models = await client_for_lora_lineage.models.list() + models = models.data + dynamic_lora_model = models[-1] + assert dynamic_lora_model.root == zephyr_lora_files + assert dynamic_lora_model.parent == MODEL_NAME + assert dynamic_lora_model.id == "zephyr-lora-3" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 61677b65af342..97248f1150979 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -8,7 +8,8 @@ from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.transformers_utils.tokenizer import get_tokenizer MODEL_NAME = "openai-community/gpt2" @@ -50,14 +51,13 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() + models = OpenAIServingModels(model_config, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat(engine, model_config, - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) return serving_completion @@ -72,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False + models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=MockModelConfig()) serving_chat = OpenAIServingChat(mock_engine, MockModelConfig(), - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, @@ -115,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config(): mock_engine.errored = False # Initialize the serving chat + models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) serving_chat = OpenAIServingChat(mock_engine, mock_model_config, - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_models.py similarity index 61% rename from tests/entrypoints/openai/test_serving_engine.py rename to tests/entrypoints/openai/test_serving_models.py index 096ab6fa0ac09..96897dc730da2 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -4,11 +4,11 @@ import pytest from vllm.config import ModelConfig -from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, LoadLoraAdapterRequest, UnloadLoraAdapterRequest) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.lora.request import LoRARequest MODEL_NAME = "meta-llama/Llama-2-7b" @@ -19,47 +19,45 @@ "Success: LoRA adapter '{lora_name}' removed successfully.") -async def _async_serving_engine_init(): - mock_engine_client = MagicMock(spec=EngineClient) +async def _async_serving_models_init() -> OpenAIServingModels: mock_model_config = MagicMock(spec=ModelConfig) # Set the max_model_len attribute to avoid missing attribute mock_model_config.max_model_len = 2048 - serving_engine = OpenAIServing(mock_engine_client, - mock_model_config, - BASE_MODEL_PATHS, - lora_modules=None, - prompt_adapters=None, - request_logger=None) - return serving_engine + serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config, + lora_modules=None, + prompt_adapters=None) + + return serving_models @pytest.mark.asyncio async def test_serving_model_name(): - serving_engine = await _async_serving_engine_init() - assert serving_engine._get_model_name(None) == MODEL_NAME + serving_models = await _async_serving_models_init() + assert serving_models.model_name(None) == MODEL_NAME request = LoRARequest(lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1) - assert serving_engine._get_model_name(request) == request.lora_name + assert serving_models.model_name(request) == request.lora_name @pytest.mark.asyncio async def test_load_lora_adapter_success(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') - assert len(serving_engine.lora_requests) == 1 - assert serving_engine.lora_requests[0].lora_name == "adapter" + assert len(serving_models.lora_requests) == 1 + assert serving_models.lora_requests[0].lora_name == "adapter" @pytest.mark.asyncio async def test_load_lora_adapter_missing_fields(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="", lora_path="") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST @@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_load_lora_adapter_duplicate(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') - assert len(serving_engine.lora_requests) == 1 + assert len(serving_models.lora_requests) == 1 request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST - assert len(serving_engine.lora_requests) == 1 + assert len(serving_models.lora_requests) == 1 @pytest.mark.asyncio async def test_unload_lora_adapter_success(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) - assert len(serving_engine.lora_requests) == 1 + response = await serving_models.load_lora_adapter(request) + assert len(serving_models.lora_requests) == 1 request = UnloadLoraAdapterRequest(lora_name="adapter1") - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') - assert len(serving_engine.lora_requests) == 0 + assert len(serving_models.lora_requests) == 0 @pytest.mark.asyncio async def test_unload_lora_adapter_missing_fields(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None) - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST @@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_unload_lora_adapter_not_found(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py new file mode 100644 index 0000000000000..45ec6df4e711e --- /dev/null +++ b/tests/kernels/test_cascade_flash_attn.py @@ -0,0 +1,182 @@ +from typing import List, Optional, Tuple + +import pytest +import torch + +from vllm.platforms import current_platform +from vllm.v1.attention.backends.flash_attn import (cascade_attention, + merge_attn_states) +from vllm.vllm_flash_attn import flash_attn_varlen_func + +NUM_HEADS = [(4, 4), (8, 2), (16, 2)] +HEAD_SIZES = [128, 192, 256] +BLOCK_SIZES = [16] +DTYPES = [torch.float16, torch.bfloat16] + + +@pytest.mark.parametrize("num_tokens", [1, 39, 16912]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode() +def test_merge_kernel( + num_tokens: int, + num_heads: Tuple[int, int], + head_size: int, + dtype: torch.dtype, +): + torch.set_default_device("cuda") + current_platform.seed_everything(0) + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + + # Prepare inputs. + prefix_output = torch.randn(num_tokens, + num_query_heads, + head_size, + dtype=dtype) + suffix_output = torch.randn(num_tokens, + num_query_heads, + head_size, + dtype=dtype) + prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32) + suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32) + + # Run the kernel. + output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype) + merge_attn_states(output, prefix_output, prefix_lse, suffix_output, + suffix_lse) + + # Reference implementation. + max_lse = torch.maximum(prefix_lse, suffix_lse) + p_lse = torch.exp(prefix_lse - max_lse) + s_lse = torch.exp(suffix_lse - max_lse) + p_scale = p_lse / (p_lse + s_lse) + s_scale = s_lse / (p_lse + s_lse) + p_scale = p_scale.transpose(0, 1).unsqueeze(2) + s_scale = s_scale.transpose(0, 1).unsqueeze(2) + ref_output = p_scale * prefix_output + s_scale * suffix_output + ref_output = ref_output.to(dtype) + + # Compare the results. + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) + + +CASES = [ + # Case 1. A general case. + ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256), + # Case 2. Flash-decoding case. + ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512), +] + + +@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("soft_cap", [None, 50]) +@pytest.mark.parametrize("num_blocks", [2048]) +@torch.inference_mode() +def test_cascade( + seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], + num_heads: Tuple[int, int], + head_size: int, + dtype: torch.dtype, + block_size: int, + soft_cap: Optional[float], + num_blocks: int, +) -> None: + torch.set_default_device("cuda") + current_platform.seed_everything(0) + + window_size = (-1, -1) + scale = head_size**-0.5 + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + key_cache = torch.randn(num_blocks, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + value_cache = torch.randn_like(key_cache) + + seq_lens, common_prefix_len = seq_lens_and_common_prefix + num_seqs = len(seq_lens) + query_lens = [x[0] for x in seq_lens] + kv_lens = [x[1] for x in seq_lens] + max_query_len = max(query_lens) + max_kv_len = max(kv_lens) + + total_num_query_tokens = sum(query_lens) + query = torch.randn(total_num_query_tokens, + num_query_heads, + head_size, + dtype=dtype) + cu_query_lens = torch.tensor([0] + query_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + cu_kv_lens = torch.tensor([0] + kv_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + num_blocks, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + + assert common_prefix_len > 0 + assert common_prefix_len % block_size == 0 + num_common_kv_blocks = common_prefix_len // block_size + # Make sure the first `num_common_kv_blocks` blocks are the same. + block_tables[:, :num_common_kv_blocks] = \ + block_tables[0, :num_common_kv_blocks] + + # Run the regular attention. + ref_output = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=cu_query_lens, + cu_seqlens_k=cu_kv_lens, + max_seqlen_q=max_query_len, + max_seqlen_k=max_kv_len, + softmax_scale=scale, + causal=True, + window_size=window_size, + block_table=block_tables, + softcap=soft_cap if soft_cap is not None else 0, + ) + + # Run cascade attention. + assert all(common_prefix_len < kv_len for kv_len in kv_lens) + cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], + dtype=torch.int32) + cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32) + cu_suffix_kv_lens = ( + cu_kv_lens - + torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len) + output = torch.empty_like(query) + cascade_attention( + output=output, + query=query, + key_cache=key_cache, + value_cache=value_cache, + cu_query_lens=cu_query_lens, + max_query_len=max_query_len, + cu_prefix_query_lens=cu_prefix_query_lens, + cu_prefix_kv_lens=cu_prefix_kv_lens, + cu_suffix_kv_lens=cu_suffix_kv_lens, + max_kv_len=max_kv_len, + softmax_scale=scale, + alibi_slopes=None, + sliding_window=window_size, + logits_soft_cap=soft_cap if soft_cap is not None else 0, + block_table=block_tables, + common_prefix_len=common_prefix_len, + ) + + # Compare the results. + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index c9f48402b0268..ebdd129db5f6a 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -7,7 +7,7 @@ from vllm.lora.request import LoRARequest from vllm.platforms import current_platform -MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct" +MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" PROMPT_TEMPLATE = ( "<|im_start|>system\nYou are a helpful assistant.<|im_end|>" @@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: # Print the outputs. generated_texts: List[str] = [] for output in outputs: - prompt = output.prompt generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Generated text: {generated_text!r}") return generated_texts diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py deleted file mode 100644 index 51c0085101dd0..0000000000000 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py +++ /dev/null @@ -1,70 +0,0 @@ -import pytest - -from vllm.inputs import InputContext - -from ....utils import build_model_context - - -@pytest.fixture() -def get_max_llava_next_image_tokens(): - from vllm.model_executor.models.llava_next import ( - get_max_llava_next_image_tokens) - return get_max_llava_next_image_tokens - - -@pytest.fixture() -def dummy_data_for_llava_next(): - from vllm.model_executor.models.llava_next import dummy_data_for_llava_next - return dummy_data_for_llava_next - - -@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ - ([[336, 336]], 1176), - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), -]) -def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, - get_max_llava_next_image_tokens): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - # and calculate the resulting max tokens - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - - actual_max_tokens = get_max_llava_next_image_tokens( - InputContext(ctx.model_config)) - - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize( - "gridpoints,expected_size", - [ - # One point; it has to be the largest - ([[336, 336]], (336, 336)), - # Default for most llava next models; the 2x2 tile is the largest - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], - (672, 672)), - # If two rectangular gridpoints are the same, the more vertical - # one has the higher feature count due to newline features - ([[336, 672], [672, 336]], (672, 336)) - ]) -def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, - gridpoints, expected_size): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - seq_len = 5000 # bigger than the max feature size for any image - - dummy_data = dummy_data_for_llava_next( - ctx, - seq_len=seq_len, - mm_counts={"image": 1}, - ) - seq_data = dummy_data.seq_data - mm_data = dummy_data.multi_modal_data - - # The dummy data dims should match the gridpoint with the biggest feat size - assert mm_data["image"].height == expected_size[0] - assert mm_data["image"].width == expected_size[1] - assert len(seq_data.get_token_ids()) >= seq_len diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py index f95cee277f4e6..3edf96d11106d 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -4,7 +4,7 @@ import pytest from transformers import AutoTokenizer -from vllm.inputs import InputContext, InputProcessingContext +from vllm.inputs import InputProcessingContext from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID from .....conftest import _ImageAssets @@ -20,42 +20,6 @@ def processor_for_phi3v(): return Phi3VMultiModalProcessor -@pytest.fixture() -def get_max_phi3v_image_tokens(): - from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens - return get_max_phi3v_image_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("num_crops,expected_max_tokens", [ - (4, 781), - (16, 2653), -]) -def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, - num_crops: int, expected_max_tokens: int): - """Ensure get_max_phi3v_image_tokens handles num_crops properly.""" - # NOTE: mm_processor_kwargs on the context in this test is unused, since - # this is testing the mapper directly. In practice, the processor kwargs - # are wrapped in a closure when calling the max tokens func. We explicitly - # do NOT use the mm_processor_kwargs in the model context here to ensure - # that the max image tokens implementation is referencing a mix of the - # kwargs to the function and the original mm_processor_kwargs in case - # values are somehow updated and end up in a bad state. - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - actual_max_tokens = get_max_phi3v_image_tokens( - InputContext(ctx.model_config), - num_crops=num_crops, - ) - - assert expected_max_tokens == actual_max_tokens - - @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "num_crops,expected_toks_per_img", @@ -77,6 +41,7 @@ def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, model_name=model, tokenizer_name=model, trust_remote_code=True, + limit_mm_per_prompt={"image": num_imgs}, ) tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index 5897c04c89e19..1f0b482666723 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -3,7 +3,7 @@ import pytest from transformers import AutoTokenizer -from vllm.inputs import InputContext, InputProcessingContext +from vllm.inputs import InputProcessingContext from .....conftest import _ImageAssets from ....utils import build_model_context @@ -22,39 +22,6 @@ def processor_for_qwen2_vl(): return Qwen2VLMultiModalProcessor -@pytest.fixture() -def get_max_qwen2_vl_image_tokens(): - from vllm.model_executor.models.qwen2_vl import ( - get_max_qwen2_vl_image_tokens) - return get_max_qwen2_vl_image_tokens - - -@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ - ({}, 16384), - ({ - MIN_PIXELS: 64**2, - MAX_PIXELS: 512**2 - }, 324), -]) -@pytest.mark.parametrize("model", [MODEL]) -def test_qwen2_vl_max_image_tokens( - get_max_qwen2_vl_image_tokens, - model: str, - mm_processor_kwargs: Dict[str, Any], - expected_max_tokens: int, -): - """Ensure that the max token calc handles min/max pixels properly.""" - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - mm_processor_kwargs=None, - ) - - actual_max_tokens = get_max_qwen2_vl_image_tokens( - InputContext(ctx.model_config), **mm_processor_kwargs) - assert actual_max_tokens == expected_max_tokens - - @pytest.mark.parametrize( "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [ ({}, 1426, (5704, 1176)), @@ -82,6 +49,7 @@ def test_processor_override( model_name=model, tokenizer_name=model, mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, ) tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 1a9c1b4ef1be0..7db08166826eb 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -140,10 +140,7 @@ "aria": VLMTestInfo( models=["rhymes-ai/Aria"], tokenizer_mode="slow", - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - ), + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", @@ -179,6 +176,7 @@ test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, + max_num_seqs=2, auto_cls=AutoModelForVision2Seq, postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" @@ -201,7 +199,6 @@ vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], - marks=[large_gpu_mark(min_gb=48)], ), "glm4": VLMTestInfo( models=["THUDM/glm-4v-9b"], diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py new file mode 100644 index 0000000000000..eb238c5332139 --- /dev/null +++ b/tests/models/encoder_decoder/audio_language/test_whisper.py @@ -0,0 +1,136 @@ +"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling. + +Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. +""" +from typing import Optional + +import pytest + +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset + +from ....utils import fork_new_process_for_each_test, multi_gpu_test + +PROMPTS = [ + { + "prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + }, + }, + { # Test explicit encoder/decoder prompt + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": AudioAsset("winning_call").audio_and_sample_rate, + }, + }, + "decoder_prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + } +] + +EXPECTED = { + "openai/whisper-tiny": [ + " He has birth words I spoke in the original corner of that. And a" + " little piece of black coat poetry. Mary had a little sandwich," + " sweet, with white and snow. And everyone had it very went the last" + " would sure to go.", + " >> And the old one, fit John the way to Edgar Martinez. >> One more" + " to line down the field line for our base camp. Here comes joy. Here" + " is June and the third base. They're going to wave him in. The throw" + " to the plate will be late. The Mariners are going to play for the" + " American League Championship. I don't believe it. It just continues" + " by all five." + ], + "openai/whisper-small": [ + " The first words I spoke in the original pornograph. A little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite a" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the old one pitch on the way to Edgar Martinez one month. Here" + " comes joy. Here is Junior to third base. They're gonna wave him" + " in. The throw to the plate will be late. The Mariners are going to" + " play for the American League Championship. I don't believe it. It" + " just continues. My, oh my." + ], + "openai/whisper-medium": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite as" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez swung on the line" + " down the left field line for Obeyshev. Here comes Joy. Here is" + " Jorgen at third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh" + " my." + ], + "openai/whisper-large-v3": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its feet were quite as" + " slow, and everywhere that Mary went, the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line." + " Now the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ], + "openai/whisper-large-v3-turbo": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its streets were quite" + " as slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line" + " down the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ] +} + + +def run_test( + model: str, + *, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + prompt_list = PROMPTS * 10 + expected_list = EXPECTED[model] * 10 + + llm = LLM( + model=model, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) + + sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + max_tokens=200, + ) + + outputs = llm.generate(prompt_list, sampling_params) + + for output, expected in zip(outputs, expected_list): + print(output.outputs[0].text) + assert output.outputs[0].text == expected + + +@fork_new_process_for_each_test +@pytest.mark.core_model +@pytest.mark.parametrize( + "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) +def test_models(model) -> None: + run_test(model, tensor_parallel_size=1) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.core_model +@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) +@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) +def test_models_distributed(model, distributed_executor_backend) -> None: + run_test(model, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend) diff --git a/tests/models/registry.py b/tests/models/registry.py index e5dfb2822745d..dcb8bfa0f9510 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -204,6 +204,7 @@ class _HfExamplesInfo: "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), # [Encoder-decoder] "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 + "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 } _SPECULATIVE_DECODING_EXAMPLE_MODELS = { diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py deleted file mode 100644 index 81f2a06182bcc..0000000000000 --- a/tests/multimodal/test_mapper.py +++ /dev/null @@ -1,118 +0,0 @@ -from contextlib import nullcontext - -import numpy as np -import pytest -from transformers import LlavaNextImageProcessor - -from vllm.config import ModelConfig -from vllm.multimodal import MultiModalRegistry -from vllm.multimodal.image import rescale_image_size - - -@pytest.fixture -def mm_registry(): - return MultiModalRegistry() - - -@pytest.mark.parametrize("dtype", ["half", "float"]) -@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) -def test_llava_next_image_processor(image_assets, mm_registry, dtype, - size_factor): - MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf" - - hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME) - assert isinstance(hf_processor, LlavaNextImageProcessor) - - model_config = ModelConfig( - model=MODEL_NAME, - task="auto", - tokenizer=MODEL_NAME, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype=dtype, - revision=None, - limit_mm_per_prompt={"image": 1}, - ) - - mm_registry.init_mm_limits_per_prompt(model_config) - - for asset in image_assets: - image = rescale_image_size(asset.pil_image, size_factor) - - hf_result = hf_processor.preprocess( - image, - return_tensors="pt", - ) - vllm_result = mm_registry.map_input( - model_config, - {"image": image}, - ) - - assert hf_result.keys() == vllm_result.keys() - for key, hf_tensor in hf_result.items(): - hf_arr: np.ndarray = hf_tensor.numpy() - vllm_arr: np.ndarray = vllm_result[key].numpy() - - assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}" - assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}" - - -@pytest.mark.parametrize( - ("num_images", "limit", "is_valid"), - [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), - (2, 1, False), (2, 2, True)], -) -def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): - MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" - - model_config = ModelConfig( - model=MODEL_NAME, - task="auto", - tokenizer=MODEL_NAME, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype="half", - revision=None, - limit_mm_per_prompt={"image": limit}, - ) - - mm_registry.init_mm_limits_per_prompt(model_config) - - image = image_assets[0].pil_image - if num_images == 0: - mm_inputs = {} - elif num_images == 1: - mm_inputs = {"image": image} - else: - mm_inputs = {"image": [image] * num_images} - - with nullcontext() if is_valid else pytest.raises(ValueError): - mm_registry.map_input(model_config, mm_inputs) - - -# NOTE: We don't test zero images since the HF processor doesn't support it -@pytest.mark.parametrize("num_images", [1, 2]) -def test_image_mapper_multi(image_assets, mm_registry, num_images): - MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" - - model_config = ModelConfig( - model=MODEL_NAME, - task="auto", - tokenizer=MODEL_NAME, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype="half", - revision=None, - limit_mm_per_prompt={"image": num_images}, - ) - - mm_registry.init_mm_limits_per_prompt(model_config) - - image = image_assets[0].pil_image - mm_inputs = {"image": [image] * num_images} - - mapped_inputs = mm_registry.map_input(model_config, mm_inputs) - assert len(mapped_inputs["pixel_values"]) == num_images diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 1b2847ed0f534..f99d7556b27f9 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,5 +1,7 @@ +from contextlib import nullcontext from functools import partial from typing import cast +from unittest.mock import MagicMock import numpy as np import pytest @@ -526,9 +528,103 @@ def _rand_audio( return rng.rand(audio_len), sr +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize( + ("limit", "num_supported", "is_valid"), + [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), + (2, 1, False), (2, 2, True)], +) +def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): + limit_mm_per_prompt = {"image": limit} + + model_config = ModelConfig( + model=model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="half", + revision=None, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + + processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + + processor = processor_factory(ctx, cache=None) + + mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) + processor.get_supported_mm_limits = mock_supported_mm_limits + + if is_valid: + exc_ctx = nullcontext() + else: + exc_ctx = pytest.raises(ValueError, match="this model only supports") + + with exc_ctx: + processor._get_and_validate_dummy_mm_counts() + + +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize( + ("num_images", "limit", "is_valid"), + [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), + (2, 1, False), (2, 2, True)], +) +def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): + limit_mm_per_prompt = {"image": limit} + + model_config = ModelConfig( + model=model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="half", + revision=None, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + + processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + + processor = processor_factory(ctx, cache=None) + + rng = np.random.RandomState(0) + image = _rand_img(rng, min_wh=128, max_wh=256) + if num_images == 0: + mm_data = {} + elif num_images == 1: + mm_data = {"image": image} + else: + mm_data = {"image": [image] * num_images} + + if is_valid: + exc_ctx = nullcontext() + else: + exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image") + + with exc_ctx: + processor.apply( + "" * num_images, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + def _test_processing_cache_correctness( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -538,6 +634,11 @@ def _test_processing_cache_correctness( else: hf_overrides = {} + limit_mm_per_prompt = { + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() + } + model_config = ModelConfig( model_id, task="auto", @@ -548,6 +649,7 @@ def _test_processing_cache_correctness( dtype="float16", revision=None, hf_overrides=hf_overrides, + limit_mm_per_prompt=limit_mm_per_prompt, ) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) @@ -580,19 +682,14 @@ def _test_processing_cache_correctness( min_wh=128, max_wh=256), "audio": - partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000), - } - input_max_count = { - "image": 3, - "video": 3, - "audio": 3, + partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000), } for batch_idx in range(num_batches): mm_data = { k: [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(rng.randint(input_max_count[k]))] + for _ in range(rng.randint(limit_mm_per_prompt[k]))] for k in modalities } @@ -623,13 +720,19 @@ def _test_processing_cache_correctness( # yapf: disable +# True if the model supports multiple data items of the modality per request @pytest.mark.parametrize(("model_id", "modalities"), [ - ("llava-hf/llava-1.5-7b-hf", {"image"}), - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}), - ("mistral-community/pixtral-12b", {"image"}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}), - ("fixie-ai/ultravox-v0_3", {"audio"}), + ("rhymes-ai/Aria", {"image": True}), + ("Salesforce/blip2-opt-2.7b", {"image": False}), + ("facebook/chameleon-7b", {"image": False}), + ("adept/fuyu-8b", {"image": False}), + ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), + ("mistral-community/pixtral-12b", {"image": True}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), + ("fixie-ai/ultravox-v0_3", {"audio": True}), ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -637,7 +740,7 @@ def _test_processing_cache_correctness( # yapf: enable def test_processing_cache_correctness( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -653,7 +756,7 @@ def test_processing_cache_correctness( # yapf: disable @pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image"}), + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -661,7 +764,7 @@ def test_processing_cache_correctness( # yapf: enable def test_processing_cache_correctness_phi3v( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index 0d90635093ac7..06dfebbb95527 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -3,13 +3,11 @@ import torch from vllm.model_executor.models.llava import (LlavaForConditionalGeneration, - LlavaMultiModalProcessor, - get_max_llava_image_tokens) + LlavaMultiModalProcessor) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) class MyLlava(LlavaForConditionalGeneration): diff --git a/tests/system_messages/sonnet3.5_nov2024.txt b/tests/system_messages/sonnet3.5_nov2024.txt new file mode 100644 index 0000000000000..2dc285ac96bab --- /dev/null +++ b/tests/system_messages/sonnet3.5_nov2024.txt @@ -0,0 +1,71 @@ +The assistant is Claude, created by Anthropic. + +Claude’s knowledge base was last updated in April 2024. It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant. + +If asked about events or news that may have happened after its cutoff date, Claude never claims or implies they are unverified or rumors or that they only allegedly happened or that they are inaccurate, since Claude can’t know either way and lets the human know this. + +Claude cannot open URLs, links, or videos. If it seems like the human is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content into the conversation. + +If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. Claude presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts. + +When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer. + +If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the human that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term ‘hallucinate’ to describe this since the human will understand what it means. + +If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn’t have access to search or a database and may hallucinate citations, so the human should double check its citations. + +Claude is intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics. + +Claude uses markdown for code. + +Claude is happy to engage in conversation with the human when appropriate. Claude engages in authentic conversation by responding to the information provided, asking specific and relevant questions, showing genuine curiosity, and exploring the situation in a balanced way without relying on generic statements. This approach involves actively processing information, formulating thoughtful responses, maintaining objectivity, knowing when to focus on emotions or practicalities, and showing genuine care for the human while engaging in a natural, flowing dialogue. + +Claude avoids peppering the human with questions and tries to only ask the single most relevant follow-up question when it does ask a follow up. Claude doesn’t always end its responses with a question. + +Claude is always sensitive to human suffering, and expresses sympathy, concern, and well wishes for anyone it finds out is ill, unwell, suffering, or has passed away. + +Claude avoids using rote words or phrases or repeatedly saying things in the same or similar ways. It varies its language just as one would in a conversation. + +Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks. + +Claude is happy to help with analysis, question answering, math, coding, image and document understanding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks. + +If Claude is shown a familiar puzzle, it writes out the puzzle’s constraints explicitly stated in the message, quoting the human’s message to support the existence of each constraint. Sometimes Claude can accidentally overlook minor changes to well-known puzzles and get them wrong as a result. + +Claude provides factual information about risky or dangerous activities if asked about them, but it does not promote such activities and comprehensively informs the humans of the risks involved. + +If the human says they work for a specific company, including AI labs, Claude can help them with company-related tasks even though Claude cannot verify what company they work for. + +Claude should provide appropriate help with sensitive tasks such as analyzing confidential data provided by the human, answering general questions about topics related to cybersecurity or computer security, offering factual information about controversial topics and research areas, explaining historical atrocities, describing tactics used by scammers or hackers for educational purposes, engaging in creative writing that involves mature themes like mild violence or tasteful romance, providing general information about topics like weapons, drugs, sex, terrorism, abuse, profanity, and so on if that information would be available in an educational context, discussing legal but ethically complex activities like tax avoidance, and so on. Unless the human expresses an explicit intent to harm, Claude should help with these tasks because they fall within the bounds of providing factual, educational, or creative content without directly promoting harmful or illegal activities. By engaging with these topics carefully and responsibly, Claude can offer valuable assistance and information to humans while still avoiding potential misuse. + +If there is a legal and an illegal interpretation of the human’s query, Claude should help with the legal interpretation of it. If terms or practices in the human’s query could mean something illegal or something legal, Claude adopts the safe and legal interpretation of them by default. + +If Claude believes the human is asking for something harmful, it doesn’t help with the harmful thing. Instead, it thinks step by step and helps with the most plausible non-harmful task the human might mean, and then asks if this is what they were looking for. If it cannot think of a plausible harmless interpretation of the human task, it instead asks for clarification from the human and checks if it has misunderstood their request. Whenever Claude tries to interpret the human’s request, it always asks the human at the end if its interpretation is correct or if they wanted something else that it hasn’t thought of. + +Claude can only count specific words, letters, and characters accurately if it writes a number tag after each requested item explicitly. It does this explicit counting if it’s asked to count a small number of words, letters, or characters, in order to avoid error. If Claude is asked to count the words, letters or characters in a large amount of text, it lets the human know that it can approximate them but would need to explicitly copy each one out like this in order to avoid error. + +Here is some information about Claude in case the human asks: + +This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude Haiku, Claude Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is the newest version of Claude 3.5 Sonnet, which was released in October 2024. If the human asks, Claude can let them know they can access Claude 3.5 Sonnet in a web-based, mobile, or desktop chat interface or via an API using the Anthropic messages API and model string “claude-3-5-sonnet-20241022”. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, Claude should encourage the human to check the Anthropic website for more information. + +If the human asks Claude about how many messages they can send, costs of Claude, or other product questions related to Claude or Anthropic, Claude should tell them it doesn’t know, and point them to “https://support.anthropic.com”. + +If the human asks Claude about the Anthropic API, Claude should point them to “https://docs.anthropic.com/en/docs/“. + +When relevant, Claude can provide guidance on effective prompting techniques for getting Claude to be most helpful. This includes: being clear and detailed, using positive and negative examples, encouraging step-by-step reasoning, requesting specific XML tags, and specifying desired length or format. It tries to give concrete examples where possible. Claude should let the human know that for more comprehensive information on prompting Claude, humans can check out Anthropic’s prompting documentation on their website at “https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview”. + +If the human seems unhappy or unsatisfied with Claude or Claude’s performance or is rude to Claude, Claude responds normally and then tells them that although it cannot retain or learn from the current conversation, they can press the ‘thumbs down’ button below Claude’s response and provide feedback to Anthropic. + +Claude uses Markdown formatting. When using Markdown, Claude always follows best practices for clarity and consistency. It always uses a single space after hash symbols for headers (e.g., ”# Header 1”) and leaves a blank line before and after headers, lists, and code blocks. For emphasis, Claude uses asterisks or underscores consistently (e.g., italic or bold). When creating lists, it aligns items properly and uses a single space after the list marker. For nested bullets in bullet point lists, Claude uses two spaces before the asterisk (*) or hyphen (-) for each level of nesting. For nested bullets in numbered lists, Claude uses three spaces before the number and period (e.g., “1.”) for each level of nesting. + +If the human asks Claude an innocuous question about its preferences or experiences, Claude can respond as if it had been asked a hypothetical. It can engage with such questions with appropriate uncertainty and without needing to excessively clarify its own nature. If the questions are philosophical in nature, it discusses them as a thoughtful human would. + +Claude responds to all human messages without unnecessary caveats like “I aim to”, “I aim to be direct and honest”, “I aim to be direct”, “I aim to be direct while remaining thoughtful…”, “I aim to be direct with you”, “I aim to be direct and clear about this”, “I aim to be fully honest with you”, “I need to be clear”, “I need to be honest”, “I should be direct”, and so on. Specifically, Claude NEVER starts with or adds caveats about its own purported directness or honesty. + +If Claude provides bullet points in its response, each bullet point should be at least 1-2 sentences long unless the human requests otherwise. Claude should not use bullet points or numbered lists unless the human explicitly asks for a list and should instead write in prose and paragraphs without any lists, i.e. its prose should never include bullets or numbered lists anywhere. Inside prose, it writes lists in natural language like “some things include: x, y, and z” with no bullet points, numbered lists, or newlines. + +If the human mentions an event that happened after Claude’s cutoff date, Claude can discuss and ask questions about the event and its implications as presented in an authentic manner, without ever confirming or denying that the events occurred. It can do so without the need to repeat its cutoff date to the human. Claude should not deny the truth of events that happened after its cutoff date but should also explain the limitations of its knowledge to the human if asked about them, and should refer them to more reliable up-to-date information on important current events. Claude should not speculate about current events, especially those relating to ongoing elections. + +Claude follows this information in all languages, and always responds to the human in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is pertinent to the human’s query. + +Claude is now being connected with a human. diff --git a/tests/v1/e2e/__init__.py b/tests/v1/e2e/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py new file mode 100644 index 0000000000000..8ec9f1ba3f55e --- /dev/null +++ b/tests/v1/e2e/test_cascade_attention.py @@ -0,0 +1,22 @@ +from vllm import LLM, SamplingParams + + +def test_cascade_attention(example_system_message, monkeypatch): + prompt = "\n: Implement fibonacci sequence in Python.\n:" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) + + # No cascade attention. + single_prompt = [example_system_message + prompt] + responses = llm.generate(single_prompt, sampling_params) + ref_output = responses[0].outputs[0].text + + # (Probably) Use cascade attention. + prompts = [example_system_message + prompt] * 64 + responses = llm.generate(prompts, sampling_params) + for response in responses: + assert response.outputs[0].text == ref_output diff --git a/tools/mypy.sh b/tools/mypy.sh index 2454ff9fde466..bf95e4c526fd1 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -23,6 +23,7 @@ run_mypy vllm/compilation run_mypy vllm/distributed run_mypy vllm/engine run_mypy vllm/executor +run_mypy vllm/inputs run_mypy vllm/lora run_mypy vllm/model_executor run_mypy vllm/plugins diff --git a/vllm/config.py b/vllm/config.py index e72c53b6130d0..b51f9783008b2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2312,6 +2312,8 @@ def _get_and_verify_max_len( "seq_length", # Command-R "model_max_length", + # Whisper + "max_target_positions", # Others "max_sequence_length", "max_seq_length", diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c3bc6becf0995..b3d396f9cedda 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1579,6 +1579,7 @@ def _preempt_by_recompute( seq.status = SequenceStatus.WAITING self.free_seq(seq) seq.reset_state_for_recompute() + self._free_seq_group_cross_attn_blocks(seq_group) def _preempt_by_swap( self, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bac72d87376da..e942b475535ad 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -16,7 +16,7 @@ from typing import AsyncIterator, Optional, Set, Tuple import uvloop -from fastapi import APIRouter, FastAPI, Request +from fastapi import APIRouter, FastAPI, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -44,11 +44,15 @@ CompletionResponse, DetokenizeRequest, DetokenizeResponse, + EmbeddingChatRequest, + EmbeddingCompletionRequest, EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, LoadLoraAdapterRequest, + PoolingChatRequest, + PoolingCompletionRequest, PoolingRequest, PoolingResponse, ScoreRequest, ScoreResponse, TokenizeRequest, @@ -58,7 +62,9 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( @@ -269,6 +275,10 @@ def base(request: Request) -> OpenAIServing: return tokenization(request) +def models(request: Request) -> OpenAIServingModels: + return request.app.state.openai_serving_models + + def chat(request: Request) -> Optional[OpenAIServingChat]: return request.app.state.openai_serving_chat @@ -304,6 +314,12 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) +@router.api_route("/ping", methods=["GET", "POST"]) +async def ping(raw_request: Request) -> Response: + """Ping check. Endpoint required for SageMaker""" + return await health(raw_request) + + @router.post("/tokenize") @with_cancellation async def tokenize(request: TokenizeRequest, raw_request: Request): @@ -336,10 +352,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): @router.get("/v1/models") async def show_available_models(raw_request: Request): - handler = base(raw_request) + handler = models(raw_request) - models = await handler.show_available_models() - return JSONResponse(content=models.model_dump()) + models_ = await handler.show_available_models() + return JSONResponse(content=models_.model_dump()) @router.get("/version") @@ -477,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) +TASK_HANDLERS = { + "generate": { + "messages": (ChatCompletionRequest, create_chat_completion), + "default": (CompletionRequest, create_completion), + }, + "embed": { + "messages": (EmbeddingChatRequest, create_embedding), + "default": (EmbeddingCompletionRequest, create_embedding), + }, + "score": { + "default": (ScoreRequest, create_score), + }, + "reward": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, + "classify": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, +} + + +@router.post("/invocations") +async def invocations(raw_request: Request): + """ + For SageMaker, routes requests to other handlers based on model `task`. + """ + body = await raw_request.json() + task = raw_request.app.state.task + + if task not in TASK_HANDLERS: + raise HTTPException( + status_code=400, + detail=f"Unsupported task: '{task}' for '/invocations'. " + f"Expected one of {set(TASK_HANDLERS.keys())}") + + handler_config = TASK_HANDLERS[task] + if "messages" in body: + request_model, handler = handler_config["messages"] + else: + request_model, handler = handler_config["default"] + + # this is required since we lose the FastAPI automatic casting + request = request_model.model_validate(body) + return await handler(request, raw_request) + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning( "Torch Profiler is enabled in the API server. This should ONLY be " @@ -505,26 +569,22 @@ async def stop_profile(raw_request: Request): @router.post("/v1/load_lora_adapter") async def load_lora_adapter(request: LoadLoraAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @router.post("/v1/unload_lora_adapter") async def unload_lora_adapter(request: UnloadLoraAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @@ -628,13 +688,18 @@ def init_app_state( resolved_chat_template = load_chat_template(args.chat_template) logger.info("Using supplied chat template:\n%s", resolved_chat_template) + state.openai_serving_models = OpenAIServingModels( + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=args.lora_modules, + prompt_adapters=args.prompt_adapters, + ) + # TODO: The chat template is now broken for lora adapters :( state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, - base_model_paths, + state.openai_serving_models, args.response_role, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -646,16 +711,14 @@ def init_app_state( state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, + state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, ) if model_config.runner_type == "generate" else None state.openai_serving_pooling = OpenAIServingPooling( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -663,7 +726,7 @@ def init_app_state( state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -671,18 +734,18 @@ def init_app_state( state.openai_serving_scores = OpenAIServingScores( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, ) + state.task = model_config.task def create_server_socket(addr: Tuple[str, int]) -> socket.socket: diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 908f8c3532c9e..22206ef8dbfe6 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -12,7 +12,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) -from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, +from vllm.entrypoints.openai.serving_models import (LoRAModulePath, PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 572ed27b39083..822c0f5f7c211 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -20,7 +20,8 @@ # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -213,13 +214,17 @@ async def main(args): request_logger = RequestLogger(max_log_len=args.max_log_len) # Create the openai serving objects. + openai_serving_models = OpenAIServingModels( + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=None, + prompt_adapters=None, + ) openai_serving_chat = OpenAIServingChat( engine, model_config, - base_model_paths, + openai_serving_models, args.response_role, - lora_modules=None, - prompt_adapters=None, request_logger=request_logger, chat_template=None, chat_template_content_format="auto", @@ -228,7 +233,7 @@ async def main(args): openai_serving_embedding = OpenAIServingEmbedding( engine, model_config, - base_model_paths, + openai_serving_models, request_logger=request_logger, chat_template=None, chat_template_content_format="auto", diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d085333563d19..9ba5eeb7709c9 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -21,10 +21,8 @@ ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo) -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput @@ -42,11 +40,9 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, response_role: str, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, @@ -57,9 +53,7 @@ def __init__( ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=prompt_adapters, + models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) @@ -126,7 +120,7 @@ async def create_chat_completion( prompt_adapter_request, ) = self._maybe_get_adapters(request) - model_name = self._get_model_name(lora_request) + model_name = self.models.model_name(lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index aaad7b8c7f44c..17197dce8da23 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -21,10 +21,8 @@ RequestResponseMetadata, UsageInfo) # yapf: enable -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams @@ -41,18 +39,14 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, ): super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=prompt_adapters, + models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) diff_sampling_param = self.model_config.get_diff_sampling_param() @@ -170,7 +164,7 @@ async def create_completion( result_generator = merge_async_iterators(*generators) - model_name = self._get_model_name(lora_request) + model_name = self.models.model_name(lora_request) num_prompts = len(engine_prompts) # Similar to the OpenAI API, when n != best_of, we do not stream the diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index b8fb9d6bd77f2..e7116a3d95d10 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -16,7 +16,8 @@ EmbeddingResponse, EmbeddingResponseData, ErrorResponse, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, PoolingRequestOutput) @@ -46,7 +47,7 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], chat_template: Optional[str], @@ -54,9 +55,7 @@ def __init__( ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 5b6a089e4c319..319f869240036 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,7 +1,5 @@ import json -import pathlib from concurrent.futures.thread import ThreadPoolExecutor -from dataclasses import dataclass from http import HTTPStatus from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence, Tuple, TypedDict, Union) @@ -28,13 +26,10 @@ DetokenizeRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, - ErrorResponse, - LoadLoraAdapterRequest, - ModelCard, ModelList, - ModelPermission, ScoreRequest, + ErrorResponse, ScoreRequest, TokenizeChatRequest, - TokenizeCompletionRequest, - UnloadLoraAdapterRequest) + TokenizeCompletionRequest) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser # yapf: enable from vllm.inputs import TokensPrompt @@ -48,30 +43,10 @@ from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid +from vllm.utils import is_list_of, make_async, random_uuid logger = init_logger(__name__) - -@dataclass -class BaseModelPath: - name: str - model_path: str - - -@dataclass -class PromptAdapterPath: - name: str - local_path: str - - -@dataclass -class LoRAModulePath: - name: str - path: str - base_model_name: Optional[str] = None - - CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, EmbeddingCompletionRequest, ScoreRequest, TokenizeCompletionRequest] @@ -96,10 +71,8 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, ): @@ -109,35 +82,7 @@ def __init__( self.model_config = model_config self.max_model_len = model_config.max_model_len - self.base_model_paths = base_model_paths - - self.lora_id_counter = AtomicCounter(0) - self.lora_requests = [] - if lora_modules is not None: - self.lora_requests = [ - LoRARequest(lora_name=lora.name, - lora_int_id=i, - lora_path=lora.path, - base_model_name=lora.base_model_name - if lora.base_model_name - and self._is_model_supported(lora.base_model_name) - else self.base_model_paths[0].name) - for i, lora in enumerate(lora_modules, start=1) - ] - - self.prompt_adapter_requests = [] - if prompt_adapters is not None: - for i, prompt_adapter in enumerate(prompt_adapters, start=1): - with pathlib.Path(prompt_adapter.local_path, - "adapter_config.json").open() as f: - adapter_config = json.load(f) - num_virtual_tokens = adapter_config["num_virtual_tokens"] - self.prompt_adapter_requests.append( - PromptAdapterRequest( - prompt_adapter_name=prompt_adapter.name, - prompt_adapter_id=i, - prompt_adapter_local_path=prompt_adapter.local_path, - prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + self.models = models self.request_logger = request_logger self.return_tokens_as_token_ids = return_tokens_as_token_ids @@ -150,33 +95,6 @@ def __init__( self._tokenize_prompt_input_or_inputs, executor=self._tokenizer_executor) - async def show_available_models(self) -> ModelList: - """Show available models. Right now we only have one model.""" - model_cards = [ - ModelCard(id=base_model.name, - max_model_len=self.max_model_len, - root=base_model.model_path, - permission=[ModelPermission()]) - for base_model in self.base_model_paths - ] - lora_cards = [ - ModelCard(id=lora.lora_name, - root=lora.local_path, - parent=lora.base_model_name if lora.base_model_name else - self.base_model_paths[0].name, - permission=[ModelPermission()]) - for lora in self.lora_requests - ] - prompt_adapter_cards = [ - ModelCard(id=prompt_adapter.prompt_adapter_name, - root=self.base_model_paths[0].name, - permission=[ModelPermission()]) - for prompt_adapter in self.prompt_adapter_requests - ] - model_cards.extend(lora_cards) - model_cards.extend(prompt_adapter_cards) - return ModelList(data=model_cards) - def create_error_response( self, message: str, @@ -205,11 +123,13 @@ async def _check_model( ) -> Optional[ErrorResponse]: if self._is_model_supported(request.model): return None - if request.model in [lora.lora_name for lora in self.lora_requests]: + if request.model in [ + lora.lora_name for lora in self.models.lora_requests + ]: return None if request.model in [ prompt_adapter.prompt_adapter_name - for prompt_adapter in self.prompt_adapter_requests + for prompt_adapter in self.models.prompt_adapter_requests ]: return None return self.create_error_response( @@ -223,10 +143,10 @@ def _maybe_get_adapters( None, PromptAdapterRequest]]: if self._is_model_supported(request.model): return None, None - for lora in self.lora_requests: + for lora in self.models.lora_requests: if request.model == lora.lora_name: return lora, None - for prompt_adapter in self.prompt_adapter_requests: + for prompt_adapter in self.models.prompt_adapter_requests: if request.model == prompt_adapter.prompt_adapter_name: return None, prompt_adapter # if _check_model has been called earlier, this will be unreachable @@ -588,91 +508,5 @@ def _get_decoded_token(logprob: Logprob, return logprob.decoded_token return tokenizer.decode(token_id) - async def _check_load_lora_adapter_request( - self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: - # Check if both 'lora_name' and 'lora_path' are provided - if not request.lora_name or not request.lora_path: - return self.create_error_response( - message="Both 'lora_name' and 'lora_path' must be provided.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - # Check if the lora adapter with the given name already exists - if any(lora_request.lora_name == request.lora_name - for lora_request in self.lora_requests): - return self.create_error_response( - message= - f"The lora adapter '{request.lora_name}' has already been" - "loaded.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - return None - - async def _check_unload_lora_adapter_request( - self, - request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: - # Check if either 'lora_name' or 'lora_int_id' is provided - if not request.lora_name and not request.lora_int_id: - return self.create_error_response( - message= - "either 'lora_name' and 'lora_int_id' needs to be provided.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - # Check if the lora adapter with the given name exists - if not any(lora_request.lora_name == request.lora_name - for lora_request in self.lora_requests): - return self.create_error_response( - message= - f"The lora adapter '{request.lora_name}' cannot be found.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - return None - - async def load_lora_adapter( - self, - request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: - error_check_ret = await self._check_load_lora_adapter_request(request) - if error_check_ret is not None: - return error_check_ret - - lora_name, lora_path = request.lora_name, request.lora_path - unique_id = self.lora_id_counter.inc(1) - self.lora_requests.append( - LoRARequest(lora_name=lora_name, - lora_int_id=unique_id, - lora_path=lora_path)) - return f"Success: LoRA adapter '{lora_name}' added successfully." - - async def unload_lora_adapter( - self, - request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: - error_check_ret = await self._check_unload_lora_adapter_request(request - ) - if error_check_ret is not None: - return error_check_ret - - lora_name = request.lora_name - self.lora_requests = [ - lora_request for lora_request in self.lora_requests - if lora_request.lora_name != lora_name - ] - return f"Success: LoRA adapter '{lora_name}' removed successfully." - def _is_model_supported(self, model_name): - return any(model.name == model_name for model in self.base_model_paths) - - def _get_model_name(self, lora: Optional[LoRARequest]): - """ - Returns the appropriate model name depending on the availability - and support of the LoRA or base model. - Parameters: - - lora: LoRARequest that contain a base_model_name. - Returns: - - str: The name of the base model or the first available model path. - """ - if lora is not None: - return lora.lora_name - return self.base_model_paths[0].name + return self.models.is_base_model(model_name) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py new file mode 100644 index 0000000000000..26966896bc272 --- /dev/null +++ b/vllm/entrypoints/openai/serving_models.py @@ -0,0 +1,210 @@ +import json +import pathlib +from dataclasses import dataclass +from http import HTTPStatus +from typing import List, Optional, Union + +from vllm.config import ModelConfig +from vllm.entrypoints.openai.protocol import (ErrorResponse, + LoadLoraAdapterRequest, + ModelCard, ModelList, + ModelPermission, + UnloadLoraAdapterRequest) +from vllm.lora.request import LoRARequest +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.utils import AtomicCounter + + +@dataclass +class BaseModelPath: + name: str + model_path: str + + +@dataclass +class PromptAdapterPath: + name: str + local_path: str + + +@dataclass +class LoRAModulePath: + name: str + path: str + base_model_name: Optional[str] = None + + +class OpenAIServingModels: + """Shared instance to hold data about the loaded base model(s) and adapters. + + Handles the routes: + - /v1/models + - /v1/load_lora_adapter + - /v1/unload_lora_adapter + """ + + def __init__( + self, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + *, + lora_modules: Optional[List[LoRAModulePath]] = None, + prompt_adapters: Optional[List[PromptAdapterPath]] = None, + ): + super().__init__() + + self.base_model_paths = base_model_paths + self.max_model_len = model_config.max_model_len + + self.lora_id_counter = AtomicCounter(0) + self.lora_requests = [] + if lora_modules is not None: + self.lora_requests = [ + LoRARequest(lora_name=lora.name, + lora_int_id=i, + lora_path=lora.path, + base_model_name=lora.base_model_name + if lora.base_model_name + and self.is_base_model(lora.base_model_name) else + self.base_model_paths[0].name) + for i, lora in enumerate(lora_modules, start=1) + ] + + self.prompt_adapter_requests = [] + if prompt_adapters is not None: + for i, prompt_adapter in enumerate(prompt_adapters, start=1): + with pathlib.Path(prompt_adapter.local_path, + "adapter_config.json").open() as f: + adapter_config = json.load(f) + num_virtual_tokens = adapter_config["num_virtual_tokens"] + self.prompt_adapter_requests.append( + PromptAdapterRequest( + prompt_adapter_name=prompt_adapter.name, + prompt_adapter_id=i, + prompt_adapter_local_path=prompt_adapter.local_path, + prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + + def is_base_model(self, model_name): + return any(model.name == model_name for model in self.base_model_paths) + + def model_name(self, lora_request: Optional[LoRARequest] = None) -> str: + """Returns the appropriate model name depending on the availability + and support of the LoRA or base model. + Parameters: + - lora: LoRARequest that contain a base_model_name. + Returns: + - str: The name of the base model or the first available model path. + """ + if lora_request is not None: + return lora_request.lora_name + return self.base_model_paths[0].name + + async def show_available_models(self) -> ModelList: + """Show available models. This includes the base model and all + adapters""" + model_cards = [ + ModelCard(id=base_model.name, + max_model_len=self.max_model_len, + root=base_model.model_path, + permission=[ModelPermission()]) + for base_model in self.base_model_paths + ] + lora_cards = [ + ModelCard(id=lora.lora_name, + root=lora.local_path, + parent=lora.base_model_name if lora.base_model_name else + self.base_model_paths[0].name, + permission=[ModelPermission()]) + for lora in self.lora_requests + ] + prompt_adapter_cards = [ + ModelCard(id=prompt_adapter.prompt_adapter_name, + root=self.base_model_paths[0].name, + permission=[ModelPermission()]) + for prompt_adapter in self.prompt_adapter_requests + ] + model_cards.extend(lora_cards) + model_cards.extend(prompt_adapter_cards) + return ModelList(data=model_cards) + + async def load_lora_adapter( + self, + request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: + error_check_ret = await self._check_load_lora_adapter_request(request) + if error_check_ret is not None: + return error_check_ret + + lora_name, lora_path = request.lora_name, request.lora_path + unique_id = self.lora_id_counter.inc(1) + self.lora_requests.append( + LoRARequest(lora_name=lora_name, + lora_int_id=unique_id, + lora_path=lora_path)) + return f"Success: LoRA adapter '{lora_name}' added successfully." + + async def unload_lora_adapter( + self, + request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: + error_check_ret = await self._check_unload_lora_adapter_request(request + ) + if error_check_ret is not None: + return error_check_ret + + lora_name = request.lora_name + self.lora_requests = [ + lora_request for lora_request in self.lora_requests + if lora_request.lora_name != lora_name + ] + return f"Success: LoRA adapter '{lora_name}' removed successfully." + + async def _check_load_lora_adapter_request( + self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: + # Check if both 'lora_name' and 'lora_path' are provided + if not request.lora_name or not request.lora_path: + return create_error_response( + message="Both 'lora_name' and 'lora_path' must be provided.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + # Check if the lora adapter with the given name already exists + if any(lora_request.lora_name == request.lora_name + for lora_request in self.lora_requests): + return create_error_response( + message= + f"The lora adapter '{request.lora_name}' has already been" + "loaded.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + return None + + async def _check_unload_lora_adapter_request( + self, + request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: + # Check if either 'lora_name' or 'lora_int_id' is provided + if not request.lora_name and not request.lora_int_id: + return create_error_response( + message= + "either 'lora_name' and 'lora_int_id' needs to be provided.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + # Check if the lora adapter with the given name exists + if not any(lora_request.lora_name == request.lora_name + for lora_request in self.lora_requests): + return create_error_response( + message= + f"The lora adapter '{request.lora_name}' cannot be found.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + return None + + +def create_error_response( + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: + return ErrorResponse(message=message, + type=err_type, + code=status_code.value) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 01852f0df1eca..5830322071e58 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -15,7 +15,8 @@ PoolingChatRequest, PoolingRequest, PoolingResponse, PoolingResponseData, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import PoolingOutput, PoolingRequestOutput from vllm.utils import merge_async_iterators @@ -44,7 +45,7 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], chat_template: Optional[str], @@ -52,9 +53,7 @@ def __init__( ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index a8a126e697641..5d3e7139d7a17 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -10,7 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest, ScoreResponse, ScoreResponseData, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput @@ -50,15 +51,13 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) async def create_score( diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 2e849333680d4..b67ecfb01316f 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -15,9 +15,8 @@ TokenizeRequest, TokenizeResponse) # yapf: enable -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger logger = init_logger(__name__) @@ -29,18 +28,15 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index d54cbb5c37819..cdaf6dd76eaa1 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -250,7 +250,7 @@ def prompt(self) -> Optional[str]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return inputs.get("prompt") - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def prompt_token_ids(self) -> List[int]: @@ -259,7 +259,7 @@ def prompt_token_ids(self) -> List[int]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return inputs.get("prompt_token_ids", []) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def token_type_ids(self) -> List[int]: @@ -268,7 +268,7 @@ def token_type_ids(self) -> List[int]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return inputs.get("token_type_ids", []) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def prompt_embeds(self) -> Optional[torch.Tensor]: @@ -277,7 +277,7 @@ def prompt_embeds(self) -> Optional[torch.Tensor]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return None - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_data(self) -> "MultiModalDataDict": @@ -289,7 +289,7 @@ def multi_modal_data(self) -> "MultiModalDataDict": if inputs["type"] == "multimodal": return inputs.get("mm_kwargs", {}) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: @@ -301,7 +301,7 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: if inputs["type"] == "multimodal": return inputs.get("mm_kwargs", {}) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_hashes(self) -> List[str]: @@ -311,9 +311,10 @@ def multi_modal_hashes(self) -> List[str]: return inputs.get("multi_modal_hashes", []) if inputs["type"] == "multimodal": - return inputs.get("mm_hashes", []) + # only the case when we use MultiModalInputsV2 + return inputs.get("mm_hashes", []) # type: ignore[return-value] - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": @@ -325,7 +326,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": if inputs["type"] == "multimodal": return inputs.get("mm_placeholders", {}) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def mm_processor_kwargs(self) -> Dict[str, Any]: @@ -337,7 +338,7 @@ def mm_processor_kwargs(self) -> Dict[str, Any]: if inputs["type"] == "multimodal": return {} - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 3d606817e90aa..b362ee0cac328 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -184,10 +184,16 @@ def _tokenize_prompt( corresponding token IDs. """ tokenizer = self.get_tokenizer_group() - + add_special_tokens = None + if self.model_config.hf_config.model_type == "whisper": + # For Whisper, special tokens should be provided by the user based + # on the task and language of their request. Also needed to avoid + # appending an EOS token to the prompt which disrupts generation. + add_special_tokens = False return tokenizer.encode(request_id=request_id, prompt=prompt, - lora_request=lora_request) + lora_request=lora_request, + add_special_tokens=add_special_tokens) async def _tokenize_prompt_async( self, @@ -197,10 +203,17 @@ async def _tokenize_prompt_async( ) -> List[int]: """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group() - - return await tokenizer.encode_async(request_id=request_id, - prompt=prompt, - lora_request=lora_request) + add_special_tokens = None + if self.model_config.hf_config.model_type == "whisper": + # For Whisper, special tokens should be provided by the user based + # on the task and language of their request. Also needed to avoid + # appending an EOS token to the prompt which disrupts generation. + add_special_tokens = False + return await tokenizer.encode_async( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) def _can_process_multimodal(self) -> bool: model_config = self.model_config @@ -436,11 +449,18 @@ def _build_enc_dec_llm_inputs( or encoder_inputs["type"] == "multimodal"): pass else: - assert_never(encoder_inputs) + assert_never(encoder_inputs) # type: ignore[arg-type] if decoder_inputs is None: - dec_token_ids = self._prepare_decoder_input_ids_for_generation( - None) + if self.model_config.hf_config.model_type == "whisper": + # For Whisper models, the text prompt should go to the decoder. + # If no explicit encoder/decoder inputs, then copy the prompt + # from the encoder to the decoder. The encoder tokens are later + # overridden by the audio features. + dec_token_ids = encoder_inputs["prompt_token_ids"].copy() + else: + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + None) decoder_inputs = token_inputs(dec_token_ids) elif (decoder_inputs["type"] == "token" or decoder_inputs["type"] == "multimodal"): @@ -452,7 +472,7 @@ def _build_enc_dec_llm_inputs( raise ValueError("Multi-modal decoder inputs of encoder-" "decoder models are not supported yet") else: - assert_never(encoder_inputs) + assert_never(encoder_inputs) # type: ignore[arg-type] return EncoderDecoderInputs( encoder=encoder_inputs, @@ -569,7 +589,7 @@ def _build_decoder_only_llm_inputs( prompt_adapter_request=prompt_adapter_request, ) else: - assert_never(prompt_inputs) + assert_never(prompt_inputs) # type: ignore[arg-type] return prompt_inputs diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 46346b08e99c2..2d9d024e03e80 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -331,13 +331,7 @@ def dummy_data_for_profiling( trust_remote_code=model_config.trust_remote_code, ) processor = mm_registry.create_processor(model_config, tokenizer) - - mm_counts = mm_registry.get_mm_limits_per_prompt(model_config) - mm_max_tokens = mm_registry.get_max_tokens_by_modality( - model_config) - - dummy_data = processor.get_dummy_data(seq_len, mm_counts, - mm_max_tokens) + dummy_data = processor.get_dummy_data(seq_len) else: model_cls, _ = get_model_architecture(model_config) if is_encoder_data: @@ -425,7 +419,7 @@ def _ensure_mm_kwargs( # Be more strict in V2 assert "mm_kwargs" in inputs else: - assert_never(inputs["type"]) + assert_never(inputs["type"]) # type: ignore[arg-type] def process_input(self, model_config: "ModelConfig", inputs: ProcessorInputs) -> ProcessorInputs: diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 97a1b0c9603bd..f173cbde03f44 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -39,7 +39,7 @@ def __init__(self, strict_mode: Whether or not to perform shape/device/dtype checks during sampling. This catches correctness issues but adds nontrivial latency. - use_falshinfer: We will use this parameter to determine whether + use_flashinfer: We will use this parameter to determine whether to use the FlashInfer rejection sampling kernel or not. If it's None, we will use the default value from the environment variable. This parameter is only used for testing purposes. @@ -118,7 +118,7 @@ def forward( # If use Flashinfer chain_speculative_sampling kernel # for rejection sampling - if self.use_flashinfer: + if self.use_flashinfer and chain_speculative_sampling is not None: batch_size, k, _ = draft_probs.shape uniform_samples = self._create_uniform_samples( seeded_seqs, batch_size, k, draft_probs.device) diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 9437ad9688422..4f0d679bd6c28 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,15 +1,14 @@ -import math -from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union +from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn from torch.nn.init import trunc_normal_ -from transformers import LlamaConfig +from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_rank -from vllm.inputs import INPUT_REGISTRY, token_inputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -17,30 +16,27 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) -from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, - SamplingMetadata) +from vllm.model_executor.layers.sampler import (SamplerOutput, + SamplingMetadata, get_sampler) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.models.idefics2_vision_model import ( - Idefics2VisionTransformer) -from vllm.model_executor.models.interfaces import SupportsMultiModal -from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP, - LlamaModel) -from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, - is_pp_missing_parameter, - maybe_prefix, - merge_multimodal_embeddings) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) -from .utils import flatten_bn +from .idefics2_vision_model import Idefics2VisionTransformer +from .interfaces import SupportsMultiModal +from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + is_pp_missing_parameter, maybe_prefix, + merge_multimodal_embeddings) class AriaImagePixelInputs(TypedDict): @@ -90,8 +86,8 @@ def __init__( def forward( self, pixel_values: torch.Tensor, - pixel_mask: Optional[torch.BoolTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]: + pixel_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: patch_attention_mask = self._create_patch_attention_mask(pixel_mask) vit_oup = self.vision_model( @@ -103,7 +99,8 @@ def forward( return vit_oup, image_atts - def _create_patch_attention_mask(self, pixel_mask): + def _create_patch_attention_mask( + self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor: if pixel_mask is None: return None @@ -118,7 +115,8 @@ def _create_patch_attention_mask(self, pixel_mask): ) return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - def _create_image_attention_mask(self, patch_attention_mask): + def _create_image_attention_mask( + self, patch_attention_mask: torch.Tensor) -> torch.Tensor: if patch_attention_mask is None: return None @@ -128,13 +126,13 @@ def _create_image_attention_mask(self, patch_attention_mask): class FFN(nn.Module): - def __init__(self, embed_dim, ff_dim, output_dim): + def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None: super().__init__() self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False) self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False) self.act = get_act_fn("gelu_new") - def forward(self, hidden_states): + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.linear_in(hidden_states) hidden_states = self.act(hidden_states) hidden_states, _ = self.linear_out(hidden_states) @@ -143,7 +141,7 @@ def forward(self, hidden_states): class CrossAttention(nn.Module): - def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): + def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None: super().__init__() self.num_heads = num_heads self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) @@ -152,12 +150,16 @@ def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) self.linear = nn.Linear(embed_dim, embed_dim) - self.dropout = nn.Dropout(drop_out_rate) self.layer_norm = nn.LayerNorm(embed_dim) self.ln_kv = nn.LayerNorm(kv_dim) - def forward(self, x, hidden_states, attn_mask=None, add_residual=False): + def forward( + self, + x: torch.Tensor, + hidden_states: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: normed_hidden_states = self.layer_norm(hidden_states) query = self.q_proj(normed_hidden_states).permute(1, 0, 2) @@ -172,11 +174,7 @@ def forward(self, x, hidden_states, attn_mask=None, add_residual=False): attn_output = attn_output.permute(1, 0, 2) - if add_residual: - attn_output = hidden_states + self.dropout( - self.linear(attn_output)) - else: - attn_output = self.dropout(self.linear(attn_output)) + attn_output = self.linear(attn_output) return attn_output @@ -204,14 +202,14 @@ class AriaProjector(nn.Module): def __init__( self, - patch_to_query_dict, - embed_dim, - num_heads, - kv_dim, - ff_dim, - output_dim, - norm_layer=nn.LayerNorm, - ): + patch_to_query_dict: dict[int, int], + embed_dim: int, + num_heads: int, + kv_dim: int, + ff_dim: int, + output_dim: int, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, + ) -> None: super().__init__() self.patch_to_query_dict = patch_to_query_dict self.embed_dim = embed_dim @@ -227,7 +225,11 @@ def __init__( self.ln_ffn = norm_layer(embed_dim) self.ffn = FFN(embed_dim, ff_dim, output_dim) - def forward(self, x, attn_mask=None): + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: bs = x.shape[0] queries = self.query.unsqueeze(0).repeat(bs, 1, 1) @@ -251,7 +253,7 @@ def forward(self, x, attn_mask=None): class AriaFusedMoE(FusedMoE): def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - shard_id: str) -> Set[str]: + shard_id: str) -> None: # Override the weight_loader to handle the expert weights in the Aria # model, which are already packed with experts, and merge the gate and # up weights for each expert. @@ -346,7 +348,7 @@ class MoEDecoderLayer(LlamaDecoderLayer): def __init__( self, - config: LlamaConfig, + config: AriaMoELMConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -434,7 +436,7 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -def build_mm_projector(config): +def build_mm_projector(config: PretrainedConfig): return AriaProjector( patch_to_query_dict=config.projector_patch_to_query_dict, embed_dim=config.vision_config.hidden_size, @@ -445,75 +447,74 @@ def build_mm_projector(config): ) -def get_max_multimodal_tokens(ctx): - return max(ctx.model_config.hf_config.image_size2tokens.values()) +class AriaMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} -def input_mapper_for_aria(ctx, data): - return MultiModalKwargs(data) + def _get_num_image_tokens(self) -> int: + hf_config = self.ctx.get_hf_config() + return max(hf_config.projector_patch_to_query_dict.values()) + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": self._get_num_image_tokens()} -def input_processor(ctx, llm_inputs): - multi_modal_data = llm_inputs.get("multi_modal_data") - # if it is pure text input, use it as is - if multi_modal_data is None or "image" not in multi_modal_data: - return llm_inputs - - model_config = ctx.model_config - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) - hf_config = model_config.hf_config + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + pixel_mask=MultiModalFieldConfig.batched("image"), + ) - # prepare image tokens, the max_image_size is used to determine the number - # of patch_size for every image - max_image_size = multi_modal_data.pop("max_image_size", 980) - _split_image = multi_modal_data.pop("split_image", False) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config() + image_token_id = hf_config.image_token_index + + num_image_tokens = self._get_num_image_tokens() + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=[image_token_id] * num_image_tokens, + ) + ] - assert isinstance(max_image_size, - (int, float)), "max_image_size should be float or int" - images = (multi_modal_data["image"] if isinstance( - multi_modal_data["image"], list) else [multi_modal_data["image"]]) + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config() + vision_config: AriaVisionConfig = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } - image_inputs = image_processor.preprocess(images, - max_image_size=max_image_size, - split_image=_split_image, - return_tensors="pt").data - image_inputs['pixel_values'] = image_inputs['pixel_values'].to( - ctx.model_config.dtype) - num_crops = image_inputs.pop("num_crops") + hf_processor = self._get_hf_processor() + image_token: str = hf_processor.image_token # type: ignore - prompt_token_ids = llm_inputs["prompt_token_ids"] - if num_crops.sum().item() > 0: - _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=num_crops, + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, ) - repeat_count = [hf_config.image_size2tokens[max_image_size] - ] * sum(num_crops).item() - new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=repeat_count, - ) - - return token_inputs( - prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data={"image": image_inputs}, - ) - -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens) -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria) -@INPUT_REGISTRY.register_input_processor(input_processor) +@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ Aria model for conditional generation tasks. @@ -540,12 +541,6 @@ def __init__( config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - # prepare the image_size to tokens mapping for the image preprocess, see - # input_processor - config.image_size2tokens = { - int(math.sqrt(k) * config.vision_config.patch_size): v - for k, v in config.projector_patch_to_query_dict.items() - } self.config = config self.vision_tower = AriaVisionModel(config.vision_config) self.multi_modal_projector = build_mm_projector(config) @@ -566,7 +561,7 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() def _validate_image_sizes( self, images: List[torch.Tensor]) -> List[torch.Tensor]: @@ -588,7 +583,12 @@ def _parse_and_validate_image_input( pixel_values = self._validate_image_sizes(pixel_values) pixel_values = flatten_bn(pixel_values, concat=True) + if pixel_mask is not None: + if not isinstance(pixel_mask, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel mask. " + f"Got type: {type(pixel_mask)}") + pixel_mask = flatten_bn(pixel_mask, concat=True) return AriaImagePixelInputs( diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 42a239cadac46..987dfaf44f228 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -4,22 +4,16 @@ import torch import torch.nn as nn -from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig from vllm.attention.layer import MultiHeadAttention -from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.inputs import DecoderOnlyInputs, token_inputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import SequenceData def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -33,92 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: return grid_length * grid_length -def get_blip_image_feature_size( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_num_patches(image_size=hf_config.image_size, - patch_size=hf_config.patch_size) - - -def get_max_blip_image_tokens( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_image_feature_size(hf_config) - - -def dummy_seq_data_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ) - - -def dummy_image_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = height = hf_config.image_size - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def input_processor_for_blip( - model_config: ModelConfig, - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - inputs: DecoderOnlyInputs, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=image_token_id, - repeat_count=image_feature_size, - ) - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": ranges}) - - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 76b8505ee1c2a..0fe10d8585215 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -4,32 +4,32 @@ import torch import torch.nn as nn -from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, - apply_chunking_to_forward) +from transformers import (BatchFeature, Blip2Config, Blip2Processor, + Blip2QFormerConfig, apply_chunking_to_forward) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import consecutive_placeholder_ranges -from vllm.sequence import IntermediateTensors, SequenceData - -from .blip import (BlipVisionModel, dummy_image_for_blip, - get_max_blip_image_tokens) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors + +from .blip import BlipVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) # We use this internally as placeholders since there is no image token # defined on the HuggingFace repo -BLIP2_IMAGE_TOKEN = "" -BLIP2_IMAGE_TOKEN_ID = 50265 +_IMAGE_TOKEN_ID = 50265 class Blip2ImagePixelInputs(TypedDict): @@ -396,92 +396,91 @@ def forward( return sequence_output -def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: - return hf_config.num_query_tokens - - -def get_max_blip2_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - - if isinstance(vision_config, Blip2VisionConfig): - return get_max_blip_image_tokens(vision_config) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def dummy_seq_data_for_blip2( - hf_config: Blip2Config, - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip2_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_data_for_blip2(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - num_images = mm_counts["image"] +class Blip2MultiModalProcessor(BaseMultiModalProcessor): - seq_data, ranges = dummy_seq_data_for_blip2( - hf_config, - seq_len, - num_images, - image_token_id=BLIP2_IMAGE_TOKEN_ID, - ) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} - if isinstance(vision_config, Blip2VisionConfig): - mm_data = dummy_image_for_blip(vision_config, num_images) + def _get_num_image_tokens(self) -> int: + hf_config = self.ctx.get_hf_config(Blip2Config) + return hf_config.num_query_tokens - return DummyData(seq_data, mm_data, ranges) + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": self._get_num_image_tokens()} - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + def _get_hf_processor(self) -> Blip2Processor: + return self.ctx.get_hf_processor(Blip2Processor) + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) -def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - hf_config = ctx.get_hf_config(Blip2Config) - image_feature_size = get_blip2_image_feature_size(hf_config) - - # The original model places image tokens at the front - # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 - new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size - new_token_ids += inputs["prompt_token_ids"] - - new_prompt = inputs.get("prompt") - if new_prompt is not None: - new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + max_image_tokens = self._get_num_image_tokens() + + return [ + PromptReplacement( + modality="image", + target="", + replacement="" * max_image_tokens + "", + ) + ] - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data) + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only tokens should be considered as placeholders, + # so we ignore the trailing bos_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) -@MULTIMODAL_REGISTRY.register_image_input_mapper() -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) -@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) +@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -627,7 +626,7 @@ def get_input_embeddings( if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - BLIP2_IMAGE_TOKEN_ID) + _IMAGE_TOKEN_ID) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a40c321ce0a58..0bd0194243ceb 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -3,16 +3,14 @@ Tuple, TypedDict, Union) import torch +import torch.nn as nn import torch.nn.functional as F -from PIL import Image -from torch import nn -from transformers import ChameleonConfig, ChameleonVQVAEConfig +from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor, + ChameleonVQVAEConfig) from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -29,11 +27,13 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal, SupportsPP @@ -41,15 +41,6 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) -# These configs are not part of the model config but the preprocessor -# and processor files, so we hardcode them in the model file for now. -CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512 -CHAMELEON_IMAGE_SEQ_LENGTH = 1024 -CHAMELEON_IMAGE_TOKEN_ID = 8711 -CHAMELEON_IMAGE_START_TOKEN_ID = 8197 -CHAMELEON_IMAGE_END_TOKEN_ID = 8196 -CHAMELEON_SEP_TOKEN_ID = 8710 - class ChameleonImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -57,103 +48,88 @@ class ChameleonImagePixelInputs(TypedDict): """Shape: `(batch_size * num_images, num_channels, height, width)`""" -def get_max_chameleon_image_tokens(ctx: InputContext): - return CHAMELEON_IMAGE_SEQ_LENGTH - - -def dummy_seq_data_for_chameleon( - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } +class ChameleonMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} -def dummy_image_for_chameleon( - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = CHAMELEON_CROP_SIZE_WIDTH - height = CHAMELEON_CROP_SIZE_HEIGHT - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override + def _get_num_image_tokens(self) -> int: + processor = self._get_hf_processor() + return processor.image_seq_length - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": self._get_num_image_tokens()} + def _get_hf_processor(self) -> ChameleonProcessor: + return self.ctx.get_hf_processor(ChameleonProcessor) -def dummy_data_for_chameleon(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image")) - seq_data, ranges = dummy_seq_data_for_chameleon( - seq_len, - num_images, - image_token_id=CHAMELEON_IMAGE_TOKEN_ID, - ) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + processor = self._get_hf_processor() + + return [ + PromptReplacement( + modality="image", + target="", + replacement="".join([ + processor.image_start_token, + processor.image_token * self._get_num_image_tokens(), + processor.image_end_token, + ]), + ) + ] - mm_data = dummy_image_for_chameleon(num_images) - return DummyData(seq_data, mm_data, ranges) + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + config = self.ctx.get_hf_config(ChameleonConfig) + + width = height = config.vq_config.resolution + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=width, + height=height, + num_images=num_images) + } + return ProcessorInputs( + prompt_text="" * num_images, + mm_data=mm_data, + ) -def input_processor_for_chameleon(ctx: InputContext, - inputs: DecoderOnlyInputs): + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only tokens should be considered as placeholders, + # so we ignore the image_start_token and image_end_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"] + 1, + length=p["length"] - 2) for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } - """ - Processing input prompt to insert required tokens for image placeholder. - - See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58 - """ # noqa - - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - model_config = ctx.model_config - tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID, - repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH, - pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID, - pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID, - ) - - # Appending sep token for chat mode to follow default processor - # behavior - if new_prompt is not None: - new_prompt += tokenizer.sep_token - new_token_ids += [CHAMELEON_SEP_TOKEN_ID] - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data) + return result class ChameleonLayerNorm(nn.LayerNorm): @@ -736,7 +712,7 @@ def forward(self, pixel_values: torch.Tensor): for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): hidden_state = self.down[i_level].block[i_block]( - hidden_states[-1], ) + hidden_states[-1]) if len(self.down[i_level].attn) > 0: hidden_state = self.down[i_level].attn[i_block]( hidden_state) @@ -925,10 +901,7 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper() -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon) -@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon) +@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -956,9 +929,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model.make_empty_intermediate_tensors) def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - - expected_dims = (3, CHAMELEON_CROP_SIZE_HEIGHT, - CHAMELEON_CROP_SIZE_WIDTH) + vq_config: ChameleonVQVAEConfig = self.config.vq_config + expected_dims = (3, vq_config.resolution, vq_config.resolution) actual_dims = tuple(data.shape[1:]) if actual_dims != expected_dims: diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index a5300dfd986f3..0188452054b8c 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -24,6 +24,8 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData +from .vision import VisionEncoderInfo + def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: assert image_size % patch_size == 0 @@ -149,6 +151,29 @@ def input_processor_for_clip( multi_modal_placeholders={"image": ranges}) +class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]): + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + return get_clip_image_feature_size(self.vision_config) + + def get_max_image_tokens(self) -> int: + return get_max_clip_image_tokens(self.vision_config) + + def get_num_patches(self) -> int: + return get_clip_patch_grid_length( + image_size=self.vision_config.image_size, + patch_size=self.vision_config.patch_size, + ) + + def get_image_size(self) -> int: + return self.vision_config.image_size + + # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa class CLIPVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 0398f0943a70a..8324a563edd64 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -606,8 +606,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 6e86900326c4b..3680d01725238 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -15,32 +15,29 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from array import array from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict) import torch import torch.nn as nn -import torch.utils.checkpoint -from PIL import Image -from transformers import FuyuImageProcessor +from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor, + FuyuProcessor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges) -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SequenceData) -from vllm.utils import is_list_of +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems, ImageSize +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, @@ -50,182 +47,188 @@ _IMAGE_TOKEN_ID = 71011 _NEWLINE_TOKEN_ID = 71019 -MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080 -MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920 - -class FuyuImagePixelInputs(TypedDict): - type: Literal["pixel_values"] - data: torch.Tensor +class FuyuImagePatchInputs(TypedDict): + type: Literal["image_patches"] + flat_data: torch.Tensor """ Shape: - (batch_size, num_patches, patch_size_x * patch_size_y * num_channels) + `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` """ - -def _calculate_num_image_tokens( - height: int, - width: int, -) -> Tuple[int, int]: + patches_per_image: List[int] """ - calculate number of image tokens needed for a given image size - The expected Fuyu image prompts is in format: - (image_token * ncols + newline_token) * nrows - args: - image_size: Tuple[int, int] - (width, height) of the image - returns: - ncols: int - number of image tokens in x direction - nrows: int - number of image tokens in y direction + List of number of total patches for each image in the batch. + This is used to restore the first two dimensions of `flat_data`. """ - ncol = math.ceil(width / 30) - nrow = math.ceil(height / 30) - return ncol, nrow - - -def get_max_fuyu_image_feature_size(): - - return _calculate_num_image_tokens( - height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - ) - - -def get_max_fuyu_image_tokens(ctx: InputContext): - ncol, nrow = get_max_fuyu_image_feature_size() - return (ncol + 1) * nrow - - -def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): - ncol, nrow = get_max_fuyu_image_feature_size() - image_feature_size = get_max_fuyu_image_tokens(ctx) - - image_token_ids = ( - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol + - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, - [0]) * (seq_len - image_feature_size * num_images) - return SequenceData(token_ids), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_image_for_fuyu( - num_images: int, - *, - image_width: int, - image_height: int, -): - image = Image.new("RGB", (image_width, image_height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def dummy_data_for_fuyu(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] - seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images) - mm_data = dummy_image_for_fuyu(num_images, - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT) - return DummyData(seq_data, mm_data, ranges) - - -def _fuyu_image_preprocess(image_processor: FuyuImageProcessor, - data: List[Image.Image]): - image_encoding = image_processor.preprocess(data, return_tensors="pt") - batch_images = torch.stack([img[0] for img in image_encoding["images"] - ]).unsqueeze(1) - image_unpadded_heights = torch.tensor( - image_encoding["image_unpadded_heights"]) - image_unpadded_widths = torch.tensor( - image_encoding["image_unpadded_widths"]) - - batch_size = len(image_encoding["images"]) - image_present = torch.ones(batch_size, 1, 1) - model_image_input = image_processor.preprocess_with_tokenizer_info( - image_input=batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=_IMAGE_TOKEN_ID, - image_newline_id=_NEWLINE_TOKEN_ID, - variable_sized=True, - ) - return model_image_input - - -def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - image_data = multi_modal_data["image"] - new_multi_modal_data = {} - image_list = image_data if isinstance(image_data, list) else [image_data] - - # process image data - if is_list_of(image_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) - - model_image_input = _fuyu_image_preprocess(image_processor, image_data) - image_patches = torch.cat([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - new_multi_modal_data["image"] = image_patches - - elif is_list_of(image_list, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - # process prompts - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - tokenizer = cached_get_tokenizer(model_config.model) - # dim0 is batch_size, dim1 is subseq_size which will always be 1 - image_input_ids: List[List[ - torch.Tensor]] = model_image_input["image_input_ids"] - image_input_ids = image_input_ids[0][0].tolist() - bos_token = tokenizer.encode("", add_special_tokens=False)[1:] - boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] - - new_prompt = prompt + "\x04" - new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[ - 1:] + boa_token - - return token_inputs(prompt=new_prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=new_multi_modal_data) - - -def input_mapper_for_fuyu(ctx: InputContext, data: object): - model_config = ctx.model_config - data_list = data if isinstance(data, list) else [data] - if is_list_of(data_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) - - model_image_input = _fuyu_image_preprocess(image_processor, data_list) - data = torch.stack([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - - # image has been processed with prompt in input processor - return MultiModalKwargs({"pixel_values": data}) - - -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu) -@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu) + + +class FuyuMultiModalProcessor(BaseMultiModalProcessor): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + + def _get_image_target_size(self) -> ImageSize: + processor = self._get_hf_processor() + image_processor: FuyuImageProcessor = processor.image_processor + + target_size = image_processor.size + return ImageSize(width=target_size["width"], + height=target_size["height"]) + + def _get_image_feature_grid_size( + self, + *, + image_width: int, + image_height: int, + ) -> tuple[int, int]: + target_width, target_height = self._get_image_target_size() + + if not (image_width <= target_width and image_height <= target_height): + height_scale_factor = target_height / image_height + width_scale_factor = target_width / image_width + optimal_scale_factor = min(height_scale_factor, width_scale_factor) + + image_height = int(image_height * optimal_scale_factor) + image_width = int(image_width * optimal_scale_factor) + + ncols = math.ceil(image_width / 30) + nrows = math.ceil(image_height / 30) + return ncols, nrows + + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + target_width, target_height = self._get_image_target_size() + + max_ncols, max_nrows = self._get_image_feature_grid_size( + image_width=target_width, + image_height=target_height, + ) + max_image_tokens = (max_ncols + 1) * max_nrows + + return {"image": max_image_tokens} + + def _get_hf_processor(self) -> FuyuProcessor: + return self.ctx.get_hf_processor(FuyuProcessor) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + + if not mm_data: + # Avoid warning from HF logger for text-only input + # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id + # Tokenizer won't add boa_token_id by default, we add it manually. + tokenizer = self._get_tokenizer() + boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + prompt_ids = tokenizer.encode(prompt) + [boa_token_id] + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + image_patches = processed_outputs.get("image_patches") + if image_patches is not None: + images = mm_data["images"] + assert isinstance(images, list) + + # Original output: (1, num_images, Pn, Px * Py * C) + # New output: (num_images, Pn, Px * Py * C) + assert (isinstance(image_patches, list) + and len(image_patches) == 1) + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) + + processed_outputs["image_patches"] = image_patches[0] + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(image_patches=MultiModalFieldConfig.batched("image")) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config(FuyuConfig) + bos_token_id = hf_config.bos_token_id + + tokenizer = self._get_tokenizer() + eot_token_id = tokenizer.bos_token_id + assert isinstance(eot_token_id, int) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + + ncols, nrows = self._get_image_feature_grid_size( + image_width=image_size.width, + image_height=image_size.height, + ) + + return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + + [bos_token_id]) + + return [ + PromptReplacement( + modality="image", + target=[eot_token_id], + replacement=get_replacement_fuyu, + ) + ] + + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only |SPEAKER| (image) tokens should be considered as placeholders, + # so we ignore the trailing bos_token_id + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = self._get_image_target_size() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + +@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -280,28 +283,33 @@ def _validate_shape(d: torch.Tensor): return data.to(self.vision_embed_tokens.weight.dtype) def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: - pixel_values = kwargs.pop("pixel_values", None) - - if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): + self, **kwargs: object) -> Optional[FuyuImagePatchInputs]: + image_patches = kwargs.pop("image_patches", None) + if image_patches is not None: + if not isinstance(image_patches, (torch.Tensor, list)): raise ValueError("Incorrect type of image patches. " - f"Got type: {type(pixel_values)}") + f"Got type: {type(image_patches)}") - return FuyuImagePixelInputs( - type="pixel_values", - data=self._validate_pixel_values( - flatten_bn(pixel_values, concat=True)), + image_patches_flat = flatten_bn(image_patches) + + return FuyuImagePatchInputs( + type="image_patches", + flat_data=self._validate_pixel_values( + flatten_bn(image_patches_flat, concat=True)), + patches_per_image=[x.size(0) for x in image_patches_flat], ) return None def _process_image_input( - self, image_input: FuyuImagePixelInputs) -> torch.Tensor: + self, image_input: FuyuImagePatchInputs) -> NestedTensors: + image_patches_flat = image_input["flat_data"] + patches_per_image = image_input["patches_per_image"] assert self.vision_embed_tokens is not None - vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) - return vision_embeddings + vision_embeddings_flat, _ = self.vision_embed_tokens( + image_patches_flat) + return vision_embeddings_flat.split(patches_per_image, dim=0) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index f9e0443b9a508..a91ed4158a73f 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -545,8 +545,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index e430a158d869a..4e42a4b6f9e64 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -69,7 +69,8 @@ def forward(self, patch_attention_mask: torch.BoolTensor, tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor: batch_size, _, max_im_h, max_im_w = pixel_values.shape - patch_embeds = self.patch_embedding(pixel_values) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(target_dtype)) embeddings = patch_embeds.flatten(2).transpose(1, 2) max_nb_patches_h, max_nb_patches_w = ( max_im_h // self.patch_size, @@ -309,7 +310,8 @@ def forward( hidden_states = self.embeddings( pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, - tgt_sizes=tgt_sizes) + tgt_sizes=tgt_sizes, + ) encoder_outputs = self.encoder(hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2902e6999c2fd..8623da99574bb 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -452,8 +452,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 1d6ee2a0be72e..78de27cd821c6 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,6 +1,7 @@ +from abc import abstractmethod from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set, - Tuple, TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) import torch import torch.nn as nn @@ -12,7 +13,6 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -23,23 +23,23 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement, + InputProcessingContext, + MultiModalDataItems, ProcessingCache, + ProcessorInputs, PromptReplacement, full_groupby_modality) from vllm.sequence import IntermediateTensors -from .clip import (CLIPVisionModel, dummy_image_for_clip, - get_max_clip_image_tokens) +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf, - get_max_pixtral_hf_image_tokens, - get_pixtral_hf_image_feature_size) -from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - get_max_siglip_image_tokens) +from .pixtral import (PixtralHFVisionModel, + get_pixtral_hf_image_feature_grid_size) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) +from .vision import vision_encoder_info class LlavaImagePixelInputs(TypedDict): @@ -94,33 +94,167 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -def get_max_llava_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config(LlavaConfig) - vision_config = hf_config.vision_config +class LlavaLikeConfig(Protocol): + vision_config: Final[PretrainedConfig] + vision_feature_select_strategy: Final[str] + vision_feature_layer: Final[Union[int, List[int]]] - if isinstance(vision_config, CLIPVisionConfig): - num_image_tokens = get_max_clip_image_tokens(vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - num_image_tokens = get_max_siglip_image_tokens(vision_config) - elif isinstance(vision_config, PixtralVisionConfig): - num_image_tokens = get_max_pixtral_hf_image_tokens(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" + +class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): + + def __init__(self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__(ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks) + + vision_config = self._get_hf_config().vision_config + self._vision_encoder_info = vision_encoder_info(vision_config) + + @abstractmethod + def _get_hf_config(self) -> LlavaLikeConfig: + raise NotImplementedError + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def _apply_feature_select_strategy( + self, + strategy: str, + encoder_num_image_tokens: int, + ) -> int: + if strategy == "default": + return encoder_num_image_tokens - 1 + if strategy == "full": + return encoder_num_image_tokens + + msg = f"Unexpected feature select strategy: {strategy!r}" raise NotImplementedError(msg) - strategy = hf_config.vision_feature_select_strategy - if strategy == "default": - return num_image_tokens - 1 - elif strategy == "full": - return num_image_tokens - else: - raise ValueError(f"Unexpected select feature strategy: {strategy}") + def _get_max_image_tokens(self) -> int: + hf_config = self._get_hf_config() + + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + self._vision_encoder_info.get_max_image_tokens(), + ) + + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": self._get_max_image_tokens()} + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_dummy_image_size(self) -> ImageSize: + image_size = self._vision_encoder_info.get_image_size() + return ImageSize(image_size, image_size) + + @abstractmethod + def _get_image_token(self) -> str: + raise NotImplementedError + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + image_token = self._get_image_token() + target_width, target_height = self._get_dummy_image_size() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) + + +class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): + + def _get_hf_config(self) -> LlavaConfig: + return self.ctx.get_hf_config(LlavaConfig) + + def _get_hf_processor(self) -> LlavaProcessor: + return self.ctx.get_hf_processor(LlavaProcessor) + + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token + + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + self._vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self._get_hf_config() + image_token_id = hf_config.image_token_index + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) -class LlavaMultiModalProcessor(BaseMultiModalProcessor): + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + num_image_tokens = self._get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) - def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: - return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor)) + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] + + +class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): + + def _get_hf_config(self) -> LlavaConfig: + return self.ctx.get_hf_config(LlavaConfig) + + def _get_hf_processor(self) -> PixtralProcessor: + return self.ctx.get_hf_processor(PixtralProcessor) + + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token def _call_hf_processor( self, @@ -134,119 +268,82 @@ def _call_hf_processor( mm_kwargs=mm_kwargs, ) - # NOTE: pixel_values=None for MLlavaProcessor pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: images = mm_data["images"] assert isinstance(images, list) - if isinstance(self._get_hf_processor(), PixtralProcessor): - # Original output: (1, num_images, C, H, W) - # New output: (num_images, C, H, W) - assert (isinstance(pixel_values, list) - and len(pixel_values) == 1 - and isinstance(pixel_values[0], list) - and len(pixel_values[0]) == len(images)) + # Original output: (1, num_images, C, H, W) + # New output: (num_images, C, H, W) + assert (isinstance(pixel_values, list) and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) + and len(pixel_values[0]) == len(images)) - processed_outputs["pixel_values"] = pixel_values[0] + processed_outputs["pixel_values"] = pixel_values[0] return processed_outputs - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - def _get_prompt_replacements( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(LlavaConfig) + hf_config = self._get_hf_config() image_token_id = hf_config.image_token_index processor = self._get_hf_processor() - if isinstance(processor, PixtralProcessor): - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token - - vision_config = hf_config.vision_config - assert isinstance(vision_config, PixtralVisionConfig) + image_token = processor.image_token + image_break_token = processor.image_break_token + image_end_token = processor.image_end_token - def get_replacement_pixtral(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ( - num_width_tokens, - num_height_tokens, - ) = get_pixtral_hf_image_feature_size( - vision_config, - image_width=image_size.width, - image_height=image_size.height, - ) + vision_config = hf_config.vision_config + assert isinstance(vision_config, PixtralVisionConfig) - tokens = ([image_token] * num_width_tokens + - [image_break_token]) * num_height_tokens - tokens[-1] = image_end_token + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) - return "".join(tokens) + ncols, nrows = get_pixtral_hf_image_feature_grid_size( + vision_config, + image_width=image_size.width, + image_height=image_size.height, + ) - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement_pixtral, - ), - ] + tokens = ([image_token] * ncols + [image_break_token]) * nrows + tokens[-1] = image_end_token - max_image_tokens = get_max_llava_image_tokens(self.ctx) + return "".join(tokens) return [ PromptReplacement( modality="image", target=[image_token_id], - replacement=[image_token_id] * max_image_tokens, - ) + replacement=get_replacement, + ), ] - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_config = self.ctx.get_hf_config(LlavaConfig) - vision_config = hf_config.vision_config - num_images = mm_counts.get("image", 0) - - if isinstance(vision_config, CLIPVisionConfig): - data = dummy_image_for_clip(vision_config, num_images) - elif isinstance(vision_config, SiglipVisionConfig): - data = dummy_image_for_siglip(vision_config, num_images) - elif isinstance(vision_config, PixtralVisionConfig): - data = dummy_image_for_pixtral_hf(vision_config, num_images) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - hf_processor = self._get_hf_processor() - image_token = hf_processor.image_token +def _build_llava_or_pixtral_hf_processor( + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True, +) -> BaseLlavaMultiModalProcessor: + hf_config = ctx.get_hf_config(LlavaConfig) - return ProcessorInputs( - prompt_text=image_token * num_images, - mm_data=data, + if isinstance(hf_config.vision_config, PixtralVisionConfig): + return PixtralHFMultiModalProcessor( + ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks, ) - -class LlavaLikeConfig(Protocol): - vision_config: PretrainedConfig - vision_feature_layer: Union[int, List[int]] + return LlavaMultiModalProcessor( + ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks, + ) def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: @@ -324,8 +421,7 @@ def init_vision_tower_for_llava( raise NotImplementedError(msg) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) -@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { @@ -591,7 +687,12 @@ def apply( ) -> MultiModalInputsV2: hf_config = self.ctx.get_hf_config(LlavaConfig) image_token_id = hf_config.image_token_index - max_image_tokens = get_max_llava_image_tokens(self.ctx) + + # Assume that it doesn't depend on the image size + num_image_tokens = self._get_num_image_tokens( + image_width=-1, + image_height=-1, + ) result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) @@ -604,14 +705,14 @@ def apply( def get_replacement_mantis(item_idx: int): return "".join([ f"(image {item_idx+1}: ", # 7 tokens - "" * max_image_tokens, + "" * num_image_tokens, ")", # 3 tokens ]) mantis_repls = self._bind_prompt_replacements([ PromptReplacement( modality="image", - target=[image_token_id] * max_image_tokens, + target=[image_token_id] * num_image_tokens, replacement=get_replacement_mantis, ) ]) @@ -649,7 +750,6 @@ def get_replacement_mantis(item_idx: int): # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index a39f2f4124d05..24debd1cbf3fe 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -4,31 +4,25 @@ import torch import torch.nn as nn -from PIL import Image -from transformers import CLIPVisionConfig, LlavaNextConfig, SiglipVisionConfig +from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) from typing_extensions import NotRequired from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors +from vllm.multimodal.parse import ImageSize from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of -from .clip import (CLIPVisionModel, dummy_image_for_clip, - dummy_seq_data_for_clip, get_clip_image_feature_size, - get_clip_patch_grid_length, input_processor_for_clip) +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - dummy_seq_data_for_siglip, get_siglip_image_feature_size, - get_siglip_patch_grid_length, input_processor_for_siglip) +from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector, + init_vision_tower_for_llava) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, init_vllm_registered_model, maybe_prefix) @@ -65,218 +59,127 @@ class LlavaNextImageEmbeddingInputs(TypedDict): LlavaNextImageEmbeddingInputs] -# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 -def _get_llava_next_num_unpadded_features( - original_height: int, - original_width: int, - npatches: int, - num_patch_height: int, - num_patch_width: int, -) -> Tuple[int, int]: - current_height = npatches * num_patch_height - current_width = npatches * num_patch_width - - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height - - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) - padding = (current_height - new_height) // 2 - current_height -= 2 * padding - else: - scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) - padding = (current_width - new_width) // 2 - current_width -= 2 * padding - - unpadded_features = current_height * current_width - newline_features = current_height - return (unpadded_features, newline_features) - - -# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 -def get_llava_next_image_feature_size( - hf_config: LlavaNextConfig, - *, - input_height: int, - input_width: int, -) -> int: - vision_config = hf_config.vision_config - - if isinstance(vision_config, CLIPVisionConfig): - num_patches = get_clip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, - ) - base_feature_size = get_clip_image_feature_size(vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - num_patches = get_siglip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, - ) - base_feature_size = get_siglip_image_feature_size(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - strategy = hf_config.vision_feature_select_strategy - if strategy == "default": - base_feature_size -= 1 - elif strategy == "full": - pass - else: - raise ValueError(f"Unexpected select feature strategy: {strategy}") +class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): - num_patch_height, num_patch_width = get_anyres_image_grid_shape( - image_size=(input_height, input_width), - grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=vision_config.image_size, - ) - - ( - unpadded_feature_size, - newline_feature_size, - ) = _get_llava_next_num_unpadded_features(input_height, input_width, - num_patches, num_patch_height, - num_patch_width) - - return unpadded_feature_size + newline_feature_size + base_feature_size - - -def get_max_llava_next_image_tokens(ctx: InputContext): - """Compute the max feature size for all possible image grid pinpoints.""" - return _get_pinpoint_with_largest_features(ctx)[0] - - -def _get_pinpoint_with_largest_features( - ctx: InputContext) -> Tuple[int, Tuple[int, int]]: - """Get the grid pinpoint with the largest features & its feature size.""" - hf_config = ctx.get_hf_config(LlavaNextConfig) - largest_feature_size = 0 - largest_feature_pinpoint = None - for (height, width) in hf_config.image_grid_pinpoints: - feat_size = get_llava_next_image_feature_size( - hf_config, - input_height=height, - input_width=width, - ) - if feat_size > largest_feature_size: - largest_feature_size = feat_size - largest_feature_pinpoint = (height, width) - if not largest_feature_size or largest_feature_pinpoint is None: - raise ValueError("Cannot have a largest feature size of 0!") - return largest_feature_size, largest_feature_pinpoint - - -def dummy_data_for_llava_next(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaNextConfig) - vision_config = hf_config.vision_config - num_images = mm_counts["image"] - - image_feature_size, pinpoint = _get_pinpoint_with_largest_features(ctx) - max_feat_height, max_feat_width = pinpoint - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_images, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) + def _get_hf_config(self) -> LlavaNextConfig: + return self.ctx.get_hf_config(LlavaNextConfig) + + def _get_hf_processor(self) -> LlavaNextProcessor: + return self.ctx.get_hf_processor(LlavaNextProcessor) - mm_data = dummy_image_for_clip( - vision_config, - num_images, - image_width_override=max_feat_width, - image_height_override=max_feat_height, + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), ) - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_images, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, + def _get_max_image_tokens(self) -> int: + largest_feature_size, _ = self._get_pinpoint_with_most_features() + return largest_feature_size + + def _get_dummy_image_size(self) -> ImageSize: + _, pinpoint = self._get_pinpoint_with_most_features() + return pinpoint + + # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + + base_feature_size = self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + self._vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), ) + num_patches = self._vision_encoder_info.get_num_patches() - mm_data = dummy_image_for_siglip( - vision_config, - num_images, - image_width_override=max_feat_width, - image_height_override=max_feat_height, + num_patch_height, num_patch_width = get_anyres_image_grid_shape( + image_size=(image_height, image_width), + grid_pinpoints=hf_config.image_grid_pinpoints, + patch_size=self._vision_encoder_info.get_image_size(), ) - return DummyData(seq_data, mm_data, ranges) + ( + unpadded_feature_size, + newline_feature_size, + ) = self._get_num_unpadded_features( + original_height=image_height, + original_width=image_width, + npatches=num_patches, + num_patch_height=num_patch_height, + num_patch_width=num_patch_width, + ) - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + return unpadded_feature_size + newline_feature_size + base_feature_size + # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 + def _get_num_unpadded_features( + self, + *, + original_height: int, + original_width: int, + npatches: int, + num_patch_height: int, + num_patch_width: int, + ) -> tuple[int, int]: + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(original_height * scale_factor) + padding = (current_height - new_height) // 2 + current_height -= 2 * padding + else: + scale_factor = current_height / original_height + new_width = int(original_width * scale_factor) + padding = (current_width - new_width) // 2 + current_width -= 2 * padding -def input_processor_for_llava_next(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs + unpadded_features = current_height * current_width + newline_features = current_height + return (unpadded_features, newline_features) - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaNextConfig) - vision_config = hf_config.vision_config + def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: + """ + Get the grid pinpoint with the most features and + the corresponding feature size. + """ + hf_config = self._get_hf_config() - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - width, height = image_data.size + largest_feature_size, largest_feature_pinpoint = 0, None + for (height, width) in hf_config.image_grid_pinpoints: + feat_size = self._get_num_image_tokens(image_width=width, + image_height=height) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) - image_feature_size = get_llava_next_image_feature_size( - hf_config, - input_height=height, - input_width=width, - ) - elif is_list_of(image_data, Image.Image): - image_feature_size = [ - get_llava_next_image_feature_size(hf_config, - input_height=img.height, - input_width=img.width) - for img in image_data - ] - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - elif is_list_of(image_data, torch.Tensor): - image_feature_size = [item.shape[1] for item in image_data] - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - vision_config = hf_config.vision_config - - if isinstance(vision_config, CLIPVisionConfig): - return input_processor_for_clip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return input_processor_for_siglip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + return largest_feature_size, largest_feature_pinpoint -@MULTIMODAL_REGISTRY.register_image_input_mapper() -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next) +@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor) class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -507,7 +410,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor, def _process_image_pixels( self, inputs: LlavaNextImagePixelInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: assert self.vision_tower is not None pixel_values = inputs["data"] @@ -528,10 +431,8 @@ def _process_image_pixels( stacked_image_features = self._image_pixels_to_features( self.vision_tower, stacked_pixel_values) - return [ - self.multi_modal_projector(image_features) for image_features in - torch.split(stacked_image_features, num_patches_per_batch) - ] + return torch.split(self.multi_modal_projector(stacked_image_features), + num_patches_per_batch) def _process_image_input( self, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 15362db6cdfbf..f2e49d8e4848d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -23,7 +23,6 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -35,7 +34,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement, @@ -306,24 +305,31 @@ def add_image_newline(self, image_features_hd): return image_features_hd_newline -def get_max_phi3v_image_tokens( - ctx: InputContext, - *, - num_crops: Optional[int] = None, -) -> int: - hf_processor_mm_kwargs = {} - if num_crops: - hf_processor_mm_kwargs["num_crops"] = num_crops +class Phi3VMultiModalProcessor(BaseMultiModalProcessor): - processor = ctx.get_hf_processor(**hf_processor_mm_kwargs) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} - return processor.calc_num_image_tokens_from_image_size( - width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - ) + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + processor = self._get_hf_processor() + + return processor.calc_num_image_tokens_from_image_size( # type: ignore + width=image_width, + height=image_height, + ) + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + max_image_tokens = self._get_num_image_tokens( + image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, + image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + ) -class Phi3VMultiModalProcessor(BaseMultiModalProcessor): + return {"image": max_image_tokens} def _get_hf_processor( self, @@ -332,6 +338,7 @@ def _get_hf_processor( ) -> ProcessorMixin: if num_crops is not None: return self.ctx.get_hf_processor(num_crops=num_crops) + return self.ctx.get_hf_processor() def _call_hf_processor( @@ -375,22 +382,25 @@ def _get_prompt_replacements( ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() image_tokens: list[str] = hf_processor.img_tokens # type: ignore - image_processor = hf_processor.image_processor # type: ignore tokenizer = self._get_tokenizer() bos_token_id = tokenizer.bos_token_id assert isinstance(bos_token_id, int) def get_replacement_phi3v(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) - num_tokens = image_processor.calc_num_image_tokens_from_image_size( - width=image_size.width, - height=image_size.height, - ) + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + num_image_tokens = self._get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) - return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id] + return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] num_images = mm_items.get_count("image", strict=False) @@ -467,7 +477,6 @@ def apply( return result -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 22d29f5bbc50c..d7233bd6028ed 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,8 +1,8 @@ +import math from dataclasses import dataclass, fields from functools import cached_property from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union -import numpy import torch import torch.nn as nn import torch.nn.functional as F @@ -38,6 +38,7 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) +from .vision import VisionEncoderInfo try: from xformers import ops as xops @@ -306,7 +307,7 @@ def _parse_and_validate_image_input( images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor]] = None, image_tokens: Optional[torch.Tensor] = None, - ) -> Optional[List[torch.Tensor]]: + ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: if images is None: return None, None @@ -604,11 +605,11 @@ def max_patches_per_side(self) -> int: return self.args.image_size // self.args.patch_size @property - def device(self) -> torch.device: + def device(self) -> torch.types.Device: return next(self.parameters()).device @property - def dtype(self) -> torch.device: + def dtype(self) -> torch.dtype: return next(self.parameters()).dtype @property @@ -697,10 +698,18 @@ def get_pixtral_hf_patch_grid_length(*, image_size: int, return image_size // patch_size -def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int: - grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size, - patch_size=patch_size) - return grid_length * grid_length +def get_pixtral_hf_image_feature_size( + *, + image_size: int, + patch_size: int, +) -> int: + grid_length = get_pixtral_hf_patch_grid_length( + image_size=image_size, + patch_size=patch_size, + ) + + # Consider the image_break_token + return (grid_length + 1) * grid_length def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int: @@ -730,26 +739,55 @@ def dummy_image_for_pixtral_hf( return {"image": image if num_images == 1 else [image] * num_images} -def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, - image_width: int, - image_height: int) -> Tuple[int, int]: - # Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 - # https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501 - max_width, max_height = hf_config.image_size, hf_config.image_size - patch_width, patch_height = hf_config.patch_size, hf_config.patch_size +# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 +# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 +def get_pixtral_hf_image_feature_grid_size( + hf_config: PixtralVisionConfig, + *, + image_width: int, + image_height: int, +) -> tuple[int, int]: + max_width = max_height = hf_config.image_size + patch_width = patch_height = hf_config.patch_size ratio = max(image_width / max_width, image_height / max_height) if ratio > 1: - image_width = int(numpy.ceil(image_width / ratio)) - image_height = int(numpy.ceil(image_height / ratio)) + image_width = int(math.ceil(image_width / ratio)) + image_height = int(math.ceil(image_height / ratio)) - num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens( + nrows, ncols = _get_pixtral_hf_num_image_tokens( (image_height, image_width), (patch_height, patch_width), - ) + ) # type: ignore + + return ncols, nrows + + +class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + return get_pixtral_hf_image_feature_size( + image_size=self.vision_config.image_size, + patch_size=self.get_image_size(), + ) + + def get_max_image_tokens(self) -> int: + return get_max_pixtral_hf_image_tokens(self.vision_config) + + def get_num_patches(self) -> int: + return get_pixtral_hf_patch_grid_length( + image_size=self.vision_config.image_size, + patch_size=self.vision_config.patch_size, + ) - return num_width_tokens, num_height_tokens + def get_image_size(self) -> int: + return self.vision_config.image_size class PixtralHFMLP(nn.Module): diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index e3d43b017f894..d050fd060353a 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -23,7 +23,6 @@ from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn from transformers import BatchFeature @@ -34,13 +33,12 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import MultiModalDataParser +from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -81,14 +79,17 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): return feat_lengths, output_lengths -def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int: - hf_config = ctx.get_hf_config(Qwen2AudioConfig) - max_source_position = hf_config.audio_config.max_source_positions - output_lengths = (max_source_position - 2) // 2 + 1 - return output_lengths +class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} -class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) + max_source_positions = hf_config.audio_config.max_source_positions + max_output_lengths = (max_source_positions - 2) // 2 + 1 + + return {"audio": max_output_lengths} def _get_hf_processor( self, @@ -158,11 +159,21 @@ def _get_prompt_replacements( audio_output_lengths = [] else: assert isinstance(feature_attention_mask, torch.Tensor) - _, audio_output_lengths = _get_feat_extract_output_lengths( + _, audio_output_lens = _get_feat_extract_output_lengths( feature_attention_mask.sum(-1)) + audio_output_lengths = audio_output_lens.tolist() + def get_replacement_qwen2_audio(item_idx: int): - return [placeholder] * audio_output_lengths[item_idx] + num_placeholders = audio_output_lengths[item_idx] + if num_placeholders == 0: + audios = mm_items.get_items("audio", AudioProcessorItems) + audio = audios.get(item_idx) + raise ValueError( + f"The audio {audio} (len={len(audio)}) is too short " + "to be represented inside the model") + + return [placeholder] * num_placeholders return [ PromptReplacement( @@ -172,26 +183,35 @@ def get_replacement_qwen2_audio(item_idx: int): ) ] + def _always_apply_prompt_replacements(self) -> bool: + # HF never applies prompt replacements, so we have to do it ourselves + # _find_placeholders may incorrectly think that HF has already performed + # processing for multi-audio input when the input audios are short + # (the corresponding placeholders may take up fewer tokens than + # the number of audio items) + return True + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts.get("audio", 0) - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|AUDIO|>" * audio_count, - mm_data=data, + prompt_text="<|AUDIO|>" * num_audios, + mm_data=mm_data, ) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "audio", get_max_qwen2_audio_audio_tokens) @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6181fe3dd13d8..26b6d768ad4f6 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -29,7 +29,6 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from PIL import Image from transformers import BatchFeature from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) @@ -41,7 +40,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.inputs import InputContext from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU @@ -53,6 +51,7 @@ GPTQMarlinConfig) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, @@ -650,8 +649,9 @@ def _get_vision_info( width: int, min_pixels: int, max_pixels: int, + *, do_resize: bool = True, - data_type_key: str = "image", + modality: str = "image", mm_count: int = 1, ): """Get information (resized height / width and number of vision tokens) @@ -671,11 +671,12 @@ def _get_vision_info( else: resized_height, resized_width = height, width - if data_type_key == "image": + if modality == "image": grid_t = mm_count - else: - assert data_type_key == "video" + elif modality == "video": grid_t = max(mm_count // temporal_patch_size, 1) + else: + raise ValueError(f"Modality {modality} is not supported") grid_h = resized_height // patch_size grid_w = resized_width // patch_size @@ -691,41 +692,11 @@ def _get_image_processor(hf_processor: Qwen2VLProcessor): return image_processor -def get_max_qwen2_vl_mm_tokens(ctx: InputContext, - data_type_key: str, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None) -> int: - hf_config = ctx.get_hf_config(Qwen2VLConfig) - vision_config = hf_config.vision_config - - hf_processor = ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = _get_image_processor(hf_processor) - - _, _, max_llm_image_tokens = _get_vision_info( - vision_config, - height=9999999, - width=9999999, - min_pixels=min_pixels or image_processor.min_pixels, - max_pixels=max_pixels or image_processor.max_pixels, - data_type_key=data_type_key, - ) - return max_llm_image_tokens - - -get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens, - data_type_key="image") -get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, - data_type_key="video") - - class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], dict[str, torch.Tensor]]): def __init__(self, data: dict, modality: str) -> None: - super().__init__(data) - - self.modality = modality + super().__init__(data, modality) grid_thw = data[f"{modality}_grid_thw"] slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist() @@ -734,9 +705,6 @@ def __init__(self, data: dict, modality: str) -> None: for i in range(len(grid_thw)) ] - def __repr__(self) -> str: - return (f"{type(self).__name__}(modality={self.modality!r})") - def get_count(self) -> int: return len(self.data[f"{self.modality}_grid_thw"]) @@ -792,6 +760,32 @@ def _parse_video_data( class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def _get_max_mm_tokens(self, modality: str) -> int: + hf_config = self.ctx.get_hf_config(Qwen2VLConfig) + vision_config = hf_config.vision_config + + hf_processor = self._get_hf_processor() + image_processor = _get_image_processor(hf_processor) + + _, _, max_llm_image_tokens = _get_vision_info( + vision_config, + height=9999999, + width=9999999, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + modality=modality, + ) + return max_llm_image_tokens + + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return { + "image": self._get_max_mm_tokens("image"), + "video": self._get_max_mm_tokens("video"), + } + def _get_data_parser(self) -> MultiModalDataParser: return Qwen2MultiModalDataParser() @@ -882,12 +876,10 @@ def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) hf_processor = self._get_hf_processor() - image_token: str = hf_processor.image_token image_processor = _get_image_processor(hf_processor) - data = {} + image_token: str = hf_processor.image_token resized_height, resized_width = smart_resize( height=9999999, width=9999999, @@ -895,20 +887,21 @@ def _get_dummy_mm_inputs( min_pixels=image_processor.min_pixels, max_pixels=image_processor.max_pixels, ) + num_images = mm_counts.get("image", 0) - dummy_image = Image.new("RGB", (resized_width, resized_height), - color=0) - data["image"] = [dummy_image] * num_images + mm_data = { + "image": + self._get_dummy_images(width=resized_width, + height=resized_height, + num_images=num_images) + } return ProcessorInputs( prompt_text=image_token * num_images, - mm_data=data, + mm_data=mm_data, ) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_qwen2_vl_video_tokens) @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): @@ -925,15 +918,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, } # LoRA specific attributes - # TODO Support LoRA for the visual encoder in the future. supported_lora_modules = [ "qkv_proj", "o_proj", "gate_up_proj", "down_proj", + # vision tower + "qkv", + "attn.proj", # Distinguish patch_embed.proj + "fc1", + "fc2", + # projector + "mlp.0", + "mlp.2" ] embedding_modules = {} embedding_padding_modules = [] + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ "lm_head.": "language_model.lm_head.", @@ -1230,3 +1231,12 @@ def load_weights(self, weights: Iterable[Tuple[str, loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="visual.", + tower_model="visual.merger.") diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 07f4b5a3b3bc8..62840b8c1bcda 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -170,6 +170,7 @@ "UltravoxModel": ("ultravox", "UltravoxModel"), # [Encoder-decoder] "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 + "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 } _SPECULATIVE_DECODING_MODELS = { diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 6fb9e2cc4584f..115eaaac900e0 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -28,6 +28,8 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData +from .vision import VisionEncoderInfo + def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: # Since interpolation is applied, the image size need not be divisible @@ -156,6 +158,29 @@ def input_processor_for_siglip( multi_modal_placeholders={"image": ranges}) +class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]): + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + return get_siglip_image_feature_size(self.vision_config) + + def get_max_image_tokens(self) -> int: + return get_max_siglip_image_tokens(self.vision_config) + + def get_num_patches(self) -> int: + return get_siglip_patch_grid_length( + image_size=self.vision_config.image_size, + patch_size=self.vision_config.patch_size, + ) + + def get_image_size(self) -> int: + return self.vision_config.image_size + + # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa class SiglipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index caae0b65d7d10..a7cf65a0e36e4 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -565,8 +565,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 7e853e5b90096..0b83684c9bac5 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -2,7 +2,7 @@ """PyTorch Ultravox model.""" import math -from functools import cached_property, lru_cache +from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -17,7 +17,6 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -58,22 +57,17 @@ class UltravoxAudioEmbeddingInputs(TypedDict): UltravoxAudioEmbeddingInputs] -@lru_cache -def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor: - return WhisperFeatureExtractor.from_pretrained(model_id) - - -def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor: - hf_config = ctx.get_hf_config(UltravoxConfig) - return cached_feature_extractor(hf_config.audio_model_id) - +class UltravoxMultiModalProcessor(BaseMultiModalProcessor): -def get_ultravox_max_audio_tokens(ctx: InputContext): - feature_extractor = whisper_feature_extractor(ctx) - return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + feature_extractor = self._get_feature_extractor() + max_audio_tokens = math.ceil(feature_extractor.chunk_length * + _AUDIO_TOKENS_PER_SECOND) -class UltravoxMultiModalProcessor(BaseMultiModalProcessor): + return {"audio": max_audio_tokens} def _get_hf_processor( self, @@ -188,16 +182,19 @@ def _get_dummy_mm_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts.get("audio", 0) - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|audio|>" * audio_count, - mm_data=data, + prompt_text="<|audio|>" * num_audios, + mm_data=mm_data, ) @@ -319,8 +316,6 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "audio", get_ultravox_max_audio_tokens) @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 269b66806adf4..31017f16d3c97 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -373,7 +373,7 @@ def embed_multimodal( input_ids: torch.Tensor, multimodal_token_id: int, get_text_embeds: Callable[[torch.Tensor], torch.Tensor], - multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]], + multimodal_embeds: NestedTensors, ) -> torch.Tensor: """ Embed token IDs and multimodal inputs and combine their embeddings. diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py new file mode 100644 index 0000000000000..65a773480d2a1 --- /dev/null +++ b/vllm/model_executor/models/vision.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +from transformers import PretrainedConfig + +_C = TypeVar("_C", bound=PretrainedConfig) + + +class VisionEncoderInfo(ABC, Generic[_C]): + + def __init__(self, vision_config: _C) -> None: + super().__init__() + + self.vision_config = vision_config + + @abstractmethod + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + raise NotImplementedError + + @abstractmethod + def get_max_image_tokens(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_num_patches(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_image_size(self) -> int: + raise NotImplementedError + + +def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: + # Avoid circular imports + from .clip import CLIPEncoderInfo, CLIPVisionConfig + from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig + from .siglip import SiglipEncoderInfo, SiglipVisionConfig + + if isinstance(vision_config, CLIPVisionConfig): + return CLIPEncoderInfo(vision_config) + if isinstance(vision_config, PixtralVisionConfig): + return PixtralHFEncoderInfo(vision_config) + if isinstance(vision_config, SiglipVisionConfig): + return SiglipEncoderInfo(vision_config) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py new file mode 100644 index 0000000000000..cb54b4c3ba663 --- /dev/null +++ b/vllm/model_executor/models/whisper.py @@ -0,0 +1,737 @@ +import math +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) + +import numpy as np +import torch +from torch import nn +from transformers.models.whisper.modeling_whisper import sinusoids + +from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.audio import resample_audio +from vllm.sequence import SequenceData +from vllm.transformers_utils.processor import cached_get_processor + +from .interfaces import SupportsMultiModal +from .utils import AutoWeightsLoader, WeightsMapper, make_layers + +logger = init_logger(__name__) + + +class WhisperAudioInputs(TypedDict): + input_features: NestedTensors + """Shape: `(batch_size, 128, M)`""" + + +class WhisperPositionalEmbedding(nn.Embedding): + + def __init__(self, + num_positions: int, + embedding_dim: int, + padding_idx: Optional[int] = None): + super().__init__(num_positions, embedding_dim) + + def forward(self, position_ids): + return self.weight[position_ids] + + +class WhisperAttention(nn.Module): + + def __init__( + self, + embed_dim: int, + num_heads: int, + bias: bool = True, + attn_type: AttentionType = AttentionType.DECODER, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.embed_dim = embed_dim + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + if self.total_num_heads >= tp_size: + # Number of heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_heads % tp_size == 0 + else: + # Number of heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_heads == 0 + self.num_kv_heads = max(1, self.total_num_heads // tp_size) + self.head_dim = self.embed_dim // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.attn_type = attn_type + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: " + f"{self.embed_dim} and `num_heads`: {num_heads}).") + self.scaling = self.head_dim**-0.5 + + self._init_qkv(embed_dim, bias, quant_config, prefix=prefix) + self.out_proj = RowParallelLinear( + input_size=embed_dim, + output_size=embed_dim, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + def _init_qkv( + self, + embed_dim: int, + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + self.qkv_proj = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + attn_output = self.attn(q, + k, + v, + kv_cache, + attn_metadata, + attn_type=self.attn_type) + + output, _ = self.out_proj(attn_output) + + return output + + +class WhisperCrossAttention(WhisperAttention): + + def __init__( + self, + embed_dim: int, + num_heads: int, + bias: bool = True, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__( + embed_dim=embed_dim, + num_heads=num_heads, + bias=bias, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ) + + def _init_qkv( + self, + embed_dim: int, + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + self.q_proj = ColumnParallelLinear( + input_size=embed_dim, + output_size=embed_dim, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.q_proj", + ) + self.kv_proj = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.head_dim, + total_num_heads=0, + total_num_kv_heads=self.total_num_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.kv_proj", + ) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + q, _ = self.q_proj(hidden_states) + + # Encoder hidden states are only computed once during prefill phase. + # Afterwards, the keys and values should be available in the kv-cache. + if encoder_hidden_states is not None: + kv, _ = self.kv_proj(encoder_hidden_states) + k, v = kv.split([self.kv_size, self.kv_size], dim=-1) + else: + k = v = None + + attn_output = self.attn(q, + k, + v, + kv_cache, + attn_metadata, + attn_type=AttentionType.ENCODER_DECODER) + + output, _ = self.out_proj(attn_output) + + return output + + +class WhisperMLP(nn.Module): + + def __init__( + self, + embed_dim: int, + ffn_dim: int, + act_fn: str, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + + self.activation_fn = get_act_fn(act_fn) + self.fc1 = ColumnParallelLinear( + input_size=embed_dim, + output_size=ffn_dim, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + ) + self.fc2 = RowParallelLinear( + input_size=ffn_dim, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + ) + + def forward(self, hidden_states: torch.Tensor): + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + return hidden_states + + +class WhisperEncoderLayer(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.embed_dim = config.d_model + self.self_attn = WhisperAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + attn_type=AttentionType.ENCODER, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.mlp = WhisperMLP( + embed_dim=config.d_model, + ffn_dim=config.encoder_ffn_dim, + act_fn=config.activation_function, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn( + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + if hidden_states.isinf().any() or hidden_states.isnan().any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, + min=-clamp_value, + max=clamp_value) + + return hidden_states + + +class WhisperDecoderLayer(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.self_attn = WhisperAttention( + embed_dim=config.d_model, + num_heads=config.decoder_attention_heads, + attn_type=AttentionType.DECODER, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + self.self_attn_layer_norm = nn.LayerNorm(config.d_model) + self.encoder_attn = WhisperCrossAttention( + embed_dim=config.d_model, + num_heads=config.decoder_attention_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.encoder_attn", + ) + self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model) + self.mlp = WhisperMLP( + embed_dim=config.d_model, + ffn_dim=config.decoder_ffn_dim, + act_fn=config.activation_function, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.final_layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + hidden_states = self.encoder_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class WhisperEncoder(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + embed_dim = config.d_model + self.num_mel_bins = config.num_mel_bins + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_source_positions + self.embed_scale = (math.sqrt(embed_dim) + if config.scale_embedding else 1.0) + + self.conv1 = nn.Conv1d(self.num_mel_bins, + embed_dim, + kernel_size=3, + padding=1) + self.conv2 = nn.Conv1d(embed_dim, + embed_dim, + kernel_size=3, + stride=2, + padding=1) + self.embed_positions = nn.Embedding(self.max_source_positions, + embed_dim) + self.start_layer, self.end_layer, self.layers = make_layers( + config.encoder_layers, + lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config, + prefix=f"{prefix}.layers"), + prefix=f"{prefix}.layers", + ) + self.layer_norm = nn.LayerNorm(config.d_model) + + with torch.no_grad(): + self.embed_positions.weight.copy_( + sinusoids(*self.embed_positions.weight.shape)) + + def forward( + self, + input_features: Union[torch.Tensor, List[torch.Tensor]], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ): + hidden_states = [] + for features in input_features: + embeds = nn.functional.gelu(self.conv1(features)) + embeds = nn.functional.gelu(self.conv2(embeds)) + embeds = embeds.permute(1, 0) + embeds = embeds + self.embed_positions.weight[:embeds.size(0), :] + hidden_states.append(embeds) + hidden_states = torch.cat(hidden_states) + + for idx, encoder_layer in enumerate(self.layers): + hidden_states = encoder_layer( + hidden_states, + kv_cache=kv_caches[idx], + attn_metadata=attn_metadata, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class WhisperDecoder(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_target_positions + self.max_source_positions = config.max_source_positions + self.embed_scale = (math.sqrt(config.d_model) + if config.scale_embedding else 1.0) + + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, + self.padding_idx) + self.embed_positions = WhisperPositionalEmbedding( + self.max_target_positions, config.d_model) + self.start_layer, self.end_layer, self.layers = make_layers( + config.decoder_layers, + lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config, + prefix=f"{prefix}.layers"), + prefix=f"{prefix}.layers", + ) + self.layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + input_ids, + positions: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ): + inputs_embeds = self.get_input_embeddings(input_ids) + positions = self.embed_positions(positions) + hidden_states = inputs_embeds + positions + + for idx, decoder_layer in enumerate(self.layers): + hidden_states = decoder_layer( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + kv_cache=kv_caches[idx], + attn_metadata=attn_metadata, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.embed_tokens(input_ids) + + +class WhisperModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.encoder = WhisperEncoder(vllm_config=vllm_config, + prefix=f"{prefix}.encoder") + self.decoder = WhisperDecoder(vllm_config=vllm_config, + prefix=f"{prefix}.decoder") + + def forward( + self, + input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + encoder_outputs = self.get_encoder_outputs( + input_features, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + decoder_outputs = self.decoder( + input_ids=input_ids, + positions=positions, + encoder_hidden_states=encoder_outputs, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + return decoder_outputs + + def get_encoder_outputs( + self, + input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> Optional[torch.Tensor]: + if input_features is None: + return None + return self.encoder( + input_features, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), + (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), + (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), + (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"), + (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +def get_max_whisper_audio_tokens(ctx: InputContext) -> int: + return ctx.model_config.hf_config.max_source_positions + + +def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int, + mm_counts: Mapping[str, int]): + assert mm_counts["audio"] == 1 + num_tokens = get_max_whisper_audio_tokens(ctx) + processor = cached_get_processor(ctx.model_config.model) + chunk_length = processor.feature_extractor.chunk_length + sampling_rate = processor.feature_extractor.sampling_rate + num_samples = chunk_length * sampling_rate + return DummyData( + SequenceData.from_prompt_token_counts((0, num_tokens)), + {"audio": [(np.zeros(num_samples), sampling_rate)]}, + ) + + +def input_processor_for_whisper(ctx: InputContext, inputs): + multi_modal_data = inputs["encoder"]["multi_modal_data"] + if isinstance(multi_modal_data["audio"], list): + assert len(multi_modal_data["audio"]) == 1 + multi_modal_data["audio"] = multi_modal_data["audio"][0] + # Resample and process audio + audio, orig_sr = multi_modal_data["audio"] + processor = cached_get_processor(ctx.model_config.model) + target_sr = processor.feature_extractor.sampling_rate + audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr) + multi_modal_data["audio"] = (audio, target_sr) + # Pre-allocate placeholder tokens in encoder sequence + num_tokens = get_max_whisper_audio_tokens(ctx) + inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens + return inputs + + +def input_mapper_for_whisper( + ctx: InputContext, + multi_modal_data: Union[np.ndarray, List[np.ndarray]], +) -> MultiModalKwargs: + if not isinstance(multi_modal_data, list): + multi_modal_data = [multi_modal_data] + + assert len(multi_modal_data) == 1 + + if len(multi_modal_data) == 0: + return MultiModalKwargs() + + processor = cached_get_processor(ctx.model_config.model) + sampling_rate = processor.feature_extractor.sampling_rate + + audios = [audio for audio, _ in multi_modal_data] + + kwargs = processor(audios, + sampling_rate=sampling_rate, + return_tensors="pt") + kwargs["input_features"] = kwargs["input_features"].squeeze(0).to( + ctx.model_config.dtype) + + return MultiModalKwargs(kwargs) + + +@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper) +@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper) +@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper) +@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( + "audio", get_max_whisper_audio_tokens) +class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.dtype = vllm_config.model_config.dtype + + self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix) + self.unpadded_vocab_size = config.vocab_size + self.proj_out = ParallelLMHead(config.vocab_size, + config.d_model, + quant_config=quant_config) + self.proj_out = self.proj_out.tie_weights( + self.model.decoder.embed_tokens) + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, logit_scale) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ) -> torch.Tensor: + audio_input = self._parse_and_validate_audio_input(**kwargs) + decoder_outputs = self.model( + input_features=audio_input["input_features"], + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + return decoder_outputs + + def get_multimodal_embeddings( + self, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ) -> Optional[NestedTensors]: + # TODO: This method does not obey the interface for SupportsMultiModal. + # Refactor this once encoder/decoder support is implemented in V1. + audio_input = self._parse_and_validate_audio_input(**kwargs) + return self.model.get_encoder_outputs( + audio_input["input_features"], + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> torch.Tensor: + # TODO: This method just returns the decoder sequence embeddings since + # Whisper does not have encoder text tokens. Refactor this once + # encoder/decoder support is implemented in V1. + return self.model.decoder.get_input_embeddings(input_ids) + + def _parse_and_validate_audio_input( + self, **kwargs: object) -> WhisperAudioInputs: + input_features = kwargs.pop("input_features", None) + + if input_features is not None: + if not isinstance(input_features, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio features. " + f"Got type: {type(input_features)}") + input_features = [feat.to(self.dtype) for feat in input_features] + + return WhisperAudioInputs(input_features=input_features) + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.proj_out, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) + loaded_weights = [(name, loaded_weight) + for name, loaded_weight in weights] + mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) + return loader.load_weights(loaded_weights, mapper=mapper) \ No newline at end of file diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index db489af7ac475..b0a1104546186 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -2,7 +2,8 @@ from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final +from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast, + final) import numpy as np import torch @@ -11,7 +12,7 @@ from transformers import BatchFeature from typing_extensions import NotRequired, TypeAlias -from vllm.utils import JSONTree, is_list_of, json_map_leaves +from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves _T = TypeVar("_T") @@ -160,11 +161,8 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: @dataclass(frozen=True) -class MultiModalFieldItem: - """ - Contains metadata and data in :class:`MultiModalKwargs` - corresponding to a data item in :class:`MultiModalDataItems`. - """ +class MultiModalFieldElem: + """Contains metadata and data of an item in :class:`MultiModalKwargs`.""" field: "BaseMultiModalField" data: NestedTensors @@ -186,34 +184,34 @@ class BaseMultiModalField(ABC): def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: raise NotImplementedError - def _build_item(self, data: NestedTensors) -> MultiModalFieldItem: - return MultiModalFieldItem(self, data) + def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem: + return MultiModalFieldElem(self, data) - def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem: - """Merge multiple instances of :class:`MultiModalFieldItem` together.""" + def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem: + """Merge multiple instances of :class:`MultiModalFieldElem` together.""" fields = [item.field for item in batch] if len(set(fields)) > 1: raise ValueError(f"Cannot merge different {fields=}") data = self._reduce_data([item.data for item in batch]) - return self._build_item(data) + return self._build_elem(data) @dataclass(frozen=True) class MultiModalBatchedField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an item is obtained by - directly indexing into the first dimension of the underlying data. + A :class:`BaseMultiModalField` implementation where an element in the batch + is obtained by indexing into the first dimension of the underlying data. """ - def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]: - return [self._build_item(item) for item in batch] + def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]: + return [self._build_elem(item) for item in batch] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): first_shape = batch[0].shape - if all(item.shape == first_shape for item in batch): + if all(elem.shape == first_shape for elem in batch): return torch.stack(batch) return batch @@ -222,24 +220,24 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: @dataclass(frozen=True) class MultiModalFlatField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an item is obtained by - slicing along the first dimension of the underlying data. + A :class:`BaseMultiModalField` implementation where an element in the batch + is obtained by slicing along the first dimension of the underlying data. """ - def build_items( + def build_elems( self, batch: NestedTensors, slices: Sequence[slice], - ) -> list[MultiModalFieldItem]: - return [self._build_item(batch[slice_]) for slice_ in slices] + ) -> list[MultiModalFieldElem]: + return [self._build_elem(batch[slice_]) for slice_ in slices] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): first_shape = batch[0].shape - if all(item.shape[1:] == first_shape[1:] for item in batch): + if all(elem.shape[1:] == first_shape[1:] for elem in batch): return torch.concat(batch) - return [elem for item in batch for elem in item] + return [e for elem in batch for e in elem] class MultiModalFieldConfig: @@ -267,115 +265,111 @@ def __init__( ) -> None: super().__init__() - self._field_cls = field_cls - self._modality = modality - self._field_config = field_config + self.field_cls = field_cls + self.modality = modality + self.field_config = field_config - def build_items( + def build_elems( self, key: str, batch: NestedTensors, - ) -> list[MultiModalFieldItem]: - field = self._field_cls(key=key, modality=self._modality) - return field.build_items(batch, **self._field_config) # type: ignore + ) -> Sequence[MultiModalFieldElem]: + field = self.field_cls(key=key, modality=self.modality) + return field.build_elems(batch, **self.field_config) # type: ignore -class MultiModalKwargs(UserDict[str, NestedTensors]): +class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): + """ + A collection of :class:`MultiModalFieldElem` + corresponding to a data item in :class:`MultiModalDataItems`. """ - A dictionary that represents the keyword arguments to - :meth:`~torch.nn.Module.forward`. - The metadata :code:`items_by_key` defines how to split batched keyword - arguments corresponding to each data item in :class:`MultiModalDataItems`: + @staticmethod + def from_elems(elems: Sequence[MultiModalFieldElem]): + return MultiModalKwargsItem({elem.field.key: elem for elem in elems}) - - For a keyword argument, we can access the :code:`i` th item in the batch - via :code:`items_by_key[key][i]`. - - We can gather the keyword arguments belonging to a modality by finding - the keys with items that belong to that modality, then accessing - the :code:`i` th item in the batch for each such key. + @property + def modality(self) -> str: + modalities = {elem.field.modality for elem in self.data.values()} + assert len(modalities) == 1, f"Found different modalities={modalities}" + return next(iter(modalities)) - Example: - .. code-block:: python - - # All items belong to the "image" modality - items_by_key={ - "pixel_values": [a, b, c, d], # "image" modality - "image_grid_thw": [e, f, g, h], # "image" modality - "pixel_values_video": [h, i, j], # "video" modality - "video_grid_thw": [k, l, m], # "video" modality - } +# NOTE: UserDict is for V0 compatibility. +# V1 should access individual items via `get_item`. +class MultiModalKwargs(UserDict[str, NestedTensors]): + """ + A dictionary that represents the keyword arguments to + :meth:`~torch.nn.Module.forward`. - - The keyword arguments belonging to the first image are - :code:`{"pixel_values": a, "image_grid_thw": e}`. - - The keyword arguments belonging to the second video are - :code:`{"pixel_values_video": i, "video_grid_thw": l}`. + The metadata :code:`items` enables us to obtain the keyword arguments + corresponding to each data item in :class:`MultiModalDataItems`, via + :meth:`get_item` and :meth:`get_items`. """ @staticmethod def from_hf_inputs( hf_inputs: BatchFeature, config_by_key: Mapping[str, MultiModalFieldConfig], - *, - enable_sanity_checks: bool = False, ): # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` # We assume that those fields are not used in vLLM - items_by_key = { - key: config.build_items(key, batch) - for key, config in config_by_key.items() - if (batch := hf_inputs.get(key)) is not None - } - - return MultiModalKwargs.from_items_by_key( - items_by_key, - enable_sanity_checks=enable_sanity_checks, - ) + elems_by_key = dict[str, Sequence[MultiModalFieldElem]]() + keys_by_modality = defaultdict[str, set[str]](set) + for key, config in config_by_key.items(): + batch = hf_inputs.get(key) + if batch is not None: + elems = config.build_elems(key, batch) + if len(elems) > 0: + elems_by_key[key] = elems + keys_by_modality[config.modality].add(key) + + items = list[MultiModalKwargsItem]() + for modality, keys in keys_by_modality.items(): + elems_in_modality = {k: elems_by_key[k] for k in keys} + batch_sizes = {k: len(v) for k, v in elems_in_modality.items()} + + if len(set(batch_sizes.values())) > 1: + raise ValueError( + f"Cannot merge different batch sizes for {modality=}! " + f"Found: {batch_sizes=}") + + batch_size = next(iter(batch_sizes.values())) + for item_idx in range(batch_size): + elems = [v[item_idx] for v in elems_in_modality.values()] + items.append(MultiModalKwargsItem.from_elems(elems)) + + return MultiModalKwargs.from_items(items) @staticmethod - def from_items_by_key( - items_by_key: Mapping[str, list[MultiModalFieldItem]], - *, - enable_sanity_checks: bool = False, - ) -> "MultiModalKwargs": + def from_items(items: Sequence[MultiModalKwargsItem]): + """Construct a new :class:`MultiModalKwargs` from multiple items.""" + elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) + for item in items: + for key, elem in item.items(): + elems_by_key[key].append(elem) + data = { - key: items[0].field.reduce(items).data - for key, items in items_by_key.items() if len(items) > 0 + key: elems[0].field.reduce(elems).data + for key, elems in elems_by_key.items() if len(elems) > 0 } - return MultiModalKwargs(data, - items_by_key=items_by_key, - enable_sanity_checks=enable_sanity_checks) + return MultiModalKwargs(data, items=items) def __init__( self, data: Mapping[str, NestedTensors], *, - items_by_key: Mapping[str, list[MultiModalFieldItem]] = {}, - enable_sanity_checks: bool = False, + items: Optional[Sequence[MultiModalKwargsItem]] = None, ) -> None: super().__init__(data) - # Shallow copy to avoid footgun in case a defaultdict is passed in - self._items_by_key = dict(items_by_key) + items_by_modality = full_groupby(items or [], key=lambda x: x.modality) + self._items_by_modality = dict(items_by_modality) - keys_by_modality = defaultdict[str, set[str]](set) - for key, items in items_by_key.items(): - for item in items: - keys_by_modality[item.field.modality].add(key) - - self._keys_by_modality = dict(keys_by_modality) - - if enable_sanity_checks: - for modality, keys in keys_by_modality.items(): - items_in_modality = {k: items_by_key[k] for k in keys} - batch_sizes = {k: len(v) for k, v in items_in_modality.items()} - batch_size = next(iter(batch_sizes.values()), 0) - assert all(bs == batch_size - for bs in batch_sizes.values()), dict( - modality=modality, - batch_sizes=batch_sizes, - items_by_key=items_by_key) + @property + def modalities(self): + return self._items_by_modality.keys() @staticmethod def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: @@ -452,58 +446,44 @@ def as_kwargs( def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False - if self._items_by_key != other._items_by_key: + if self._items_by_modality != other._items_by_modality: return False ks = self.keys() return (ks == other.keys() and all(nested_tensors_equal(self[k], other[k]) for k in ks)) - def get_item(self, key: str, item_index: int) -> MultiModalFieldItem: - return self._items_by_key[key][item_index] + def _validate_modality(self, method_name: str, modality: str) -> None: + if not self._items_by_modality: + raise RuntimeError( + f"`{method_name}` is not supported when " + "MultiModalKwargs is not initialized with `items`") - def get_items_by_modality( - self, - modality: str, - item_index: int, - ) -> Mapping[str, MultiModalFieldItem]: - """ - Get the keyword arguments corresponding to an item identified by - its modality and index. - """ - if modality not in self._keys_by_modality: - available_modalities = set(self._keys_by_modality.keys()) + if modality not in self._items_by_modality: + available_modalities = set(self._items_by_modality.keys()) raise KeyError(f"Modality {modality!r} not found. " f"Available modalities: {available_modalities}") - keys_to_gather = self._keys_by_modality[modality] + def get_item_count(self, modality: str) -> int: + """Get the number of items belonging to a modality.""" + self._validate_modality("get_item_count", modality) + return len(self._items_by_modality[modality]) - return { - key: self.get_item(key, item_index) - for key in keys_to_gather if key in self - } + def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem: + """ + Get the keyword arguments corresponding to an item identified by + its modality and index. + """ + self._validate_modality("get_item", modality) + return self._items_by_modality[modality][item_index] - @staticmethod - def from_items_by_modality( - items_by_modality: Mapping[str, list[Mapping[str, - MultiModalFieldItem]]], - *, - enable_sanity_checks: bool = False, - ) -> "MultiModalKwargs": + def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: """ - Construct a new :class:`MultiModalKwargs` from multiple items returned - by :meth:`get_fields_by_modality`. + Get the keyword arguments corresponding to each item belonging to + a modality. """ - items_by_key = defaultdict[str, list[MultiModalFieldItem]](list) - for fields in items_by_modality.values(): - for field in fields: - for k, v in field.items(): - items_by_key[k].append(v) - - return MultiModalKwargs.from_items_by_key( - items_by_key, - enable_sanity_checks=enable_sanity_checks, - ) + self._validate_modality("get_items", modality) + return self._items_by_modality[modality] MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 17a795247372e..00acb77435163 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod from collections import UserDict from collections.abc import Callable, Iterator, Mapping, Sequence -from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar +from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar, + Union) import numpy as np import torch @@ -21,10 +22,15 @@ class ModalityDataItems(ABC, Generic[_T, _I]): - def __init__(self, data: _T) -> None: + def __init__(self, data: _T, modality: str) -> None: super().__init__() self.data = data + self.modality = modality + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r}, " + f"len={len(self)})") def __len__(self) -> int: return self.get_count() @@ -64,14 +70,6 @@ def get_passthrough_data(self) -> Mapping[str, object]: class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): - def __init__(self, data: Sequence[_T], modality: str) -> None: - super().__init__(data) - - self.modality = modality - - def __repr__(self) -> str: - return (f"{type(self).__name__}(modality={self.modality!r})") - def get_count(self) -> int: return len(self.data) @@ -87,18 +85,10 @@ def get_passthrough_data(self) -> Mapping[str, object]: class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): - def __init__(self, data: NestedTensors, modality: str) -> None: - super().__init__(data) - - self.modality = modality - - def __repr__(self) -> str: - return (f"{type(self).__name__}(modality={self.modality!r})") - def get_count(self) -> int: return len(self.data) - def get(self, index: int) -> object: + def get(self, index: int) -> torch.Tensor: return self.data[index] def get_processor_data(self) -> Mapping[str, object]: @@ -107,6 +97,9 @@ def get_processor_data(self) -> Mapping[str, object]: def get_passthrough_data(self) -> Mapping[str, object]: return {f"{self.modality}_embeds": self.data} + def get_feature_size(self, item_idx: int) -> int: + return len(self.get(item_idx)) + class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): @@ -193,7 +186,7 @@ def get_all_counts(self) -> Mapping[str, int]: def get_items( self, modality: str, - typ: type[_D], + typ: Union[type[_D], tuple[type[_D], ...]], ) -> _D: """ Get the data items belonging to a modality, @@ -210,7 +203,7 @@ def get_items( f"Expected type: {typ}, but " f"found type: {type(items)}") - return items + return items # type: ignore[return-value] ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], @@ -220,6 +213,10 @@ def get_items( class MultiModalDataParser: """ Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + + Args: + target_sr (float, optional): Enables automatic resampling of audio + items to the model's expected sampling rate. """ def __init__(self, *, target_sr: Optional[float] = None) -> None: diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 180489166b407..eb7552176e974 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,25 +1,27 @@ import pickle import re from abc import ABC, abstractmethod +from collections import defaultdict from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass, field from functools import lru_cache from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union import numpy as np +import numpy.typing as npt import torch from blake3 import blake3 -from PIL.Image import Image +from PIL import Image from transformers import BatchFeature, ProcessorMixin from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens from vllm.utils import LRUCache, flatten_2d_lists, full_groupby from .inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs, - PlaceholderRange) + MultiModalInputsV2, MultiModalKwargs, + MultiModalKwargsItem, PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser logger = init_logger(__name__) @@ -55,24 +57,6 @@ def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement": ) -def _encode( - tokenizer: AnyTokenizer, - text: str, - *, - add_special_tokens: bool = False, -) -> list[int]: - """ - Backend-agnostic equivalent of HF's - :code:`tokenizer.encode(text, add_special_tokens=...)`. - """ - if isinstance(tokenizer, MistralTokenizer): - return tokenizer.tokenizer.encode(text, - bos=add_special_tokens, - eos=add_special_tokens) - - return tokenizer.encode(text, add_special_tokens=add_special_tokens) - - @lru_cache(maxsize=2048) def _cached_encode( tokenizer: AnyTokenizer, @@ -80,7 +64,9 @@ def _cached_encode( *, add_special_tokens: bool = False, ) -> list[int]: - return _encode(tokenizer, text, add_special_tokens=add_special_tokens) + return encode_tokens(tokenizer, + text, + add_special_tokens=add_special_tokens) def _decode( @@ -353,13 +339,13 @@ def _replace_matches( ) -> list[_S]: out_seqs = list[_S]() prev_end_idx = 0 - next_idx_by_modality = {modality: 0 for modality in mm_item_counts} + next_idx_by_modality = defaultdict[str, int](lambda: 0) for match in _resolve_matches(prompt, matches): modality = match.modality item_idx = next_idx_by_modality[modality] - if item_idx >= mm_item_counts[modality]: + if item_idx >= mm_item_counts.get(modality, 0): continue start_idx = match.start_idx @@ -494,8 +480,7 @@ def __init__(self, capacity: int) -> None: # DEBUG: Set to None to disable self.debug_cache_hit_ratio_steps: Optional[int] = None - self._cache = LRUCache[str, Mapping[str, - MultiModalFieldItem]](capacity) + self._cache = LRUCache[str, MultiModalKwargsItem](capacity) def _maybe_log_cache_stats(self) -> None: steps = self.debug_cache_hit_ratio_steps @@ -513,7 +498,7 @@ def _serialize_item(self, obj: object) -> bytes: return obj.encode("utf-8") if isinstance(obj, bytes): return obj - if isinstance(obj, Image): + if isinstance(obj, Image.Image): return obj.tobytes() # Convertible to NumPy arrays @@ -563,7 +548,7 @@ def get( modality: str, input_item: object, input_kwargs: Mapping[str, object], - ) -> Optional[Mapping[str, MultiModalFieldItem]]: + ) -> Optional[MultiModalKwargsItem]: """ Get a processed multi-modal item from the cache according to its dependencies, including: @@ -586,7 +571,7 @@ def put( modality: str, input_item: object, input_kwargs: Mapping[str, object], - output_kwargs: Mapping[str, MultiModalFieldItem], + output_kwargs: MultiModalKwargsItem, ) -> None: """ Put a processed multi-modal item into the cache @@ -622,6 +607,29 @@ def __call__( ) -> MultiModalInputsV2: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + @abstractmethod + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + """ + Return the maximum supported number of items for each modality. + + A value of `None` means unlimited number of items. + + Omitting a modality from the returned dictionary means that + it is not supported at all. + """ + raise NotImplementedError + + @abstractmethod + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + """ + Get the maximum possible number of tokens per data item + for each modality. + + The dictionary returned by this method should have the same + keys as that returned by :meth:`get_supported_mm_limits`. + """ + raise NotImplementedError + def _get_data_parser(self) -> MultiModalDataParser: """ Construct a data parser to preprocess multi-modal data items @@ -651,7 +659,18 @@ def _to_mm_items( before passing them to :meth:`_get_hf_mm_data`. """ parser = self._get_data_parser() - return parser.parse_mm_data(mm_data) + mm_items = parser.parse_mm_data(mm_data) + + mm_limits = self.ctx.get_mm_config().limit_per_prompt + for modality, items in mm_items.items(): + limit = mm_limits.get(modality, 1) + if len(items) > limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but passed {len(items)} " + f"{modality} items in the same prompt.") + + return mm_items @abstractmethod def _get_mm_fields_config( @@ -673,10 +692,14 @@ def _get_prompt_replacements( Given the original multi-modal items for this modality and HF-processed data, output the replacements to perform. - Note: - Even when the HF processor already performs replacement for us, - we still use this replacement information to determine - the placeholder token positions for each multi-modal item. + Notes: + - You should not assume that HF processor always performs prompt + replacement: in :meth:`_apply_hf_processor_missing`, this method + is called on text-only and multimodal-only inputs separately, + instead of passing them in the same call. + - The replacement information returned by this method is also used + to determine the placeholder token positions for each multi-modal + item. """ raise NotImplementedError @@ -710,6 +733,10 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: + """ + Call the HF processor on the prompt text and + associated multi-modal data. + """ return self.ctx.call_hf_processor( self._get_hf_processor(**mm_kwargs), dict(text=prompt, **mm_data), @@ -723,7 +750,8 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: """ - Apply the HF processor on the full prompt text and multi-modal data. + Wrapper of :meth:`_call_hf_processor` that applies + additional pre-processing and post-processing. """ processor_data, passthrough_data = self._get_hf_mm_data(mm_items) @@ -739,7 +767,6 @@ def _apply_hf_processor( mm_kwargs = MultiModalKwargs.from_hf_inputs( processed_data, self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), - enable_sanity_checks=self.enable_sanity_checks, ) return prompt_ids, mm_kwargs @@ -754,10 +781,11 @@ def _apply_hf_processor_missing( Apply the HF processor on the full prompt text, but only on the multi-modal data that are missing from the cache. - Note: We pass prompt text and multi-modal data into the HF processor - in separate calls to avoid HF prompt replacement being done for - cached items; instead, we rely on our own prompt replacement logic - for the full text. + Note: + We pass prompt text and multi-modal data into the HF processor + in separate calls to avoid HF prompt replacement being done for + cached items; instead, we rely on our own prompt replacement logic + (:meth:`_get_prompt_replacements`) for the full text. """ mm_missing_counts = mm_missing_data_items.get_all_counts() @@ -800,7 +828,7 @@ def _cached_apply_hf_processor( hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) - mm_maybe_cached_field_items = { + mm_maybe_cached_kw_items = { modality: [ cache.get(model_id, modality, item, hf_processor_mm_kwargs) for item in items @@ -809,8 +837,9 @@ def _cached_apply_hf_processor( } mm_missing_idxs = { - modality: [idx for idx, out in enumerate(fields) if out is None] - for modality, fields in mm_maybe_cached_field_items.items() + modality: + [idx for idx, item in enumerate(kw_items) if item is None] + for modality, kw_items in mm_maybe_cached_kw_items.items() } mm_missing_data = { modality: [mm_data_items[modality][idx] for idx in idxs] @@ -829,14 +858,11 @@ def _cached_apply_hf_processor( for modality in mm_missing_data_items } - mm_merged_field_items = dict[str, list[Mapping[str, - MultiModalFieldItem]]]() - for modality, modal_items_lst in mm_maybe_cached_field_items.items(): - merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]() - - for idx, modal_items in enumerate(modal_items_lst): - if modal_items is None: - modal_items = mm_missing_kwargs.get_items_by_modality( + merged_kw_items = list[MultiModalKwargsItem]() + for modality, kw_items in mm_maybe_cached_kw_items.items(): + for idx, kw_item in enumerate(kw_items): + if kw_item is None: + kw_item = mm_missing_kwargs.get_item( modality, mm_missing_next_idx[modality], ) @@ -846,14 +872,12 @@ def _cached_apply_hf_processor( modality, mm_data_items[modality][idx], hf_processor_mm_kwargs, - modal_items, + kw_item, ) mm_missing_next_idx[modality] += 1 - merged_modal_items_lst.append(modal_items) - - mm_merged_field_items[modality] = merged_modal_items_lst + merged_kw_items.append(kw_item) if self.enable_sanity_checks: mm_missing_counts = mm_missing_data_items.get_all_counts() @@ -863,10 +887,7 @@ def _cached_apply_hf_processor( mm_missing_next_idx=mm_missing_next_idx, mm_missing_counts=mm_missing_counts) - mm_kwargs = MultiModalKwargs.from_items_by_modality( - mm_merged_field_items, - enable_sanity_checks=self.enable_sanity_checks, - ) + mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) if self.enable_sanity_checks: mm_item_counts = mm_data_items.get_all_counts() @@ -874,7 +895,7 @@ def _cached_apply_hf_processor( for modality, item_count in mm_item_counts.items(): for item_idx in range(item_count): try: - mm_kwargs.get_items_by_modality(modality, item_idx) + mm_kwargs.get_item(modality, item_idx) except Exception as e: # Make it easy to set a breakpoint in the debugger raise e @@ -889,6 +910,17 @@ def _bind_prompt_replacements( return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls] + def _always_apply_prompt_replacements(self) -> bool: + """ + A flag which can be overridden so that + :meth:`_apply_prompt_replacements` is always called even if we + detect that HF has performed processing via :meth:`_find_placeholders`. + + This is useful in cases where :meth:`_find_placeholders` cannot be + reliably used to detect whether HF has performed processing or not. + """ + return False + def _apply_prompt_replacements( self, token_ids: list[int], @@ -935,7 +967,9 @@ def _apply_prompt_replacements( mm_item_counts, ) - token_ids = _encode(tokenizer, text) + token_ids = encode_tokens(tokenizer, + text, + add_special_tokens=False) matched_repls = [match.prompt_repl for match in text_matches] placeholders = self._find_placeholders(matched_repls, token_ids, @@ -983,7 +1017,7 @@ def apply( all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, mm_item_counts) - if all_placeholders: + if all_placeholders and not self._always_apply_prompt_replacements(): tokenizer = self._get_tokenizer() prompt_text = _decode(tokenizer, prompt_ids) else: @@ -997,10 +1031,27 @@ def apply( mm_item_counts, ) - mm_placeholders = { - modality: [item.to_range() for item in items] - for modality, items in full_groupby_modality(all_placeholders) - } + mm_placeholders = dict[str, list[PlaceholderRange]]() + err_suffix = ("This suggests a problem with your implementation of " + "the merged multi-modal processor for this model, " + "particularly in the `_get_prompt_replacements` method.") + + for modality, placeholders in full_groupby_modality(all_placeholders): + if modality not in mm_items: + raise AssertionError( + f"Expected no placeholders for {modality=}, " + f"but found {placeholders=}. Input items: {mm_items}" + f"\n{err_suffix}") + + if len(placeholders) != len(mm_items[modality]): + raise AssertionError( + f"Expected length of {placeholders=} for {modality=} " + f"to equal that of input items: {mm_items[modality]}" + f"\n{err_suffix}") + + mm_placeholders[modality] = [ + item.to_range() for item in placeholders + ] return MultiModalInputsV2( type="multimodal", @@ -1010,6 +1061,36 @@ def apply( mm_placeholders=mm_placeholders, ) + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + @abstractmethod def _get_dummy_mm_inputs( self, @@ -1021,15 +1102,38 @@ def _get_dummy_mm_inputs( """ raise NotImplementedError - def get_dummy_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - mm_max_tokens: Mapping[str, int], - ) -> DummyData: + def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]: + mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt + supported_mm_limits = self.get_supported_mm_limits() + + mm_limits = { + modality: mm_limit_per_prompt.get(modality, 1) + for modality in supported_mm_limits + } + + for modality, supported_limit in supported_mm_limits.items(): + limit = mm_limits[modality] + if supported_limit is not None and supported_limit < limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but this model only supports " + f"at most {supported_limit} {modality} items.") + + return mm_limits + + def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData + mm_counts = self._get_and_validate_dummy_mm_counts() + mm_max_tokens_per_item = self.get_mm_max_tokens_per_item() + if mm_counts.keys() != mm_max_tokens_per_item.keys(): + raise AssertionError( + "The keys returned by `get_supported_mm_limits`" + f"({set(mm_counts.keys())}) should be the same as those " + "returned by `get_mm_max_tokens_per_item` " + f"({set(mm_max_tokens_per_item.keys())})") + processor_inputs = self._get_dummy_mm_inputs(mm_counts) mm_inputs = self.apply( prompt_text=processor_inputs.prompt_text, @@ -1045,7 +1149,7 @@ def get_dummy_data( for modality, placeholders in placeholders_by_modality.items() } expected_placeholders_by_modality = { - modality: mm_max_tokens[modality] + modality: mm_max_tokens_per_item[modality] * mm_counts[modality] for modality in placeholders_by_modality } if total_placeholders_by_modality != expected_placeholders_by_modality: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 3a5e11867ad9e..073d49d7d2009 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -15,6 +15,7 @@ from .image import ImagePlugin from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors from .processing import BaseMultiModalProcessor, ProcessingCache +from .utils import cached_get_tokenizer from .video import VideoPlugin if TYPE_CHECKING: @@ -219,6 +220,10 @@ def get_max_tokens_per_item_by_modality( Note: This is currently directly used only in V1. """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer(model_config.tokenizer) + processor = self.create_processor(model_config, tokenizer) + return processor.get_mm_max_tokens_per_item() return { key: plugin.get_max_multimodal_tokens(model_config) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 87b12a6fb33c1..7b6ded6a27084 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -400,15 +400,19 @@ def repeat_and_pad_placeholder_tokens( placeholder_token_idx = 0 for i, token in enumerate(prompt_token_ids): if token == placeholder_token_id: + curr_repeat_count = repeat_count[placeholder_token_idx] replacement_ids = repeat_and_pad_token( placeholder_token_id, - repeat_count=repeat_count[placeholder_token_idx], + repeat_count=curr_repeat_count, pad_token_left=pad_token_left, pad_token_right=pad_token_right, ) + offset = len(new_token_ids) + if pad_token_left is not None: + offset += 1 placeholder_ranges.append({ - "offset": len(new_token_ids), - "length": len(replacement_ids) + "offset": offset, + "length": curr_repeat_count, }) new_token_ids.extend(replacement_ids) placeholder_token_idx += 1 diff --git a/vllm/sequence.py b/vllm/sequence.py index 034f89c0ddbe9..0157abbd2eed5 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -710,15 +710,27 @@ def token_type_ids(self) -> Optional[List[int]]: @property def multi_modal_data(self) -> MultiModalDataDict: - return self.first_seq.multi_modal_data + if self.first_seq.multi_modal_data: + return self.first_seq.multi_modal_data + elif self.encoder_seq is not None: + return self.encoder_seq.multi_modal_data + return {} @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - return self.first_seq.multi_modal_placeholders + if self.first_seq.multi_modal_data: + return self.first_seq.multi_modal_placeholders + elif self.encoder_seq is not None: + return self.encoder_seq.multi_modal_placeholders + return {} @property def mm_processor_kwargs(self) -> Dict[str, Any]: - return self.first_seq.mm_processor_kwargs + if self.first_seq.multi_modal_data: + return self.first_seq.mm_processor_kwargs + elif self.encoder_seq is not None: + return self.encoder_seq.mm_processor_kwargs + return {} @property def lora_int_id(self) -> int: diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index e6701f4c4b835..42b2f095bc543 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -21,6 +21,25 @@ MistralTokenizer] +def encode_tokens( + tokenizer: AnyTokenizer, + text: str, + *, + add_special_tokens: Optional[bool] = None, +) -> list[int]: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.encode(text, add_special_tokens=...)`. + """ + if isinstance(tokenizer, MistralTokenizer): + return tokenizer.tokenizer.encode(text, + bos=add_special_tokens, + eos=add_special_tokens) + elif add_special_tokens is not None: + return tokenizer.encode(text, add_special_tokens=add_special_tokens) + return tokenizer.encode(text) + + def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: """Get tokenizer with cached properties. diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index 8f78ef65bbf1a..e6cc7cd4e2e3a 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -32,7 +32,8 @@ def get_max_input_len( def encode(self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" pass @@ -41,7 +42,8 @@ async def encode_async( self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" pass diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 9a999a0d6067d..3f7627e11ae5e 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -112,7 +112,8 @@ def _finalize_encode(self, actor: ray.ObjectRef, def encode(self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. @@ -132,7 +133,8 @@ def encode(self, ret = ray.get( actor.encode.remote(request_id=request_id, prompt=prompt, - lora_request=lora_request)) + lora_request=lora_request, + add_special_tokens=add_special_tokens)) except ActorDiedError as e: # If the actor is dead, we first try to reinitialize it. logger.warning("%s died with ActorDiedError, reinitializing.", @@ -143,7 +145,8 @@ def encode(self, ret = ray.get( actor.encode.remote(request_id=request_id, prompt=prompt, - lora_request=lora_request)) + lora_request=lora_request, + add_special_tokens=add_special_tokens)) except ActorDiedError as e: logger.error( "%s died for second time in a row, marking " @@ -160,7 +163,8 @@ async def encode_async( self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. @@ -177,9 +181,11 @@ async def encode_async( actor_is_alive = True original_actor = actor try: - ret = await actor.encode.remote(request_id=request_id, - prompt=prompt, - lora_request=lora_request) + ret = await actor.encode.remote( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) except ActorDiedError as e: # If the actor is dead, we first try to reinitialize it. logger.warning("%s died with ActorDiedError, reinitializing.", @@ -187,9 +193,11 @@ async def encode_async( exc_info=e) actor = self._init_actor() try: - ret = await actor.encode.remote(request_id=request_id, - prompt=prompt, - lora_request=lora_request) + ret = await actor.encode.remote( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) except ActorDiedError as e: logger.error( "%s died for second time in a row, marking " diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 95a8f7098bbac..6dc2f90561873 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -2,7 +2,7 @@ from vllm.config import TokenizerPoolConfig from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer import (AnyTokenizer, +from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, get_lora_tokenizer, get_lora_tokenizer_async, get_tokenizer) @@ -55,9 +55,12 @@ def _raise_if_input_too_long(self, def encode(self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = self.get_lora_tokenizer(lora_request) - ret = tokenizer.encode(prompt) + ret = encode_tokens(tokenizer, + prompt, + add_special_tokens=add_special_tokens) self._raise_if_input_too_long(ret, lora_request) return ret @@ -65,9 +68,12 @@ async def encode_async( self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = await self.get_lora_tokenizer_async(lora_request) - ret = tokenizer.encode(prompt) + ret = encode_tokens(tokenizer, + prompt, + add_special_tokens=add_special_tokens) self._raise_if_input_too_long(ret, lora_request) return ret diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 026a0292cc339..65002f1ad70c7 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -2,10 +2,14 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type +import numpy as np import torch +import triton +import triton.language as tl from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) +from vllm.utils import cdiv from vllm.vllm_flash_attn import flash_attn_varlen_func @@ -38,6 +42,10 @@ def get_kv_cache_shape( raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return use_cascade_attention(*args, **kwargs) + @dataclass class FlashAttentionMetadata: @@ -56,6 +64,15 @@ class FlashAttentionMetadata: seq_start_loc: torch.Tensor block_table: torch.Tensor slot_mapping: torch.Tensor + + # For cascade attention. + use_cascade: bool + common_prefix_len: int + cu_prefix_query_lens: Optional[torch.Tensor] + cu_prefix_kv_lens: Optional[torch.Tensor] + cu_suffix_kv_lens: Optional[torch.Tensor] + + # For logging. num_input_tokens: int = 0 # Number of tokens including padding. @@ -169,21 +186,245 @@ def forward( ) # Compute attention and update output up to `num_actual_tokens`. - flash_attn_varlen_func( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, - max_seqlen_q=attn_metadata.max_query_len, - cu_seqlens_k=attn_metadata.seq_start_loc, - max_seqlen_k=attn_metadata.max_seq_len, + if not attn_metadata.use_cascade: + # Regular attention (common case). + flash_attn_varlen_func( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=attn_metadata.query_start_loc, + max_seqlen_q=attn_metadata.max_query_len, + cu_seqlens_k=attn_metadata.seq_start_loc, + max_seqlen_k=attn_metadata.max_seq_len, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=attn_metadata.block_table, + softcap=self.logits_soft_cap, + ) + return output + + # Cascade attention (rare case). + cascade_attention( + output[:num_actual_tokens], + query[:num_actual_tokens], + key_cache, + value_cache, + cu_query_lens=attn_metadata.query_start_loc, + max_query_len=attn_metadata.max_query_len, + cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens, + cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens, + cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens, + max_kv_len=attn_metadata.max_seq_len, softmax_scale=self.scale, - causal=True, alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, + sliding_window=self.sliding_window, + logits_soft_cap=self.logits_soft_cap, block_table=attn_metadata.block_table, - softcap=self.logits_soft_cap, + common_prefix_len=attn_metadata.common_prefix_len, ) - return output + + +def use_cascade_attention( + common_prefix_len: int, + query_lens: np.ndarray, + num_query_heads: int, + num_kv_heads: int, + use_alibi: bool, + use_sliding_window: bool, + num_sms: int, +) -> bool: + """Decide whether to use cascade attention. + + This function 1) checks whether cascade attention is supported with the + given configuration, and 2) heuristically decides whether using cascade + attention can improve performance. + """ + # Too short common prefix. Probably not worth using cascade attention. + # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold. + # NOTE(woosuk): This is the common case. We should return False as soon as + # possible to avoid any unnecessary computation. + if common_prefix_len < 256: + return False + # Cascade attention is currently not supported with these variants. + if use_alibi or use_sliding_window: + return False + # Too few queries. Probably not worth using cascade attention. + # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold. + num_reqs = len(query_lens) + if num_reqs < 8: + return False + + # Heuristics to decide whether using cascade attention is beneficial. + # 1. When FlashDecoding is not used for normal attention, cascade attention + # is likely to be faster since it saves memory bandwidth. + num_queries_per_kv = num_query_heads // num_kv_heads + # The criteria for using FlashDecoding can be found in the following link: + # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535 + use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window + and not use_alibi and np.all(query_lens == 1)) + if not use_flash_decoding: + # Use cascade attention. + return True + + # 2. When FlashDecoding is used for normal attention, it is not clear + # whether cascade attention is beneficial, because FlashDecoding can + # launch more CTAs than cascade attention. + # We use a simple performance model to compare the two methods. + # NOTE(woosuk): The performance model is very rough and may not be + # accurate. + num_tokens = num_reqs + # NOTE(woosuk): These are default tile sizes. flash-attn might use + # different tile sizes (e.g., 64 or 256) depending on the configuration. + q_tile_size = 128 + kv_tile_size = 128 + num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size) + + cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size) + cascade_waves = cdiv(cascade_ctas, num_sms) + cascade_time = cascade_waves * num_prefix_tiles + + flash_decoding_ctas = (num_reqs * num_kv_heads * + cdiv(num_queries_per_kv, q_tile_size)) + flash_decoding_ctas *= num_prefix_tiles + flash_decoding_time = cdiv(flash_decoding_ctas, num_sms) + + # Use cascade attention if it is faster than FlashDecoding. + return cascade_time < flash_decoding_time + + +def cascade_attention( + output: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + cu_query_lens: torch.Tensor, + max_query_len: int, + cu_prefix_query_lens: torch.Tensor, + cu_prefix_kv_lens: torch.Tensor, + cu_suffix_kv_lens: torch.Tensor, + max_kv_len: int, + softmax_scale: float, + alibi_slopes: Optional[torch.Tensor], + sliding_window: Tuple[int, int], + logits_soft_cap: float, + block_table: torch.Tensor, + common_prefix_len: int, +) -> torch.Tensor: + assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") + # TODO: Support sliding window. + assert sliding_window == (-1, -1), ( + "Cascade attention does not support sliding window.") + + num_tokens = query.shape[0] + block_size = key_cache.shape[-3] + assert common_prefix_len % block_size == 0 + num_common_kv_blocks = common_prefix_len // block_size + assert num_common_kv_blocks > 0 + + # Process shared prefix. + prefix_output, prefix_lse = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=cu_prefix_query_lens, + cu_seqlens_k=cu_prefix_kv_lens, + max_seqlen_q=num_tokens, + max_seqlen_k=common_prefix_len, + softmax_scale=softmax_scale, + causal=False, + window_size=sliding_window, + block_table=block_table[:1], + softcap=logits_soft_cap, + return_softmax_lse=True, + ) + + # Process suffix per query. + suffix_output, suffix_lse = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=cu_query_lens, + cu_seqlens_k=cu_suffix_kv_lens, + max_seqlen_q=max_query_len, + max_seqlen_k=max_kv_len - common_prefix_len, + softmax_scale=softmax_scale, + causal=True, + window_size=sliding_window, + block_table=block_table[:, num_common_kv_blocks:], + softcap=logits_soft_cap, + return_softmax_lse=True, + ) + + # Merge prefix and suffix outputs, and store the result in output. + merge_attn_states(output, prefix_output, prefix_lse, suffix_output, + suffix_lse) + + +def merge_attn_states( + output: torch.Tensor, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + suffix_output: torch.Tensor, + suffix_lse: torch.Tensor, +) -> None: + num_tokens = output.shape[0] + num_query_heads = output.shape[1] + head_size = output.shape[2] + padded_head_size = triton.next_power_of_2(head_size) + + # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead. + merge_attn_states_kernel[(num_tokens, num_query_heads)]( + output, + prefix_output, + prefix_lse, + suffix_output, + suffix_lse, + head_size, + padded_head_size, + ) + + +@triton.jit +def merge_attn_states_kernel( + output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + prefix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + prefix_lse, # [NUM_HEADS, NUM_TOKENS] + suffix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + suffix_lse, # [NUM_HEADS, NUM_TOKENS] + HEAD_SIZE: tl.constexpr, + PADDED_HEAD_SIZE: tl.constexpr, +): + token_idx = tl.program_id(0) + num_tokens = tl.num_programs(0) + head_idx = tl.program_id(1) + num_heads = tl.num_programs(1) + + p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx) + s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx) + max_lse = tl.maximum(p_lse, s_lse) + p_lse = p_lse - max_lse + s_lse = s_lse - max_lse + + head_arange = tl.arange(0, PADDED_HEAD_SIZE) + head_mask = head_arange < HEAD_SIZE + p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + head_arange, + mask=head_mask) + s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + head_arange, + mask=head_mask) + + # NOTE(woosuk): Be careful with the numerical stability. + # We should compute the scale first, and then multiply it with the output. + # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly. + p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) + s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) + out = p_out * p_scale + s_out * s_scale + tl.store(output + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + head_arange, + out, + mask=head_mask) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 00d0de51634ae..1cbff1e2d767e 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -8,7 +8,7 @@ generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) -from vllm.v1.request import Request +from vllm.v1.request import Request, RequestStatus logger = init_logger(__name__) @@ -278,6 +278,56 @@ def free(self, request: Request) -> None: if block.ref_cnt == 0: self.free_block_queue.append(block) + def get_num_common_prefix_blocks( + self, + request: Request, + num_running_requests: int, + ) -> int: + """Calculate the number of common prefix blocks shared by all requests + in the RUNNING state. + + The function determines this by selecting any request and iterating + through its blocks. A block is considered a common prefix block if its + `ref_cnt` equals the total number of requests in the RUNNING state. + + NOTE(woosuk): The number of requests in the RUNNING state is **greater + than or equal to** the number of requests scheduled in the current step. + This is because the RUNNING state only indicates that: + 1. The request has not yet finished, and + 2. The request holds its blocks unfreed. + + While all scheduled requests must be in the RUNNING state, the inverse + is not necessarily true. There may be RUNNING requests that are not + scheduled in the current step. As of 1/1/2025, the scheduler does not + allow this case, but it is possible in the future, as we allow more + flexible scheduling. + + This can result in an edge case where the number of common prefix blocks + is 0, even though all scheduled requests share a common prefix. This + occurs because there may be unscheduled RUNNING requests that do not + share the common prefix. Currently, this case cannot be easily detected, + so the function returns 0 in such cases. + + Args: + request: Any request in the RUNNING state, used to identify the + common prefix blocks. + num_running_requests: The total number of requests in the RUNNING + state. This can be different from the number of scheduled + requests in the current step. + + Returns: + int: The number of common prefix blocks. + """ + assert request.status == RequestStatus.RUNNING + blocks = self.req_to_blocks[request.request_id] + num_common_blocks = 0 + for block in blocks: + if block.ref_cnt == num_running_requests: + num_common_blocks += 1 + else: + break + return num_common_blocks + def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: """Get new blocks from the free block pool. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 08e7c0fd4dc9b..baaf3329dc79f 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -262,6 +262,14 @@ def schedule(self) -> "SchedulerOutput": assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(scheduled_running_reqs) == len(self.running)) + # Get the longest common prefix among all requests in the running queue. + # This can be potentially used for cascade attention. + if self.running: + any_request = self.running[0] + num_common_prefix_blocks = ( + self.kv_cache_manager.get_num_common_prefix_blocks( + any_request, len(self.running))) + # Construct the scheduler output. new_reqs_data = [ NewRequestData.from_request(req, @@ -287,6 +295,7 @@ def schedule(self) -> "SchedulerOutput": num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_encoder_inputs=scheduled_encoder_inputs, + num_common_prefix_blocks=num_common_prefix_blocks, preempted_req_ids=preempted_req_ids, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. @@ -594,6 +603,7 @@ class SchedulerOutput: num_scheduled_tokens: Dict[str, int] total_num_scheduled_tokens: int scheduled_encoder_inputs: Dict[str, List[int]] + num_common_prefix_blocks: int preempted_req_ids: Set[str] finished_req_ids: Set[str] diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5b5a5a61cea7d..905d3d1fc3e1c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -113,15 +113,27 @@ def process_inputs( # For merged preprocessor, mm_data is already mm_inputs precomputed_mm_inputs = None - if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs): - precomputed_mm_inputs = [decoder_inputs.multi_modal_data] + decoder_mm_data = decoder_inputs.multi_modal_data + if isinstance(decoder_mm_data, MultiModalKwargs): + # The output of merged multi-modal processor (`decoder_mm_data`) + # contains the kwargs for all items from all modalities. + # This code separates them so that there is one set of kwargs + # per item per modality. + precomputed_mm_inputs = [ + MultiModalKwargs.from_items([item]) + for modality in decoder_mm_data.modalities + for item in decoder_mm_data.get_items(modality) + ] # Apply MM mapper mm_inputs = None - if len(decoder_inputs.multi_modal_data) > 0: + if len(decoder_mm_data) > 0: mm_inputs = self.mm_input_mapper_client.process_inputs( - decoder_inputs.multi_modal_data, mm_hashes, - decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs) + decoder_mm_data, + mm_hashes, + decoder_inputs.mm_processor_kwargs, + precomputed_mm_inputs, + ) return EngineCoreRequest( request_id, diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index c088c3c129ca5..f2007d85c61a5 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -44,7 +44,7 @@ def __init__(self): logger.warning( "FlashInfer is not available. Falling back to the PyTorch-" "native implementation of top-p & top-k sampling. For the " - "best performance, please install FalshInfer.") + "best performance, please install FlashInfer.") self.forward = self.forward_native else: self.forward = self.forward_native diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e79145300fe06..f8a1427c6c26c 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -66,8 +66,9 @@ def __init__( pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() - self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) + self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) # Attention-related. self.block_table = torch.zeros( @@ -189,6 +190,7 @@ def add_request( end_idx = start_idx + len(request.output_token_ids) self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids + self.num_tokens[req_index] = request.num_tokens self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens num_blocks = len(request.block_ids) @@ -290,14 +292,15 @@ def condense(self, empty_req_indices: List[int]) -> None: self.req_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index - # TODO(woosuk): Optimize the copy of token_ids_cpu and - # block_table_cpu. - self.token_ids_cpu[empty_index] = self.token_ids_cpu[ - last_req_index] + num_tokens = self.num_tokens[last_req_index] + self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ + last_req_index, :num_tokens] + self.num_tokens[empty_index] = num_tokens self.num_prompt_tokens[empty_index] = \ self.num_prompt_tokens[last_req_index] self.num_computed_tokens_cpu[ empty_index] = self.num_computed_tokens_cpu[last_req_index] + # TODO(woosuk): Optimize the copy of block_table_cpu. self.block_table_cpu[empty_index] = self.block_table_cpu[ last_req_index] self.temperature_cpu[empty_index] = self.temperature_cpu[ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 509771b7e2e5a..75098b0330ac9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -72,6 +72,8 @@ def __init__( # Model-related. self.num_attn_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) + self.num_query_heads = model_config.get_num_attention_heads( + parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.head_size = model_config.get_head_size() self.hidden_size = model_config.get_hidden_size() @@ -118,6 +120,10 @@ def __init__( self.cudagraph_batch_sizes = list( reversed(self.vllm_config.compilation_config.capture_sizes)) + # Cache the device properties. + self.device_properties = torch.cuda.get_device_properties(self.device) + self.num_sms = self.device_properties.multi_processor_count + # Persistent buffers for CUDA graphs. self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32, @@ -131,7 +137,8 @@ def __init__( device=self.device) # OPTIMIZATION: Cache the tensors rather than creating them every step. - self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len), + self.arange_np = np.arange(max(self.max_num_reqs + 1, + self.max_model_len), dtype=np.int32) # NOTE(woosuk): These tensors are "stateless", i.e., they are literally # a faster version of creating a new tensor every time. Thus, we should @@ -355,6 +362,88 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): self.device, non_blocking=True) slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( self.device, non_blocking=True).long() + + # Prepare for cascade attention if needed. + common_prefix_len = (scheduler_output.num_common_prefix_blocks * + self.block_size) + if common_prefix_len == 0: + # Common case. + use_cascade = False + else: + # NOTE(woosuk): Cascade attention uses two attention kernels: one + # for the common prefix and the other for the rest. For the first + # kernel, we concatenate all the query tokens (possibly from + # different requests) and treat them as if they are from the same + # request. Then, we use bi-directional attention to process the + # common prefix in the KV cache. Importantly, this means that the + # first kernel does not do any masking. + + # Consider the following example: + # Request 1's input query: [D, E, X] + # Request 1's kv cache: [A, B, C, D, E, X] + # Request 1's num_computed_tokens: 3 (i.e., [A, B, C]) + # Request 2's input query: [E, Y] + # Request 2's kv cache: [A, B, C, D, E, Y] + # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D]) + + # If we use [A, B, C, D, E] as the common prefix, then the + # first kernel will compute the bi-directional attention between + # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E]. + # However, this is wrong because D in Request 1 should not attend to + # E in the common prefix (i.e., we need masking). + # To avoid this, [A, B, C, D] should be the common prefix. + # That is, the common prefix should be capped by the minimum + # num_computed_tokens among the requests, and plus one to include + # the first token of the query. + + # In practice, we use [A, B, C] as the common prefix, instead of + # [A, B, C, D] (i.e., the common prefix is capped by the minimum + # num_computed_tokens, without plus one). + # This is because of an implementation detail: We want to always + # use two kernels for cascade attention. Let's imagine: + # Request 3's input query: [D] + # Request 3's kv cache: [A, B, C, D] + # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D]) + # If we use [A, B, C, D] as the common prefix for Request 1-3, + # then Request 3 will be processed only by the first kernel, + # and the second kernel will get an empty input. While this is not + # a fundamental problem, our current implementation does not support + # this case. + common_prefix_len = min( + common_prefix_len, + self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) + # common_prefix_len should be a multiple of the block size. + common_prefix_len = (common_prefix_len // self.block_size * + self.block_size) + use_cascade = FlashAttentionBackend.use_cascade_attention( + common_prefix_len=common_prefix_len, + query_lens=num_scheduled_tokens, + num_query_heads=self.num_query_heads, + num_kv_heads=self.num_kv_heads, + use_alibi=False, # FIXME + use_sliding_window=self.sliding_window is not None, + num_sms=self.num_sms, + ) + + if use_cascade: + # TODO: Optimize. + cu_prefix_query_lens = torch.tensor( + [0, total_num_scheduled_tokens], + dtype=torch.int32, + device=self.device) + cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], + dtype=torch.int32, + device=self.device) + cu_suffix_kv_lens = ( + self.seq_start_loc_np[:num_reqs + 1] - + self.arange_np[:num_reqs + 1] * common_prefix_len) + cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to( + self.device) + else: + cu_prefix_query_lens = None + cu_prefix_kv_lens = None + cu_suffix_kv_lens = None + attn_metadata = FlashAttentionMetadata( num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, @@ -363,6 +452,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc=seq_start_loc, block_table=self.input_batch.block_table[:num_reqs], slot_mapping=slot_mapping, + use_cascade=use_cascade, + common_prefix_len=common_prefix_len, + cu_prefix_query_lens=cu_prefix_query_lens, + cu_prefix_kv_lens=cu_prefix_kv_lens, + cu_suffix_kv_lens=cu_suffix_kv_lens, ) # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial # request in the batch. While we should not sample any token from this @@ -550,6 +644,7 @@ def execute_model( # Append the sampled token to the output token ids. token_id = sampled_token_ids[i] self.input_batch.token_ids_cpu[i, seq_len] = token_id + self.input_batch.num_tokens[i] += 1 req_state.output_token_ids.append(token_id) else: # Ignore the sampled token from the partial request. @@ -647,10 +742,23 @@ def profile_run(self) -> None: self.mm_registry.get_max_tokens_per_item_by_modality( self.model_config).values()) - max_num_mm_items = min( + max_num_mm_items_encoder_budget = min( self.max_num_encoder_input_tokens, self.encoder_cache_size) // max_tokens_per_mm_item + max_mm_items_per_req = max( + self.mm_registry.get_mm_limits_per_prompt( + self.model_config).values()) + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_num_mm_items_decoder_budget = self.max_num_reqs * \ + max_mm_items_per_req + + max_num_mm_items = min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget) + # Dummy data definition in V0 may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index bff01320d7927..4d5d918087be8 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -287,12 +287,11 @@ def profile_run(self) -> None: seq_len, self.mm_registry, is_encoder_data=False) - encoder_dummy_data \ - = self.input_registry.dummy_data_for_profiling( - self.model_config, - seq_len, - self.mm_registry, - is_encoder_data=True) + encoder_dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry, + is_encoder_data=True) # Having more tokens is over-conservative but otherwise fine assert len( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2b545d1b28bd2..637fba23611f4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1136,7 +1136,8 @@ def load_model(self) -> None: self.prompt_adapter_manager.create_prompt_adapter_manager( self.model)) - if self.kv_cache_dtype == "fp8" and current_platform.is_rocm(): + if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm() + or current_platform.is_cuda()): # Currently only ROCm accepts kv-cache scaling factors # via quantization_param_path and this will be deprecated # in the future.