[Model] Whisper model implementation (vllm-project#11280)

Co-authored-by: Aurick Qiao <aurick.qiao@snowflake.com>
neuralmagic · Jan 3, 2025 · e1a5c2f · e1a5c2f
1 parent fd3a62a
commit e1a5c2f
Show file tree

Hide file tree

Showing 16 changed files with 1,045 additions and 55 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -363,12 +363,14 @@ steps:
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
   - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
   - tests/models/encoder_decoder/vision_language
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
     - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
     - pytest -v -s models/encoder_decoder/language -m core_model
     - pytest -v -s models/encoder_decoder/vision_language -m core_model
 

diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference_whisper.py
@@ -0,0 +1,59 @@
+import time
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+
+# Create a Whisper encoder/decoder model instance
+llm = LLM(
+    model="openai/whisper-large-v3",
+    max_model_len=448,
+    max_num_seqs=400,
+    limit_mm_per_prompt={"audio": 1},
+    kv_cache_dtype="fp8",
+)
+
+prompts = [
+    {
+        "prompt": "<|startoftranscript|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt": "<|startoftranscript|>",
+    }
+] * 1024
+
+# Create a sampling params object.
+sampling_params = SamplingParams(
+    temperature=0,
+    top_p=1.0,
+    max_tokens=200,
+)
+
+start = time.time()
+
+# Generate output tokens from the prompts. The output is a list of
+# RequestOutput objects that contain the prompt, generated
+# text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    encoder_prompt = output.encoder_prompt
+    generated_text = output.outputs[0].text
+    print(f"Encoder prompt: {encoder_prompt!r}, "
+          f"Decoder prompt: {prompt!r}, "
+          f"Generated text: {generated_text!r}")
+
+duration = time.time() - start
+
+print("Duration:", duration)
+print("RPS:", len(prompts) / duration)
diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py
diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py
@@ -0,0 +1,136 @@
+"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
+
+Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
+"""
+from typing import Optional
+
+import pytest
+
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+
+from ....utils import fork_new_process_for_each_test, multi_gpu_test
+
+PROMPTS = [
+    {
+        "prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+        "multi_modal_data": {
+            "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+        },
+    },
+    {  # Test explicit encoder/decoder prompt
+        "encoder_prompt": {
+            "prompt": "",
+            "multi_modal_data": {
+                "audio": AudioAsset("winning_call").audio_and_sample_rate,
+            },
+        },
+        "decoder_prompt":
+        "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
+    }
+]
+
+EXPECTED = {
+    "openai/whisper-tiny": [
+        " He has birth words I spoke in the original corner of that. And a"
+        " little piece of black coat poetry. Mary had a little sandwich,"
+        " sweet, with white and snow. And everyone had it very went the last"
+        " would sure to go.",
+        " >> And the old one, fit John the way to Edgar Martinez. >> One more"
+        " to line down the field line for our base camp. Here comes joy. Here"
+        " is June and the third base. They're going to wave him in. The throw"
+        " to the plate will be late. The Mariners are going to play for the"
+        " American League Championship. I don't believe it. It just continues"
+        " by all five."
+    ],
+    "openai/whisper-small": [
+        " The first words I spoke in the original pornograph. A little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite a"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the old one pitch on the way to Edgar Martinez one month. Here"
+        " comes joy. Here is Junior to third base. They're gonna wave him"
+        " in. The throw to the plate will be late. The Mariners are going to"
+        " play for the American League Championship. I don't believe it. It"
+        " just continues. My, oh my."
+    ],
+    "openai/whisper-medium": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its fleece was quite as"
+        " slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez swung on the line"
+        " down the left field line for Obeyshev. Here comes Joy. Here is"
+        " Jorgen at third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh"
+        " my."
+    ],
+    "openai/whisper-large-v3": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its feet were quite as"
+        " slow, and everywhere that Mary went, the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line."
+        " Now the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ],
+    "openai/whisper-large-v3-turbo": [
+        " The first words I spoke in the original phonograph, a little piece"
+        " of practical poetry. Mary had a little lamb, its streets were quite"
+        " as slow, and everywhere that Mary went the lamb was sure to go.",
+        " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line"
+        " down the left field line for a base hit. Here comes Joy. Here is"
+        " Junior to third base. They're going to wave him in. The throw to the"
+        " plate will be late. The Mariners are going to play for the American"
+        " League Championship. I don't believe it. It just continues. My, oh,"
+        " my."
+    ]
+}
+
+
+def run_test(
+    model: str,
+    *,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+) -> None:
+    prompt_list = PROMPTS * 10
+    expected_list = EXPECTED[model] * 10
+
+    llm = LLM(
+        model=model,
+        tensor_parallel_size=tensor_parallel_size,
+        distributed_executor_backend=distributed_executor_backend,
+    )
+
+    sampling_params = SamplingParams(
+        temperature=0,
+        top_p=1.0,
+        max_tokens=200,
+    )
+
+    outputs = llm.generate(prompt_list, sampling_params)
+
+    for output, expected in zip(outputs, expected_list):
+        print(output.outputs[0].text)
+        assert output.outputs[0].text == expected
+
+
+@fork_new_process_for_each_test
+@pytest.mark.core_model
+@pytest.mark.parametrize(
+    "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
+def test_models(model) -> None:
+    run_test(model, tensor_parallel_size=1)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+def test_models_distributed(model, distributed_executor_backend) -> None:
+    run_test(model,
+             tensor_parallel_size=2,
+             distributed_executor_backend=distributed_executor_backend)
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -204,6 +204,7 @@ class _HfExamplesInfo:
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
+    "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {

diff --git a/vllm/config.py b/vllm/config.py
@@ -2312,6 +2312,8 @@ def _get_and_verify_max_len(
         "seq_length",
         # Command-R
         "model_max_length",
+        # Whisper
+        "max_target_positions",
         # Others
         "max_sequence_length",
         "max_seq_length",

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
@@ -184,10 +184,16 @@ def _tokenize_prompt(
         corresponding token IDs.
         """
         tokenizer = self.get_tokenizer_group()
-
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
         return tokenizer.encode(request_id=request_id,
                                 prompt=prompt,
-                                lora_request=lora_request)
+                                lora_request=lora_request,
+                                add_special_tokens=add_special_tokens)
 
     async def _tokenize_prompt_async(
         self,
@@ -197,10 +203,17 @@ async def _tokenize_prompt_async(
     ) -> List[int]:
         """Async version of :meth:`_tokenize_prompt`."""
         tokenizer = self.get_tokenizer_group()
-
-        return await tokenizer.encode_async(request_id=request_id,
-                                            prompt=prompt,
-                                            lora_request=lora_request)
+        add_special_tokens = None
+        if self.model_config.hf_config.model_type == "whisper":
+            # For Whisper, special tokens should be provided by the user based
+            # on the task and language of their request. Also needed to avoid
+            # appending an EOS token to the prompt which disrupts generation.
+            add_special_tokens = False
+        return await tokenizer.encode_async(
+            request_id=request_id,
+            prompt=prompt,
+            lora_request=lora_request,
+            add_special_tokens=add_special_tokens)
 
     def _can_process_multimodal(self) -> bool:
         model_config = self.model_config
@@ -439,8 +452,15 @@ def _build_enc_dec_llm_inputs(
             assert_never(encoder_inputs)  # type: ignore[arg-type]
 
         if decoder_inputs is None:
-            dec_token_ids = self._prepare_decoder_input_ids_for_generation(
-                None)
+            if self.model_config.hf_config.model_type == "whisper":
+                # For Whisper models, the text prompt should go to the decoder.
+                # If no explicit encoder/decoder inputs, then copy the prompt
+                # from the encoder to the decoder. The encoder tokens are later
+                # overridden by the audio features.
+                dec_token_ids = encoder_inputs["prompt_token_ids"].copy()
+            else:
+                dec_token_ids = self._prepare_decoder_input_ids_for_generation(
+                    None)
             decoder_inputs = token_inputs(dec_token_ids)
         elif (decoder_inputs["type"] == "token"
               or decoder_inputs["type"] == "multimodal"):

diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -170,6 +170,7 @@
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     # [Encoder-decoder]
     "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"),  # noqa: E501
+    "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"),  # noqa: E501
 }
 
 _SPECULATIVE_DECODING_MODELS = {