Skip to content

Commit

Permalink
[CI] Prune tests/models/decoder_only/language/* tests (vllm-project#9940
Browse files Browse the repository at this point in the history
)

Signed-off-by: mgoin <michael@neuralmagic.com>
  • Loading branch information
mgoin authored Nov 5, 2024
1 parent b9c64c0 commit 0246246
Show file tree
Hide file tree
Showing 9 changed files with 70 additions and 270 deletions.
3 changes: 1 addition & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,14 @@ steps:
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language/test_models.py
- pytest -v -s models/decoder_only/language/test_big_models.py

- label: Decoder-only Language Models Test (Extended) # 1h20min
nightly: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
- pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py

- label: Decoder-only Multi-Modal Models Test (Standard)
#mirror_hardwares: [amd]
Expand Down
93 changes: 0 additions & 93 deletions tests/models/decoder_only/language/test_big_models.py

This file was deleted.

10 changes: 5 additions & 5 deletions tests/models/decoder_only/language/test_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,19 @@
"kv_cache_dtype,base_model,test_model,scale_path",
[
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
"nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct", None),
("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
"meta-llama/Llama-3.2-1B-Instruct", None),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-7b-chat-hf",
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("enforce_eager", [False, True])
@pytest.mark.parametrize("enforce_eager", [True])
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
Expand Down
13 changes: 0 additions & 13 deletions tests/models/decoder_only/language/test_gptq_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,11 @@
MAX_MODEL_LEN = 1024

MODELS = [
# act_order==False, group_size=channelwise
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
# act_order==False, group_size=128
("TheBloke/Llama-2-7B-GPTQ", "main"),

# act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
# act_order==True, group_size=64
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
# act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),

# 8-bit, act_order==True, group_size=channelwise
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
# 8-bit, act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
# 8-bit, act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),

# 4-bit, act_order==True, group_size=128
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
Expand Down
12 changes: 6 additions & 6 deletions tests/models/decoder_only/language/test_gptq_marlin_24.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,16 @@ class ModelPair:
# 4-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
# 4-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
# # 4-bit, group_size == channelwise
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
# model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),

# 8-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
# 8-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
# # 8-bit, group_size == channelwise
# ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
# model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
]


Expand Down
69 changes: 0 additions & 69 deletions tests/models/decoder_only/language/test_marlin.py

This file was deleted.

37 changes: 21 additions & 16 deletions tests/models/decoder_only/language/test_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
import pytest

from vllm import LLM, SamplingParams
from vllm import SamplingParams

from ...utils import check_logprobs_close

Expand All @@ -15,6 +15,10 @@
# "mistralai/Mistral-Nemo-Instruct-2407"
]

MISTRAL_FORMAT_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3",
]

SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
SYMBOLIC_LANG_PROMPTS = [
"勇敢な船乗りについての詩を書く", # japanese
Expand Down Expand Up @@ -95,7 +99,7 @@ def test_models(
)


@pytest.mark.parametrize("model", MODELS[1:])
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
Expand Down Expand Up @@ -135,28 +139,29 @@ def test_mistral_format(
)


@pytest.mark.parametrize("model", MODELS[1:])
@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
def test_mistral_symbolic_languages(
vllm_runner,
model: str,
dtype: str,
prompt: str,
) -> None:
prompt = "hi"
msg = {"role": "user", "content": prompt}
llm = LLM(model=model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral")
outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
assert "�" not in outputs[0].outputs[0].text.strip()
with vllm_runner(model,
dtype=dtype,
max_model_len=8192,
tokenizer_mode="mistral",
config_format="mistral",
load_format="mistral") as vllm_model:
for prompt in SYMBOLIC_LANG_PROMPTS:
msg = {"role": "user", "content": prompt}
outputs = vllm_model.model.chat([msg],
sampling_params=SAMPLING_PARAMS)
assert "�" not in outputs[0].outputs[0].text.strip()


@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("model", MODELS[1:]) # v1 can't do func calling
@pytest.mark.parametrize("model",
MISTRAL_FORMAT_MODELS) # v1 can't do func calling
def test_mistral_function_calling(
vllm_runner,
model: str,
Expand Down
Loading

0 comments on commit 0246246

Please sign in to comment.