From 0e39a33c6d6dd25687db83c410b6210201880bdd Mon Sep 17 00:00:00 2001 From: Gordon Wong Date: Sat, 17 Aug 2024 00:05:18 +0800 Subject: [PATCH] [Bugfix][Hardware][AMD][Frontend] add quantization param to embedding checking method (#7513) --- vllm/entrypoints/openai/api_server.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ad5ba4a93ba63..a641dcc24aaae 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -60,11 +60,13 @@ _running_tasks: Set[asyncio.Task] = set() -def model_is_embedding(model_name: str, trust_remote_code: bool) -> bool: +def model_is_embedding(model_name: str, trust_remote_code: bool, + quantization: str) -> bool: return ModelConfig(model=model_name, tokenizer=model_name, tokenizer_mode="auto", trust_remote_code=trust_remote_code, + quantization=quantization, seed=0, dtype="auto").embedding_mode @@ -97,7 +99,8 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(args.model, args.trust_remote_code) + if (model_is_embedding(args.model, args.trust_remote_code, + args.quantization) or args.disable_frontend_multiprocessing): async_engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER)