From 0d8451c3a45d309e58de5e1c546f043de461d478 Mon Sep 17 00:00:00 2001 From: Jiaxin Shan Date: Fri, 13 Dec 2024 12:17:37 -0800 Subject: [PATCH] [Distributed] Allow the placement group more time to wait for resources to be ready (#11138) Signed-off-by: Jiaxin Shan --- vllm/executor/ray_utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 4f28efd639084..426aa1b5c728f 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -277,10 +277,14 @@ def initialize_ray_cluster( f"Total number of devices: {device_bundles}.") else: num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) + # Log a warning message and delay resource allocation failure response. + # Avoid immediate rejection to allow user-initiated placement group + # created and wait cluster to be ready if parallel_config.world_size > num_devices_in_cluster: - raise ValueError( - f"The number of required {device_str}s exceeds the total " - f"number of available {device_str}s in the placement group.") + logger.warning( + "The number of required %ss exceeds the total " + "number of available %ss in the placement group.", device_str, + device_str) # Create a new placement group placement_group_specs: List[Dict[str, float]] = ([{ device_str: 1.0