Skip to content

Commit

Permalink
stash
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-redhat committed Jan 3, 2025
1 parent 1c4b92a commit eb9b00b
Showing 1 changed file with 10 additions and 9 deletions.
19 changes: 10 additions & 9 deletions vllm/v1/executor/multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,23 +37,18 @@ class MultiprocExecutor(Executor):
def __init__(self, vllm_config: VllmConfig) -> None:

# The child processes will send SIGQUIT when unrecoverable
# errors happen. We kill the process tree here so that the
# stack trace is very evident.
# TODO: rather than killing the main process, we should
# figure out how to raise an AsyncEngineDeadError and
# handle at the API server level so we can return a better
# error code to the clients calling VLLM.

# errors happen.
def sigquit_handler(signum, frame):
logger.fatal(
"MulitprocExecutor got SIGQUIT from worker processes, shutting "
"down. See stack trace above for root cause issue.")
# Propagate error up to parent process.
parent_process = psutil.Process().parent()
parent_process.send_signal(signal.SIGQUIT)
kill_process_tree(os.getpid())
self.shutdown()

signal.signal(signal.SIGQUIT, sigquit_handler)

self.vllm_config = vllm_config
self.parallel_config = vllm_config.parallel_config

Expand Down Expand Up @@ -356,6 +351,7 @@ def signal_handler(signum, frame):
traceback = get_exception_traceback()
logger.error("Worker hit an exception: %s", traceback)
parent_process.send_signal(signal.SIGQUIT)
raise

finally:
# Clean up once worker exits busy loop
Expand Down Expand Up @@ -390,12 +386,17 @@ class ResponseStatus(Enum):

def worker_busy_loop(self):
"""Main busy loop for Multiprocessing Workers"""

i = 0
while True:
method, args, kwargs = self.rpc_broadcast_mq.dequeue()

try:
if i == 10:
raise ValueError("SIMULATE CUDA EXCEPTION")
i += 1
output = getattr(self.worker, method)(*args, **kwargs)
except BaseException as e:
except Exception as e:
self.worker_response_mq.enqueue(
(WorkerProc.ResponseStatus.FAILURE, e))
continue
Expand Down

0 comments on commit eb9b00b

Please sign in to comment.