diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index abaf807335ef2..9d9d03b0228ee 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -37,13 +37,7 @@ class MultiprocExecutor(Executor): def __init__(self, vllm_config: VllmConfig) -> None: # The child processes will send SIGQUIT when unrecoverable - # errors happen. We kill the process tree here so that the - # stack trace is very evident. - # TODO: rather than killing the main process, we should - # figure out how to raise an AsyncEngineDeadError and - # handle at the API server level so we can return a better - # error code to the clients calling VLLM. - + # errors happen. def sigquit_handler(signum, frame): logger.fatal( "MulitprocExecutor got SIGQUIT from worker processes, shutting " @@ -51,9 +45,10 @@ def sigquit_handler(signum, frame): # Propagate error up to parent process. parent_process = psutil.Process().parent() parent_process.send_signal(signal.SIGQUIT) - kill_process_tree(os.getpid()) + self.shutdown() signal.signal(signal.SIGQUIT, sigquit_handler) + self.vllm_config = vllm_config self.parallel_config = vllm_config.parallel_config @@ -356,6 +351,7 @@ def signal_handler(signum, frame): traceback = get_exception_traceback() logger.error("Worker hit an exception: %s", traceback) parent_process.send_signal(signal.SIGQUIT) + raise finally: # Clean up once worker exits busy loop @@ -390,12 +386,17 @@ class ResponseStatus(Enum): def worker_busy_loop(self): """Main busy loop for Multiprocessing Workers""" + + i = 0 while True: method, args, kwargs = self.rpc_broadcast_mq.dequeue() try: + if i == 10: + raise ValueError("SIMULATE CUDA EXCEPTION") + i += 1 output = getattr(self.worker, method)(*args, **kwargs) - except BaseException as e: + except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) continue