Skip to content

Commit

Permalink
stash
Browse files Browse the repository at this point in the history
  • Loading branch information
robertgshaw2-redhat committed Jan 3, 2025
1 parent dcfd3b8 commit 6e0e0d4
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 11 deletions.
20 changes: 9 additions & 11 deletions vllm/v1/executor/multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import signal
import sys
import time
import weakref
from dataclasses import dataclass
from enum import Enum, auto
from multiprocessing.process import BaseProcess
Expand All @@ -19,9 +20,8 @@
from vllm.executor.multiproc_worker_utils import (
_add_prefix, set_multiprocessing_worker_envs)
from vllm.logger import init_logger
from vllm.utils import (get_distributed_init_method, get_exception_traceback,
get_mp_context, get_open_port, get_open_zmq_ipc_path,
kill_process_tree, zmq_socket_ctx)
from vllm.utils import (get_distributed_init_method, get_mp_context,
get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx)
from vllm.v1.executor.abstract import Executor
from vllm.v1.outputs import ModelRunnerOutput
from vllm.worker.worker_base import WorkerWrapperBase
Expand All @@ -35,6 +35,9 @@
class MultiprocExecutor(Executor):

def __init__(self, vllm_config: VllmConfig) -> None:
# Call self.shutdown at exit to clean up
# and ensure workers will be terminated.
self._finalizer = weakref.finalize(self, self.shutdown)

# The child processes will send SIGQUIT when unrecoverable
# errors happen.
Expand Down Expand Up @@ -344,15 +347,12 @@ def signal_handler(signum, frame):
worker.worker_busy_loop()

except SystemExit:
# worker_busy_loop sends exceptions to Executor and raises
# SystemExit.
shutdown_requested = True
logger.debug("Worker interrupted.")

except Exception:
# worker_busy_loop sends exceptions exceptons to Executor
# for shutdown, but if there is an error in startup or an
# error with IPC
# error with IPC itself, we need to alert the parent.
# itself, we need to alert the parent so we can shut down.
psutil.Process().parent().send_signal(signal.SIGQUIT)
raise
Expand Down Expand Up @@ -390,18 +390,16 @@ class ResponseStatus(Enum):

def worker_busy_loop(self):
"""Main busy loop for Multiprocessing Workers"""

while True:
method, args, kwargs = self.rpc_broadcast_mq.dequeue()

try:
if self.rank == 0:
raise ValueError("SIMULATE CUDA ERROR")
output = getattr(self.worker, method)(*args, **kwargs)
except Exception as e:
self.worker_response_mq.enqueue(
(WorkerProc.ResponseStatus.FAILURE, e))
traceback = get_exception_traceback()
logger.error("WorkerProc hit an exception: %s", traceback)
logger.exception("WorkerProc hit an exception: %s", exc_info=e)
continue

self.worker_response_mq.enqueue(
Expand Down
6 changes: 6 additions & 0 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ def __init__(
distributed_init_method: str,
):

self.i = 0

# TODO: use WorkerBase.__init__(self, vllm_config=vllm_config)
self.vllm_config = vllm_config
self.model_config = vllm_config.model_config
Expand Down Expand Up @@ -201,6 +203,10 @@ def execute_model(
self,
scheduler_output: "SchedulerOutput",
) -> ModelRunnerOutput:
if self.rank == 0 and self.i == 10:
raise ValueError("ERROR FROM HERE :)")
self.i += 1

output = self.model_runner.execute_model(scheduler_output)
return output if self.rank == 0 else None

Expand Down

0 comments on commit 6e0e0d4

Please sign in to comment.