-
Notifications
You must be signed in to change notification settings - Fork 161
feat(BA-1213): Add detection and event notifications for kernel/container mismatches #4252
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
for private_port, host_ports in src["NetworkSettings"]["Ports"].items(): | ||
private_port = int(private_port.split("/")[0]) | ||
if host_ports is None: | ||
host_ip = "127.0.0.1" |
Check notice
Code scanning / devskim
Accessing localhost could indicate debug code, or could hinder scaling. Note
version: int | ||
agent_config: Mapping[str, Any] | ||
resource_spec: KernelResourceSpec | ||
service_ports: Any # TODO: type-annotation |
Check notice
Code scanning / devskim
A "TODO" or similar was left in source code, possibly indicating incomplete functionality Note
async def handle_dangling_kernel( | ||
context: AgentRegistry, source: AgentId, event: DanglingKernelDetected | ||
) -> None: | ||
# TODO: Impl dangling kernel handler |
Check notice
Code scanning / devskim
A "TODO" or similar was left in source code, possibly indicating incomplete functionality Note
async def handle_dangling_container( | ||
context: AgentRegistry, source: AgentId, event: DanglingContainerDetected | ||
) -> None: | ||
# TODO: Impl dangling container handler |
Check notice
Code scanning / devskim
A "TODO" or similar was left in source code, possibly indicating incomplete functionality Note
src/ai/backend/agent/agent.py
Outdated
def _get_probe_runner(self) -> ProbeRunner: | ||
probe = AgentProbe( | ||
self.enumerate_containers, | ||
self.get_kernel_registry, | ||
self.event_producer, | ||
) | ||
return ProbeRunner(11.0, [probe]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't make new object in getter.
network_id: str, | ||
image: ImageRef, | ||
version: int, | ||
args: KernelInitArgs, | ||
network_driver: str, | ||
*, | ||
agent_config: Mapping[str, Any], | ||
resource_spec: KernelResourceSpec, | ||
service_ports: Any, # TODO: type-annotation | ||
environ: Mapping[str, Any], | ||
data: Dict[str, Any], | ||
) -> None: | ||
super().__init__( | ||
ownership_data, | ||
network_id, | ||
image, | ||
version, | ||
agent_config=agent_config, | ||
resource_spec=resource_spec, | ||
service_ports=service_ports, | ||
data=data, | ||
environ=environ, | ||
) | ||
super().__init__(args) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍
src/ai/backend/agent/dummy/kernel.py
Outdated
def _get_probe_runner(self) -> ProbeRunner: | ||
return ProbeRunner.nop() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than having private methods in common, it seems better to have interfaces injected.
src/ai/backend/agent/probe.py
Outdated
class BaseKernelProbe(ABC): | ||
def __init__( | ||
self, | ||
kernel_id: KernelId, | ||
kernel_state_getter: Callable[..., KernelLifecycleStatus], | ||
container_id_getter: Callable[..., Optional[ContainerId]], | ||
event_producer: EventProducer, | ||
) -> None: | ||
self._kernel_id = kernel_id | ||
self._container_id_getter = container_id_getter | ||
self._kernel_state_getter = kernel_state_getter | ||
self._event_producer = event_producer | ||
|
||
@abstractmethod | ||
async def _get_container_info(self) -> Optional[Container]: | ||
raise NotImplementedError | ||
|
||
def _compare_with_container(self, container: Optional[Container]) -> None: | ||
kernel_state = self._kernel_state_getter() | ||
match kernel_state: | ||
case KernelLifecycleStatus.PREPARING: | ||
if container is not None: | ||
# container exists but kernel is hanging in PREPARING state | ||
raise DanglingKernel | ||
case KernelLifecycleStatus.RUNNING: | ||
if container is None or container.status != ContainerStatus.RUNNING: | ||
raise DanglingKernel | ||
case KernelLifecycleStatus.TERMINATING: | ||
# There might be a delay in the container status change | ||
# after the kernel is being terminated. | ||
pass |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If possible, I'd like to keep the implementation out of ABC.
@abstractmethod | ||
def _init_probe_runner_obj(self) -> ProbeRunner: | ||
raise NotImplementedError |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think private abstractmethod is harmful.
agent_config: Mapping[str, Any] | ||
resource_spec: KernelResourceSpec | ||
service_ports: Any # TODO: type-annotation | ||
data: dict[Any, Any] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix the type of key as possible.
pass | ||
|
||
|
||
class ProbeRunner(Generic[TResourceCtx]): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove Probe
prefix. but Runner
is so generic name. please rename proper name.
resolves #4242 (BA-1213)
Checklist: (if applicable)