Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

♻️ Containers are also removed via agent when the dynamic-sidecar is stopped (⚠️ devops) #6924

Merged
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
dc1bd4f
added force container cleanup step when stopping services
Dec 9, 2024
e8c1aaa
fixed imports
Dec 9, 2024
23825e8
fixed imports
Dec 9, 2024
66bd31f
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 9, 2024
b15b4a0
fixed broken tests
Dec 9, 2024
2e7c6cd
display removed containers
Dec 9, 2024
9686a88
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 9, 2024
f21f43a
fixed failing test
Dec 9, 2024
e68afac
refactor test
Dec 9, 2024
4424d0f
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 10, 2024
83d9286
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 11, 2024
e41d9c5
adding message that is always visible if orphans are detected
Dec 11, 2024
688f10d
added validator for docker_node_id
Dec 13, 2024
2354e68
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 13, 2024
d426f7c
refactor
Dec 13, 2024
aa239cf
update specs
Dec 13, 2024
b71b054
refactor
Dec 16, 2024
f9ddcd0
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 16, 2024
039f781
Merge branch 'master' into pr-osparc-orphaned-containers-removal
GitHK Dec 16, 2024
2d5cd71
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 16, 2024
d9bbfa5
fixed regex
Dec 16, 2024
a950fc3
Merge branch 'pr-osparc-orphaned-containers-removal' of github.com:Gi…
Dec 16, 2024
45e9d2e
fixed openapispecs
Dec 17, 2024
e452ef4
Merge remote-tracking branch 'upstream/master' into pr-osparc-orphane…
Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,7 @@ class ServiceExtras(BaseModel):


CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME: Final[NonNegativeInt] = 89


DYNAMIC_SIDECAR_SERVICE_PREFIX: Final[str] = "dy-sidecar"
DYNAMIC_PROXY_SERVICE_PREFIX: Final[str] = "dy-proxy"
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import logging
from datetime import timedelta
from typing import Final

from models_library.projects_nodes_io import NodeID
from models_library.rabbitmq_basic_types import RPCMethodName, RPCNamespace
from pydantic import NonNegativeInt, TypeAdapter
from servicelib.logging_utils import log_decorator
from servicelib.rabbitmq import RabbitMQRPCClient

_logger = logging.getLogger(__name__)

_REQUEST_TIMEOUT: Final[NonNegativeInt] = int(timedelta(minutes=60).total_seconds())


@log_decorator(_logger, level=logging.DEBUG)
async def force_container_cleanup(
rabbitmq_rpc_client: RabbitMQRPCClient,
*,
docker_node_id: str,
GitHK marked this conversation as resolved.
Show resolved Hide resolved
swarm_stack_name: str,
node_id: NodeID,
) -> None:
result = await rabbitmq_rpc_client.request(
RPCNamespace.from_entries(
{
"service": "agent",
"docker_node_id": docker_node_id,
"swarm_stack_name": swarm_stack_name,
}
),
TypeAdapter(RPCMethodName).validate_python("force_container_cleanup"),
node_id=node_id,
timeout_s=_REQUEST_TIMEOUT,
)
assert result is None # nosec
20 changes: 20 additions & 0 deletions services/agent/src/simcore_service_agent/api/rpc/_containers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import logging

from fastapi import FastAPI
from models_library.projects_nodes_io import NodeID
from servicelib.logging_utils import log_context
from servicelib.rabbitmq import RPCRouter

from ...services.containers_manager import ContainersManager

_logger = logging.getLogger(__name__)

router = RPCRouter()


@router.expose()
async def force_container_cleanup(app: FastAPI, *, node_id: NodeID) -> None:
with log_context(
_logger, logging.INFO, f"removing all orphan container for {node_id=}"
):
await ContainersManager.get_from_app_state(app).force_container_cleanup(node_id)
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from servicelib.rabbitmq.rpc_interfaces.agent.errors import (
NoServiceVolumesFoundRPCError,
)
from simcore_service_agent.services.volumes_manager import VolumesManager

from ...services.volumes_manager import VolumesManager

_logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from simcore_service_agent.core.settings import ApplicationSettings

from ...services.rabbitmq import get_rabbitmq_rpc_server
from . import _volumes
from . import _containers, _volumes

ROUTERS: list[RPCRouter] = [
_containers.router,
_volumes.router,
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
)
from ..api.rest.routes import setup_rest_api
from ..api.rpc.routes import setup_rpc_api_routes
from ..services.containers_manager import setup_containers_manager
from ..services.instrumentation import setup_instrumentation
from ..services.rabbitmq import setup_rabbitmq
from ..services.volumes_manager import setup_volume_manager
Expand Down Expand Up @@ -58,6 +59,7 @@ def create_app() -> FastAPI:

setup_rabbitmq(app)
setup_volume_manager(app)
setup_containers_manager(app)
setup_rest_api(app)
setup_rpc_api_routes(app)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import logging
from dataclasses import dataclass, field

from aiodocker import Docker
from fastapi import FastAPI
from models_library.api_schemas_directorv2.services import (
DYNAMIC_PROXY_SERVICE_PREFIX,
DYNAMIC_SIDECAR_SERVICE_PREFIX,
)
from models_library.projects_nodes_io import NodeID
from servicelib.fastapi.app_state import SingletonInAppStateMixin
from servicelib.utils import limited_gather

from .docker_utils import get_containers_with_prefixes, remove_container_forcefully

_logger = logging.getLogger(__name__)


@dataclass
class ContainersManager(SingletonInAppStateMixin):
app_state_name: str = "containers_manager"

docker: Docker = field(default_factory=Docker)

async def force_container_cleanup(self, node_id: NodeID) -> None:
# compose all possible used container prefixes
proxy_prefix = f"{DYNAMIC_PROXY_SERVICE_PREFIX}_{node_id}"
dy_sidecar_prefix = f"{DYNAMIC_SIDECAR_SERVICE_PREFIX}_{node_id}"
user_service_prefix = f"{DYNAMIC_SIDECAR_SERVICE_PREFIX}-{node_id}"
GitHK marked this conversation as resolved.
Show resolved Hide resolved

orphan_containers = await get_containers_with_prefixes(
GitHK marked this conversation as resolved.
Show resolved Hide resolved
self.docker, {proxy_prefix, dy_sidecar_prefix, user_service_prefix}
)

_logger.info(
GitHK marked this conversation as resolved.
Show resolved Hide resolved
"Orphan containers for node_id='%s': %s", node_id, orphan_containers
)

await limited_gather(
GitHK marked this conversation as resolved.
Show resolved Hide resolved
*[
remove_container_forcefully(self.docker, container)
for container in orphan_containers
],
)

async def shutdown(self) -> None:
await self.docker.close()


def get_containers_manager(app: FastAPI) -> ContainersManager:
return ContainersManager.get_from_app_state(app)


def setup_containers_manager(app: FastAPI) -> None:
async def _on_startup() -> None:
ContainersManager().set_to_app_state(app)

async def _on_shutdown() -> None:
await ContainersManager.get_from_app_state(app).shutdown()

app.add_event_handler("startup", _on_startup)
app.add_event_handler("shutdown", _on_shutdown)
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,27 @@ async def remove_volume(
get_instrumentation(app).agent_metrics.remove_volumes(
settings.AGENT_DOCKER_NODE_ID
)


async def get_containers_with_prefixes(docker: Docker, prefixes: set[str]) -> set[str]:
"""Returns a set of container names matching any of the given prefixes"""
all_containers = await docker.containers.list(all=True)

result: set[str] = set()
for container in all_containers:
container_info = await container.show()
container_name = container_info.get("Name", "").lstrip("/")
if any(container_name.startswith(prefix) for prefix in prefixes):
result.add(container_name)

return result


async def remove_container_forcefully(docker: Docker, container_id: str) -> None:
"""Removes a container regardless of it's state"""
try:
container = await docker.containers.get(container_id)
await container.delete(force=True)
GitHK marked this conversation as resolved.
Show resolved Hide resolved
except DockerError as e:
if e.status != status.HTTP_404_NOT_FOUND:
raise
54 changes: 54 additions & 0 deletions services/agent/tests/unit/test_api_rpc__containers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# pylint:disable=redefined-outer-name
# pylint:disable=unused-argument

from collections.abc import Awaitable, Callable
from unittest.mock import AsyncMock

import pytest
import pytest_mock
from faker import Faker
from fastapi import FastAPI
from models_library.projects_nodes_io import NodeID
from servicelib.rabbitmq import RabbitMQRPCClient
from servicelib.rabbitmq.rpc_interfaces.agent import containers

pytest_simcore_core_services_selection = [
"rabbit",
]


@pytest.fixture
def node_id(faker: Faker) -> NodeID:
return faker.uuid4(cast_to=None)


@pytest.fixture
async def rpc_client(
initialized_app: FastAPI,
rabbitmq_rpc_client: Callable[[str], Awaitable[RabbitMQRPCClient]],
) -> RabbitMQRPCClient:
return await rabbitmq_rpc_client("client")


@pytest.fixture
def mocked_force_container_cleanup(mocker: pytest_mock.MockerFixture) -> AsyncMock:
return mocker.patch(
"simcore_service_agent.services.containers_manager.ContainersManager.force_container_cleanup"
)


async def test_force_container_cleanup(
rpc_client: RabbitMQRPCClient,
swarm_stack_name: str,
docker_node_id: str,
node_id: NodeID,
mocked_force_container_cleanup: AsyncMock,
):
assert mocked_force_container_cleanup.call_count == 0
await containers.force_container_cleanup(
rpc_client,
docker_node_id=docker_node_id,
swarm_stack_name=swarm_stack_name,
node_id=node_id,
)
assert mocked_force_container_cleanup.call_count == 1
107 changes: 107 additions & 0 deletions services/agent/tests/unit/test_services_containers_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# pylint: disable=redefined-outer-name


import logging
from collections.abc import AsyncIterable, Awaitable, Callable
from enum import Enum

import pytest
from aiodocker import Docker, DockerError
from asgi_lifespan import LifespanManager
from faker import Faker
from fastapi import FastAPI, status
from models_library.api_schemas_directorv2.services import (
DYNAMIC_PROXY_SERVICE_PREFIX,
DYNAMIC_SIDECAR_SERVICE_PREFIX,
)
from models_library.projects_nodes_io import NodeID
from simcore_service_agent.services.containers_manager import (
get_containers_manager,
setup_containers_manager,
)


@pytest.fixture
async def app() -> AsyncIterable[FastAPI]:
app = FastAPI()
setup_containers_manager(app)

async with LifespanManager(app):
yield app


@pytest.fixture
def node_id(faker: Faker) -> NodeID:
return faker.uuid4(cast_to=None)


@pytest.fixture
async def docker() -> AsyncIterable[Docker]:
async with Docker() as docker:
yield docker


class _ContainerMode(Enum):
CREATED = "CREATED"
RUNNING = "RUNNING"
STOPPED = "STOPPED"


@pytest.fixture
async def create_container(
docker: Docker,
) -> AsyncIterable[Callable[[str, _ContainerMode], Awaitable[str]]]:
created_containers: set[str] = set()

async def _(name: str, container_mode: _ContainerMode) -> str:
container = await docker.containers.create(
config={
"Image": "alpine",
"Cmd": ["sh", "-c", "while true; do sleep 1; done"],
},
name=name,
)

if container_mode in (_ContainerMode.RUNNING, _ContainerMode.STOPPED):
await container.start()
if container_mode == _ContainerMode.STOPPED:
await container.stop()

created_containers.add(container.id)
return container.id

yield _

# cleanup containers
for container_id in created_containers:
try:
container = await docker.containers.get(container_id)
await container.delete(force=True)
except DockerError as e:
if e.status != status.HTTP_404_NOT_FOUND:
raise


async def test_force_container_cleanup(
app: FastAPI,
node_id: NodeID,
create_container: Callable[[str, _ContainerMode], Awaitable[str]],
faker: Faker,
caplog: pytest.LogCaptureFixture,
):
caplog.set_level(logging.DEBUG)
caplog.clear()

proxy_name = f"{DYNAMIC_PROXY_SERVICE_PREFIX}_{node_id}{faker.pystr()}"
dynamic_sidecar_name = f"{DYNAMIC_SIDECAR_SERVICE_PREFIX}-{node_id}{faker.pystr()}"
user_service_name = f"{DYNAMIC_SIDECAR_SERVICE_PREFIX}_{node_id}{faker.pystr()}"

await create_container(proxy_name, _ContainerMode.CREATED)
await create_container(dynamic_sidecar_name, _ContainerMode.RUNNING)
await create_container(user_service_name, _ContainerMode.STOPPED)

await get_containers_manager(app).force_container_cleanup(node_id)

assert proxy_name in caplog.text
assert dynamic_sidecar_name in caplog.text
assert user_service_name in caplog.text
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from typing import Final
# dynamic services

DYNAMIC_SIDECAR_SERVICE_PREFIX: Final[str] = "dy-sidecar"
DYNAMIC_PROXY_SERVICE_PREFIX: Final[str] = "dy-proxy"
from models_library.api_schemas_directorv2.services import (
DYNAMIC_PROXY_SERVICE_PREFIX,
DYNAMIC_SIDECAR_SERVICE_PREFIX,
)

# dynamic services

# label storing scheduler_data to allow service
# monitoring recovery after director-v2 reboots
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
from models_library.api_schemas_directorv2.dynamic_services_service import (
CommonServiceDetails,
)
from models_library.api_schemas_directorv2.services import (
DYNAMIC_PROXY_SERVICE_PREFIX,
DYNAMIC_SIDECAR_SERVICE_PREFIX,
)
from models_library.basic_types import PortInt
from models_library.callbacks_mapping import CallbacksMapping
from models_library.generated_models.docker_rest_api import ContainerState, Status2
Expand All @@ -39,9 +43,7 @@
from servicelib.exception_utils import DelayedExceptionHandler

from ..constants import (
DYNAMIC_PROXY_SERVICE_PREFIX,
DYNAMIC_SIDECAR_SCHEDULER_DATA_LABEL,
DYNAMIC_SIDECAR_SERVICE_PREFIX,
REGEX_DY_SERVICE_PROXY,
REGEX_DY_SERVICE_SIDECAR,
)
Expand Down
Loading
Loading