diff --git a/src/ai/backend/manager/api/manager.py b/src/ai/backend/manager/api/manager.py index fd692b8824..32deec1bde 100644 --- a/src/ai/backend/manager/api/manager.py +++ b/src/ai/backend/manager/api/manager.py @@ -111,15 +111,13 @@ async def detect_status_update(root_ctx: RootContext) -> None: async def report_status_bgtask(root_ctx: RootContext) -> None: interval = cast(Optional[float], root_ctx.local_config["manager"]["status-update-interval"]) if interval is None: - # Do not report if interval is not set + # Do not run bgtask if interval is not set return try: while True: await asyncio.sleep(interval) try: await report_manager_status(root_ctx) - except asyncio.CancelledError: - raise except Exception as e: log.exception(f"Failed to report manager health status (e:{str(e)})") except asyncio.CancelledError: diff --git a/src/ai/backend/manager/config.py b/src/ai/backend/manager/config.py index f6bfe3644b..d580ad1ede 100644 --- a/src/ai/backend/manager/config.py +++ b/src/ai/backend/manager/config.py @@ -290,8 +290,8 @@ ], t.Key("aiomonitor-webui-port", default=49100): t.ToInt[1:65535], t.Key("use-experimental-redis-event-dispatcher", default=False): t.ToBool, - t.Key("status-update-interval", default=None): t.Null | t.ToFloat[0:], - t.Key("status-lifetime", default=None): t.Null | t.ToFloat[0:], + t.Key("status-update-interval", default=None): t.Null | t.ToFloat[0:], # second + t.Key("status-lifetime", default=None): t.Null | t.ToInt[0:], # second t.Key("public-metrics-port", default=None): t.Null | t.ToInt[1:65535], }).allow_extra("*"), t.Key("docker-registry"): t.Dict({ # deprecated in v20.09 diff --git a/src/ai/backend/manager/models/health.py b/src/ai/backend/manager/models/health.py index 364c07e43e..71662e3656 100644 --- a/src/ai/backend/manager/models/health.py +++ b/src/ai/backend/manager/models/health.py @@ -142,7 +142,7 @@ async def _get_connnection_info(root_ctx: RootContext) -> ConnectionInfoOfProces async def report_manager_status(root_ctx: RootContext) -> None: - lifetime = cast(Optional[float], root_ctx.local_config["manager"]["status-lifetime"]) + lifetime = cast(Optional[int], root_ctx.local_config["manager"]["status-lifetime"]) cxn_info = await _get_connnection_info(root_ctx) _data = msgpack.packb(cxn_info.model_dump(mode="json"))