From 5d58743bbef9d54aaa5544fafcbcfc9d710c795b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 15:37:04 +0100 Subject: [PATCH 01/23] chore(docs): print default URI in conf ref/ex --- docs/utils/conf-ex.ini.j2 | 4 ++++ docs/utils/conf-ref.adoc.j2 | 7 ++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/utils/conf-ex.ini.j2 b/docs/utils/conf-ex.ini.j2 index ce0757fa..155a79e9 100644 --- a/docs/utils/conf-ex.ini.j2 +++ b/docs/utils/conf-ex.ini.j2 @@ -34,6 +34,8 @@ {%- set default = parameter.default %} {%- if parameter._type == "bool" %} {%- set default = "yes" if default == True else "no" %} +{%- elif parameter._type == "uri" %} +{%- set default = default.geturl() %} {%- endif %} # {%- if default.__class__.__name__ == 'list' %} @@ -56,6 +58,8 @@ {%- endif %} {%- if parameter._type == "bool" %} {%- set value = "yes" if value == True else "no" %} +{%- elif parameter._type == "uri" %} +{%- set value = value.geturl() %} {%- endif %} {%- if value.__class__.__name__ == 'list' %} {%- set value = "\n " + value | join('\n ') %} diff --git a/docs/utils/conf-ref.adoc.j2 b/docs/utils/conf-ref.adoc.j2 index a72e26b8..ed52ec6e 100644 --- a/docs/utils/conf-ref.adoc.j2 +++ b/docs/utils/conf-ref.adoc.j2 @@ -43,7 +43,12 @@ * `{{ default }}` {% endfor %} {% else %} -*Default:* `{{ parameter.default }}` +{%- if parameter._type == "uri" %} +{%- set default = parameter.default.geturl() %} +{%- else %} +{%- set default = parameter.default %} +{%- endif %} +*Default:* `{{ default }}` {% endif %} {%- else %} _No default value_ From bb4d80c438d9622011986f1d6c936f4db843cd2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 15:30:54 +0100 Subject: [PATCH 02/23] chore(dev): update slurm conf and racksdb for dev MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Increase RAM size and cores on emulated nodes - Enable metrics on tiny and emulator - Set select/cons_tres on emulator - Define a partition with DefMemPerCPU on tiny - Adopt new FireHPC syntax to define partition parameters - Rename slurm_partitions > node→nodes to reflect latest FireHPC expectations. --- dev/firehpc/conf/emulator/group_vars/all.yml | 37 +++++++++++++------- dev/firehpc/conf/tiny/group_vars/all.yml | 18 ++++++++-- dev/firehpc/db/types/nodes.yml | 22 ++++++------ 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/dev/firehpc/conf/emulator/group_vars/all.yml b/dev/firehpc/conf/emulator/group_vars/all.yml index 7127951f..f1c616bc 100644 --- a/dev/firehpc/conf/emulator/group_vars/all.yml +++ b/dev/firehpc/conf/emulator/group_vars/all.yml @@ -7,24 +7,33 @@ slurm_params: PriorityType: priority/multifactor PriorityWeightFairshare: 1000 PriorityWeightQOS: 100 + SelectType: select/cons_tres slurm_partitions: - name: batch - node: cn[001-080,101-980] + nodes: cn[001-080,101-980] default: yes - maxtime: INFINITE - state: UP + params: + MaxTime: INFINITE + State: UP + DefMemPerCPU: 512 - name: debug - node: cn[081-099] - maxtime: "1:0:0" - state: UP + nodes: cn[081-099] + params: + MaxTime: "1:0:0" + State: UP + DefMemPerCPU: 512 - name: interactive - node: cn100 - maxtime: "10:00" - state: UP + nodes: cn100 + params: + MaxTime: "10:00" + State: UP + DefMemPerCPU: 512 - name: ia - node: gn[01-40] - maxtime: "10:00" - state: UP + nodes: gn[01-40] + params: + MaxTime: "10:00" + State: UP + DefMemPerCPU: 512 slurm_qos: - name: critical args: @@ -35,4 +44,8 @@ slurm_qos: - MaxSubmitJobsPA=30 - Priority=200 slurmweb_enabled: true +slurmweb_agent_settings: + metrics: + enabled: true redis_enabled: true +metrics_enabled: true diff --git a/dev/firehpc/conf/tiny/group_vars/all.yml b/dev/firehpc/conf/tiny/group_vars/all.yml index d00c7304..ae2b4b59 100644 --- a/dev/firehpc/conf/tiny/group_vars/all.yml +++ b/dev/firehpc/conf/tiny/group_vars/all.yml @@ -14,13 +14,25 @@ slurm_qos: - Flags=OverPartQOS - GrpTRES=node=50 - GrpJobs=60 - - MaxTRES=cpu=48 + - MaxTRES=cpu=128 - MaxWall=8:00:00 - - MaxTRESPU=cpu=10,mem=5 - - MaxTRESPA=cpu=15,mem=10 + - MaxTRESPU=cpu=10,mem=65536 + - MaxTRESPA=cpu=15,mem=131072 - MaxJobsPU=10 - MaxSubmitJobsPU=20 - MaxSubmitJobsPA=30 - Priority=100 +slurm_partitions: +- name: normal + nodes: cn[1-2] + default: yes + params: + MaxTime: "2:0:0" + State: UP + DefMemPerCPU: 512 slurmweb_enabled: true +slurmweb_agent_settings: + metrics: + enabled: true redis_enabled: true +metrics_enabled: true diff --git a/dev/firehpc/db/types/nodes.yml b/dev/firehpc/db/types/nodes.yml index b0932230..3630c939 100644 --- a/dev/firehpc/db/types/nodes.yml +++ b/dev/firehpc/db/types/nodes.yml @@ -3,12 +3,12 @@ height: 1u width: full cpu: - cores: 8 + cores: 32 model: Emulated CPU - sockets: 1 + sockets: 2 ram: - dimm: 2 - size: 8GB + dimm: 8 + size: 16GB storage: - model: Samsung 980 Pro size: 256GB @@ -21,11 +21,11 @@ height: 3u width: 1/12 cpu: - cores: 8 + cores: 24 model: Emulated CPU - sockets: 1 + sockets: 2 ram: - dimm: 2 + dimm: 8 size: 8GB storage: - model: Samsung 980 Pro @@ -39,9 +39,9 @@ height: 2u width: 1 cpu: - cores: 8 + cores: 16 model: Emulated CPU - sockets: 1 + sockets: 2 gpu: - model: Nvidia H100 memory: 80GB @@ -52,8 +52,8 @@ - model: Nvidia H100 memory: 80GB ram: - dimm: 2 - size: 8GB + dimm: 16 + size: 32GB storage: - model: Samsung 980 Pro size: 256GB From 5e1c3850bde6893c6278e6780a41a658c7ce27db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 15:35:00 +0100 Subject: [PATCH 03/23] chore(dev): redirect prometheus to dev clusters Create SSH tunnel to request metrics on prometheus running on emulated clusters in development environment, except on pocket cluster to test a cluster without metrics. --- dev/conf/agent.ini.j2 | 3 +++ dev/setup-dev-environment | 2 ++ 2 files changed, 5 insertions(+) diff --git a/dev/conf/agent.ini.j2 b/dev/conf/agent.ini.j2 index 0d6fa775..88dff46b 100644 --- a/dev/conf/agent.ini.j2 +++ b/dev/conf/agent.ini.j2 @@ -27,5 +27,8 @@ enabled={{ cache_enabled }} port={{ redis_port }} password={{ redis_password }} +{% if cluster_name != "pocket" %} [metrics] enabled=yes +host=http://localhost:{{ prometheus_port }} +{% endif %} diff --git a/dev/setup-dev-environment b/dev/setup-dev-environment index 071e52cb..ea6dcf0c 100755 --- a/dev/setup-dev-environment +++ b/dev/setup-dev-environment @@ -62,6 +62,7 @@ class PortAllocator: "ldap": PortForward(389, 3389), # LDAP "slurmrestd": PortForward(2375, 2375), # slurmrestd "redis": PortForward(6379, 6379), # redis + "prometheus": PortForward(9090, 9090), # prometheus } def allocate(self): @@ -217,6 +218,7 @@ class SlurmwebAgent: policy_path=self.policy_path, redis_port=self.forwards["redis"].local, redis_password=redis_password, + prometheus_port=self.forwards["prometheus"].local, infrastructure=self.name if self.ui_name != self.name else None, ) ) From 15544dfe590a20dc1d9d072694dce08de63796d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 15:42:05 +0100 Subject: [PATCH 04/23] feat(agent): boolean to flag metrics in /info Add a boolean to indicate if metrics feature is enabled in agent /info endpoint, in complement of the cluster and infrastructure names. --- CHANGELOG.md | 4 ++-- slurmweb/views/agent.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb7dfec5..01df0119 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - agent: - - Return RacksDB infrastructure name in `/info` endpoint in complement of - the cluster name. + - Return RacksDB infrastructure name and a boolean to indicate if metrics + feature is enabled in `/info` endpoint, in addition to the cluster name. - Add optional `/metrics` endpoint with various Slurm metrics in OpenMetrics format designed to be scraped by Prometheus or compatible (#274). - gateway: diff --git a/slurmweb/views/agent.py b/slurmweb/views/agent.py index cfe68be1..b293dcca 100644 --- a/slurmweb/views/agent.py +++ b/slurmweb/views/agent.py @@ -35,6 +35,7 @@ def info(): data = { "cluster": current_app.settings.service.cluster, "infrastructure": current_app.settings.racksdb.infrastructure, + "metrics": current_app.settings.metrics.enabled, } return jsonify(data) From 4f3222084791ab363a03d3b0a69db0666e0dac2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Tue, 5 Nov 2024 15:21:17 +0100 Subject: [PATCH 05/23] tests(agent): update test to reflect changes --- slurmweb/tests/test_agent.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/slurmweb/tests/test_agent.py b/slurmweb/tests/test_agent.py index e3fba254..91aa9cc4 100644 --- a/slurmweb/tests/test_agent.py +++ b/slurmweb/tests/test_agent.py @@ -144,11 +144,13 @@ def test_info(self): response = self.client.get(f"/v{get_version()}/info") self.assertEqual(response.status_code, 200) self.assertIsInstance(response.json, dict) - self.assertEqual(len(response.json.keys()), 2) + self.assertEqual(len(response.json.keys()), 3) self.assertIn("cluster", response.json) self.assertEqual(response.json["cluster"], "test") self.assertIn("infrastructure", response.json) self.assertEqual(response.json["infrastructure"], "test") + self.assertIn("metrics", response.json) + self.assertIsInstance(response.json["metrics"], bool) def test_permissions(self): response = self.client.get(f"/v{get_version()}/permissions") From fb465c6104273cd5e36cf7a848c05d2ee4c83d95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 15:46:26 +0100 Subject: [PATCH 06/23] feat(gateway): metrics flag in /clusters Return boolean metrics feature flag of every clusters in gateway /clusters endpoint. --- CHANGELOG.md | 4 ++-- slurmweb/apps/gateway.py | 5 +++-- slurmweb/views/gateway.py | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01df0119..5b337d82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add optional `/metrics` endpoint with various Slurm metrics in OpenMetrics format designed to be scraped by Prometheus or compatible (#274). - gateway: - - Return RacksDB infrastructure name of every clusters in `/clusters` - endpoint. + - Return RacksDB infrastructure name and boolean metrics feature flag of every + clusters in `/clusters` endpoint. - Return optional markdown login service message as rendered HTML page with `/messages/login` enpoint. - frontend: diff --git a/slurmweb/apps/gateway.py b/slurmweb/apps/gateway.py index 3669e683..283eb326 100644 --- a/slurmweb/apps/gateway.py +++ b/slurmweb/apps/gateway.py @@ -20,15 +20,16 @@ class SlurmwebAgent: - def __init__(self, cluster, infrastructure, url): + def __init__(self, cluster, infrastructure, metrics, url): self.cluster = cluster self.infrastructure = infrastructure + self.metrics = metrics self.url = url @classmethod def from_json(cls, url, data): try: - return cls(data["cluster"], data["infrastructure"], url) + return cls(data["cluster"], data["infrastructure"], data["metrics"], url) except KeyError as err: raise SlurmwebAgentError( "Unable to retrieve cluster name from agent" diff --git a/slurmweb/views/gateway.py b/slurmweb/views/gateway.py index b5ad7b14..528a44a5 100644 --- a/slurmweb/views/gateway.py +++ b/slurmweb/views/gateway.py @@ -148,6 +148,7 @@ async def get_cluster(agent): cluster = { "name": agent.cluster, "infrastructure": agent.infrastructure, + "metrics": agent.metrics, "permissions": permissions, } From b7af5f5d8b98b7e8173b9ce4f426ab8cfc5a9df3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 16:00:58 +0100 Subject: [PATCH 07/23] refactor(gateway): generic management query params Manage query parameters forwarded to agent more generically in request_agent() instead of specifically in racksdb view. --- slurmweb/views/gateway.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/slurmweb/views/gateway.py b/slurmweb/views/gateway.py index 528a44a5..4a6fa3ef 100644 --- a/slurmweb/views/gateway.py +++ b/slurmweb/views/gateway.py @@ -222,6 +222,8 @@ def request_agent( ) else: url = f"{current_app.agents[cluster].url}/{query}" + if len(request.query_string): + url += f"?{request.query_string.decode()}" if request.method == "GET": return session.get(url, headers=headers) elif request.method == "POST": @@ -324,8 +326,7 @@ def accounts(cluster: str): def racksdb(cluster: str, query: str): return proxy_agent( cluster, - f"racksdb/v{current_app.settings.agents.racksdb_version}/{query}" - f"{'?' if len(request.query_string) else '' }{request.query_string.decode()}", + f"racksdb/v{current_app.settings.agents.racksdb_version}/{query}", request.token, json=False, with_version=False, From 47367e5144b377a3f6384485e9de710117a99bb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 16:03:29 +0100 Subject: [PATCH 08/23] refactor(agent): move metrics module in subdir This subdirectory will also hold all other upcoming metrics related modules. --- slurmweb/metrics/__init__.py | 0 slurmweb/{metrics.py => metrics/collector.py} | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 slurmweb/metrics/__init__.py rename slurmweb/{metrics.py => metrics/collector.py} (98%) diff --git a/slurmweb/metrics/__init__.py b/slurmweb/metrics/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/slurmweb/metrics.py b/slurmweb/metrics/collector.py similarity index 98% rename from slurmweb/metrics.py rename to slurmweb/metrics/collector.py index 1fd1a856..08bd92cc 100644 --- a/slurmweb/metrics.py +++ b/slurmweb/metrics/collector.py @@ -11,8 +11,8 @@ import prometheus_client import prometheus_client.core -from .errors import SlurmwebCacheError -from .slurmrestd.errors import ( +from ..errors import SlurmwebCacheError +from ..slurmrestd.errors import ( SlurmrestdNotFoundError, SlurmrestdInvalidResponseError, SlurmrestConnectionError, From 3dec1fc122e29ccbf95e698be9733f023053b42b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 16:15:39 +0100 Subject: [PATCH 09/23] tests(agent): adapt tests after module move --- slurmweb/tests/test_agent.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/slurmweb/tests/test_agent.py b/slurmweb/tests/test_agent.py index 91aa9cc4..29541622 100644 --- a/slurmweb/tests/test_agent.py +++ b/slurmweb/tests/test_agent.py @@ -663,8 +663,8 @@ def test_request_metrics_forbidden(self): self.assertEqual( cm.output, [ - "WARNING:slurmweb.metrics:IP address 127.0.0.1 not authorized to " - "request metrics" + "WARNING:slurmweb.metrics.collector:IP address 127.0.0.1 not " + "authorized to request metrics" ], ) @@ -681,8 +681,8 @@ def test_request_metrics_slurmrest_connection_error(self): self.assertEqual( cm.output, [ - "ERROR:slurmweb.metrics:Unable to collect metrics due to slurmrestd " - "connection error: connection error" + "ERROR:slurmweb.metrics.collector:Unable to collect metrics due to " + "slurmrestd connection error: connection error" ], ) @@ -699,8 +699,8 @@ def test_request_metrics_slurmrestd_invalid_type(self): self.assertEqual( cm.output, [ - "ERROR:slurmweb.metrics:Unable to collect metrics due to slurmrestd " - "invalid response: invalid type" + "ERROR:slurmweb.metrics.collector:Unable to collect metrics due to " + "slurmrestd invalid response: invalid type" ], ) @@ -722,8 +722,8 @@ def test_request_metrics_slurmrestd_internal_error(self): self.assertEqual( cm.output, [ - "ERROR:slurmweb.metrics:Unable to collect metrics due to slurmrestd " - "internal error: fake error description (fake error source)" + "ERROR:slurmweb.metrics.collector:Unable to collect metrics due to " + "slurmrestd internal error: fake error description (fake error source)" ], ) @@ -741,8 +741,8 @@ def test_request_metrics_slurmrestd_not_found(self, slurm_version): self.assertEqual( cm.output, [ - "ERROR:slurmweb.metrics:Unable to collect metrics due to URL not found " - "on slurmrestd: /unfound" + "ERROR:slurmweb.metrics.collector:Unable to collect metrics due to URL " + "not found on slurmrestd: /unfound" ], ) @@ -762,7 +762,7 @@ def test_request_metrics_cache_error(self, slurm_version): self.assertEqual( cm.output, [ - "ERROR:slurmweb.metrics:Unable to collect metrics due to cache error: " - "fake error" + "ERROR:slurmweb.metrics.collector:Unable to collect metrics due to " + "cache error: fake error" ], ) From 3fe0d29ab78756d91f94b27672373711864c9a67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 16:11:20 +0100 Subject: [PATCH 10/23] feat(conf): add metrics > host agent parameter --- CHANGELOG.md | 1 + conf/vendor/agent.yml | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5b337d82..9800b9f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `racksdb` > `infrastructure` parameter for the agent. - Add `metrics` > `enabled` parameter for the agent. - Add `metrics` > `restrict` parameter for the agent. + - Add `metrics` > `host` parameter for the agent. - Add `ui` > `templates`, `message_template`, `message_login` parameters for the gateway. - Select `alloc_cpus` and `alloc_idle_cpus` nodes fields on `slurmrestd` diff --git a/conf/vendor/agent.yml b/conf/vendor/agent.yml index 72ac3ab0..29de7697 100644 --- a/conf/vendor/agent.yml +++ b/conf/vendor/agent.yml @@ -373,3 +373,8 @@ metrics: - ::1/128 doc: | Restricted list of IP networks permitted to request metrics. + host: + type: uri + default: http://localhost:9090 + doc: | + URL of Prometheus server (or compatible) to requests metrics with PromQL. From b36f8efacdfb649ca262bad917c7164844a90749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 16:13:29 +0100 Subject: [PATCH 11/23] chore(front): add dep on chart.js w/ luxon adapter Add dependency on charts.js and luxon adapter and draw charts with timeseries metrics. --- CHANGELOG.md | 2 ++ frontend/package-lock.json | 43 ++++++++++++++++++++++++++++++++++++++ frontend/package.json | 4 ++++ 3 files changed, 49 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9800b9f2..b9f1ce6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Request RacksDB with the infrastructure name provided by the gateway (#348). - Display time limit of running jobs in job details page (#352). - Display service message below login form if defined (#253). + - Add dependency on _charts.js_ and _luxon_ adapter to draw charts with + timeseries metrics. - conf: - Add `racksdb` > `infrastructure` parameter for the agent. - Add `metrics` > `enabled` parameter for the agent. diff --git a/frontend/package-lock.json b/frontend/package-lock.json index af723e2b..db380e87 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -13,6 +13,9 @@ "@microsoft/fetch-event-source": "^2.0.1", "@tailwindcss/forms": "^0.5.6", "axios": "^1.5.0", + "chart.js": "^4.4.6", + "chartjs-adapter-luxon": "^1.3.1", + "luxon": "^3.5.0", "pinia": "^2.1.6", "vue": "^3.4.21", "vue-router": "^4.2.4" @@ -21,6 +24,7 @@ "@pinia/testing": "^0.1.5", "@rushstack/eslint-patch": "^1.3.2", "@tsconfig/node18": "^18.2.0", + "@types/luxon": "^3.4.2", "@types/node": "^18.17.5", "@vitejs/plugin-vue": "^4.3.1", "@vue/eslint-config-prettier": "^8.0.0", @@ -649,6 +653,11 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@kurkle/color": { + "version": "0.3.2", + "resolved": "https://registry.npmjs.org/@kurkle/color/-/color-0.3.2.tgz", + "integrity": "sha512-fuscdXJ9G1qb7W8VdHi+IwRqij3lBkosAm4ydQtEmbY58OzHXqQhvlxqEkoz0yssNVn38bcpRWgA9PP+OGoisw==" + }, "node_modules/@microsoft/fetch-event-source": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/@microsoft/fetch-event-source/-/fetch-event-source-2.0.1.tgz", @@ -1012,6 +1021,12 @@ "integrity": "sha512-Hr5Jfhc9eYOQNPYO5WLDq/n4jqijdHNlDXjuAQkkt+mWdQR+XJToOHrsD4cPaMXpn6KO7y2+wM8AZEs8VpBLVA==", "dev": true }, + "node_modules/@types/luxon": { + "version": "3.4.2", + "resolved": "https://registry.npmjs.org/@types/luxon/-/luxon-3.4.2.tgz", + "integrity": "sha512-TifLZlFudklWlMBfhubvgqTXRzLDI5pCbGa4P8a3wPyUQSW+1xQ5eDsreP9DWHX3tjq1ke96uYG/nwundroWcA==", + "dev": true + }, "node_modules/@types/node": { "version": "18.17.14", "resolved": "https://registry.npmjs.org/@types/node/-/node-18.17.14.tgz", @@ -2000,6 +2015,26 @@ "url": "https://github.com/chalk/chalk?sponsor=1" } }, + "node_modules/chart.js": { + "version": "4.4.6", + "resolved": "https://registry.npmjs.org/chart.js/-/chart.js-4.4.6.tgz", + "integrity": "sha512-8Y406zevUPbbIBA/HRk33khEmQPk5+cxeflWE/2rx1NJsjVWMPw/9mSP9rxHP5eqi6LNoPBVMfZHxbwLSgldYA==", + "dependencies": { + "@kurkle/color": "^0.3.0" + }, + "engines": { + "pnpm": ">=8" + } + }, + "node_modules/chartjs-adapter-luxon": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/chartjs-adapter-luxon/-/chartjs-adapter-luxon-1.3.1.tgz", + "integrity": "sha512-yxHov3X8y+reIibl1o+j18xzrcdddCLqsXhriV2+aQ4hCR66IYFchlRXUvrJVoxglJ380pgytU7YWtoqdIgqhg==", + "peerDependencies": { + "chart.js": ">=3.0.0", + "luxon": ">=1.0.0" + } + }, "node_modules/check-error": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/check-error/-/check-error-1.0.3.tgz", @@ -4115,6 +4150,14 @@ "node": ">=10" } }, + "node_modules/luxon": { + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/luxon/-/luxon-3.5.0.tgz", + "integrity": "sha512-rh+Zjr6DNfUYR3bPwJEnuwDdqMbxZW7LOQfUN4B54+Cl+0o5zaU9RJ6bcidfDtC1cWCZXQ+nvX8bf6bAji37QQ==", + "engines": { + "node": ">=12" + } + }, "node_modules/magic-string": { "version": "0.30.8", "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.8.tgz", diff --git a/frontend/package.json b/frontend/package.json index f81e7127..69ef5944 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -18,6 +18,9 @@ "@microsoft/fetch-event-source": "^2.0.1", "@tailwindcss/forms": "^0.5.6", "axios": "^1.5.0", + "chart.js": "^4.4.6", + "chartjs-adapter-luxon": "^1.3.1", + "luxon": "^3.5.0", "pinia": "^2.1.6", "vue": "^3.4.21", "vue-router": "^4.2.4" @@ -26,6 +29,7 @@ "@pinia/testing": "^0.1.5", "@rushstack/eslint-patch": "^1.3.2", "@tsconfig/node18": "^18.2.0", + "@types/luxon": "^3.4.2", "@types/node": "^18.17.5", "@vitejs/plugin-vue": "^4.3.1", "@vue/eslint-config-prettier": "^8.0.0", From 654e143c2fd36ce46ab6b7a2dae4204660071a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Tue, 5 Nov 2024 14:21:44 +0100 Subject: [PATCH 12/23] feat(conf): add metrics > job agent parameter --- CHANGELOG.md | 1 + conf/vendor/agent.yml | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9f1ce6a..d94777dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `metrics` > `enabled` parameter for the agent. - Add `metrics` > `restrict` parameter for the agent. - Add `metrics` > `host` parameter for the agent. + - Add `metrics` > `job` parameter for the agent. - Add `ui` > `templates`, `message_template`, `message_login` parameters for the gateway. - Select `alloc_cpus` and `alloc_idle_cpus` nodes fields on `slurmrestd` diff --git a/conf/vendor/agent.yml b/conf/vendor/agent.yml index 29de7697..b73bb226 100644 --- a/conf/vendor/agent.yml +++ b/conf/vendor/agent.yml @@ -378,3 +378,7 @@ metrics: default: http://localhost:9090 doc: | URL of Prometheus server (or compatible) to requests metrics with PromQL. + job: + type: str + default: slurm + doc: Name of Prometheus job which scrapes Slurm-web metrics. From 60053ab5ea1fc15fa066e42529d66d92a252b26a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Mon, 4 Nov 2024 16:12:04 +0100 Subject: [PATCH 13/23] docs: update conf references --- docs/modules/conf/examples/agent.ini | 10 ++++++++++ docs/modules/conf/partials/conf-agent.adoc | 23 ++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/docs/modules/conf/examples/agent.ini b/docs/modules/conf/examples/agent.ini index 4fcfbcc0..b437c619 100644 --- a/docs/modules/conf/examples/agent.ini +++ b/docs/modules/conf/examples/agent.ini @@ -478,3 +478,13 @@ enabled=no restrict= 127.0.0.0/24 ::1/128 + +# URL of Prometheus server (or compatible) to requests metrics with PromQL. +# +# Default value: http://localhost:9090 +host=http://localhost:9090 + +# Name of Prometheus job which scrapes Slurm-web metrics. +# +# Default value: slurm +job=slurm diff --git a/docs/modules/conf/partials/conf-agent.adoc b/docs/modules/conf/partials/conf-agent.adoc index 0e045ad0..63d9e45d 100644 --- a/docs/modules/conf/partials/conf-agent.adoc +++ b/docs/modules/conf/partials/conf-agent.adoc @@ -882,6 +882,29 @@ compatible) is enabled. * `::1/128` +|- + +|host +|uri +|URL of Prometheus server (or compatible) to requests metrics with PromQL. + + + + + +*Default:* `http://localhost:9090` + +|- + +|job +|str +|Name of Prometheus job which scrapes Slurm-web metrics. + + + + +*Default:* `slurm` + |- From 33b56adba1a4b7ad58f9a46d7d1c78a9afdb4eae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Tue, 5 Nov 2024 14:29:52 +0100 Subject: [PATCH 14/23] feat(agent): query metrics in agent Add possibility to query metrics from Prometheus in agent. This commit introduces slurmweb.metrics.db module with SlurmwebMetricsDB class designed to query nodes, cores and jobs metrics, with various range and resolution in PromQL (Prometheus Query Language) from a Prometheus database. This class is instanciated in agent application when metrics feature is enabled, its method are called by metrics view, accessible through /v/metrics/ API endpoint. Note that SlurmwebAppAgent.metrics attribute is renamed metrics_collector to avoid name conflict with new metrics_db attribute. --- CHANGELOG.md | 2 ++ slurmweb/apps/agent.py | 9 +++-- slurmweb/errors.py | 4 +++ slurmweb/metrics/db.py | 73 +++++++++++++++++++++++++++++++++++++++++ slurmweb/views/agent.py | 36 +++++++++++++++++++- 5 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 slurmweb/metrics/db.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d94777dc..7accf30c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 feature is enabled in `/info` endpoint, in addition to the cluster name. - Add optional `/metrics` endpoint with various Slurm metrics in OpenMetrics format designed to be scraped by Prometheus or compatible (#274). + - Add possibility to query metrics from Prometheus database with + `/v/metrics/` endpoint. - gateway: - Return RacksDB infrastructure name and boolean metrics feature flag of every clusters in `/clusters` endpoint. diff --git a/slurmweb/apps/agent.py b/slurmweb/apps/agent.py index 7642aff7..cc5bd7f8 100644 --- a/slurmweb/apps/agent.py +++ b/slurmweb/apps/agent.py @@ -40,6 +40,7 @@ class SlurmwebAppAgent(SlurmwebWebApp, RFLTokenizedRBACWebApp): SlurmwebAppRoute(f"/v{get_version()}/qos", views.qos), SlurmwebAppRoute(f"/v{get_version()}/reservations", views.reservations), SlurmwebAppRoute(f"/v{get_version()}/accounts", views.accounts), + SlurmwebAppRoute(f"/v{get_version()}/metrics/", views.metrics), } def __init__(self, seed): @@ -110,9 +111,13 @@ def __init__(self, seed): if self.settings.metrics.enabled: # Lazy load metrics module to avoid failing on missing optional external # dependency when feature is actually disabled. - from ..metrics import SlurmWebMetricsCollector, make_wsgi_app + from ..metrics.collector import SlurmWebMetricsCollector, make_wsgi_app + from ..metrics.db import SlurmwebMetricsDB - self.metrics = SlurmWebMetricsCollector(self.slurmrestd) + self.metrics_collector = SlurmWebMetricsCollector(self.slurmrestd) self.wsgi_app = dispatcher.DispatcherMiddleware( self.wsgi_app, {"/metrics": make_wsgi_app(self.settings.metrics)} ) + self.metrics_db = SlurmwebMetricsDB( + self.settings.metrics.host, self.settings.metrics.job + ) diff --git a/slurmweb/errors.py b/slurmweb/errors.py index 577c995c..4c995219 100644 --- a/slurmweb/errors.py +++ b/slurmweb/errors.py @@ -23,3 +23,7 @@ class SlurmwebAuthenticationError(Exception): class SlurmwebCacheError(Exception): pass + + +class SlurmwebMetricsDBError(Exception): + pass diff --git a/slurmweb/metrics/db.py b/slurmweb/metrics/db.py new file mode 100644 index 00000000..e4169b45 --- /dev/null +++ b/slurmweb/metrics/db.py @@ -0,0 +1,73 @@ +# Copyright (c) 2024 Rackslab +# +# This file is part of Slurm-web. +# +# SPDX-License-Identifier: GPL-3.0-or-later + +import collections + +import requests + +from ..errors import SlurmwebMetricsDBError + +SlurmWebRangeResolution = collections.namedtuple( + "SlurmWebRangeResolution", ["resolution", "range"] +) + + +class SlurmwebMetricsDB: + RANGE_RESOLUTIONS = { + "hour": SlurmWebRangeResolution("30s", "1h"), + "day": SlurmWebRangeResolution("10m", "1d"), + "week": SlurmWebRangeResolution("1h", "1w"), + } + REQUEST_BASE_PATH = "/api/v1/query?query=" + + def __init__(self, base_uri, job): + self.base_uri = base_uri + self.job = job + + def request(self, metric, last): + return self._request(self._query(metric, last)) + + def _request(self, query): + url = f"{self.base_uri.geturl()}{self.REQUEST_BASE_PATH}{query}" + try: + response = requests.get(url) + except requests.exceptions.ConnectionError as err: + raise SlurmwebMetricsDBError( + f"Connection error on {self.base_uri.geturl()}: {err}" + ) from err + json = response.json() + # Check response status code + if response.status_code != 200: + raise SlurmwebMetricsDBError( + f"Prometheus error for query {query}: {json['error']}" + ) + # Check result is not empty + if not json["data"]["result"]: + raise SlurmwebMetricsDBError(f"Empty result for query {query}") + try: + return { + result["metric"]["state"]: [ + # Convert timestamp for second to millisecond and values from + # string to floats. + [t_v_pair[0] * 1000, float(t_v_pair[1])] + for t_v_pair in result["values"] + ] + for result in json["data"]["result"] + } + except KeyError as err: + raise SlurmwebMetricsDBError( + f"Unexpected result on metrics query {query}" + ) from err + + def _query(self, metric, last): + if last not in self.RANGE_RESOLUTIONS.keys(): + raise SlurmwebMetricsDBError(f"Unsupported metric range {last}") + return ( + f'avg_over_time(slurm_{metric}{{job="{self.job}"}}' + f"[{self.RANGE_RESOLUTIONS[last].resolution}])" + f"[{self.RANGE_RESOLUTIONS[last].range}:" + f"{self.RANGE_RESOLUTIONS[last].resolution}]" + ) diff --git a/slurmweb/views/agent.py b/slurmweb/views/agent.py index b293dcca..dda1ecef 100644 --- a/slurmweb/views/agent.py +++ b/slurmweb/views/agent.py @@ -11,7 +11,7 @@ from rfl.web.tokens import rbac_action, check_jwt from ..version import get_version -from ..errors import SlurmwebCacheError +from ..errors import SlurmwebCacheError, SlurmwebMetricsDBError from ..slurmrestd.errors import ( SlurmrestdNotFoundError, @@ -155,3 +155,37 @@ def reservations(): @rbac_action("view-accounts") def accounts(): return jsonify(slurmrest("accounts")) + + +@check_jwt +def metrics(metric): + # Dictionnary of metrics and required policy actions associations + metrics_policy_actions = { + "nodes": "view-nodes", + "cores": "view-nodes", + "jobs": "view-jobs", + } + + # Check metric is supported or send HTTP/404 + if metric not in metrics_policy_actions.keys(): + abort(404, f"Metric {metric} not found") + + # Check permission to request metric or send HTTP/403 + action = metrics_policy_actions[metric] + if not current_app.policy.allowed_user_action(request.user, action): + logger.warning( + "Unauthorized access from user %s to %s metric (missing permission on %s)", + request.user, + metric, + action, + ) + abort(403, f"Access to {metric} metric not permitted") + + # Send metrics from DB + + try: + return jsonify( + current_app.metrics_db.request(metric, request.args.get("range", "hour")) + ) + except SlurmwebMetricsDBError as err: + abort(500, str(err)) From 705b89c5af5235908c804768c10a68c6d3a37192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Tue, 5 Nov 2024 14:30:29 +0100 Subject: [PATCH 15/23] feat(gateway): proxy metrics endpoint Proxy metrics requests to agent through /api/agents//metrics/ enpoint. --- CHANGELOG.md | 4 +++- slurmweb/apps/gateway.py | 1 + slurmweb/views/gateway.py | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7accf30c..e31591d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Return RacksDB infrastructure name and boolean metrics feature flag of every clusters in `/clusters` endpoint. - Return optional markdown login service message as rendered HTML page with - `/messages/login` enpoint. + `/messages/login` endpoint. + - Proxy metrics requests to agent through + `/api/agents//metrics/` endpoint. - frontend: - Request RacksDB with the infrastructure name provided by the gateway (#348). - Display time limit of running jobs in job details page (#352). diff --git a/slurmweb/apps/gateway.py b/slurmweb/apps/gateway.py index 283eb326..1c30aafe 100644 --- a/slurmweb/apps/gateway.py +++ b/slurmweb/apps/gateway.py @@ -49,6 +49,7 @@ class SlurmwebAppGateway(SlurmwebWebApp, RFLTokenizedWebApp): SlurmwebAppRoute("/api/clusters", views.clusters), SlurmwebAppRoute("/api/users", views.users), SlurmwebAppRoute("/api/agents//stats", views.stats), + SlurmwebAppRoute("/api/agents//metrics/", views.metrics), SlurmwebAppRoute("/api/agents//jobs", views.jobs), SlurmwebAppRoute("/api/agents//job/", views.job), SlurmwebAppRoute("/api/agents//nodes", views.nodes), diff --git a/slurmweb/views/gateway.py b/slurmweb/views/gateway.py index 4a6fa3ef..d547e331 100644 --- a/slurmweb/views/gateway.py +++ b/slurmweb/views/gateway.py @@ -321,6 +321,12 @@ def accounts(cluster: str): return proxy_agent(cluster, "accounts", request.token) +@check_jwt +@validate_cluster +def metrics(cluster: str, metric: str): + return proxy_agent(cluster, f"metrics/{metric}", request.token) + + @check_jwt @validate_cluster def racksdb(cluster: str, query: str): From e53c211d8006c2ea24f7b48b36345ddad1568981 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Tue, 5 Nov 2024 14:30:56 +0100 Subject: [PATCH 16/23] tests(agent): adapt tests to reflect changes --- slurmweb/tests/test_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slurmweb/tests/test_agent.py b/slurmweb/tests/test_agent.py index 29541622..0897f1cc 100644 --- a/slurmweb/tests/test_agent.py +++ b/slurmweb/tests/test_agent.py @@ -614,7 +614,7 @@ def setUp(self): ) def tearDown(self): - self.app.metrics.unregister() + self.app.metrics_collector.unregister() @all_slurm_versions def test_request_metrics(self, slurm_version): From 10347cbcf4f9be04c124e2d2872d9ef7442dd3cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Palancher?= Date: Tue, 5 Nov 2024 14:31:22 +0100 Subject: [PATCH 17/23] feat(front): resources/jobs charts in dashboard Display charts of resources (nodes/cores) status and jobs queue in dashboard page based on metrics from Prometheus, when metrics feature is enabled. fix #275 --- CHANGELOG.md | 2 + frontend/public/chart_placeholder.svg | 1 + .../dashboard/ChartJobsHistogram.vue | 52 +++++ .../dashboard/ChartResourcesHistogram.vue | 97 ++++++++ .../components/dashboard/DashboardCharts.vue | 82 +++++++ frontend/src/composables/DataGetter.ts | 25 +- frontend/src/composables/DataPoller.ts | 32 ++- frontend/src/composables/GatewayAPI.ts | 54 ++++- .../src/composables/dashboard/LiveChart.ts | 213 ++++++++++++++++++ frontend/src/stores/runtime.ts | 33 ++- frontend/src/views/DashboardView.vue | 5 + 11 files changed, 585 insertions(+), 11 deletions(-) create mode 100644 frontend/public/chart_placeholder.svg create mode 100644 frontend/src/components/dashboard/ChartJobsHistogram.vue create mode 100644 frontend/src/components/dashboard/ChartResourcesHistogram.vue create mode 100644 frontend/src/components/dashboard/DashboardCharts.vue create mode 100644 frontend/src/composables/dashboard/LiveChart.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index e31591d5..92beece1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Display service message below login form if defined (#253). - Add dependency on _charts.js_ and _luxon_ adapter to draw charts with timeseries metrics. + - Display charts of resources (nodes/cores) status and jobs queue in dashboard + page based on metrics from Prometheus (#275). - conf: - Add `racksdb` > `infrastructure` parameter for the agent. - Add `metrics` > `enabled` parameter for the agent. diff --git a/frontend/public/chart_placeholder.svg b/frontend/public/chart_placeholder.svg new file mode 100644 index 00000000..a69665b2 --- /dev/null +++ b/frontend/public/chart_placeholder.svg @@ -0,0 +1 @@ + diff --git a/frontend/src/components/dashboard/ChartJobsHistogram.vue b/frontend/src/components/dashboard/ChartJobsHistogram.vue new file mode 100644 index 00000000..ed2b68e9 --- /dev/null +++ b/frontend/src/components/dashboard/ChartJobsHistogram.vue @@ -0,0 +1,52 @@ + + + + + diff --git a/frontend/src/components/dashboard/ChartResourcesHistogram.vue b/frontend/src/components/dashboard/ChartResourcesHistogram.vue new file mode 100644 index 00000000..2ec65c2c --- /dev/null +++ b/frontend/src/components/dashboard/ChartResourcesHistogram.vue @@ -0,0 +1,97 @@ + + + + + diff --git a/frontend/src/components/dashboard/DashboardCharts.vue b/frontend/src/components/dashboard/DashboardCharts.vue new file mode 100644 index 00000000..9a0dfed3 --- /dev/null +++ b/frontend/src/components/dashboard/DashboardCharts.vue @@ -0,0 +1,82 @@ + + + + + diff --git a/frontend/src/composables/DataGetter.ts b/frontend/src/composables/DataGetter.ts index ea052225..0109bf3d 100644 --- a/frontend/src/composables/DataGetter.ts +++ b/frontend/src/composables/DataGetter.ts @@ -70,9 +70,11 @@ export function useGatewayDataGetter( } export function useClusterDataGetter( - callback: GatewayAnyClusterApiKey, - otherParam?: string | number + initialCallback: GatewayAnyClusterApiKey, + initialOtherParam?: string | number ) { + let callback = initialCallback + let otherParam = initialOtherParam const data: Ref = ref() const unable: Ref = ref(false) const loaded: Ref = ref(false) @@ -98,7 +100,6 @@ export function useClusterDataGetter( async function get(cluster: string) { try { unable.value = false - if (gateway.isValidGatewayClusterWithStringAPIKey(callback)) { data.value = (await gateway[callback](cluster, otherParam as string)) as Type } else if (gateway.isValidGatewayClusterWithNumberAPIKey(callback)) { @@ -124,6 +125,22 @@ export function useClusterDataGetter( } } + function setCallback(newCallback: GatewayAnyClusterApiKey) { + callback = newCallback + loaded.value = false + if (runtime.currentCluster) { + get(runtime.currentCluster.name) + } + } + + function setParam(newOtherParam: string | number) { + otherParam = newOtherParam + loaded.value = false + if (runtime.currentCluster) { + get(runtime.currentCluster.name) + } + } + watch( () => runtime.currentCluster, (newCluster, oldCluster) => { @@ -141,5 +158,5 @@ export function useClusterDataGetter( get(runtime.currentCluster.name) } }) - return { data, unable, loaded } + return { data, unable, loaded, setCallback, setParam } } diff --git a/frontend/src/composables/DataPoller.ts b/frontend/src/composables/DataPoller.ts index 57657948..7413e5b5 100644 --- a/frontend/src/composables/DataPoller.ts +++ b/frontend/src/composables/DataPoller.ts @@ -14,17 +14,21 @@ import { useGatewayAPI } from '@/composables/GatewayAPI' import type { GatewayAnyClusterApiKey } from '@/composables/GatewayAPI' import { useRuntimeStore } from '@/stores/runtime' -type ClusterDataPoller = { - data: Ref +export interface ClusterDataPoller { + data: Ref unable: Ref loaded: Ref + setCallback: (newCallback: GatewayAnyClusterApiKey) => void + setParam: (newOtherParam: string | number) => void } export function useClusterDataPoller( - callback: GatewayAnyClusterApiKey, + initialCallback: GatewayAnyClusterApiKey, timeout: number, - otherParam?: number | string + initialOtherParam?: number | string ): ClusterDataPoller { + let callback = initialCallback + let otherParam = initialOtherParam const data: Ref = ref() const unable: Ref = ref(false) const loaded: Ref = ref(false) @@ -95,6 +99,24 @@ export function useClusterDataPoller( gateway.abort() } + function setCallback(newCallback: GatewayAnyClusterApiKey) { + if (runtime.currentCluster) stop(runtime.currentCluster.name) + callback = newCallback + loaded.value = false + if (runtime.currentCluster) { + start(runtime.currentCluster.name) + } + } + + function setParam(newOtherParam: string | number) { + if (runtime.currentCluster) stop(runtime.currentCluster.name) + otherParam = newOtherParam + loaded.value = false + if (runtime.currentCluster) { + start(runtime.currentCluster.name) + } + } + watch( () => runtime.currentCluster, (newCluster, oldCluster) => { @@ -123,5 +145,5 @@ export function useClusterDataPoller( } }) - return { data, unable, loaded } + return { data, unable, loaded, setCallback, setParam } } diff --git a/frontend/src/composables/GatewayAPI.ts b/frontend/src/composables/GatewayAPI.ts index c78a08af..51203bea 100644 --- a/frontend/src/composables/GatewayAPI.ts +++ b/frontend/src/composables/GatewayAPI.ts @@ -24,6 +24,7 @@ interface loginIdents { export interface ClusterDescription { name: string infrastructure: string + metrics: boolean permissions: ClusterPermissions stats?: ClusterStats } @@ -255,6 +256,22 @@ export interface ClusterReservation { flags: string[] } +export type MetricValue = [number, number] +const MetricRanges = ['week', 'day', 'hour'] as const +export type MetricRange = (typeof MetricRanges)[number] +export type MetricResourceState = 'idle' | 'down' | 'mixed' | 'allocated' | 'drain' | 'unknown' +export type MetricJobState = + | 'unknown' + | 'cancelled' + | 'completed' + | 'completing' + | 'running' + | 'pending' + +export function isMetricRange(range: unknown): range is MetricRange { + return typeof range === 'string' && MetricRanges.includes(range as MetricRange) +} + export function renderClusterOptionalNumber(optionalNumber: ClusterOptionalNumber): string { if (!optionalNumber.set) { return '-' @@ -356,7 +373,12 @@ const GatewayClusterAPIKeys = [ export type GatewayClusterAPIKey = (typeof GatewayClusterAPIKeys)[number] const GatewayClusterWithNumberAPIKeys = ['job'] as const export type GatewayClusterWithNumberAPIKey = (typeof GatewayClusterWithNumberAPIKeys)[number] -const GatewayClusterWithStringAPIKeys = ['node'] as const +const GatewayClusterWithStringAPIKeys = [ + 'node', + 'metrics_nodes', + 'metrics_cores', + 'metrics_jobs' +] as const export type GatewayClusterWithStringAPIKey = (typeof GatewayClusterWithStringAPIKeys)[number] export type GatewayAnyClusterApiKey = | GatewayClusterAPIKey @@ -525,6 +547,33 @@ export function useGatewayAPI() { return await get(`/agents/${cluster}/accounts`) } + async function metrics_nodes( + cluster: string, + last: string + ): Promise> { + return await get>( + `/agents/${cluster}/metrics/nodes?range=${last}` + ) + } + + async function metrics_cores( + cluster: string, + last: string + ): Promise> { + return await get>( + `/agents/${cluster}/metrics/cores?range=${last}` + ) + } + + async function metrics_jobs( + cluster: string, + last: string + ): Promise> { + return await get>( + `/agents/${cluster}/metrics/jobs?range=${last}` + ) + } + async function infrastructureImagePng( cluster: string, infrastructure: string, @@ -599,6 +648,9 @@ export function useGatewayAPI() { qos, reservations, accounts, + metrics_nodes, + metrics_cores, + metrics_jobs, infrastructureImagePng, abort, isValidGatewayGenericAPIKey, diff --git a/frontend/src/composables/dashboard/LiveChart.ts b/frontend/src/composables/dashboard/LiveChart.ts new file mode 100644 index 00000000..afcd3c2f --- /dev/null +++ b/frontend/src/composables/dashboard/LiveChart.ts @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2023-2024 Rackslab + * + * This file is part of Slurm-web. + * + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +import { watch, onMounted } from 'vue' +import type { Ref } from 'vue' +import type { GatewayAnyClusterApiKey } from '@/composables/GatewayAPI' +import { useRuntimeStore } from '@/stores/runtime' +import { useClusterDataPoller } from '@/composables/DataPoller' +import type { ClusterDataPoller } from '@/composables/DataPoller' +import type { MetricValue } from '@/composables/GatewayAPI' +import { Chart } from 'chart.js/auto' +import type { ChartOptions, TimeScaleOptions, TimeUnit } from 'chart.js' +import 'chartjs-adapter-luxon' +import { DateTime } from 'luxon' +import type { Point } from 'node_modules/chart.js/dist/core/core.controller' + +export interface DashboardLiveChart { + metrics: ClusterDataPoller> + setCallback: (callback: GatewayAnyClusterApiKey) => void +} + +export function useDashboardLiveChart( + callback: GatewayAnyClusterApiKey, + chartCanvas: Ref, + stateColors: Record, + possibleStates: MetricKeyType[] +): DashboardLiveChart { + const runtimeStore = useRuntimeStore() + const metrics = useClusterDataPoller>( + callback, + 30000, + runtimeStore.dashboard.range + ) + let chart: Chart | null + + /* Update charts datasets when metrics values change. */ + watch( + () => metrics.data.value, + () => { + /* If chart is null, stop here. */ + if (!chart) return + + /* If poller data is undefined, just set an empty dataset and leave. */ + if (!metrics.data.value) { + chart.data.datasets = [] + return + } + + for (const state of possibleStates) { + /* If current state is not present in poller data keys, skip it. */ + if (!(state in metrics.data.value)) continue + /* Compute new data array */ + const new_data = metrics.data.value[state as MetricKeyType].map((value) => ({ + x: value[0], + y: value[1] + })) + /* Search for existing dataset which has the current state as label */ + const matching_datasets = chart.data.datasets.filter((dataset) => dataset.label == state) + if (!matching_datasets.length) { + /* If matching dataset has not been found, push a new dataset with all + * its parameters. */ + chart.data.datasets.push({ + label: state, + data: new_data, + barPercentage: 1, + fill: 'stack', + backgroundColor: stateColors[state as MetricKeyType] + }) + continue + } else { + /* If matching dataset has been found, get the timestamp of the last + * datapoint. */ + const last_timestamp = (matching_datasets[0].data.slice(-1)[0] as Point).x + /* Iterate over new data to insert in the dataset only the datapoints + * with a timestamp after the timestamp of the last datapoint in + * current dataset, and count inserted values. */ + let nb_new_values = 0 + new_data.forEach((item) => { + if (item.x > last_timestamp) { + matching_datasets[0].data.push(item) + nb_new_values += 1 + } + }) + /* Remove n datapoints from the beginning of the dataset, where n is + * the number of the inserted points, in order to keep a consistent + * number of datapoints. */ + matching_datasets[0].data.splice(0, nb_new_values) + } + } + /* Update suggested min and unit of x-axis. */ + if (chart.options.scales && chart.options.scales.x) { + chart.options.scales.x.suggestedMin = suggestedMin() + ;(chart.options.scales.x as TimeScaleOptions).time.unit = timeframeUnit() + } + /* Finally update the chart. */ + chart.update() + } + ) + + /* Clear chart datasets and set new poller param when dashboard range is + * modified. */ + watch( + () => runtimeStore.dashboard.range, + () => { + if (chart) chart.data.datasets = [] + metrics.setParam(runtimeStore.dashboard.range) + } + ) + + /* Compute the suggested min of the x-axis depending on the current dashboard + * range. */ + function suggestedMin() { + const now = Date.now() + let result = 0 + if (runtimeStore.dashboard.range == 'hour') { + result = now - 60 * 60 * 1000 + } + if (runtimeStore.dashboard.range == 'day') { + result = now - 24 * 60 * 60 * 1000 + } + if (runtimeStore.dashboard.range == 'week') { + result = now - 7 * 24 * 60 * 60 * 1000 + } + return result + } + + /* Determine the timeframe unit of the x-axis depending on the current + * dashboard range. */ + function timeframeUnit(): TimeUnit { + if (runtimeStore.dashboard.range == 'hour') { + return 'minute' + } + return 'hour' + } + + /* Determine ticks labels on y-axis */ + function yTicksCallback(value: number | string) { + /* y-axis represent nodes, cores or jobs, select only integers values */ + if (typeof value !== 'number') return value + if (value % 1 === 0) { + return value + } + } + + /* Determine ticks labels on x-axis. */ + function xTicksCallback(value: number | string) { + if (typeof value === 'number') { + const dt = DateTime.fromMillis(value) + // localized time simple every five minutes with hour range. + if (runtimeStore.dashboard.range == 'hour' && value % (1000 * 60 * 5) === 0) + return dt.toLocaleString(DateTime.TIME_SIMPLE) + // localized time simple every hours with day range. + if (runtimeStore.dashboard.range == 'day' && value % (1000 * 60 * 60) === 0) + return dt.toLocaleString(DateTime.TIME_SIMPLE) + // localized numeric day time at midnight and empty tick at noon. + if (runtimeStore.dashboard.range == 'week') { + if (value % (1000 * 60 * 60 * 24) === 0) { + return dt.toLocaleString({ month: 'numeric', day: 'numeric' }) + } + if (value % (1000 * 60 * 60 * 12) === 0) { + return '' + } + } + } + } + + const genericOptions: ChartOptions = { + responsive: true, + maintainAspectRatio: false, + scales: { + y: { + stacked: true, + beginAtZero: true, + ticks: { + callback: yTicksCallback + } + }, + x: { + type: 'time', + stacked: true, + grid: { + offset: false + }, + ticks: { + callback: xTicksCallback + } + } + } + } + + /* Clear chart datasets and set new metrics callback */ + function setCallback(callback: GatewayAnyClusterApiKey) { + if (chart) chart.data.datasets = [] + metrics.setCallback(callback) + } + + onMounted(() => { + if (chartCanvas.value) { + chart = new Chart(chartCanvas.value, { + type: 'bar', + data: { datasets: [] }, + options: genericOptions + }) + } + }) + + return { metrics, setCallback } +} diff --git a/frontend/src/stores/runtime.ts b/frontend/src/stores/runtime.ts index d4d4b8cd..1372a423 100644 --- a/frontend/src/stores/runtime.ts +++ b/frontend/src/stores/runtime.ts @@ -11,7 +11,36 @@ import { ref } from 'vue' import type { Ref } from 'vue' import type { RouteLocation } from 'vue-router' import { getNodeMainState } from '@/composables/GatewayAPI' -import type { ClusterDescription, ClusterJob, ClusterNode } from '@/composables/GatewayAPI' +import type { + ClusterDescription, + ClusterJob, + ClusterNode, + MetricRange +} from '@/composables/GatewayAPI' + +/* + * Dashboard view settings + */ + +interface DashboardQueryParameters { + range?: string + cores?: boolean +} + +class DashboardViewSettings { + range: MetricRange = 'hour' + coresToggle = false + query() { + const result: DashboardQueryParameters = {} + if (this.range != 'hour') { + result.range = this.range + } + if (this.coresToggle) { + result.cores = this.coresToggle + } + return result + } +} /* * Jobs view settings @@ -277,6 +306,7 @@ export const useRuntimeStore = defineStore('runtime', () => { const routePath: Ref = ref('/') const beforeSettingsRoute: Ref = ref(undefined) + const dashboard = ref(new DashboardViewSettings()) const jobs: Ref = ref(new JobsViewSettings()) const resources: Ref = ref(new ResourcesViewSettings()) @@ -336,6 +366,7 @@ export const useRuntimeStore = defineStore('runtime', () => { navigation, routePath, beforeSettingsRoute, + dashboard, jobs, resources, errors, diff --git a/frontend/src/views/DashboardView.vue b/frontend/src/views/DashboardView.vue index c1d4d14b..50082d3a 100644 --- a/frontend/src/views/DashboardView.vue +++ b/frontend/src/views/DashboardView.vue @@ -8,10 +8,14 @@