rackslab · rezib · Nov 11, 2024 · Nov 4, 2024 · Nov 4, 2024 · Nov 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,23 +9,33 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 - agent:
-  - Return RacksDB infrastructure name in `/info` endpoint in complement of
-    the cluster name.
+  - Return RacksDB infrastructure name and a boolean to indicate if metrics
+    feature is enabled in `/info` endpoint, in addition to the cluster name.
   - Add optional `/metrics` endpoint with various Slurm metrics in OpenMetrics
     format designed to be scraped by Prometheus or compatible (#274).
+  - Add possibility to query metrics from Prometheus database with
+    `/v<version>/metrics/<metric>` endpoint.
 - gateway:
-  - Return RacksDB infrastructure name of every clusters in `/clusters`
-    endpoint.
+  - Return RacksDB infrastructure name and boolean metrics feature flag of every
+    clusters in `/clusters` endpoint.
   - Return optional markdown login service message as rendered HTML page with
-    `/messages/login` enpoint.
+    `/messages/login` endpoint.
+  - Proxy metrics requests to agent through
+    `/api/agents/<cluster>/metrics/<metric>` endpoint.
 - frontend:
   - Request RacksDB with the infrastructure name provided by the gateway (#348).
   - Display time limit of running jobs in job details page (#352).
   - Display service message below login form if defined (#253).
+  - Add dependency on _charts.js_ and _luxon_ adapter to draw charts with
+    timeseries metrics.
+  - Display charts of resources (nodes/cores) status and jobs queue in dashboard
+    page based on metrics from Prometheus (#275).
 - conf:
   - Add `racksdb` > `infrastructure` parameter for the agent.
   - Add `metrics` > `enabled` parameter for the agent.
   - Add `metrics` > `restrict` parameter for the agent.
+  - Add `metrics` > `host` parameter for the agent.
+  - Add `metrics` > `job` parameter for the agent.
   - Add `ui` > `templates`, `message_template`, `message_login` parameters for
     the gateway.
   - Select `alloc_cpus` and `alloc_idle_cpus` nodes fields on `slurmrestd`
@@ -36,9 +46,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   which can either be configuration definition file or site override (#349).
 - docs:
   - Add manpage for `slurm-web-show-conf` command.
-  - Add metrics export configuration documentation.
-  - Mention metrics export optional feature in quickstart guide.
-  - Mention metrics export feature in overview page.
+  - Add metrics feature configuration documentation page.
+  - Mention metrics optional feature in quickstart guide.
+  - Mention metrics export and charts feature in overview page.
   - Mention possible Prometheus integration in architecture page.
   - Mention login service message feature in overview page.
   - Add page to document _Service Messages_ configuration.
@@ -47,7 +57,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Add requirement on markdown external library for `gateway` extra package.
 
 ### Changed
-- docs: Update configuration reference documentation.
+- docs:
+  - Update configuration reference documentation.
+  - Update dashboard screenshot in overview page with example of resource chart.
 - conf:
   - Convert `[cache]` > `password` agent parameter from string to password type.
   - Convert `[ldap]` > `bind_password` gateway parameter from string to password

diff --git a/assets/screenshots/assemblies/bitmaps/slurm-web_charts-medium.png b/assets/screenshots/assemblies/bitmaps/slurm-web_charts-medium.png
diff --git a/assets/screenshots/assemblies/slurm-web_charts.svg b/assets/screenshots/assemblies/slurm-web_charts.svg
diff --git a/assets/screenshots/build.yaml b/assets/screenshots/build.yaml
@@ -12,6 +12,8 @@ assemblies:
   - large
   slurm-web_responsive.svg:
   - large
+  slurm-web_charts.svg:
+  - medium
 # List raw screenshots for which versions with dropped shadow must be generated.
 shadowed:
 - screenshot_auth.png

diff --git a/assets/screenshots/raw/screenshot_charts.png b/assets/screenshots/raw/screenshot_charts.png
diff --git a/assets/screenshots/raw/screenshot_dashboard_tablet.png b/assets/screenshots/raw/screenshot_dashboard_tablet.png
diff --git a/assets/screenshots/shadowed/screenshot_dashboard_tablet.png b/assets/screenshots/shadowed/screenshot_dashboard_tablet.png
diff --git a/conf/vendor/agent.yml b/conf/vendor/agent.yml
@@ -373,3 +373,12 @@ metrics:
     - ::1/128
     doc: |
       Restricted list of IP networks permitted to request metrics.
+  host:
+    type: uri
+    default: http://localhost:9090
+    doc: |
+      URL of Prometheus server (or compatible) to requests metrics with PromQL.
+  job:
+    type: str
+    default: slurm
+    doc: Name of Prometheus job which scrapes Slurm-web metrics.
diff --git a/dev/conf/agent.ini.j2 b/dev/conf/agent.ini.j2
@@ -27,5 +27,8 @@ enabled={{ cache_enabled }}
 port={{ redis_port }}
 password={{ redis_password }}
 
+{% if cluster_name != "pocket" %}
 [metrics]
 enabled=yes
+host=http://localhost:{{ prometheus_port }}
+{% endif %}
diff --git a/dev/crawl-tests-assets b/dev/crawl-tests-assets
@@ -27,17 +27,84 @@ from rfl.settings.errors import (
 )
 from racksdb import RacksDB
 from slurmweb.slurmrestd.unix import SlurmrestdUnixAdapter
+from slurmweb.metrics.db import SlurmwebMetricsDB
 from slurmweb.version import get_version
 
 logger = logging.getLogger("crawl-tests-assets")
 
 DEBUG_FLAGS = ["slurmweb", "rfl", "werkzeug", "urllib3"]
 DEV_HOST = "firehpc.dev.rackslab.io"
 USER = getpass.getuser()
+METRICS_PREFERRED_CLUSTER = "emulator"
+# Map between infrastructure names and cluster names that are visible in Slurm-web.
+MAP_CLUSTER_NAMES = {"emulator": "atlas"}
+
+
+def slurmweb_cluster_name(infrastructure: str):
+    if infrastructure in MAP_CLUSTER_NAMES:
+        return MAP_CLUSTER_NAMES[infrastructure]
+    return infrastructure
+
 
 ASSETS = Path(__file__).parent.resolve() / ".." / "slurmweb" / "tests" / "assets"
 
 
+def crawl_prometheus(url: str, job: str) -> None:
+    """Crawl and save test assets from Prometheus."""
+    # Check assets directory
+    assets_path = ASSETS / "prometheus"
+    if not assets_path.exists():
+        assets_path.mkdir(parents=True)
+
+    # Save requests status
+    status_file = assets_path / "status.json"
+    if status_file.exists():
+        with open(status_file) as fh:
+            requests_statuses = json.load(fh)
+    else:
+        requests_statuses = {}
+
+    headers = {}
+    db = SlurmwebMetricsDB(url, job)
+
+    for metric in ["nodes", "cores", "jobs"]:
+        for _range in ["hour"]:
+            dump_component_query(
+                requests_statuses,
+                url,
+                f"{db.REQUEST_BASE_PATH}{db._query(metric, _range)}",
+                headers,
+                assets_path,
+                f"{metric}-{_range}",
+                prettify=False,
+            )
+
+    # query unexisting metric
+    dump_component_query(
+        requests_statuses,
+        url,
+        f"{db.REQUEST_BASE_PATH}{db._query('fail', 'hour')}",
+        headers,
+        assets_path,
+        "unknown-metric",
+    )
+
+    # query unknown API path
+    dump_component_query(
+        requests_statuses,
+        url,
+        f"{db.REQUEST_BASE_PATH}/fail",
+        headers,
+        assets_path,
+        "unknown-path",
+    )
+
+    # Save resulting status file
+    with open(status_file, "w+") as fh:
+        json.dump(requests_statuses, fh, indent=2)
+        fh.write("\n")
+
+
 def query_slurmrestd(session: requests.Session, prefix: str, query: str) -> Any:
     """Send GET HTTP request to slurmrestd and return JSON result. Raise RuntimeError in
     case of connection error or not JSON result."""
@@ -385,8 +452,9 @@ def dump_component_query(
     query: str,
     headers: str,
     assets_path: Path,
-    asset_name: dict[int,str]|str,
+    asset_name: dict[int, str] | str,
     skip_exist: bool = True,
+    prettify: bool = True,
 ) -> Any:
     """Send GET HTTP request to Slurm-web component pointed by URL and save JSON result
     in assets directory."""
@@ -425,17 +493,17 @@ def dump_component_query(
     else:
         with open(asset, "w+") as fh:
             if asset.suffix == ".json":
-                fh.write(json.dumps(data, indent=2))
+                fh.write(json.dumps(data, indent=2 if prettify else None))
             else:
                 fh.write(data)
     return data
 
 
-def crawl_gateway(cluster: str, dev_tmp_dir: Path) -> str:
+def crawl_gateway(cluster: str, infrastructure: str, dev_tmp_dir: Path) -> str:
     """Crawl and save test assets from Slurm-web gateway component and return
     authentication JWT."""
     # Retrieve admin user account to connect
-    user = admin_user(cluster)
+    user = admin_user(infrastructure)
     logger.info("Found user %s in group admin on cluster %s", user, cluster)
 
     # Get gateway HTTP base URL from configuration
@@ -471,9 +539,9 @@ def crawl_gateway(cluster: str, dev_tmp_dir: Path) -> str:
         headers,
         assets_path,
         {
-          200: "message_login",
-          404: "message_login_not_found",
-          500: "message_login_error",
+            200: "message_login",
+            404: "message_login_not_found",
+            500: "message_login_error",
         },
     )
 
@@ -592,6 +660,19 @@ def crawl_gateway(cluster: str, dev_tmp_dir: Path) -> str:
         "accounts",
     )
 
+    # metrics
+    for metric in ["nodes", "cores", "jobs"]:
+        for _range in ["hour"]:
+            dump_component_query(
+                requests_statuses,
+                url,
+                f"/api/agents/{cluster}/metrics/{metric}?range={_range}",
+                headers,
+                assets_path,
+                f"metrics-{metric}-{_range}",
+                prettify=False,
+            )
+
     # Save resulting status file
     with open(status_file, "w+") as fh:
         json.dump(requests_statuses, fh, indent=2)
@@ -600,7 +681,7 @@ def crawl_gateway(cluster: str, dev_tmp_dir: Path) -> str:
     return token
 
 
-def crawl_agent(port: int, token: str) -> None:
+def crawl_agent(port: int, token: str, metrics: bool) -> None:
     """Crawl and save test assets from Slurm-web agent component."""
     # Compose and return the URL to the gateway
     url = f"http://localhost:{port}"
@@ -678,6 +759,20 @@ def crawl_agent(port: int, token: str) -> None:
         "accounts",
     )
 
+    # metrics
+    if metrics:
+        for metric in ["nodes", "cores", "jobs"]:
+            for _range in ["hour"]:
+                dump_component_query(
+                    requests_statuses,
+                    url,
+                    f"/v{get_version()}/metrics/{metric}?range={_range}",
+                    headers,
+                    assets_path,
+                    f"metrics-{metric}-{_range}",
+                    prettify=False,
+                )
+
     # FIXME: Download unknown job/node
     # Save resulting status file
     with open(status_file, "w+") as fh:
@@ -714,8 +809,16 @@ def main() -> None:
     db = RacksDB.load(db="dev/firehpc/db", schema="../RacksDB/schemas/racksdb.yml")
     logger.info("List of clusters: %s", db.infrastructures.keys())
 
+    gateway_infrastructure = list(
+        db.infrastructures.filter(name=METRICS_PREFERRED_CLUSTER)
+    )[0].name
+
     # Crawl gateway and get bearer token
-    token = crawl_gateway(list(db.infrastructures.keys())[0], dev_tmp_dir)
+    token = crawl_gateway(
+        slurmweb_cluster_name(gateway_infrastructure),
+        gateway_infrastructure,
+        dev_tmp_dir,
+    )
 
     for cluster in db.infrastructures.keys():
         # Load agent configuration
@@ -730,8 +833,14 @@ def main() -> None:
             logger.critical(err)
             sys.exit(1)
 
+        crawl_metrics = cluster == METRICS_PREFERRED_CLUSTER
+
         # Crawl agent
-        crawl_agent(settings.service.port, token)
+        crawl_agent(settings.service.port, token, metrics=crawl_metrics)
+
+        # Crawl prometheus
+        if crawl_metrics:
+            crawl_prometheus(settings.metrics.host.geturl(), settings.metrics.job)
 
         # Crawl slurmrestd
         try: