diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py index bf3a977e..a8433c8a 100644 --- a/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ b/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -218,7 +218,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 21 +LIBPATCH = 22 logger = logging.getLogger(__name__) @@ -1047,7 +1047,7 @@ def __init__( self._dashboards_path = dashboards_path # No peer relation bucket we can rely on providers, keep StoredState here, too - self._stored.set_default(dashboard_templates={}) + self._stored.set_default(dashboard_templates={}) # type: ignore self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) @@ -1073,7 +1073,7 @@ def add_dashboard(self, content: str, inject_dropdowns: bool = True) -> None: """ # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates = self._stored.dashboard_templates # type: Any encoded_dashboard = _encode_dashboard_content(content) @@ -1094,7 +1094,7 @@ def remove_non_builtin_dashboards(self) -> None: """Remove all dashboards to the relation added via :method:`add_dashboard`.""" # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates = self._stored.dashboard_templates # type: Any for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("prog:"): @@ -1121,7 +1121,7 @@ def _update_all_dashboards_from_dir( # Ensure we do not leave outdated dashboards by removing from stored all # the encoded dashboards that start with "file/". if self._dashboards_path: - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates = self._stored.dashboard_templates # type: Any for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): @@ -1175,7 +1175,7 @@ def _reinitialize_dashboard_data(self, inject_dropdowns: bool = True) -> None: e.grafana_dashboards_absolute_path, e.message, ) - stored_dashboard_templates = self._stored.dashboard_templates + stored_dashboard_templates = self._stored.dashboard_templates # type: Any for dashboard_id in list(stored_dashboard_templates.keys()): if dashboard_id.startswith("file:"): @@ -1205,7 +1205,7 @@ def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> event: The `RelationChangedEvent` that triggered this handler. """ if self._charm.unit.is_leader(): - data = json.loads(event.relation.data[event.app].get("event", "{}")) + data = json.loads(event.relation.data[event.app].get("event", "{}")) # type: ignore if not data: return @@ -1251,7 +1251,7 @@ def _juju_topology(self) -> Dict: @property def dashboard_templates(self) -> List: """Return a list of the known dashboard templates.""" - return [v for v in self._stored.dashboard_templates.values()] + return [v for v in self._stored.dashboard_templates.values()] # type: ignore class GrafanaDashboardConsumer(Object): @@ -1305,7 +1305,7 @@ def __init__( self._relation_name = relation_name self._tranformer = CosTool(self._charm) - self._stored.set_default(dashboards=dict()) + self._stored.set_default(dashboards=dict()) # type: ignore self.framework.observe( self._charm.on[self._relation_name].relation_changed, @@ -1588,9 +1588,8 @@ class GrafanaDashboardAggregator(Object): The :class:`GrafanaDashboardAggregator` object provides a way to collate and aggregate Grafana dashboards from reactive/machine charms and transport them into Charmed Operators, using Juju topology. - For detailed usage instructions, see the documentation for - :module:`lma-proxy-operator`, as this class is intended for use as a + :module:`cos-proxy-operator`, as this class is intended for use as a single point of intersection rather than use in individual charms. Since :class:`GrafanaDashboardAggregator` serves as a bridge between @@ -1602,7 +1601,6 @@ class GrafanaDashboardAggregator(Object): In its most streamlined usage, :class:`GrafanaDashboardAggregator` is integrated in a charmed operator as follows: - self.grafana = GrafanaDashboardAggregator(self) Args: @@ -1630,7 +1628,7 @@ def __init__( # Reactive charms may be RPC-ish and not leave reliable data around. Keep # StoredState here - self._stored.set_default( + self._stored.set_default( # type: ignore dashboard_templates={}, id_mappings={}, ) @@ -1672,11 +1670,11 @@ def _upset_dashboards_on_event(self, event: RelationEvent) -> None: return for id in dashboards: - self._stored.dashboard_templates[id] = self._content_to_dashboard_object( + self._stored.dashboard_templates[id] = self._content_to_dashboard_object( # type: ignore dashboards[id], event ) - self._stored.id_mappings[event.app.name] = dashboards + self._stored.id_mappings[event.app.name] = dashboards # type: ignore self._update_remote_grafana(event) def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: @@ -1692,11 +1690,11 @@ def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: def remove_dashboards(self, event: RelationBrokenEvent) -> None: """Remove a dashboard if the relation is broken.""" - app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) + app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) # type: ignore - del self._stored.id_mappings[event.app.name] + del self._stored.id_mappings[event.app.name] # type: ignore for id in app_ids: - del self._stored.dashboard_templates[id] + del self._stored.dashboard_templates[id] # type: ignore stored_data = { "templates": _type_convert_stored(self._stored.dashboard_templates), @@ -1740,6 +1738,19 @@ def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 and dash["templating"]["list"][i]["name"] == "host" ): dash["templating"]["list"][i] = REACTIVE_CONVERTER + + # Strip out newly-added 'juju_application' template variables which + # don't line up with our drop-downs + dash_mutable = dash + for i in range(len(dash["templating"]["list"])): + if ( + "name" in dash["templating"]["list"][i] + and dash["templating"]["list"][i]["name"] == "app" + ): + del dash_mutable["templating"]["list"][i] + + if dash_mutable: + dash = dash_mutable except KeyError: logger.debug("No existing templating data in dashboard") @@ -1764,17 +1775,18 @@ def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: # Reactive data can reliably be pulled out of events. In theory, if we got an event, # it's on the bucket, but using event explicitly keeps the mental model in # place for reactive - for k in event.relation.data[event.unit].keys(): + for k in event.relation.data[event.unit].keys(): # type: ignore if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) + templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) # type: ignore - for k in event.relation.data[event.app].keys(): + for k in event.relation.data[event.app].keys(): # type: ignore if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) + templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) # type: ignore builtins = self._maybe_get_builtin_dashboards(event) if not templates and not builtins: + logger.warning("NOTHING!") return {} dashboards = {} @@ -1793,11 +1805,19 @@ def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: # Replace the old-style datasource templates dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) + dash = re.sub( + r'"datasource": "(!?\w)[\w|\s|-]+?Juju generated.*?"', + r'"datasource": "${prometheusds}"', + dash, + ) + + # Yank out "new"+old LMA topology + dash = re.sub(r'(,?juju_application=~)"\$app"', r'\1"\$juju_application"', dash) from jinja2 import Template content = _encode_dashboard_content( - Template(dash).render(host=event.unit.name, datasource="prometheus") + Template(dash).render(host=r"$host", datasource=r"${prometheusds}") # type: ignore ) id = "prog:{}".format(content[-24:-16]) @@ -1828,12 +1848,12 @@ def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: if dashboards_path: - def _is_dashboard(p: Path) -> bool: + def is_dashboard(p: Path) -> bool: return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) - for path in filter(_is_dashboard, Path(dashboards_path).glob("*")): + for path in filter(is_dashboard, Path(dashboards_path).glob("*")): # path = Path(path) - if event.app.name in path.name: + if event.app.name in path.name: # type: ignore id = "file:{}".format(path.stem) builtins[id] = self._content_to_dashboard_object( _encode_dashboard_content(path.read_bytes()), event @@ -1843,7 +1863,7 @@ def _is_dashboard(p: Path) -> bool: def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict: return { - "charm": event.app.name, + "charm": event.app.name, # type: ignore "content": content, "juju_topology": self._juju_topology(event), "inject_dropdowns": True, @@ -1856,8 +1876,8 @@ def _juju_topology(self, event: RelationEvent) -> Dict: return { "model": self._charm.model.name, "model_uuid": self._charm.model.uuid, - "application": event.app.name, - "unit": event.unit.name, + "application": event.app.name, # type: ignore + "unit": event.unit.name, # type: ignore } diff --git a/lib/charms/grafana_k8s/v0/grafana_source.py b/lib/charms/grafana_k8s/v0/grafana_source.py index db7e63b9..a94d78c2 100644 --- a/lib/charms/grafana_k8s/v0/grafana_source.py +++ b/lib/charms/grafana_k8s/v0/grafana_source.py @@ -160,7 +160,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 13 +LIBPATCH = 14 logger = logging.getLogger(__name__) @@ -459,6 +459,9 @@ def _set_unit_details(self, _: Union[BoundEvent, RelationEvent, Relation]): """ for relation in self._charm.model.relations[self._relation_name]: url = self._source_url or "{}:{}".format(socket.getfqdn(), self._source_port) + if self._source_type == "mimir": + url = f"{url}/prometheus" + relation.data[self._charm.unit]["grafana_source_host"] = url @@ -495,7 +498,7 @@ def __init__( # We're stuck with this forever now so upgrades work, or until such point as we can # break compatibility - self._stored.set_default( + self._stored.set_default( # type: ignore sources=dict(), sources_to_delete=set(), ) @@ -664,13 +667,13 @@ def upgrade_keys(self) -> None: ) # If there's stored data, merge it and purge it - if self._stored.sources: + if self._stored.sources: # type: ignore self._stored.sources = {} peer_sources = self.get_peer_data("sources") sources.update(peer_sources) self.set_peer_data("sources", sources) - if self._stored.sources_to_delete: + if self._stored.sources_to_delete: # type: ignore old_sources_to_delete = _type_convert_stored(self._stored.sources_to_delete) self._stored.sources_to_delete = set() peer_sources_to_delete = set(self.get_peer_data("sources_to_delete")) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index f080fb84..c3ed5e5b 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -342,13 +342,22 @@ def _on_scrape_targets_changed(self, event): import tempfile from collections import defaultdict from pathlib import Path -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from urllib.parse import urlparse import yaml from charms.observability_libs.v0.juju_topology import JujuTopology from ops.charm import CharmBase, RelationRole -from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents +from ops.framework import ( + BoundEvent, + EventBase, + EventSource, + Object, + ObjectEvents, + StoredDict, + StoredList, + StoredState, +) from ops.model import Relation # The unique Charmhub library identifier, never change it @@ -359,7 +368,7 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 27 +LIBPATCH = 29 logger = logging.getLogger(__name__) @@ -683,6 +692,19 @@ class MetricsEndpointProviderEvents(ObjectEvents): alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) +def _type_convert_stored(obj): + """Convert Stored* to their appropriate types, recursively.""" + if isinstance(obj, StoredList): + return list(map(_type_convert_stored, obj)) + elif isinstance(obj, StoredDict): + rdict = {} # type: Dict[Any, Any] + for k in obj.keys(): + rdict[k] = _type_convert_stored(obj[k]) + return rdict + else: + return obj + + def _validate_relation_by_interface_and_direction( charm: CharmBase, relation_name: str, @@ -1569,20 +1591,10 @@ def __init__( if not isinstance(refresh_event, list): refresh_event = [refresh_event] + self.framework.observe(events.relation_joined, self.set_scrape_job_spec) for ev in refresh_event: self.framework.observe(ev, self.set_scrape_job_spec) - # Update relation data every reinit. If instead we used event hooks then observing only - # relation-joined would not be sufficient: - # - Would need to observe leader-elected, in case there was no leader during - # relation-joined. - # - If later related to an ingress provider, then would need to register and wait for - # update-status interval to elapse before changes would apply. - # - The ingerss-ready custom event is currently emitted prematurely and cannot be relied - # upon: https://github.com/canonical/traefik-k8s-operator/issues/78 - # NOTE We may still end up waiting for update-status before changes are applied. - self.set_scrape_job_spec() - def _on_relation_changed(self, event): """Check for alert rule messages in the relation data before moving on.""" if self._charm.unit.is_leader(): @@ -1823,7 +1835,6 @@ class MetricsEndpointAggregator(Object): derive from `MetricsEndpointAggregator` overriding the `_get_targets()` method, which is responsible for aggregating the unit name, host address ("hostname") and port of the scrape target. - `MetricsEndpointAggregator` also assumes that each unit of a scrape target sets in its unit-level relation data a key named "groups". The value of this key is expected to be the string @@ -1858,7 +1869,9 @@ class MetricsEndpointAggregator(Object): constructing an aggregator object. """ - def __init__(self, charm, relation_names, relabel_instance=True): + _stored = StoredState() + + def __init__(self, charm, relation_names: Optional[dict] = None, relabel_instance=True): """Construct a `MetricsEndpointAggregator`. Args: @@ -1875,12 +1888,19 @@ def __init__(self, charm, relation_names, relabel_instance=True): relabel_instance: A boolean flag indicating if Prometheus scrape job "instance" labels must refer to Juju Topology. """ - super().__init__(charm, relation_names["prometheus"]) - self._charm = charm - self._target_relation = relation_names["scrape_target"] - self._prometheus_relation = relation_names["prometheus"] - self._alert_rules_relation = relation_names["alert_rules"] + + relation_names = relation_names or {} + + self._prometheus_relation = relation_names.get( + "prometheus", "downstream-prometheus-scrape" + ) + self._target_relation = relation_names.get("scrape_target", "prometheus-target") + self._alert_rules_relation = relation_names.get("alert_rules", "prometheus-rules") + + super().__init__(charm, self._prometheus_relation) + self._stored.set_default(jobs=[], alert_rules=[]) + self._relabel_instance = relabel_instance # manage Prometheus charm relation events @@ -1889,13 +1909,15 @@ def __init__(self, charm, relation_names, relabel_instance=True): # manage list of Prometheus scrape jobs from related scrape targets target_events = self._charm.on[self._target_relation] - self.framework.observe(target_events.relation_changed, self._update_prometheus_jobs) - self.framework.observe(target_events.relation_departed, self._remove_prometheus_jobs) + self.framework.observe(target_events.relation_changed, self._on_prometheus_targets_changed) + self.framework.observe( + target_events.relation_departed, self._on_prometheus_targets_departed + ) # manage alert rules for Prometheus from related scrape targets alert_rule_events = self._charm.on[self._alert_rules_relation] - self.framework.observe(alert_rule_events.relation_changed, self._update_alert_rules) - self.framework.observe(alert_rule_events.relation_departed, self._remove_alert_rules) + self.framework.observe(alert_rule_events.relation_changed, self._on_alert_rules_changed) + self.framework.observe(alert_rule_events.relation_departed, self._on_alert_rules_departed) def _set_prometheus_data(self, event): """Ensure every new Prometheus instances is updated. @@ -1904,25 +1926,41 @@ def _set_prometheus_data(self, event): `MetricsEndpointAggregator`, that Prometheus unit is provided with the complete set of existing scrape jobs and alert rules. """ - jobs = [] # list of scrape jobs, one per relation + jobs = [] + _type_convert_stored( + self._stored.jobs + ) # list of scrape jobs, one per relation for relation in self.model.relations[self._target_relation]: targets = self._get_targets(relation) if targets and relation.app: jobs.append(self._static_scrape_job(targets, relation.app.name)) - groups = [] # list of alert rule groups, one group per relation + groups = [] + _type_convert_stored(self._stored.alert_rules) # list of alert rule groups for relation in self.model.relations[self._alert_rules_relation]: unit_rules = self._get_alert_rules(relation) if unit_rules and relation.app: appname = relation.app.name rules = self._label_alert_rules(unit_rules, appname) - group = {"name": self._group_name(appname), "rules": rules} + group = {"name": self.group_name(appname), "rules": rules} groups.append(group) event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) - def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: + def _on_prometheus_targets_changed(self, event): + """Update scrape jobs in response to scrape target changes. + + When there is any change in relation data with any scrape + target, the Prometheus scrape job, for that specific target is + updated. + """ + targets = self._get_targets(event.relation) + if not targets: + return + + # new scrape job for the relation that has changed + self.set_target_job_data(targets, event.relation.app.name) + + def set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: """Update scrape jobs in response to scrape target changes. When there is any change in relation data with any scrape @@ -1944,28 +1982,10 @@ def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: jobs.append(updated_job) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - def _update_prometheus_jobs(self, event): - """Update scrape jobs in response to scrape target changes. - - When there is any change in relation data with any scrape - target, the Prometheus scrape job, for that specific target is - updated. - """ - targets = self._get_targets(event.relation) - if not targets: - return + if not _type_convert_stored(self._stored.jobs) == jobs: + self._stored.jobs = jobs - # new scrape job for the relation that has changed - updated_job = self._static_scrape_job(targets, event.relation.app.name) - - for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) - # list of scrape jobs that have not changed - jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] - jobs.append(updated_job) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - - def _remove_prometheus_jobs(self, event): + def _on_prometheus_targets_departed(self, event): """Remove scrape jobs when a target departs. Any time a scrape target departs, any Prometheus scrape job @@ -1973,7 +1993,18 @@ def _remove_prometheus_jobs(self, event): """ job_name = self._job_name(event.relation.app.name) unit_name = event.unit.name + self.remove_prometheus_jobs(job_name, unit_name) + def remove_prometheus_jobs(self, job_name: str, unit_name: Optional[str] = ""): + """Given a job name and unit name, remove scrape jobs associated. + + The `unit_name` parameter is used for automatic, relation data bag-based + generation, where the unit name in labels can be used to ensure that jobs with + similar names (which are generated via the app name when scanning relation data + bags) are not accidentally removed, as their unit name labels will differ. + For NRPE, the job name is calculated from an ID sent via the NRPE relation, and is + sufficient to uniquely identify the target. + """ for relation in self.model.relations[self._prometheus_relation]: jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) if not jobs: @@ -2000,7 +2031,125 @@ def _remove_prometheus_jobs(self, event): relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - def _update_alert_rules(self, event): + if not _type_convert_stored(self._stored.jobs) == jobs: + self._stored.jobs = jobs + + def _job_name(self, appname) -> str: + """Construct a scrape job name. + + Each relation has its own unique scrape job name. All units in + the relation are scraped as part of the same scrape job. + + Args: + appname: string name of a related application. + + Returns: + a string Prometheus scrape job name for the application. + """ + return "juju_{}_{}_{}_prometheus_scrape".format( + self.model.name, self.model.uuid[:7], appname + ) + + def _get_targets(self, relation) -> dict: + """Fetch scrape targets for a relation. + + Scrape target information is returned for each unit in the + relation. This information contains the unit name, network + hostname (or address) for that unit, and port on which a + metrics endpoint is exposed in that unit. + + Args: + relation: an `ops.model.Relation` object for which scrape + targets are required. + + Returns: + a dictionary whose keys are names of the units in the + relation. There values associated with each key is itself + a dictionary of the form + ``` + {"hostname": hostname, "port": port} + ``` + """ + targets = {} + for unit in relation.units: + port = relation.data[unit].get("port", 80) + hostname = relation.data[unit].get("hostname") + if hostname: + targets.update({unit.name: {"hostname": hostname, "port": port}}) + + return targets + + def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: + """Construct a static scrape job for an application. + + Args: + targets: a dictionary providing hostname and port for all + scrape target. The keys of this dictionary are unit + names. Values corresponding to these keys are + themselves a dictionary with keys "hostname" and + "port". + application_name: a string name of the application for + which this static scrape job is being constructed. + + Returns: + A dictionary corresponding to a Prometheus static scrape + job configuration for one application. The returned + dictionary may be transformed into YAML and appended to + the list of any existing list of Prometheus static configs. + """ + juju_model = self.model.name + juju_model_uuid = self.model.uuid + job = { + "job_name": self._job_name(application_name), + "static_configs": [ + { + "targets": ["{}:{}".format(target["hostname"], target["port"])], + "labels": { + "juju_model": juju_model, + "juju_model_uuid": juju_model_uuid, + "juju_application": application_name, + "juju_unit": unit_name, + "host": target["hostname"], + }, + } + for unit_name, target in targets.items() + ], + "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), + } + job.update(kwargs.get("updates", {})) + + return job + + @property + def _relabel_configs(self) -> list: + """Create Juju topology relabeling configuration. + + Using Juju topology for instance labels ensures that these + labels are stable across unit recreation. + + Returns: + a list of Prometheus relabeling configurations. Each item in + this list is one relabel configuration. + """ + return ( + [ + { + "source_labels": [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_unit", + ], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + ] + if self._relabel_instance + else [] + ) + + def _on_alert_rules_changed(self, event): """Update alert rules in response to scrape target changes. When there is any change in alert rule relation data for any @@ -2011,28 +2160,49 @@ def _update_alert_rules(self, event): if not unit_rules: return - appname = event.relation.app.name - rules = self._label_alert_rules(unit_rules, appname) - # the alert rule group that has changed - updated_group = {"name": self._group_name(appname), "rules": rules} + app_name = event.relation.app.name + self.set_alert_rule_data(app_name, unit_rules) + + def set_alert_rule_data(self, name: str, unit_rules: dict, label_rules: bool = True) -> None: + """Update alert rule data. + + The unit rules should be a dict, which is has additional Juju topology labels added. For + rules generated by the NRPE exporter, they are pre-labeled so lookups can be performed. + """ + if label_rules: + rules = self._label_alert_rules(unit_rules, name) + else: + rules = [unit_rules] + updated_group = {"name": self.group_name(name), "rules": rules} for relation in self.model.relations[self._prometheus_relation]: alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) groups = alert_rules.get("groups", []) # list of alert rule groups that have not changed - groups = [group for group in groups if updated_group["name"] != group["name"]] - groups.append(updated_group) + for group in groups: + if group["name"] == updated_group["name"]: + group["rules"] = [r for r in group["rules"] if r not in updated_group["rules"]] + group["rules"].extend(updated_group["rules"]) + + if updated_group["name"] not in [g["name"] for g in groups]: + groups.append(updated_group) relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) - def _remove_alert_rules(self, event): + if not _type_convert_stored(self._stored.alert_rules) == groups: + self._stored.alert_rules = groups + + def _on_alert_rules_departed(self, event): """Remove alert rules for departed targets. Any time a scrape target departs any alert rules associated with that specific scrape target is removed. """ - group_name = self._group_name(event.relation.app.name) + group_name = self.group_name(event.relation.app.name) unit_name = event.unit.name + self.remove_alert_rules(group_name, unit_name) + def remove_alert_rules(self, group_name: str, unit_name: str) -> None: + """Remove an alert rule group from relation data.""" for relation in self.model.relations[self._prometheus_relation]: alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) if not alert_rules: @@ -2065,34 +2235,8 @@ def _remove_alert_rules(self, event): json.dumps({"groups": groups}) if groups else "{}" ) - def _get_targets(self, relation) -> dict: - """Fetch scrape targets for a relation. - - Scrape target information is returned for each unit in the - relation. This information contains the unit name, network - hostname (or address) for that unit, and port on which a - metrics endpoint is exposed in that unit. - - Args: - relation: an `ops.model.Relation` object for which scrape - targets are required. - - Returns: - a dictionary whose keys are names of the units in the - relation. There values associated with each key is itself - a dictionary of the form - ``` - {"hostname": hostname, "port": port} - ``` - """ - targets = {} - for unit in relation.units: - port = relation.data[unit].get("port", 80) - hostname = relation.data[unit].get("hostname") - if hostname: - targets.update({unit.name: {"hostname": hostname, "port": port}}) - - return targets + if not _type_convert_stored(self._stored.alert_rules) == groups: + self._stored.alert_rules = groups def _get_alert_rules(self, relation) -> dict: """Fetch alert rules for a relation. @@ -2120,23 +2264,7 @@ def _get_alert_rules(self, relation) -> dict: return rules - def _job_name(self, appname) -> str: - """Construct a scrape job name. - - Each relation has its own unique scrape job name. All units in - the relation are scraped as part of the same scrape job. - - Args: - appname: string name of a related application. - - Returns: - a string Prometheus scrape job name for the application. - """ - return "juju_{}_{}_{}_prometheus_scrape".format( - self.model.name, self.model.uuid[:7], appname - ) - - def _group_name(self, appname) -> str: + def group_name(self, unit_name: str) -> str: """Construct name for an alert rule group. Each unit in a relation may define its own alert rules. All @@ -2144,20 +2272,21 @@ def _group_name(self, appname) -> str: given a single alert rule group name. Args: - appname: string name of a related application. + unit_name: string name of a related application. Returns: - a string Prometheus alert rules group name for the application. + a string Prometheus alert rules group name for the unit. """ - return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], appname) + unit_name = re.sub(r"/", "_", unit_name) + return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], unit_name) - def _label_alert_rules(self, unit_rules, appname) -> list: + def _label_alert_rules(self, unit_rules, app_name: str) -> list: """Apply juju topology labels to alert rules. Args: unit_rules: a list of alert rules, where each rule is in dictionary format. - appname: a string name of the application to which the + app_name: a string name of the application to which the alert rules belong. Returns: @@ -2169,7 +2298,7 @@ def _label_alert_rules(self, unit_rules, appname) -> list: # the new JujuTopology removed this, so build it up by hand matchers = { "juju_{}".format(k): v - for k, v in JujuTopology(self.model.name, self.model.uuid, appname, unit_name) + for k, v in JujuTopology(self.model.name, self.model.uuid, app_name, unit_name) .as_dict(excluded_keys=["charm_name"]) .items() } @@ -2178,76 +2307,6 @@ def _label_alert_rules(self, unit_rules, appname) -> list: return labeled_rules - def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: - """Construct a static scrape job for an application. - - Args: - targets: a dictionary providing hostname and port for all - scrape target. The keys of this dictionary are unit - names. Values corresponding to these keys are - themselves a dictionary with keys "hostname" and - "port". - application_name: a string name of the application for - which this static scrape job is being constructed. - - Returns: - A dictionary corresponding to a Prometheus static scrape - job configuration for one application. The returned - dictionary may be transformed into YAML and appended to - the list of any existing list of Prometheus static configs. - """ - juju_model = self.model.name - juju_model_uuid = self.model.uuid - job = { - "job_name": self._job_name(application_name), - "static_configs": [ - { - "targets": ["{}:{}".format(target["hostname"], target["port"])], - "labels": { - "juju_model": juju_model, - "juju_model_uuid": juju_model_uuid, - "juju_application": application_name, - "juju_unit": unit_name, - "host": target["hostname"], - }, - } - for unit_name, target in targets.items() - ], - "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), - } - job.update(kwargs.get("updates", {})) - - return job - - @property - def _relabel_configs(self) -> list: - """Create Juju topology relabeling configuration. - - Using Juju topology for instance labels ensures that these - labels are stable across unit recreation. - - Returns: - a list of Prometheus relabeling configurations. Each item in - this list is one relabel configuration. - """ - return ( - [ - { - "source_labels": [ - "juju_model", - "juju_model_uuid", - "juju_application", - "juju_unit", - ], - "separator": "_", - "target_label": "instance", - "regex": "(.*)", - } - ] - if self._relabel_instance - else [] - ) - class CosTool: """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" diff --git a/lib/charms/traefik_k8s/v1/ingress.py b/lib/charms/traefik_k8s/v1/ingress.py index 898b609d..978f01a5 100644 --- a/lib/charms/traefik_k8s/v1/ingress.py +++ b/lib/charms/traefik_k8s/v1/ingress.py @@ -69,7 +69,7 @@ def _on_ingress_revoked(self, event: IngressPerAppRevokedEvent): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 8 +LIBPATCH = 9 DEFAULT_RELATION_NAME = "ingress" RELATION_INTERFACE = "ingress" @@ -202,7 +202,7 @@ def __init__(self, handle, relation, *args, **kwargs): obj = kwargs.get(attr, default) setattr(self, attr, obj) - def snapshot(self) -> dict: + def snapshot(self): dct = super().snapshot() for attr in self.__attrs__(): obj = getattr(self, attr) @@ -217,7 +217,7 @@ def snapshot(self) -> dict: return dct - def restore(self, snapshot: dict) -> None: + def restore(self, snapshot) -> None: super().restore(snapshot) for attr, obj in snapshot.items(): setattr(self, attr, obj) @@ -250,7 +250,7 @@ class IngressPerAppProviderEvents(ObjectEvents): class IngressPerAppProvider(_IngressPerAppBase): """Implementation of the provider of ingress.""" - on = IngressPerAppProviderEvents() + on = IngressPerAppProviderEvents() # type: ignore def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): """Constructor for IngressPerAppProvider. @@ -406,7 +406,8 @@ class IngressPerAppRequirerEvents(ObjectEvents): class IngressPerAppRequirer(_IngressPerAppBase): """Implementation of the requirer of the ingress relation.""" - on = IngressPerAppRequirerEvents() + on = IngressPerAppRequirerEvents() # type: ignore + # used to prevent spur1ious urls to be sent out if the event we're currently # handling is a relation-broken one. _stored = StoredState()