From e28bf9b2dc0e80fd1add5a6e464955577d0b33b6 Mon Sep 17 00:00:00 2001 From: wf-jenkins <34043090+wf-jenkins@users.noreply.github.com> Date: Wed, 4 Oct 2023 14:50:53 -0700 Subject: [PATCH] Release operator version: 2.14.1 * Reuse the last copied image since release failed due to clean-cluster issue Co-authored-by: Mark Michael Co-authored-by: Jerry Belmonte Co-authored-by: Anil Kodali Co-authored-by: Yuqi Jin Co-authored-by: Priya Selvaganesan * Update operator version from the file locally for the test Co-authored-by: Jerry Belmonte Co-authored-by: Anil Kodali Co-authored-by: Yuqi Jin Co-authored-by: Priya Selvaganesan Co-authored-by: John Cornish * Fix the operator version file path Co-authored-by: Anil Kodali Co-authored-by: Yuqi Jin Co-authored-by: Priya Selvaganesan Co-authored-by: John Cornish Co-authored-by: Mark Michael * Release operator version: 2.14.1 --------- Co-authored-by: John Cornish Co-authored-by: Mark Michael Co-authored-by: Jerry Belmonte Co-authored-by: Anil Kodali Co-authored-by: Yuqi Jin Co-authored-by: Priya Selvaganesan --- README.md | 33 ++- collector/release/NEXT_RELEASE_VERSION | 2 +- collector/release/VERSION | 2 +- deploy/crd/wavefront.com_wavefronts.yaml | 18 +- deploy/scenarios/wavefront-full-config.yaml | 11 +- deploy/scenarios/wavefront-pod-resources.yaml | 4 +- deploy/wavefront-operator.yaml | 24 +- docs/alerts/alerts.md | 93 ++++--- docs/alerts/create-alert.sh | 211 ++++++++++++--- docs/alerts/create-all-alerts.sh | 246 ++++++++++++++++++ .../container-cpu-overutilization.json.tmpl | 51 +++- .../container-cpu-throttling.json.tmpl | 53 +++- ...container-memory-overutilization.json.tmpl | 67 +++++ .../alerts/templates/etcd-no-leader.json.tmpl | 53 ++++ .../node-condition-not-ready.json.tmpl | 53 ++++ .../node-cpu-overutilization.json.tmpl | 64 +++++ .../node-cpu-request-saturation.json.tmpl | 70 +++++ .../templates/node-disk-pressure.json.tmpl | 53 ++++ .../node-filesystem-overutilization.json.tmpl | 71 +++++ .../node-memory-overutilization.json.tmpl | 71 +++++ .../templates/node-memory-pressure.json.tmpl | 60 +++++ .../node-memory-request-saturation.json.tmpl | 70 +++++ .../observability-status-unhealthy.json.tmpl | 53 ++++ ...ent-volume-claim-overutilization.json.tmpl | 61 +++++ .../persistent-volumes-error.json.tmpl | 46 ++++ .../persistent-volumes-no-claim.json.tmpl | 46 ++++ .../templates/pod-backoff-event.json.tmpl | 47 ++-- .../pod-out-of-memory-kills.json.tmpl | 47 ++-- .../templates/pod-stuck-in-pending.json.tmpl | 45 ++-- .../pod-stuck-in-terminating.json.tmpl | 47 ++-- .../templates/workload-not-ready.json.tmpl | 46 +++- docs/operator/custom-configuration.md | 6 +- .../config/manager/component_versions.yaml | 2 +- operator/config/manager/kustomization.yaml | 2 +- .../deploy/wavefront-operator.yaml | 24 +- .../docs/operator/custom-configuration.md | 4 +- operator/release/NEXT_RELEASE_VERSION | 2 +- operator/release/OPERATOR_VERSION | 2 +- release.Jenkinsfile | 2 +- scripts/promote-release-images.sh | 4 +- 40 files changed, 1634 insertions(+), 232 deletions(-) create mode 100755 docs/alerts/create-all-alerts.sh create mode 100644 docs/alerts/templates/container-memory-overutilization.json.tmpl create mode 100644 docs/alerts/templates/etcd-no-leader.json.tmpl create mode 100644 docs/alerts/templates/node-condition-not-ready.json.tmpl create mode 100644 docs/alerts/templates/node-cpu-overutilization.json.tmpl create mode 100644 docs/alerts/templates/node-cpu-request-saturation.json.tmpl create mode 100644 docs/alerts/templates/node-disk-pressure.json.tmpl create mode 100644 docs/alerts/templates/node-filesystem-overutilization.json.tmpl create mode 100644 docs/alerts/templates/node-memory-overutilization.json.tmpl create mode 100644 docs/alerts/templates/node-memory-pressure.json.tmpl create mode 100644 docs/alerts/templates/node-memory-request-saturation.json.tmpl create mode 100644 docs/alerts/templates/observability-status-unhealthy.json.tmpl create mode 100644 docs/alerts/templates/persistent-volume-claim-overutilization.json.tmpl create mode 100644 docs/alerts/templates/persistent-volumes-error.json.tmpl create mode 100644 docs/alerts/templates/persistent-volumes-no-claim.json.tmpl diff --git a/README.md b/README.md index 1f9f3e965..f4d424b84 100644 --- a/README.md +++ b/README.md @@ -147,11 +147,17 @@ We have templates for common scenarios. See the comments in each file for usage You can see all configuration options in the [wavefront-full-config.yaml](deploy/scenarios/wavefront-full-config.yaml). -# Creating Alerts +## Creating Alerts We have alerts on common Kubernetes issues. For details on creating alerts, see [alerts.md](docs/alerts/alerts.md). -### Pod Failure +### Observability Failures + +| Alert name | Description | +|---|---| +| [Observability Status is Unhealthy](docs/alerts/templates/observability-status-unhealthy.json.tmpl) | The status of the Observability for Kubernetes is unhealthy. | + +### Pod Failures | Alert name | Description | |---|---| @@ -162,6 +168,29 @@ We have alerts on common Kubernetes issues. For details on creating alerts, see | [Pod Out-of-memory Kills](docs/alerts/templates/pod-out-of-memory-kills.json.tmpl) | Workload has pod with container status `OOMKilled`. | | [Container CPU Throttling](docs/alerts/templates/container-cpu-throttling.json.tmpl) | Workload has a container with high CPU throttling. | | [Container CPU Overutilization](docs/alerts/templates/container-cpu-overutilization.json.tmpl) | Workload has a container with high CPU utilization. | +| [Container Memory Overutilization](docs/alerts/templates/container-memory-overutilization.json.tmpl) | Workload has a container with high memory utilization. | +| [Missing etcd leader](templates/etcd-no-leader.json.tmpl) | etcd cannot elect a leader. | + +### Persistent Volume Failures + +| Alert name | Description | +|---|---| +| [Persistent Volumes No Claim](docs/alerts/templates/persistent-volumes-no-claim.json.tmpl) | Persistent Volume has no claim. | +| [Persistent Volumes Error](docs/alerts/templates/persistent-volumes-error.json.tmpl) | Persistent Volume has issues with provisioning. | +| [Persistent Volume Claim Overutilization](docs/alerts/templates/persistent-volume-claim-overutilization.json.tmpl) | Workload has low available disk space for a claimed Persistent Volume. | + +### Node Failures + +| Alert name | Description | +|----------------------------------------------------------------------------------------------------|-------------| +| [Node Memory Overutilization](docs/alerts/templates/node-memory-overutilization.json.tmpl) | Node has high memory utilization. | +| [Node CPU Overutilization](docs/alerts/templates/node-cpu-overutilization.json.tmpl) | Node has high CPU utilization. | +| [Node Filesystem Overutilization](docs/alerts/templates/node-filesystem-overutilization.json.tmpl) | Node storage is almost full. | +| [Node CPU-request Saturation](docs/alerts/templates/node-cpu-request-saturation.json.tmpl) | Node has overcommitted cpu resource requests. | +| [Node Memory-request Saturation](docs/alerts/templates/node-memory-request-saturation.json.tmpl) | Node has overcommitted memory resource requests. | +| [Node Disk Pressure](docs/alerts/templates/node-disk-pressure.json.tmpl) | Node has problematic `DiskPressure` condition. | +| [Node Memory Pressure](docs/alerts/templates/node-memory-pressure.json.tmpl) | Node has problematic `MemoryPressure` condition. | +| [Node Condition Not Ready](docs/alerts/templates/node-condition-not-ready.json.tmpl) | Node Condition not in Ready state. | ## Bring Your Own Logs Shipper diff --git a/collector/release/NEXT_RELEASE_VERSION b/collector/release/NEXT_RELEASE_VERSION index dd43a143f..5db08bf2d 100644 --- a/collector/release/NEXT_RELEASE_VERSION +++ b/collector/release/NEXT_RELEASE_VERSION @@ -1 +1 @@ -1.26.1 +1.27.0 diff --git a/collector/release/VERSION b/collector/release/VERSION index ad2191947..dd43a143f 100644 --- a/collector/release/VERSION +++ b/collector/release/VERSION @@ -1 +1 @@ -1.25.0 +1.26.1 diff --git a/deploy/crd/wavefront.com_wavefronts.yaml b/deploy/crd/wavefront.com_wavefronts.yaml index 72f99388e..2789b2050 100644 --- a/deploy/crd/wavefront.com_wavefronts.yaml +++ b/deploy/crd/wavefront.com_wavefronts.yaml @@ -158,7 +158,7 @@ spec: default: resources: limits: - cpu: 400m + cpu: 2000m ephemeral-storage: 1Gi memory: 512Mi requests: @@ -297,7 +297,7 @@ spec: default: resources: limits: - cpu: 200m + cpu: 1000m ephemeral-storage: 512Mi memory: 256Mi requests: @@ -712,19 +712,21 @@ spec: type: object type: object type: object - kubernetesEvents: - description: KubernetesEvents is deprecated, please use aria-insights-secret - instead + insights: + description: Insights properties: enable: default: false - description: Enable is whether to enable events. Defaults + description: Enable is whether to enable Insights. Defaults to false. type: boolean - externalEndpointURL: + ingestionUrl: + description: Ingestion Url is the endpoint to send kubernetes + events. + pattern: ^http(s)?:\/\/.+ type: string required: - - externalEndpointURL + - ingestionUrl type: object type: object imagePullSecret: diff --git a/deploy/scenarios/wavefront-full-config.yaml b/deploy/scenarios/wavefront-full-config.yaml index a5e9b0930..7c71a5e71 100644 --- a/deploy/scenarios/wavefront-full-config.yaml +++ b/deploy/scenarios/wavefront-full-config.yaml @@ -1,5 +1,6 @@ # Need to change YOUR_CLUSTER_NAME and YOUR_WAVEFRONT_URL accordingly # This is not a valid configuration since some options are not compatible. See notes for more information. +# Unless otherwise specified, the values here are set to their default values. apiVersion: wavefront.com/v1alpha1 kind: Wavefront metadata: @@ -56,16 +57,16 @@ spec: - kubernetes.collector.runtime.* tagGuaranteeList: - label.env - defaultCollectionInterval: 90s #defaults to 60s + defaultCollectionInterval: 60s # Rules based and Prometheus endpoints auto-discovery. - enableDiscovery: true #defaults to true + enableDiscovery: true # controlPlane can enable/disable control plane metrics controlPlane: - enable: true #defaults to true + enable: true clusterCollector: resources: limits: - cpu: 400m + cpu: 2000m ephemeral-storage: 1Gi memory: 512Mi requests: @@ -75,7 +76,7 @@ spec: nodeCollector: resources: limits: - cpu: 200m + cpu: 1000m ephemeral-storage: 512Mi memory: 256Mi requests: diff --git a/deploy/scenarios/wavefront-pod-resources.yaml b/deploy/scenarios/wavefront-pod-resources.yaml index 6d8c2f4f5..3c9937e20 100644 --- a/deploy/scenarios/wavefront-pod-resources.yaml +++ b/deploy/scenarios/wavefront-pod-resources.yaml @@ -16,7 +16,7 @@ spec: cpu: 200m memory: 10Mi limits: - cpu: 400m + cpu: 2000m memory: 512Mi nodeCollector: resources: @@ -24,7 +24,7 @@ spec: cpu: 200m memory: 10Mi limits: - cpu: 200m + cpu: 1000m memory: 256Mi dataExport: wavefrontProxy: diff --git a/deploy/wavefront-operator.yaml b/deploy/wavefront-operator.yaml index e259eb02b..26aa5fcaa 100644 --- a/deploy/wavefront-operator.yaml +++ b/deploy/wavefront-operator.yaml @@ -165,7 +165,7 @@ spec: default: resources: limits: - cpu: 400m + cpu: 2000m ephemeral-storage: 1Gi memory: 512Mi requests: @@ -304,7 +304,7 @@ spec: default: resources: limits: - cpu: 200m + cpu: 1000m ephemeral-storage: 512Mi memory: 256Mi requests: @@ -719,19 +719,21 @@ spec: type: object type: object type: object - kubernetesEvents: - description: KubernetesEvents is deprecated, please use aria-insights-secret - instead + insights: + description: Insights properties: enable: default: false - description: Enable is whether to enable events. Defaults + description: Enable is whether to enable Insights. Defaults to false. type: boolean - externalEndpointURL: + ingestionUrl: + description: Ingestion Url is the endpoint to send kubernetes + events. + pattern: ^http(s)?:\/\/.+ type: string required: - - externalEndpointURL + - ingestionUrl type: object type: object imagePullSecret: @@ -1441,9 +1443,9 @@ subjects: --- apiVersion: v1 data: - collector: 1.25.0 + collector: 1.26.1 logging: 2.1.9 - proxy: "13.1" + proxy: "13.2" kind: ConfigMap metadata: labels: @@ -1513,7 +1515,7 @@ spec: configMapKeyRef: key: logging name: wavefront-component-versions - image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.13.0 + image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.14.1 imagePullPolicy: Always livenessProbe: httpGet: diff --git a/docs/alerts/alerts.md b/docs/alerts/alerts.md index 0aa391495..8e225202e 100644 --- a/docs/alerts/alerts.md +++ b/docs/alerts/alerts.md @@ -1,64 +1,81 @@ # Alerts -This page contains the steps to create an alert template. -We have alert templates on common Kubernetes issues. +This page contains the steps to create alerts for the Observability for Kubernetes Operator. -* [Detect pod stuck in pending](templates/pod-stuck-in-pending.json.tmpl) -* [Detect pod stuck in terminating](templates/pod-stuck-in-terminating.json.tmpl) -* [Detect pod backoff event](templates/pod-backoff-event.json.tmpl) -* [Detect workload with non-ready pods](templates/workload-not-ready.json.tmpl) -* [Detect pod out-of-memory kills](templates/pod-out-of-memory-kills.json.tmpl) -* [Detect container cpu throttling](templates/container-cpu-throttling.json.tmpl) -* [Detect container cpu overutilization](templates/container-cpu-overutilization.json.tmpl) +## Table of Content -## Flags +- [Alert Templates](#alert-templates) +- [Creating Alerts](#creating-alerts) +- [Example: Creating All the Alerts](#example-creating-all-the-alerts) +- [Example: Creating a Single Alert](#example-creating-a-single-alert) +- [Customizing Alerts](#customizing-alerts) -``` -Usage of ./create-alert.sh: - -t (Required) Wavefront API token - -c (Required) Wavefront instance name - -f (Required) path to alert file template - -n (Required) kubernetes cluster name - -h print usage info and exit -``` +## Alert Templates + +We have alert templates on common Kubernetes issues. -## Create an alert +| Alert | Template | +|---|---| +| [Detect if observability status is unhealthy](templates/observability-status-unhealthy.json.tmpl) | `observability-status-unhealthy.json.tmpl` | +| [Detect pod stuck in pending](templates/pod-stuck-in-pending.json.tmpl) | `pod-stuck-in-pending.json.tmpl` | +| [Detect pod stuck in terminating](templates/pod-stuck-in-terminating.json.tmpl) | `pod-stuck-in-terminating.json.tmpl` | +| [Detect pod backoff event](templates/pod-backoff-event.json.tmpl) | `pod-backoff-event.json.tmpl` | +| [Detect workload with non-ready pods](templates/workload-not-ready.json.tmpl) | `workload-not-ready.json.tmpl` | +| [Detect pod out-of-memory kills](templates/pod-out-of-memory-kills.json.tmpl) | `pod-out-of-memory-kills.json.tmpl` | +| [Detect container cpu throttling](templates/container-cpu-throttling.json.tmpl) | `container-cpu-throttling.json.tmpl` | +| [Detect container cpu overutilization](templates/container-cpu-overutilization.json.tmpl) | `container-cpu-overutilization.json.tmpl` | +| [Detect persistent volumes with no claims](templates/persistent-volumes-no-claim.json.tmpl) | `persistent-volumes-no-claim.json.tmpl` | +| [Detect persistent volumes with error](templates/persistent-volumes-error.json.tmpl) | `persistent-volumes-error.json.tmpl` | +| [Detect persistent volumes filling up](templates/persistent-volume-claim-overutilization.json.tmpl) | `persistent-volume-claim-overutilization.json.tmpl` | +| [Detect node memory overutilization](templates/node-memory-overutilization.json.tmpl) | `node-memory-overutilization.json.tmpl` | +| [Detect node cpu overutilization](templates/node-cpu-overutilization.json.tmpl) | `node-cpu-overutilization.json.tmpl` | +| [Detect node filesystem overutilization](templates/node-filesystem-overutilization.json.tmpl) | `node-filesystem-overutilization.json.tmpl` | +| [Detect node cpu-request saturation](templates/node-cpu-request-saturation.json.tmpl) | `node-cpu-request-saturation.json.tmpl` | +| [Detect node memory-request saturation](templates/node-memory-request-saturation.json.tmpl) | `node-memory-request-saturation.json.tmpl` | +| [Detect node disk pressure condition](templates/node-disk-pressure.json.tmpl) | `node-disk-pressure.json.tmpl` | +| [Detect node memory pressure condition](templates/node-memory-pressure.json.tmpl) | `node-memory-pressure.json.tmpl` | +| [Detect node condition not ready](templates/node-condition-not-ready.json.tmpl) | `node-not-ready.json.tmpl` | +| [Detect etcd has no leader](templates/etcd-no-leader.json.tmpl) | `etcd-no-leader.json.tmpl` | -### Step 1: Download the alert template file. +## Creating Alerts -1. Replace ``, (ex: `/tmp/pod-stuck-in-pending.json`). -2. Replace ``, (ex: `pod-stuck-in-pending.json.tmpl`). +1. Ensure that you have the information for the required fields: + - **Wavefront API token**. See [Managing API Tokens](https://docs.wavefront.com/wavefront_api.html#managing-api-tokens) page. + - **Wavefront instance**. For example, the value of `` from your wavefront url (`https://.wavefront.com`). + - **Cluster name**. For example, the value of `clusterName` from your Wavefront Custom Resource configuration (ex: `mycluster-us-west-1`). + - **(Optional) Alert template**. For example, the value of `` from the list of alert templates (ex: `pod-backoff-event.json.tmpl`). + - **(Optional) Alert target**. For example, an email address, PagerDuty key, or [alert target](https://docs.wavefront.com/webhooks_alert_notification.html). Alert targets can be a comma separated list. + +### Example: Creating All the Alerts ```bash -export ALERT_FILE_OUTPUT_PATH= -export ALERT_TEMPLATE_FILE= -curl -sSL -o "$ALERT_FILE_OUTPUT_PATH" "https://raw.githubusercontent.com/wavefrontHQ/observability-for-kubernetes/main/docs/alerts/templates/$ALERT_TEMPLATE_FILE" +curl -sSL https://raw.githubusercontent.com/wavefrontHQ/observability-for-kubernetes/main/docs/alerts/create-all-alerts.sh | bash -s -- \ + -t \ + -c \ + -e \ + -n ``` -### Step 2: Create the alert template. +>**Note:** You will need to change , , , and in the above example. -1. Ensure that you have the information for the required fields: - - **Wavefront API token**. See [Managing API Tokens](https://docs.wavefront.com/wavefront_api.html#managing-api-tokens) page. - - **Wavefront instance**. For example, the value of `` from your wavefront url (`https://.wavefront.com`). - - **Cluster name**. For example, the value of `clusterName` from your Wavefront Custom Resource configuration (ex: `mycluster-us-west-1`). - - **Alert template file**. For example, the download output path of the alert template file from **Step 1**. +### Example: Creating a Single Alert ```bash curl -sSL https://raw.githubusercontent.com/wavefrontHQ/observability-for-kubernetes/main/docs/alerts/create-alert.sh | bash -s -- \ -t \ -c \ -n \ - -f + -e \ + -f ``` -**Note:** You will need to change YOUR_API_TOKEN, YOUR_WAVEFRONT_INSTANCE, YOUR_CLUSTER_NAME, and PATH_TO_ALERT_FILE in the above example. +>**Note:** You will need to change , , , , and in the above example. -### Step 3: Customize the alert. +## Customizing Alerts -1. Log in to your service instance `https://.wavefront.com` as a user with the Alerts permission. Click **Alerting** > **All Alerts** from the toolbar to display the Alerts Browser. +1. Log in to your service instance `https://.wavefront.com` as a user with the Alerts permission. Click **Alerting** > **All Alerts** from the toolbar to display the Alerts Browser. 2. Click the alert name, or click the ellipsis icon next to the alert and select **Edit**. You can search for the alert by typing the alert name in the search field. 3. Change the alert properties when you edit the alert. -4. Specify alert recipients to receive notifications when the alert changes state. -5. Click **Save** in the top right to save your changes. +4. Click **Save** in the top right to save your changes. -See [Create and Manage Alerts](https://docs.wavefront.com/alerts_manage.html) for an overview on how to create and manage alerts. +>**Note:** See [Create and Manage Alerts](https://docs.wavefront.com/alerts_manage.html) for an overview on how to create and manage alerts. diff --git a/docs/alerts/create-alert.sh b/docs/alerts/create-alert.sh index bb550b546..dcfdcb04c 100755 --- a/docs/alerts/create-alert.sh +++ b/docs/alerts/create-alert.sh @@ -1,11 +1,53 @@ #!/usr/bin/env bash set -eo pipefail +function download_alert() { + local github_repo="$1" + local alert_path="$2" + local git_branch="$3" + local alert_file="$4" + local response res_code + + printf "Downloading alert ..." + + response=$(mktemp) + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + "https://api.github.com/repos/${github_repo}/contents/${alert_path}?ref=${git_branch}" \ + -H "Accept: application/vnd.github+json") + + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to download alert: $(cat "${response}")" + fi + + local download_url + if [ -x "$(command -v jq)" ]; then + download_url=$(jq -r '.download_url' "${response}") + else + download_url=$(grep download_url "${response}" | tr '",' ' ' | awk '{print $3}') + fi + + res_code=$(curl --silent --show-error --output "${alert_file}" --write-out "%{http_code}" -L "${download_url}") + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to download alert: $(cat "${alert_file}")" + fi + + echo " done." +} + function post_alert_to_wavefront() { - local wavefront_token=$1 - local wavefront_cluster=$2 - local alert_file=$3 - local k8s_cluster_name=$4 + local wavefront_token="$1" + local wavefront_cluster="$2" + local alert_file="$3" + local k8s_cluster_name="$4" + local alert_target="$5" + local alert_name response res_code + + if [ -x "$(command -v jq)" ]; then + alert_name=$(jq -r '.name' "${alert_file}") + echo "Creating alert: ${alert_name}" + else + echo "Creating alert: ${alert_file}" + fi response=$(mktemp) res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ @@ -13,54 +55,100 @@ function post_alert_to_wavefront() { -H "Accept: application/json" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer ${wavefront_token}" \ - -d @<(sed "s/K8S_CLUSTER_NAME/${k8s_cluster_name}/g" "${alert_file}")) + -d @<(sed "s/K8S_CLUSTER_NAME/${k8s_cluster_name}/g" "${alert_file}" | sed "s/ALERT_TARGET/${alert_target}/g")) if [[ ${res_code} -ne 200 ]]; then - echo "Unable to create alert: " - cat "${response}" - exit 1 + print_err_and_exit "Unable to create alert: $(cat "${response}")" + fi + + local alert_id + alert_id=$(sed -n 's/.*"id":"\{0,1\}\([0-9][^,"]*\)"\{0,1\}.*/\1/p' "${response}") + + echo "Alert has been created at: https://${wavefront_cluster}.wavefront.com/alerts/${alert_id}" +} + +function get_csp_access_token() { + local csp_endpoint="$1" + local csp_token_or_secret="$2" + local csp_app_id="$3" + local csp_org_id="$4" + local csp_access_token response res_code + + printf "Retrieving the CSP access token ..." + + response=$(mktemp) + + if [[ -z "${csp_app_id}" ]]; then + local csp_api_token="${csp_token_or_secret}" + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + -X POST "https://${csp_endpoint}.cloud.vmware.com/csp/gateway/am/api/auth/api-tokens/authorize" \ + -H "Accept: application/json" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "api_token=${csp_api_token}") + else + local csp_credentials + csp_credentials=$(printf '%s:%s' "${csp_app_id}" "${csp_token_or_secret}" | base64) + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + -X POST "https://${csp_endpoint}.cloud.vmware.com/csp/gateway/am/api/auth/authorize" \ + -H "Accept: application/json" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -H "Authorization: Basic ${csp_credentials}" \ + -d "grant_type=client_credentials&orgId=${csp_org_id}") fi if [ -x "$(command -v jq)" ]; then - alert_name=$(jq -r '.name' "${alert_file}") - echo "Alert name: ${alert_name}" + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to retrieve the CSP access token: $(jq -r '.message' "${response}")" + fi + csp_access_token=$(jq -r '.access_token' "${response}") + else + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to retrieve the CSP access token: $(cat "${response}")" + fi + for item in $(tr '{,}' ' ' < "${response}"); do + if echo "${item}" | grep access_token >/dev/null; then + csp_access_token=$(echo "${item}" | tr '"' ' ' | awk '{print $3}') + break + fi + done fi - alert_id=$(sed -n 's/.*id":"\([0-9]*\).*/\1/p' "${response}") + WAVEFRONT_TOKEN="${csp_access_token}" - echo "Alert has been created at: https://${wavefront_cluster}.wavefront.com/alerts/${alert_id}" + echo " done." } function check_alert_file() { - local alert_file=$1 + local alert_file="$1" if ! [ -f "${alert_file}" ]; then - echo "Invalid alert file: ${alert_file}" - exit 1 + print_err_and_exit "Invalid alert file: ${alert_file}" fi if [ -x "$(command -v jq)" ] && ! jq -e . "${alert_file}" &>/dev/null; then - echo "Invalid json format for alert file: ${alert_file}" - exit 1 + print_err_and_exit "Invalid json format for alert file: ${alert_file}" elif [ -x "$(command -v python)" ] \ && ! python -c "import sys,json;json.loads(sys.stdin.read())" < "${alert_file}" &>/dev/null; then - echo "Invalid json format for alert file: ${alert_file}" - exit 1 + print_err_and_exit "Invalid json format for alert file: ${alert_file}" elif [ -x "$(command -v python3)" ] \ && ! python3 -c "import sys,json;json.loads(sys.stdin.read())" < "${alert_file}" &>/dev/null; then - echo "Invalid json format for alert file: ${alert_file}" - exit 1 + print_err_and_exit "Invalid json format for alert file: ${alert_file}" fi } function check_required_argument() { - local required_arg=$1 - local failure_msg=$2 - if [[ -z ${required_arg} ]]; then - print_usage_and_exit "$failure_msg" + local required_arg="$1" + local failure_msg="$2" + if [[ -z "${required_arg}" ]]; then + print_usage_and_exit "${failure_msg}" fi } +function print_err_and_exit() { + echo "Error: $1" + exit 1 +} + function print_usage_and_exit() { echo "Failure: $1" print_usage @@ -68,39 +156,76 @@ function print_usage_and_exit() { } function print_usage() { - echo "Usage: create-alert.sh -t -c -f -n -h" - echo -e "\t-t wavefront token (required)" + echo "Usage: create-alert [flags] [options]" + echo -e "\t-t wavefront api token (optional)" echo -e "\t-c wavefront instance name (required)" - echo -e "\t-f path to alert file (required)" echo -e "\t-n kubernetes cluster name (required)" + echo -e "\t-f alert template file name (required)" + echo -e "\t-p end-point for csp authentication (optional)" + echo -e "\t-a api token for csp authentication (optional)" + echo -e "\t-i oauth app id for csp authentication (optional)" + echo -e "\t-s oauth app secret for csp authentication (optional)" + echo -e "\t-o oauth org id for csp authentication (optional)" + echo -e "\t-e alert target (optional)" echo -e "\t-h print usage" } function main() { # Required arguments - local WF_CLUSTER= - local ALERT_FILE= - local K8S_CLUSTER_NAME= - - while getopts 'c:t:f:n:h' opt; do + local WF_CLUSTER='' + local ALERT_FILE_NAME='' + local K8S_CLUSTER_NAME='' + + # Optional arguments + local CSP_ENDPOINT='console' + local CSP_API_TOKEN='' + local CSP_APP_ID='' + local CSP_APP_SECRET='' + + # Default arguments + local GITHUB_REPO='wavefrontHQ/observability-for-kubernetes' + local ALERTS_DIRECTORY='docs/alerts/templates' + local GIT_BRANCH='main' + local ALERT_TARGET='' + + while getopts ':t:c:n:f:p:a:i:s:o:e:d:b:h' opt; do case "${opt}" in - t) WAVEFRONT_TOKEN="${OPTARG}" ;; - c) WF_CLUSTER="${OPTARG}" ;; - f) ALERT_FILE="${OPTARG}" ;; - n) K8S_CLUSTER_NAME="${OPTARG}" ;; - h) print_usage; exit 0 ;; - \?) print_usage_and_exit "Invalid option" ;; + t) WAVEFRONT_TOKEN="${OPTARG}" ;; + c) WF_CLUSTER="${OPTARG}" ;; + n) K8S_CLUSTER_NAME="${OPTARG}" ;; + f) ALERT_FILE_NAME="${OPTARG}" ;; + p) CSP_ENDPOINT="${OPTARG}" ;; + a) CSP_API_TOKEN="${OPTARG}" ;; + i) CSP_APP_ID="${OPTARG}" ;; + s) CSP_APP_SECRET="${OPTARG}" ;; + o) CSP_ORG_ID="${OPTARG}" ;; + e) ALERT_TARGET="${OPTARG}" ;; + d) ALERTS_DIRECTORY="${OPTARG}" ;; + b) GIT_BRANCH="${OPTARG}" ;; + h) print_usage; exit 0 ;; + \?) print_usage_and_exit "Invalid option: -${OPTARG}" ;; esac done + # Get the CSP access token if necessary + if [[ -n "${CSP_API_TOKEN}" ]]; then + get_csp_access_token "${CSP_ENDPOINT}" "${CSP_API_TOKEN}" + elif [[ -n "${CSP_APP_ID}" ]]; then + get_csp_access_token "${CSP_ENDPOINT}" "${CSP_APP_SECRET}" "${CSP_APP_ID}" "${CSP_ORG_ID}" + fi + # Checking for required arguments check_required_argument "${WAVEFRONT_TOKEN}" "-t is required" check_required_argument "${WF_CLUSTER}" "-c is required" - check_required_argument "${ALERT_FILE}" "-f is required" check_required_argument "${K8S_CLUSTER_NAME}" "-n is required" - - check_alert_file "${ALERT_FILE}" - post_alert_to_wavefront "${WAVEFRONT_TOKEN}" "${WF_CLUSTER}" "${ALERT_FILE}" "${K8S_CLUSTER_NAME}" + check_required_argument "${ALERT_FILE_NAME}" "-f is required" + + # Download and create the alert + TEMP_FILE=$(mktemp) + download_alert "${GITHUB_REPO}" "${ALERTS_DIRECTORY}/${ALERT_FILE_NAME}" "${GIT_BRANCH}" "${TEMP_FILE}" + check_alert_file "${TEMP_FILE}" + post_alert_to_wavefront "${WAVEFRONT_TOKEN}" "${WF_CLUSTER}" "${TEMP_FILE}" "${K8S_CLUSTER_NAME}" "${ALERT_TARGET}" + rm "${TEMP_FILE}" } main "$@" diff --git a/docs/alerts/create-all-alerts.sh b/docs/alerts/create-all-alerts.sh new file mode 100755 index 000000000..d5bd12d06 --- /dev/null +++ b/docs/alerts/create-all-alerts.sh @@ -0,0 +1,246 @@ +#!/usr/bin/env bash +set -eo pipefail + +function download_alerts() { + local github_repo="$1" + local alerts_path="$2" + local git_branch="$3" + local response res_code + + printf "Downloading alerts ..." + + response=$(mktemp) + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + "https://api.github.com/repos/${github_repo}/contents/${alerts_path}?ref=${git_branch}" \ + -H "Accept: application/vnd.github+json") + + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to download alerts: $(cat "${response}")" + fi + + pushd "${TEMP_DIR}" >/dev/null + local download_urls + if [ -x "$(command -v jq)" ]; then + download_urls=$(jq -r '.[].download_url' "${response}") + else + download_urls=$(grep download_url "${response}" | awk '{print $2}' | tr '",' ' ') + fi + # shellcheck disable=SC2068 + for download_url in ${download_urls[@]}; do + res_code=$(curl --silent --show-error --write-out "%{http_code}" -LO "${download_url}") + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to download alert at: ${download_url}" + fi + done + popd >/dev/null + + echo " done." +} + +function create_alerts() { + local wavefront_token="$1" + local wavefront_cluster="$2" + local k8s_cluster_name="$3" + local alert_target="$4" + local alert_files + + pushd "${TEMP_DIR}" >/dev/null + alert_files=$(ls "${TEMP_DIR}") + # shellcheck disable=SC2068 + for alert_file in ${alert_files[@]}; do + check_alert_file "${alert_file}" + post_alert_to_wavefront "${WAVEFRONT_TOKEN}" "${WF_CLUSTER}" "${K8S_CLUSTER_NAME}" "${alert_file}" "${alert_target}" + done + popd >/dev/null + + echo "Link to alerts just created: https://${wavefront_cluster}.wavefront.com/alerts?search=%7B%22searchTerms%22%3A%5B%7B%22type%22%3A%22tagpath%22%2C%22value%22%3A%22integration.kubernetes%22%7D%2C%7B%22type%22%3A%22freetext%22%2C%22value%22%3A%22${k8s_cluster_name}%22%7D%5D%2C%22sortOrder%22%3A%22ascending%22%2C%22sortField%22%3Anull%2C%22pageNum%22%3A1%7D&tagPathTree=%7B%22integration%22%3A%7B%22wf-value%22%3A%22integration%22%7D%7D" + +} + +function post_alert_to_wavefront() { + local wavefront_token="$1" + local wavefront_cluster="$2" + local k8s_cluster_name="$3" + local alert_file="$4" + local alert_target="$5" + local alert_name response res_code + + if [ -x "$(command -v jq)" ]; then + alert_name=$(jq -r '.name' "${alert_file}") + echo "Creating alert: ${alert_name}" + else + echo "Creating alert: ${alert_file}" + fi + + response=$(mktemp) + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + -X POST "https://${wavefront_cluster}.wavefront.com/api/v2/alert?useMultiQuery=true" \ + -H "Accept: application/json" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer ${wavefront_token}" \ + -d @<(sed "s/K8S_CLUSTER_NAME/${k8s_cluster_name}/g" "${alert_file}" | sed "s/ALERT_TARGET/${alert_target}/g")) + + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to create alert: $(cat "${response}")" + fi + + echo "Alert has been created." +} + +function get_csp_access_token() { + local csp_endpoint="$1" + local csp_token_or_secret="$2" + local csp_app_id="$3" + local csp_org_id="$4" + local csp_access_token response res_code + + printf "Retrieving the CSP access token ..." + + response=$(mktemp) + + if [[ -z "${csp_app_id}" ]]; then + local csp_api_token="${csp_token_or_secret}" + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + -X POST "https://${csp_endpoint}.cloud.vmware.com/csp/gateway/am/api/auth/api-tokens/authorize" \ + -H "Accept: application/json" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "api_token=${csp_api_token}") + else + local csp_credentials + csp_credentials=$(printf '%s:%s' "${csp_app_id}" "${csp_token_or_secret}" | base64) + res_code=$(curl --silent --show-error --output "${response}" --write-out "%{http_code}" \ + -X POST "https://${csp_endpoint}.cloud.vmware.com/csp/gateway/am/api/auth/authorize" \ + -H "Accept: application/json" \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -H "Authorization: Basic ${csp_credentials}" \ + -d "grant_type=client_credentials&orgId=${csp_org_id}") + fi + + if [ -x "$(command -v jq)" ]; then + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to retrieve the CSP access token: $(jq -r '.message' "${response}")" + fi + csp_access_token=$(jq -r '.access_token' "${response}") + else + if [[ ${res_code} -ne 200 ]]; then + print_err_and_exit "Unable to retrieve the CSP access token: $(cat "${response}")" + fi + for item in $(tr '{,}' ' ' < "${response}"); do + if echo "${item}" | grep access_token >/dev/null; then + csp_access_token=$(echo "${item}" | tr '"' ' ' | awk '{print $3}') + break + fi + done + fi + + WAVEFRONT_TOKEN="${csp_access_token}" + + echo " done." +} + +function check_alert_file() { + local alert_file="$1" + + if ! [ -f "${alert_file}" ]; then + print_err_and_exit "Invalid alert file: ${alert_file}" + fi + + if [ -x "$(command -v jq)" ] && ! jq -e . "${alert_file}" &>/dev/null; then + print_err_and_exit "Invalid json format for alert file: ${alert_file}" + elif [ -x "$(command -v python)" ] \ + && ! python -c "import sys,json;json.loads(sys.stdin.read())" < "${alert_file}" &>/dev/null; then + print_err_and_exit "Invalid json format for alert file: ${alert_file}" + elif [ -x "$(command -v python3)" ] \ + && ! python3 -c "import sys,json;json.loads(sys.stdin.read())" < "${alert_file}" &>/dev/null; then + print_err_and_exit "Invalid json format for alert file: ${alert_file}" + fi +} + +function check_required_argument() { + local required_arg="$1" + local failure_msg="$2" + if [[ -z "${required_arg}" ]]; then + print_usage_and_exit "${failure_msg}" + fi +} + +function print_err_and_exit() { + echo "Error: $1" + exit 1 +} + +function print_usage_and_exit() { + echo "Failure: $1" + print_usage + exit 1 +} + +function print_usage() { + echo "Usage: create-all-alerts [flags] [options]" + echo -e "\t-t wavefront api token (optional)" + echo -e "\t-c wavefront instance name (required)" + echo -e "\t-n kubernetes cluster name (required)" + echo -e "\t-p end-point for csp authentication (optional)" + echo -e "\t-a api token for csp authentication (optional)" + echo -e "\t-i oauth app id for csp authentication (optional)" + echo -e "\t-s oauth app secret for csp authentication (optional)" + echo -e "\t-o oauth org id for csp authentication (optional)" + echo -e "\t-e alert target (optional)" + echo -e "\t-h print usage" +} + +function main() { + # Required arguments + local WF_CLUSTER='' + local K8S_CLUSTER_NAME='' + + # Optional arguments + local CSP_ENDPOINT='console' + local CSP_API_TOKEN='' + local CSP_APP_ID='' + local CSP_APP_SECRET='' + + # Default arguments + local GITHUB_REPO='wavefrontHQ/observability-for-kubernetes' + local ALERTS_DIRECTORY='docs/alerts/templates' + local GIT_BRANCH='main' + local ALERT_TARGET='' + + while getopts ':t:c:n:p:a:i:s:o:e:d:b:h' opt; do + case "${opt}" in + t) WAVEFRONT_TOKEN="${OPTARG}" ;; + c) WF_CLUSTER="${OPTARG}" ;; + n) K8S_CLUSTER_NAME="${OPTARG}" ;; + p) CSP_ENDPOINT="${OPTARG}" ;; + a) CSP_API_TOKEN="${OPTARG}" ;; + i) CSP_APP_ID="${OPTARG}" ;; + s) CSP_APP_SECRET="${OPTARG}" ;; + o) CSP_ORG_ID="${OPTARG}" ;; + e) ALERT_TARGET="${OPTARG}" ;; + d) ALERTS_DIRECTORY="${OPTARG}" ;; + b) GIT_BRANCH="${OPTARG}" ;; + h) print_usage; exit 0 ;; + \?) print_usage_and_exit "Invalid option: -${OPTARG}" ;; + esac + done + + # Get the CSP access token if necessary + if [[ -n "${CSP_API_TOKEN}" ]]; then + get_csp_access_token "${CSP_ENDPOINT}" "${CSP_API_TOKEN}" + elif [[ -n "${CSP_APP_ID}" ]]; then + get_csp_access_token "${CSP_ENDPOINT}" "${CSP_APP_SECRET}" "${CSP_APP_ID}" "${CSP_ORG_ID}" + fi + + # Checking for required arguments + check_required_argument "${WAVEFRONT_TOKEN}" "-t is required" + check_required_argument "${WF_CLUSTER}" "-c is required" + check_required_argument "${K8S_CLUSTER_NAME}" "-n is required" + + # Download and create all the alerts + TEMP_DIR=$(mktemp -d) + download_alerts "${GITHUB_REPO}" "${ALERTS_DIRECTORY}" "${GIT_BRANCH}" + create_alerts "${WAVEFRONT_TOKEN}" "${WF_CLUSTER}" "${K8S_CLUSTER_NAME}" "${ALERT_TARGET}" + rm -rf "${TEMP_DIR}" +} + +main "$@" diff --git a/docs/alerts/templates/container-cpu-overutilization.json.tmpl b/docs/alerts/templates/container-cpu-overutilization.json.tmpl index d188097db..4b14bec5b 100644 --- a/docs/alerts/templates/container-cpu-overutilization.json.tmpl +++ b/docs/alerts/templates/container-cpu-overutilization.json.tmpl @@ -1,26 +1,51 @@ { "name": "Container CPU Overutilization", "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a container's cpu utilization percentage is constantly high.", + "alertChartUnits": "%", "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, container_name) / sum(${C}, sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, container_name) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "mavg(5m, align(1m, ts(\"kubernetes.pod_container.cpu.usage_rate\", cluster=\"K8S_CLUSTER_NAME\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "mavg(5m, align(1m, ts(\"kubernetes.pod_container.cpu.limit\", cluster=\"K8S_CLUSTER_NAME\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, { "name": "Alert Condition", - "query": "round(sum(mavg(5m, align(1m, ts(\"kubernetes.pod_container.cpu.usage_rate\", cluster=\"K8S_CLUSTER_NAME\"))) by (container_name, pod_name, namespace_name)) / sum(mavg(5m, align(1m, ts(\"kubernetes.pod_container.cpu.limit\", cluster=\"K8S_CLUSTER_NAME\"))) by (container_name, pod_name, namespace_name)) * 100)", + "query": "${A}", "queryType": "WQL", "alertSourceType": ["CONDITION"], "hidden": true }, { "name": "Display Condition", - "query": "if(${Alert Condition} >= 90, ${Alert Condition})", + "query": "${A}.ge(90)", "queryType": "WQL", - "alertSourceType": ["VARIABLE","AUDIT"], + "alertSourceType": ["AUDIT"], "hidden": false } ], "conditions": { - "info": "round(sum(mavg(5m, align(1m, ts(\"kubernetes.pod_container.cpu.usage_rate\", cluster=\"K8S_CLUSTER_NAME\"))) by (container_name, pod_name, namespace_name)) / sum(mavg(5m, align(1m, ts(\"kubernetes.pod_container.cpu.limit\", cluster=\"K8S_CLUSTER_NAME\"))) by (container_name, pod_name, namespace_name)) * 100) >= 90" + "info": "${A} >= 90" }, + "conditionQBEnabled": false, "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, "minutes": 15, "resolveAfterMinutes": 2, "tags": { @@ -30,12 +55,16 @@ }, "alertTriageDashboards": [ { - "dashboardId": "integration-kubernetes-workloads", - "parameters": { - "constants": { - "cluster": "K8S_CLUSTER_NAME" - } - } + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } } - ] + ], + "targets": { + "info": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false } \ No newline at end of file diff --git a/docs/alerts/templates/container-cpu-throttling.json.tmpl b/docs/alerts/templates/container-cpu-throttling.json.tmpl index 5111be911..8b7c05739 100644 --- a/docs/alerts/templates/container-cpu-throttling.json.tmpl +++ b/docs/alerts/templates/container-cpu-throttling.json.tmpl @@ -1,26 +1,51 @@ { "name": "Container CPU Throttling", "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "%", "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, container_name, pod_name, namespace_name) / sum(${C}, sources, cluster, cluster_uuid, container_name, pod_name, namespace_name) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "mavg(5m, rate(ts(\"kubernetes.cadvisor.container.cpu.cfs.throttled.periods.total.counter\", cluster=\"K8S_CLUSTER_NAME\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "mavg(5m, rate(ts(\"kubernetes.cadvisor.container.cpu.cfs.periods.total.counter\", cluster=\"K8S_CLUSTER_NAME\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, { "name": "Alert Condition", - "query": "round(sum((mavg(5m, rate(ts(\"kubernetes.cadvisor.container.cpu.cfs.throttled.periods.total.counter\", cluster=\"K8S_CLUSTER_NAME\"))) * 300) by (container, pod, namespace)) / sum((mavg(5m, rate(ts(\"kubernetes.cadvisor.container.cpu.cfs.periods.total.counter\", cluster=\"K8S_CLUSTER_NAME\"))) * 300) by (container, pod, namespace)) * 100)", + "query": "${A}", "queryType": "WQL", "alertSourceType": ["CONDITION"], "hidden": true }, { "name": "Display Condition", - "query": "if(${Alert Condition} >= 25, ${Alert Condition})", + "query": "${A}.ge(25)", "queryType": "WQL", - "alertSourceType": ["VARIABLE","AUDIT"], + "alertSourceType": ["AUDIT"], "hidden": false } ], "conditions": { - "info": "round(sum((mavg(5m, rate(ts(\"kubernetes.cadvisor.container.cpu.cfs.throttled.periods.total.counter\", cluster=\"K8S_CLUSTER_NAME\"))) * 300) by (container, pod, namespace)) / sum((mavg(5m, rate(ts(\"kubernetes.cadvisor.container.cpu.cfs.periods.total.counter\", cluster=\"K8S_CLUSTER_NAME\"))) * 300) by (container, pod, namespace)) * 100) >= 25" + "info": "${A} >= 25" }, + "conditionQBEnabled": false, "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, "minutes": 15, "resolveAfterMinutes": 2, "tags": { @@ -30,12 +55,16 @@ }, "alertTriageDashboards": [ { - "dashboardId": "integration-kubernetes-workloads", - "parameters": { - "constants": { - "cluster": "K8S_CLUSTER_NAME" - } - } + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } } - ] -} + ], + "targets": { + "info": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/container-memory-overutilization.json.tmpl b/docs/alerts/templates/container-memory-overutilization.json.tmpl new file mode 100644 index 000000000..e94a10734 --- /dev/null +++ b/docs/alerts/templates/container-memory-overutilization.json.tmpl @@ -0,0 +1,67 @@ +{ + "name": "Container Memory Overutilization", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a container's memory utilization percentage is constantly high.", + "alertChartUnits": "%", + "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, container_name) / sum(${C}, sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, container_name) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "mavg(5m, align(1m, ts(\"kubernetes.pod_container.memory.working_set\", cluster=\"K8S_CLUSTER_NAME\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "mavg(5m, align(1m, ts(\"kubernetes.pod_container.memory.limit\", cluster=\"K8S_CLUSTER_NAME\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "${A}.ge(90)", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "info": "${A} >= 90" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "minutes": 15, + "resolveAfterMinutes": 2, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } + } + ], + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/etcd-no-leader.json.tmpl b/docs/alerts/templates/etcd-no-leader.json.tmpl new file mode 100644 index 000000000..8c1b5f434 --- /dev/null +++ b/docs/alerts/templates/etcd-no-leader.json.tmpl @@ -0,0 +1,53 @@ +{ + "name": "etcd Server Has No Leader", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when an etcd server does not have a leader.", + "alertChartUnits": "", + "alertSources": [ + { + "name": "A", + "query": "retainDimension(ts(\"kubernetes.controlplane.etcd.server.has.leader.gauge\", cluster=\"K8S_CLUSTER_NAME\"), sources, cluster, cluster_uuid, namespace, pod)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${A} < 1, ${A})", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "severe": "${A} < 1" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "minutes": 10, + "resolveAfterMinutes": 2, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-control-plane", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/node-condition-not-ready.json.tmpl b/docs/alerts/templates/node-condition-not-ready.json.tmpl new file mode 100644 index 000000000..de622d544 --- /dev/null +++ b/docs/alerts/templates/node-condition-not-ready.json.tmpl @@ -0,0 +1,53 @@ +{ + "name": "Node Condition Not Ready", + "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", + "alertSources": [ + { + "name": "A", + "query": "mcount(5m, ${B}) >= 2", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.node.status.condition\", cluster=\"K8S_CLUSTER_NAME\" AND condition=\"Ready\" AND status!=\"True\"), sources, cluster, cluster_uuid, nodename, node_role, condition, status)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION","AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 1" + }, + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, + "minutes": 10, + "resolveAfterMinutes": 2, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/node-cpu-overutilization.json.tmpl b/docs/alerts/templates/node-cpu-overutilization.json.tmpl new file mode 100644 index 000000000..eb64696b9 --- /dev/null +++ b/docs/alerts/templates/node-cpu-overutilization.json.tmpl @@ -0,0 +1,64 @@ +{ + "name": "Node CPU Overutilization", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a node's cpu utilization percentage is constantly high.", + "alertChartUnits": "%", + "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, nodename, node_role) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "ts(\"kubernetes.node.cpu.node_utilization\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${A} >= 80, ${A})", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 80", + "severe": "${A} >= 90" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "includeObsoleteMetrics": false, + "minutes": 5, + "resolveAfterMinutes": 5, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "targets": { + "warn": "ALERT_TARGET" + } +} \ No newline at end of file diff --git a/docs/alerts/templates/node-cpu-request-saturation.json.tmpl b/docs/alerts/templates/node-cpu-request-saturation.json.tmpl new file mode 100644 index 000000000..de37c0190 --- /dev/null +++ b/docs/alerts/templates/node-cpu-request-saturation.json.tmpl @@ -0,0 +1,70 @@ +{ + "name": "Node CPU-request Saturation", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a node's cpu-request saturation exceeds the specified threshold.", + "alertChartUnits": "%", + "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, nodename, node_role) / sum(${C}, sources, cluster, cluster_uuid, nodename, node_role) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "ts(\"kubernetes.node.cpu.request\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "ts(\"kubernetes.node.cpu.node_allocatable\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${A} >= 90, ${A})", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 90" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "includeObsoleteMetrics": false, + "minutes": 5, + "resolveAfterMinutes": 5, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "targets": { + "warn": "ALERT_TARGET" + } +} \ No newline at end of file diff --git a/docs/alerts/templates/node-disk-pressure.json.tmpl b/docs/alerts/templates/node-disk-pressure.json.tmpl new file mode 100644 index 000000000..df3583fe9 --- /dev/null +++ b/docs/alerts/templates/node-disk-pressure.json.tmpl @@ -0,0 +1,53 @@ +{ + "name": "Node Disk Pressure", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a node has the problematic condition of DiskPressure.", + "alertChartUnits": "", + "alertSources": [ + { + "name": "A", + "query": "mcount(5m, ${B}) >= 2", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.node.status.condition\", cluster=\"K8S_CLUSTER_NAME\" AND (condition=\"DiskPressure\" AND status=\"True\")), sources, cluster, cluster_uuid, nodename, node_role, status, condition)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION","AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 1" + }, + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, + "includeObsoleteMetrics": false, + "minutes": 5, + "resolveAfterMinutes": 2, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ] +} \ No newline at end of file diff --git a/docs/alerts/templates/node-filesystem-overutilization.json.tmpl b/docs/alerts/templates/node-filesystem-overutilization.json.tmpl new file mode 100644 index 000000000..415382cdd --- /dev/null +++ b/docs/alerts/templates/node-filesystem-overutilization.json.tmpl @@ -0,0 +1,71 @@ +{ + "name": "Node Filesystem Overutilization", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a node's storage is almost full.", + "alertChartUnits": "%", + "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, nodename, node_role) / sum(${C}, sources, cluster, cluster_uuid, nodename, node_role) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "ts(\"kubernetes.node.filesystem.usage\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "ts(\"kubernetes.node.filesystem.limit\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${A} >= 75, ${A})", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 75", + "severe": "${A} >= 95" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "includeObsoleteMetrics": false, + "minutes": 10, + "resolveAfterMinutes": 10, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "targets": { + "warn": "ALERT_TARGET" + } +} \ No newline at end of file diff --git a/docs/alerts/templates/node-memory-overutilization.json.tmpl b/docs/alerts/templates/node-memory-overutilization.json.tmpl new file mode 100644 index 000000000..f50f37ed5 --- /dev/null +++ b/docs/alerts/templates/node-memory-overutilization.json.tmpl @@ -0,0 +1,71 @@ +{ + "name": "Node Memory Overutilization", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a node's memory utilization percentage is constantly high.", + "alertChartUnits": "%", + "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, nodename, node_role) / sum(${C}, sources, cluster, cluster_uuid, nodename, node_role) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "ts(\"kubernetes.node.memory.working_set\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "ts(\"kubernetes.node.memory.node_allocatable\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${A} >= 80, ${A})", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 80", + "severe": "${A} >= 90" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "includeObsoleteMetrics": false, + "minutes": 10, + "resolveAfterMinutes": 10, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "targets": { + "warn": "ALERT_TARGET" + } +} \ No newline at end of file diff --git a/docs/alerts/templates/node-memory-pressure.json.tmpl b/docs/alerts/templates/node-memory-pressure.json.tmpl new file mode 100644 index 000000000..6f0293ae5 --- /dev/null +++ b/docs/alerts/templates/node-memory-pressure.json.tmpl @@ -0,0 +1,60 @@ +{ + "name": "Node Memory Pressure", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a node has the problematic condition of MemoryPressure.", + "alertChartUnits": "", + "alertSources": [ + { + "name": "A", + "query": "mcount(5m, ${B}) >= 2", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(${C}, sources, cluster, cluster_uuid, nodename, node_role, status, condition)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "ts(\"kubernetes.node.status.condition\", cluster=\"K8S_CLUSTER_NAME\" AND (condition=\"MemoryPressure\" AND status=\"True\"))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION","AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 1" + }, + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, + "includeObsoleteMetrics": false, + "minutes": 5, + "resolveAfterMinutes": 2, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ] +} \ No newline at end of file diff --git a/docs/alerts/templates/node-memory-request-saturation.json.tmpl b/docs/alerts/templates/node-memory-request-saturation.json.tmpl new file mode 100644 index 000000000..47380ece0 --- /dev/null +++ b/docs/alerts/templates/node-memory-request-saturation.json.tmpl @@ -0,0 +1,70 @@ +{ + "name": "Node Memory-request Saturation", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a node's memory-request saturation exceeds the specified threshold.", + "alertChartUnits": "%", + "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, nodename, node_role) / sum(${C}, sources, cluster, cluster_uuid, nodename, node_role) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "ts(\"kubernetes.node.memory.request\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "ts(\"kubernetes.node.memory.node_allocatable\", cluster=\"K8S_CLUSTER_NAME\")", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${A} >= 90, ${A})", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 90" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "includeObsoleteMetrics": false, + "minutes": 5, + "resolveAfterMinutes": 5, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-nodes", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "targets": { + "warn": "ALERT_TARGET" + } +} \ No newline at end of file diff --git a/docs/alerts/templates/observability-status-unhealthy.json.tmpl b/docs/alerts/templates/observability-status-unhealthy.json.tmpl new file mode 100644 index 000000000..92cfe712c --- /dev/null +++ b/docs/alerts/templates/observability-status-unhealthy.json.tmpl @@ -0,0 +1,53 @@ +{ + "name": "Observability Status is Unhealthy", + "alertType": "THRESHOLD", + "additionalInformation": "The status of the Observability for Kubernetes is unhealthy.", + "alertChartUnits": "", + "alertSources": [ + { + "name": "A", + "query": "count(ts(\"kubernetes.observability.*.status\", cluster=\"K8S_CLUSTER_NAME\" AND status=\"Unhealthy\"), sources, cluster, cluster_uuid, message, status)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "if(${A} > 0, ${A})", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "severe": "${A} > 0" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "minutes": 1, + "resolveAfterMinutes": 5, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "alertTriageDashboards": [ + { + "dashboardId": "integration-kubernetes-status", + "parameters": { + "constants": { + "cluster_name": "K8S_CLUSTER_NAME" + } + } + } + ], + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/persistent-volume-claim-overutilization.json.tmpl b/docs/alerts/templates/persistent-volume-claim-overutilization.json.tmpl new file mode 100644 index 000000000..1df85b30b --- /dev/null +++ b/docs/alerts/templates/persistent-volume-claim-overutilization.json.tmpl @@ -0,0 +1,61 @@ +{ + "name": "Persistent Volume Claim Overutilization", + "alertType": "THRESHOLD", + "additionalInformation": "Alert reports when a Persistent Volume's available storage is below the required level specified.", + "alertChartUnits": "%", + "alertSources": [ + { + "name": "A", + "query": "round(sum(${B}, sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, pvc_name) / sum(${C}, sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, pvc_name) * 100)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "mmin(5m, align(1m, ts(\"kubernetes.pod.filesystem.available\", cluster=\"K8S_CLUSTER_NAME\" AND pvc_name=\"*\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "C", + "query": "mmax(5m, align(1m, ts(\"kubernetes.pod.filesystem.limit\", cluster=\"K8S_CLUSTER_NAME\" AND pvc_name=\"*\")))", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION"], + "hidden": true + }, + { + "name": "Display Condition", + "query": "${A}.le(15)", + "queryType": "WQL", + "alertSourceType": ["AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} <= 15", + "severe": "${A} <= 5" + }, + "conditionQBEnabled": false, + "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, + "minutes": 5, + "resolveAfterMinutes": 5, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "targets": { + "warn": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/persistent-volumes-error.json.tmpl b/docs/alerts/templates/persistent-volumes-error.json.tmpl new file mode 100644 index 000000000..20a34aba0 --- /dev/null +++ b/docs/alerts/templates/persistent-volumes-error.json.tmpl @@ -0,0 +1,46 @@ +{ + "name": "Persistent Volumes Error", + "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", + "alertSources": [ + { + "name": "A", + "query": "mcount(5m, ${B}) >= 2", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.pv.status.phase\", cluster=\"K8S_CLUSTER_NAME\" AND (phase=\"Failed\" OR phase=\"Pending\")), sources, cluster, cluster_uuid, pv_name, phase)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION","AUDIT"], + "hidden": false + } + ], + "conditions": { + "warn": "${A} >= 1" + }, + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, + "minutes": 10, + "resolveAfterMinutes": 2, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "targets": { + "warn": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/persistent-volumes-no-claim.json.tmpl b/docs/alerts/templates/persistent-volumes-no-claim.json.tmpl new file mode 100644 index 000000000..fce90ff2b --- /dev/null +++ b/docs/alerts/templates/persistent-volumes-no-claim.json.tmpl @@ -0,0 +1,46 @@ +{ + "name": "Persistent Volumes No Claim", + "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", + "alertSources": [ + { + "name": "A", + "query": "mcount(5m, ${B}) >= 2", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.pv.status.phase\", cluster=\"K8S_CLUSTER_NAME\" AND phase=\"Available\"), sources, cluster, cluster_uuid, pv_name, phase)", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "Alert Condition", + "query": "${A}", + "queryType": "WQL", + "alertSourceType": ["CONDITION","AUDIT"], + "hidden": false + } + ], + "conditions": { + "info": "${A} >= 1" + }, + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, + "minutes": 10, + "resolveAfterMinutes": 2, + "tags": { + "customerTags": [ + "integration.kubernetes" + ] + }, + "targets": { + "info": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/pod-backoff-event.json.tmpl b/docs/alerts/templates/pod-backoff-event.json.tmpl index 5dfa37ef7..43aa4c4cb 100644 --- a/docs/alerts/templates/pod-backoff-event.json.tmpl +++ b/docs/alerts/templates/pod-backoff-event.json.tmpl @@ -1,26 +1,37 @@ { "name": "Pod Backoff Event", "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", "alertSources": [ { - "name": "Alert Condition", - "query": "(mcount(10m, count(ts(\"kubernetes.pod_container.status\", cluster=\"K8S_CLUSTER_NAME\" AND (reason=\"ImagePullBackOff\" OR reason=\"CrashLoopBackOff\")), sources, workload_name, pod_name, namespace_name, cluster, cluster_uuid, reason)) >= 5)", + "name": "A", + "query": "mcount(10m, ${B}) >= 5", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.pod_container.status\", cluster=\"K8S_CLUSTER_NAME\" AND (reason=\"ImagePullBackOff\" OR reason=\"CrashLoopBackOff\")), sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, reason)", "queryType": "WQL", - "alertSourceType": ["CONDITION"], + "alertSourceType": ["VARIABLE"], "hidden": true }, { - "name": "Display Condition", - "query": "if(${Alert Condition}, ${Alert Condition})", + "name": "Alert Condition", + "query": "${A}", "queryType": "WQL", - "alertSourceType": ["VARIABLE","AUDIT"], + "alertSourceType": ["CONDITION","AUDIT"], "hidden": false } ], "conditions": { - "warn": "(mcount(10m, count(ts(\"kubernetes.pod_container.status\", cluster=\"K8S_CLUSTER_NAME\" AND (reason=\"ImagePullBackOff\" OR reason=\"CrashLoopBackOff\")), sources, workload_name, pod_name, namespace_name, cluster, cluster_uuid, reason)) >= 5) >= 1" + "warn": "${A} >= 1" }, - "displayExpression": "${Display Condition}", + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, "minutes": 10, "resolveAfterMinutes": 2, "tags": { @@ -30,12 +41,16 @@ }, "alertTriageDashboards": [ { - "dashboardId": "integration-kubernetes-workloads", - "parameters": { - "constants": { - "cluster": "K8S_CLUSTER_NAME" - } - } + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } } - ] -} + ], + "targets": { + "warn": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/pod-out-of-memory-kills.json.tmpl b/docs/alerts/templates/pod-out-of-memory-kills.json.tmpl index 6d35bc36f..07ed0f8fb 100644 --- a/docs/alerts/templates/pod-out-of-memory-kills.json.tmpl +++ b/docs/alerts/templates/pod-out-of-memory-kills.json.tmpl @@ -1,26 +1,37 @@ { "name": "Pod Out-of-memory Kills", "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", "alertSources": [ { - "name": "Alert Condition", - "query": "(mcount(15m, count(ts(\"kubernetes.pod_container.status\", cluster=\"K8S_CLUSTER_NAME\" AND reason=\"OOMKilled\"), sources, workload_name, pod_name, namespace_name, cluster, container_name, cluster_uuid, reason)) >= 2)", + "name": "A", + "query": "mcount(15m, ${B}) >= 2", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.pod_container.status\", cluster=\"K8S_CLUSTER_NAME\" AND reason=\"OOMKilled\"), sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, container_name, reason)", "queryType": "WQL", - "alertSourceType": ["CONDITION"], + "alertSourceType": ["VARIABLE"], "hidden": true }, { - "name": "Display Condition", - "query": "if(${Alert Condition}, ${Alert Condition})", + "name": "Alert Condition", + "query": "${A}", "queryType": "WQL", - "alertSourceType": ["VARIABLE","AUDIT"], + "alertSourceType": ["CONDITION","AUDIT"], "hidden": false } ], "conditions": { - "warn": "(mcount(15m, count(ts(\"kubernetes.pod_container.status\", cluster=\"K8S_CLUSTER_NAME\" AND reason=\"OOMKilled\"), sources, workload_name, pod_name, namespace_name, cluster, container_name, cluster_uuid, reason)) >= 2) >= 1" + "warn": "${A} >= 1" }, - "displayExpression": "${Display Condition}", + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, "minutes": 15, "resolveAfterMinutes": 2, "tags": { @@ -30,12 +41,16 @@ }, "alertTriageDashboards": [ { - "dashboardId": "integration-kubernetes-workloads", - "parameters": { - "constants": { - "cluster": "K8S_CLUSTER_NAME" - } - } + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } } - ] -} + ], + "targets": { + "warn": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/pod-stuck-in-pending.json.tmpl b/docs/alerts/templates/pod-stuck-in-pending.json.tmpl index b1b18c0a5..40c7ac5cf 100644 --- a/docs/alerts/templates/pod-stuck-in-pending.json.tmpl +++ b/docs/alerts/templates/pod-stuck-in-pending.json.tmpl @@ -1,26 +1,37 @@ { "name": "Pod Stuck in Pending", "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", "alertSources": [ { - "name": "Alert Condition", - "query": "(mcount(10m, count(ts(\"kubernetes.pod.status.phase\", phase=\"Pending\" AND cluster=\"K8S_CLUSTER_NAME\"), sources, pod_name, workload_name, namespace_name, cluster, message, cluster_uuid, reason)) >= 10)", + "name": "A", + "query": "mcount(10m, ${B}) >= 10", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.pod.status.phase\", cluster=\"K8S_CLUSTER_NAME\" AND phase=\"Pending\"), sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, message, reason, phase)", "queryType": "WQL", - "alertSourceType": ["CONDITION"], + "alertSourceType": ["VARIABLE"], "hidden": true }, { - "name": "Display Condition", - "query": "if(${Alert Condition}, ${Alert Condition})", + "name": "Alert Condition", + "query": "${A}", "queryType": "WQL", - "alertSourceType": ["VARIABLE","AUDIT"], + "alertSourceType": ["CONDITION","AUDIT"], "hidden": false } ], "conditions": { - "warn": "(mcount(10m, count(ts(\"kubernetes.pod.status.phase\", phase=\"Pending\" AND cluster=\"K8S_CLUSTER_NAME\"), sources, pod_name, workload_name, namespace_name, cluster, message, cluster_uuid, reason)) >= 10) >= 1" + "warn": "${A} >= 1" }, - "displayExpression": "${Display Condition}", + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, "minutes": 5, "resolveAfterMinutes": 2, "tags": { @@ -30,12 +41,16 @@ }, "alertTriageDashboards": [ { - "dashboardId": "integration-kubernetes-workloads", - "parameters": { - "constants": { - "cluster": "K8S_CLUSTER_NAME" - } - } + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } } - ] + ], + "targets": { + "warn": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false } \ No newline at end of file diff --git a/docs/alerts/templates/pod-stuck-in-terminating.json.tmpl b/docs/alerts/templates/pod-stuck-in-terminating.json.tmpl index 879678aae..29d0f10ba 100644 --- a/docs/alerts/templates/pod-stuck-in-terminating.json.tmpl +++ b/docs/alerts/templates/pod-stuck-in-terminating.json.tmpl @@ -1,26 +1,37 @@ { "name": "Pod Stuck in Terminating", "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", "alertSources": [ { - "name": "Alert Condition", - "query": "(mcount(10m, count(ts(\"kubernetes.pod.terminating\", cluster=\"K8S_CLUSTER_NAME\"), sources, workload_name, pod_name, namespace_name, cluster, DeletionTimestamp, cluster_uuid)) >= 10)", + "name": "A", + "query": "mcount(10m, ${B}) >= 10", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "count(ts(\"kubernetes.pod.terminating\", cluster=\"K8S_CLUSTER_NAME\"), sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind, pod_name, reason, DeletionTimestamp)", "queryType": "WQL", - "alertSourceType": ["CONDITION"], + "alertSourceType": ["VARIABLE"], "hidden": true }, { - "name": "Display Condition", - "query": "if(${Alert Condition}, ${Alert Condition})", + "name": "Alert Condition", + "query": "${A}", "queryType": "WQL", - "alertSourceType": ["VARIABLE","AUDIT"], + "alertSourceType": ["CONDITION","AUDIT"], "hidden": false } ], "conditions": { - "warn": "(mcount(10m, count(ts(\"kubernetes.pod.terminating\", cluster=\"K8S_CLUSTER_NAME\"), sources, workload_name, pod_name, namespace_name, cluster, DeletionTimestamp, cluster_uuid)) >= 10) >= 1" + "warn": "${A} >= 1" }, - "displayExpression": "${Display Condition}", + "conditionQBEnabled": false, + "displayExpression": "${Alert Condition}", + "displayExpressionQBEnabled": false, "minutes": 5, "resolveAfterMinutes": 2, "tags": { @@ -30,12 +41,16 @@ }, "alertTriageDashboards": [ { - "dashboardId": "integration-kubernetes-workloads", - "parameters": { - "constants": { - "cluster": "K8S_CLUSTER_NAME" - } - } + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } } - ] -} + ], + "targets": { + "warn": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/alerts/templates/workload-not-ready.json.tmpl b/docs/alerts/templates/workload-not-ready.json.tmpl index f1f323c23..946040901 100644 --- a/docs/alerts/templates/workload-not-ready.json.tmpl +++ b/docs/alerts/templates/workload-not-ready.json.tmpl @@ -1,26 +1,44 @@ { "name": "Workload Not Ready", "alertType": "THRESHOLD", + "additionalInformation": "", + "alertChartUnits": "", "alertSources": [ + { + "name": "A", + "query": "msum(10m, ${B}) >= 10", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, + { + "name": "B", + "query": "sum(ts(\"kubernetes.workload.status\", cluster=\"K8S_CLUSTER_NAME\"), sources, cluster, cluster_uuid, namespace_name, workload_name, workload_kind) = 0", + "queryType": "WQL", + "alertSourceType": ["VARIABLE"], + "hidden": true + }, { "name": "Alert Condition", - "query": "(msum(10m, sum(ts(\"kubernetes.workload.status\", cluster=\"K8S_CLUSTER_NAME\"), sources, workload_name, workload_kind, namespace_name, cluster, cluster_uuid) = 0) >= 10)", + "query": "${A}", "queryType": "WQL", "alertSourceType": ["CONDITION"], "hidden": true }, { "name": "Display Condition", - "query": "if(${Alert Condition}, ${Alert Condition})", + "query": "if(${A} >= 1, ${A})", "queryType": "WQL", - "alertSourceType": ["VARIABLE","AUDIT"], + "alertSourceType": ["AUDIT"], "hidden": false } ], "conditions": { - "warn": "(msum(10m, sum(ts(\"kubernetes.workload.status\", cluster=\"K8S_CLUSTER_NAME\"), sources, workload_name, workload_kind, namespace_name, cluster, cluster_uuid) = 0) >= 10) >= 1" + "warn": "${A} >= 1" }, + "conditionQBEnabled": false, "displayExpression": "${Display Condition}", + "displayExpressionQBEnabled": false, "minutes": 5, "resolveAfterMinutes": 2, "tags": { @@ -30,12 +48,16 @@ }, "alertTriageDashboards": [ { - "dashboardId": "integration-kubernetes-workloads", - "parameters": { - "constants": { - "cluster": "K8S_CLUSTER_NAME" - } - } + "dashboardId": "integration-kubernetes-workloads", + "parameters": { + "constants": { + "cluster": "K8S_CLUSTER_NAME" + } + } } - ] -} + ], + "targets": { + "warn": "ALERT_TARGET" + }, + "includeObsoleteMetrics": false +} \ No newline at end of file diff --git a/docs/operator/custom-configuration.md b/docs/operator/custom-configuration.md index f5e5b24a4..c598e0a37 100644 --- a/docs/operator/custom-configuration.md +++ b/docs/operator/custom-configuration.md @@ -9,9 +9,9 @@ Install the Observability for Kubernetes Operator into `observability-system` na | Component | From | To | |---|---|---| -| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.13.0` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.13.0` | -| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.25.0` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.25.0` | -| Wavefront Proxy | `projects.registry.vmware.com/tanzu_observability/proxy:13.1` | `YOUR_IMAGE_REGISTRY/proxy:13.1` | +| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.14.1` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.14.1` | +| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.26.1` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.26.1` | +| Wavefront Proxy | `projects.registry.vmware.com/tanzu_observability/proxy:13.2` | `YOUR_IMAGE_REGISTRY/proxy:13.2` | | Operations for Applications logging | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator-fluentbit:2.1.9` | `YOUR_IMAGE_REGISTRY/kubernetes-operator-fluentbit:2.1.9` | 2. Create a local directory called `observability`. diff --git a/operator/config/manager/component_versions.yaml b/operator/config/manager/component_versions.yaml index f9a1e1cfa..41ef91a1d 100644 --- a/operator/config/manager/component_versions.yaml +++ b/operator/config/manager/component_versions.yaml @@ -7,6 +7,6 @@ metadata: name: component-versions namespace: system data: - collector: "1.25.0" + collector: "1.26.1" logging: "2.1.9" proxy: "13.2" diff --git a/operator/config/manager/kustomization.yaml b/operator/config/manager/kustomization.yaml index d72f9d952..9b5e65c67 100644 --- a/operator/config/manager/kustomization.yaml +++ b/operator/config/manager/kustomization.yaml @@ -15,7 +15,7 @@ kind: Kustomization images: - name: controller newName: projects.registry.vmware.com/tanzu_observability/kubernetes-operator - newTag: 2.13.0 + newTag: 2.14.1 patches: - path: patches.yaml diff --git a/operator/dev-internal/deploy/wavefront-operator.yaml b/operator/dev-internal/deploy/wavefront-operator.yaml index e259eb02b..26aa5fcaa 100644 --- a/operator/dev-internal/deploy/wavefront-operator.yaml +++ b/operator/dev-internal/deploy/wavefront-operator.yaml @@ -165,7 +165,7 @@ spec: default: resources: limits: - cpu: 400m + cpu: 2000m ephemeral-storage: 1Gi memory: 512Mi requests: @@ -304,7 +304,7 @@ spec: default: resources: limits: - cpu: 200m + cpu: 1000m ephemeral-storage: 512Mi memory: 256Mi requests: @@ -719,19 +719,21 @@ spec: type: object type: object type: object - kubernetesEvents: - description: KubernetesEvents is deprecated, please use aria-insights-secret - instead + insights: + description: Insights properties: enable: default: false - description: Enable is whether to enable events. Defaults + description: Enable is whether to enable Insights. Defaults to false. type: boolean - externalEndpointURL: + ingestionUrl: + description: Ingestion Url is the endpoint to send kubernetes + events. + pattern: ^http(s)?:\/\/.+ type: string required: - - externalEndpointURL + - ingestionUrl type: object type: object imagePullSecret: @@ -1441,9 +1443,9 @@ subjects: --- apiVersion: v1 data: - collector: 1.25.0 + collector: 1.26.1 logging: 2.1.9 - proxy: "13.1" + proxy: "13.2" kind: ConfigMap metadata: labels: @@ -1513,7 +1515,7 @@ spec: configMapKeyRef: key: logging name: wavefront-component-versions - image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.13.0 + image: projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.14.1 imagePullPolicy: Always livenessProbe: httpGet: diff --git a/operator/dev-internal/docs/operator/custom-configuration.md b/operator/dev-internal/docs/operator/custom-configuration.md index f3292a383..c598e0a37 100644 --- a/operator/dev-internal/docs/operator/custom-configuration.md +++ b/operator/dev-internal/docs/operator/custom-configuration.md @@ -9,8 +9,8 @@ Install the Observability for Kubernetes Operator into `observability-system` na | Component | From | To | |---|---|---| -| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.13.0` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.13.0` | -| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.25.0` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.25.0` | +| Observability for Kubernetes Operator | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator:2.14.1` | `YOUR_IMAGE_REGISTRY/kubernetes-operator:2.14.1` | +| Kubernetes Metrics Collector | `projects.registry.vmware.com/tanzu_observability/kubernetes-collector:1.26.1` | `YOUR_IMAGE_REGISTRY/kubernetes-collector:1.26.1` | | Wavefront Proxy | `projects.registry.vmware.com/tanzu_observability/proxy:13.2` | `YOUR_IMAGE_REGISTRY/proxy:13.2` | | Operations for Applications logging | `projects.registry.vmware.com/tanzu_observability/kubernetes-operator-fluentbit:2.1.9` | `YOUR_IMAGE_REGISTRY/kubernetes-operator-fluentbit:2.1.9` | diff --git a/operator/release/NEXT_RELEASE_VERSION b/operator/release/NEXT_RELEASE_VERSION index b70ae75a8..68e69e405 100644 --- a/operator/release/NEXT_RELEASE_VERSION +++ b/operator/release/NEXT_RELEASE_VERSION @@ -1 +1 @@ -2.14.1 +2.15.0 diff --git a/operator/release/OPERATOR_VERSION b/operator/release/OPERATOR_VERSION index fb2c0766b..b70ae75a8 100644 --- a/operator/release/OPERATOR_VERSION +++ b/operator/release/OPERATOR_VERSION @@ -1 +1 @@ -2.13.0 +2.14.1 diff --git a/release.Jenkinsfile b/release.Jenkinsfile index 1e6058ccd..1258a4ee9 100644 --- a/release.Jenkinsfile +++ b/release.Jenkinsfile @@ -36,7 +36,7 @@ pipeline { sh 'NUMBER_OF_NODES=1 GKE_NODE_POOL=arm-pool make resize-node-pool-gke-cluster' sh 'make clean-cluster' sh './operator/hack/test/deploy/deploy-local.sh -t $WAVEFRONT_TOKEN -n $K8S_CLUSTER_NAME' - sh './operator/hack/test/run-e2e-tests.sh -t $WAVEFRONT_TOKEN -r advanced -v $OPERATOR_VERSION -n $K8S_CLUSTER_NAME' + sh './operator/hack/test/run-e2e-tests.sh -t $WAVEFRONT_TOKEN -r advanced -v $(cat operator/release/OPERATOR_VERSION) -n $K8S_CLUSTER_NAME' sh 'make clean-cluster' sh 'NUMBER_OF_NODES=0 GKE_NODE_POOL=default-pool make resize-node-pool-gke-cluster' sh 'NUMBER_OF_NODES=0 GKE_NODE_POOL=arm-pool make resize-node-pool-gke-cluster' diff --git a/scripts/promote-release-images.sh b/scripts/promote-release-images.sh index 5a19f58c6..985b714ac 100755 --- a/scripts/promote-release-images.sh +++ b/scripts/promote-release-images.sh @@ -15,8 +15,8 @@ git show origin/rc:operator/wavefront-operator-main.yaml > ${REPO_ROOT}/operator OPERATOR_ALPHA_IMAGE=$(cat "${REPO_ROOT}"/operator/dev-internal/deploy/wavefront-operator.yaml | yq 'select(.metadata.name == "wavefront-controller-manager" and .kind == "Deployment" ) | .spec.template.spec.containers[0].image') OPERATOR_ALPHA_TAG=$(echo ${OPERATOR_ALPHA_IMAGE} | cut -d ':' -f2) COLLECTOR_ALPHA_TAG=$(cat "${REPO_ROOT}"/operator/dev-internal/deploy/wavefront-operator.yaml | yq 'select(.metadata.name == "wavefront-component-versions" ) | .data.collector') -crane -v copy "projects.registry.vmware.com/tanzu_observability_keights_saas/kubernetes-operator:${OPERATOR_ALPHA_TAG}" "projects.registry.vmware.com/tanzu_observability/kubernetes-operator:${OPERATOR_VERSION}" -crane -v copy "projects.registry.vmware.com/tanzu_observability_keights_saas/kubernetes-collector:${COLLECTOR_ALPHA_TAG}" "projects.registry.vmware.com/tanzu_observability/kubernetes-collector:${COLLECTOR_VERSION}" +#crane -v copy "projects.registry.vmware.com/tanzu_observability_keights_saas/kubernetes-operator:${OPERATOR_ALPHA_TAG}" "projects.registry.vmware.com/tanzu_observability/kubernetes-operator:${OPERATOR_VERSION}" +#crane -v copy "projects.registry.vmware.com/tanzu_observability_keights_saas/kubernetes-collector:${COLLECTOR_ALPHA_TAG}" "projects.registry.vmware.com/tanzu_observability/kubernetes-collector:${COLLECTOR_VERSION}" # Update wavefront-operator yaml in dev-internal with release versions sed -i.bak "s/collector: ${COLLECTOR_ALPHA_TAG}/collector: ${COLLECTOR_VERSION}/g" ${REPO_ROOT}/operator/dev-internal/deploy/wavefront-operator.yaml